# 1 Imports

In [1]:
# imports

import os
import re
import json
import math
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from items import Item
import matplotlib.pyplot as plt
import pickle
from collections import Counter
from openai import OpenAI

In [2]:
# Call it with Tester.test(function_name, *args)

from testing import Tester

# 2 Connect to OpenAI & HuggingFace

In [3]:
# Load environment variables in a file called .env

load_dotenv()
api_key = os.getenv('OPEN_API_KEY')
openai = OpenAI()

In [4]:
# Load environment variables in a file called .env

load_dotenv()
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# 3 Load Dataset

In [5]:
with open('train.pkl', 'rb') as file:
    train = pickle.load(file)

with open('test.pkl', 'rb') as file:
    test = pickle.load(file)

# 4 Fine-Tunning Preparation

## 4.1 Data Selection

In [6]:
fine_tune_train = train[:500]
fine_tune_validation = train[500:1000]

## 4.2 JSONL Preparation

In [7]:
# Create a good prompt for the model

def message_for(item):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar", "").replace("\n\nPrice is $", "")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": f"Price is $ {item.price:.2f}"}
    ]

In [8]:
print(message_for(train[0]))

[{'role': 'system', 'content': 'You estimate prices of items. Reply only with the price, no explanation'}, {'role': 'user', 'content': 'How much does this cost?\n\nAMSUNBO Brand Musical Instruments Crystal Singing Bowl A Note Third Eye Chakra 8 inch with Free Suede Striker and O ring\nBrand AMSUNBO Size 8 inch/ 20cm The Power of Crystal Singing Bowls “ Quartz is a vibrational transmitter” Wilson says. “We’re water . We are a perfect receptor for this kind of information.” Consider how running your finger around the edge of a crystal glass full of water makes it ring and the water itself shift. Because we are about 65–70% water, we react to the sound of the crystal bowls similarly, as their vibration penetrates into us. How to Play Quartz Crystal Singing Bowls Place the AMSUNBO crystal singing bowl symmetrically on its rubber o ring. Run the suede or'}, {'role': 'assistant', 'content': 'Price is $ 89.99'}]


In [9]:
# Convert the items into a list of json objects - 'jsonl' string

def make_jsonl(items):
    result = ''
    for item in items:
        messages = message_for(item)
        messages_str = json.dumps(messages)
        result += '{"messages": ' + messages_str + '}\n'
    return result.strip()

In [10]:
print(make_jsonl(train[:3]))

{"messages": [{"role": "system", "content": "You estimate prices of items. Reply only with the price, no explanation"}, {"role": "user", "content": "How much does this cost?\n\nAMSUNBO Brand Musical Instruments Crystal Singing Bowl A Note Third Eye Chakra 8 inch with Free Suede Striker and O ring\nBrand AMSUNBO Size 8 inch/ 20cm The Power of Crystal Singing Bowls \u201c Quartz is a vibrational transmitter\u201d Wilson says. \u201cWe\u2019re water . We are a perfect receptor for this kind of information.\u201d Consider how running your finger around the edge of a crystal glass full of water makes it ring and the water itself shift. Because we are about 65\u201370% water, we react to the sound of the crystal bowls similarly, as their vibration penetrates into us. How to Play Quartz Crystal Singing Bowls Place the AMSUNBO crystal singing bowl symmetrically on its rubber o ring. Run the suede or"}, {"role": "assistant", "content": "Price is $ 89.99"}]}
{"messages": [{"role": "system", "con

## 4.3 Save JSONL

In [11]:
def write_jsonl(items, filename):
    with open(filename, 'w') as f:
        jsonl = make_jsonl(items)
        f.write(jsonl)

In [12]:
write_jsonl(fine_tune_train, 'fine_tune_train.jsonl')
write_jsonl(fine_tune_validation, 'fine_tune_validation.jsonl')

## 4.4 Load JSONL

In [13]:
with open('fine_tune_train.jsonl', 'rb') as f:
    train_file = openai.files.create(file=f, purpose='fine-tune')

In [14]:
print(train_file)

FileObject(id='file-VwYAG1NPviFoKE7XPymrMT', bytes=470782, created_at=1744753698, filename='fine_tune_train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None, expires_at=None)


In [15]:
with open('fine_tune_validation.jsonl', 'rb') as f:
    validation_file = openai.files.create(file=f, purpose='fine-tune')

In [16]:
print(validation_file)

FileObject(id='file-526BNB6ZgYZUQ2cTJKWz3r', bytes=466051, created_at=1744753700, filename='fine_tune_validation.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None, expires_at=None)


# 5 Fine-Tuning

In [17]:
openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model='gpt-4o-mini-2024-07-18',
    seed=42,
    hyperparameters={'n_epochs': 1},
    suffix='pricer'
)

FineTuningJob(id='ftjob-puz8yMngcy67ziHdVEFZGmn0', created_at=1744753703, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-8cYu3pUev8eHFUUPBndE5KXh', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-VwYAG1NPviFoKE7XPymrMT', validation_file='file-526BNB6ZgYZUQ2cTJKWz3r', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=1)), type='supervised'), user_provided_suffix='pricer', metadata=None)

In [18]:
openai.fine_tuning.jobs.list(limit=1)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-puz8yMngcy67ziHdVEFZGmn0', created_at=1744753703, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-8cYu3pUev8eHFUUPBndE5KXh', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-VwYAG1NPviFoKE7XPymrMT', validation_file='file-526BNB6ZgYZUQ2cTJKWz3r', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=1)), type='supervised'), user_provided_suffix='pricer', metadata=None)], has_more=True, object='list')

In [19]:
job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id

In [21]:
openai.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-puz8yMngcy67ziHdVEFZGmn0', created_at=1744753703, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-8cYu3pUev8eHFUUPBndE5KXh', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-VwYAG1NPviFoKE7XPymrMT', validation_file='file-526BNB6ZgYZUQ2cTJKWz3r', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=1)), type='supervised'), user_provided_suffix='pricer', metadata=None)