In [6]:
## Fine Tune GPT 3.5 Turbo with custom dataset


In [21]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [22]:
df=pd.read_excel("traffic_complaints.xlsx")
df.head()

Unnamed: 0,Complaint Description,Category,Sub-category
0,"Buses at Ring Road stop at random places, bloc...",Public Transport,Unscheduled Stops
1,"Buses at Main Market stop at random places, bl...",Public Transport,Unscheduled Stops
2,Important traffic signs are missing at Ring Ro...,Road Infrastructure,Missing Signage
3,Important traffic signs are missing at Tech Pa...,Road Infrastructure,Missing Signage
4,There is a severe shortage of legal parking sp...,Parking Issues,Lack of Parking Spaces


In [9]:
df.shape

(200, 3)

In [23]:
#Format DataSet like above message

In [24]:
def convert_to_gpt35(data):
    converted_data = []
    for _, row in data.iterrows():
        json_response='{"Category":"'+ row["Category"]+'","Sub-category":"'+ row["Sub-category"]+'"}'
        converted_data.append({
            "messages": [
                {"role": "user", "content": row['Complaint Description']},
                {"role": "assistant", "content": json_response}
            ]
        })
    return converted_data

In [25]:
converted_data = convert_to_gpt35(df)
converted_data[0]['messages']

[{'role': 'user',
  'content': 'Buses at Ring Road stop at random places, blocking lanes and causing traffic disruption.'},
 {'role': 'assistant',
  'content': '{"Category":"Public Transport","Sub-category":"Unscheduled Stops"}'}]

In [26]:
# import json
# json.loads(converted_data[0]['messages'][1]['content'])

In [27]:
#Train & Test split
train_data,test_data=train_test_split(converted_data,test_size=0.2,stratify=df["Category"],random_state=42)

# New Section

In [28]:
#Create JSONL file
def write_to_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')

train_file_name="training.jsonl"
validation_file_name="validation.jsonl"

write_to_jsonl(train_data,train_file_name)
write_to_jsonl(test_data,validation_file_name)

In [None]:
from openai import OpenAI

OPENAI_API_KEY='your open api key'
client = OpenAI(api_key=OPENAI_API_KEY)

In [30]:
#Upload training and validation file
training_file=client.files.create(file=open(train_file_name,'rb'),purpose='fine-tune')
validation_file=client.files.create(file=open(validation_file_name,'rb'),purpose='fine-tune')

In [31]:
print("Training   Id:",training_file.id)
print("Validation Id:",validation_file.id)

Training   Id: file-13gxzfYtJWBh9bAPSe1cpE
Validation Id: file-B6V5qqPP62w2ZfyuQK5cze


In [39]:
#Create Fine Tuning Job

suffix="finetune-gpt35"

response=client.fine_tuning.jobs.create(training_file=training_file.id,
                                        validation_file=validation_file.id,
                                        model="gpt-3.5-turbo",
                                        suffix=suffix,)

response

FineTuningJob(id='ftjob-ZNIsvdx4bBsqGBEnwx2o3LKx', created_at=1751728085, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-h8OAQgHSlxnHulUbd4UK9zah', result_files=[], seed=1014096164, status='validating_files', trained_tokens=None, training_file='file-13gxzfYtJWBh9bAPSe1cpE', validation_file='file-B6V5qqPP62w2ZfyuQK5cze', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto'))), user_provided_suffix='finetune-gpt35', usage_metrics=None, shared_with_openai=False, eval_id=None)

In [40]:
#All Finetuning Jobs

client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-ZNIsvdx4bBsqGBEnwx2o3LKx', created_at=1751728085, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-h8OAQgHSlxnHulUbd4UK9zah', result_files=[], seed=1014096164, status='validating_files', trained_tokens=None, training_file='file-13gxzfYtJWBh9bAPSe1cpE', validation_file='file-B6V5qqPP62w2ZfyuQK5cze', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto'))), user_provided_suffix='finetune-gpt35', usage_metrics=None, shared_with_openai=False, eval_id=None), FineTuningJob(id='ftjob-AtrGPvTyZQssPfHGDlyVI6vY', created_at

In [48]:
#Retrieve Specific Job

response = client.fine_tuning.jobs.retrieve(response.id)
print('Job Status :',response.status)
print(response)

Job Status : succeeded
FineTuningJob(id='ftjob-ZNIsvdx4bBsqGBEnwx2o3LKx', created_at=1751728085, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal:finetune-gpt35:Bpz2JuJT', finished_at=1751729049, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=3), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-h8OAQgHSlxnHulUbd4UK9zah', result_files=['file-4NJtSy6T2mGoeTxpQi3hXH'], seed=1014096164, status='succeeded', trained_tokens=20235, training_file='file-13gxzfYtJWBh9bAPSe1cpE', validation_file='file-B6V5qqPP62w2ZfyuQK5cze', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=3))), user_provided_suffix='finetune-gpt35', usage_metrics=None, shared_with_openai=False, eval_id=None)


In [42]:
from time import sleep
from datetime import datetime

def my_function():
    start_time = datetime.now()
    print("Timestamp:", start_time)

    # # your code logic here

    # end_time = datetime.now()
    # print("End Time:", end_time)

def monitor_job(job_id):
    """Monitor fine-tuning job progress"""
    while True:
        job = client.fine_tuning.jobs.retrieve(job_id)
        print(f"Status: {job.status}")
        my_function()

        if job.status in ["succeeded", "failed"]:
            return job

        # List latest events
        events = client.fine_tuning.jobs.list_events(
            fine_tuning_job_id=job_id,
            limit=5
        )
        for event in events.data:
            print(f"Event: {event.message}")

        sleep(30)  # Check every 30 seconds

# Monitor the job until completion
job = monitor_job(response.id)
if job.status == "succeeded":
    fine_tuned_model = job.fine_tuned_model
    print(f"Fine-tuned model ID: {fine_tuned_model}")
else:
    print("Fine-tuning failed.")


Status: validating_files
Timestamp: 2025-07-05 15:08:12.889014
Event: Validating training file: file-13gxzfYtJWBh9bAPSe1cpE and validation file: file-B6V5qqPP62w2ZfyuQK5cze
Event: Created fine-tuning job: ftjob-ZNIsvdx4bBsqGBEnwx2o3LKx
Status: running
Timestamp: 2025-07-05 15:08:43.365952
Event: Fine-tuning job started
Event: Files validated, moving job to queued state
Event: Validating training file: file-13gxzfYtJWBh9bAPSe1cpE and validation file: file-B6V5qqPP62w2ZfyuQK5cze
Event: Created fine-tuning job: ftjob-ZNIsvdx4bBsqGBEnwx2o3LKx
Status: running
Timestamp: 2025-07-05 15:09:13.822927
Event: Fine-tuning job started
Event: Files validated, moving job to queued state
Event: Validating training file: file-13gxzfYtJWBh9bAPSe1cpE and validation file: file-B6V5qqPP62w2ZfyuQK5cze
Event: Created fine-tuning job: ftjob-ZNIsvdx4bBsqGBEnwx2o3LKx
Status: running
Timestamp: 2025-07-05 15:09:44.306637
Event: Fine-tuning job started
Event: Files validated, moving job to queued state
Event: Val

In [49]:

fine_tuned_model_id = response.fine_tuned_model
print("\nFine-tuned model id:", fine_tuned_model_id)



Fine-tuned model id: ft:gpt-3.5-turbo-0125:personal:finetune-gpt35:Bpz2JuJT


In [50]:
#Test Finetuned Model

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def format_test(row):

    formatted_message = [
        {
            "role": "user",
            "content": row['Complaint Description']
        }
    ]
    return formatted_message


def predict(test_messages, fine_tuned_model_id):

    response = client.chat.completions.create(
        model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=50
    )

    return response.choices[0].message.content

In [53]:
def store_predictions(test_df, fine_tuned_model_id):
    start_time = datetime.now()
    print("Start Time:", start_time.strftime("%Y-%m-%d %H:%M:%S"))

    print("fine_tuned_model_id:", fine_tuned_model_id)

    test_df['Prediction'] = None

    for index, row in test_df.iterrows():
        test_message = format_test(row)  # Assume this formats the input correctly
        prediction_result = predict(test_message, fine_tuned_model_id)  # Your prediction logic
        test_df.at[index, 'Prediction'] = prediction_result

    end_time = datetime.now()
    print("End Time:", end_time.strftime("%Y-%m-%d %H:%M:%S"))

    # Save to Excel
    test_df.to_excel("predictions.xlsx", index=False)
    print("Predictions saved to predictions.xlsx")

In [54]:
test_df = pd.read_excel("traffic_complaints_test_data.xlsx")
store_predictions(test_df, fine_tuned_model_id)

Start Time: 2025-07-05 15:38:51
fine_tuned_model_id: ft:gpt-3.5-turbo-0125:personal:finetune-gpt35:Bpz2JuJT
End Time: 2025-07-05 15:39:02
Predictions saved to predictions.xlsx
