**GPT-3 FINETUNING USING GPT 4.1** ie To diff between two sports

STEP 1 - INSTALLING THE REQ LIB

In [None]:
!pip install --quiet openai

In [None]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-2.3.0-py3-none-any.whl.metadata (29 kB)
Downloading openai-2.3.0-py3-none-any.whl (999 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m999.8/999.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.109.1
    Uninstalling openai-1.109.1:
      Successfully uninstalled openai-1.109.1
Successfully installed openai-2.3.0


In [None]:
from sklearn.datasets import fetch_20newsgroups # dataset from sklearn
import pandas as pd
import openai
import json
from sklearn.model_selection import train_test_split

categories = ['rec.sport.baseball', 'rec.sport.hockey']
sports_dataset = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=categories)

STEP 2 - DATA PREPRATION

The data is divided into two columns according to the chatgpt example -
1) text - tht containes the email
2) labels - tht contains the name of sport

In [None]:
texts = [text.strip().replace("\n", " ") for text in sports_dataset.data]
labels = [sports_dataset.target_names[target].split('.')[-1] for target in sports_dataset.target]

In [None]:
# splitting of data into json files for easy storage with gpt 4.1 complaince
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

def create_jsonl_gpt4(file_path, texts, labels):
    import json
    with open(file_path, "w", encoding="utf-8") as f:
        for text, label in zip(texts, labels):
            text_clean = text.replace("\n", " ").strip()
            label_clean = label.strip()
            if not text_clean or not label_clean:
                continue
            record = {
                "messages": [
                    {"role": "system", "content": "Classify the sport into either baseball or hockey."},
                    {"role": "user", "content": text_clean},
                    {"role": "assistant", "content": label_clean}
                ]
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

create_jsonl_gpt4("sports_train_gpt4.jsonl", train_texts, train_labels)
create_jsonl_gpt4("sports_valid_gpt4.jsonl", val_texts, val_labels)





**STEP 3 - USING OPENAI TO UPLOAD FILES IN JSON FORMAT**

In [None]:
# importing the
import os
os.environ["OPENAI_API_KEY"] =  ""

In [None]:
# Upload training file
!openai api files.create -f "sports_prepared_train.jsonl" -p fine-tune

# Upload validation file
!openai api files.create -f "sports_prepared_valid.jsonl" -p fine-tune


Upload progress: 100% 1.82M/1.82M [00:00<00:00, 7.12Mit/s]
{
  "id": "file-Ya2AwbNcyCUC8a1UwEYiyo",
  "bytes": 1820561,
  "created_at": 1760282560,
  "filename": "sports_prepared_train.jsonl",
  "object": "file",
  "purpose": "fine-tune",
  "status": "processed",
  "expires_at": null,
  "status_details": null
}
Upload progress: 100% 414k/414k [00:00<00:00, 1.42Mit/s]
{
  "id": "file-RosDc3NQ6YV8NXScKPmWHr",
  "bytes": 414400,
  "created_at": 1760282562,
  "filename": "sports_prepared_valid.jsonl",
  "object": "file",
  "purpose": "fine-tune",
  "status": "processed",
  "expires_at": null,
  "status_details": null
}


**STEP 4 - FINETUNING OF THE MODEL**

In [None]:
# calling the training function so tht it can take place in openai server
TRAINING_FILE_ID = "file-Ya2AwbNcyCUC8a1UwEYiyo" # key from above
VALIDATION_FILE_ID = "file-RosDc3NQ6YV8NXScKPmWHr" # key from above

!openai api fine_tuning.jobs.create \
  -F "{TRAINING_FILE_ID}" \
  -V "{VALIDATION_FILE_ID}" \
  -m "gpt-4.1-2025-04-14"

{
  "id": "ftjob-bM7TsZtpFOM657lSBPmR6OeY",
  "created_at": 1760282577,
  "error": {
    "code": null,
    "message": null,
    "param": null
  },
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "batch_size": "auto",
    "learning_rate_multiplier": "auto",
    "n_epochs": "auto"
  },
  "model": "gpt-4.1-2025-04-14",
  "object": "fine_tuning.job",
  "organization_id": "org-3seLMfR9IYGHHmACBVoy1Y8a",
  "result_files": [],
  "seed": 1200733570,
  "status": "validating_files",
  "trained_tokens": null,
  "training_file": "file-Ya2AwbNcyCUC8a1UwEYiyo",
  "validation_file": "file-RosDc3NQ6YV8NXScKPmWHr",
  "estimated_finish": null,
  "integrations": [],
  "metadata": null,
  "method": {
    "type": "supervised",
    "dpo": null,
    "reinforcement": null,
    "supervised": {
      "hyperparameters": {
        "batch_size": "auto",
        "learning_rate_multiplier": "auto",
        "n_epochs": "auto"
      }
    }
  },
  "user_provided_suffix": null,
  "usage_me

In [None]:
# for checking the job status
!openai api fine_tuning.jobs.retrieve -i "ftjob-bM7TsZtpFOM657lSBPmR6OeY" # key from above


In [None]:
# check if the fine tuning job is completed or not and then automaticallt test the model on sample data
import json, time, subprocess, openai
FT_JOB_ID = "ftjob-bM7TsZtpFOM657lSBPmR6OeY"
while True:
    job = json.loads(subprocess.run(
        ["openai", "api", "fine_tuning.jobs.retrieve", "-i", FT_JOB_ID],
        capture_output=True, text=True
    ).stdout)
    if job["status"] in ["succeeded", "failed"]: break
    time.sleep(30)
if job["status"] == "succeeded":
    model = job["fine_tuned_model"]
    resp = openai.ChatCompletion.create(
        model=model,
        messages=[{"role":"user","content":"Classify this text: 'The hockey match was intense.'"}]
        )
    print(resp['choices'][0]['message']['content'])
else:
    print("Fine-tuning failed")
