## Install Dependencies

In [None]:
!pip install openai



## Import Libraries

In [None]:
import openai
import os
import json
from google.colab import drive, runtime

## Mount Google Drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Access Data

In [None]:
data = '/content/drive/MyDrive/Taboo data/en'

## Load Data

In [None]:
taboo_data = {}

# Load each JSON file
for filename in os.listdir(data):
    if filename.endswith(".json"):
        category = filename.split('.')[0]  # e.g., 'animals', 'cars'
        with open(os.path.join(data, filename), 'r') as file:
            taboo_data[category] = json.load(file)

# Display data
print(f"Categories loaded: {list(taboo_data.keys())}")

Categories loaded: ['literature', 'food', 'tv', 'sports', 'cars', 'web', 'animals', 'city-country', 'people', 'things']


## Prepare Training Data

### Prompt-completion format

In [None]:
# Store prompt-completion pairs
training_data = []

for category, words in taboo_data.items():
    for word, taboo_words in words.items():
        prompt = f"Describe '{word}' without using the words: {', '.join(taboo_words)}."
        completion = f"A description for '{word}'."
        training_data.append({'prompt': prompt, 'completion': completion})

# Display sample training data
print("Sample training data:", training_data[:3])

Sample training data: [{'prompt': "Describe '1984' without using the words: George Orwell.", 'completion': "A description for '1984'."}, {'prompt': "Describe 'A Doll's House' without using the words: Henrik Ibsen.", 'completion': "A description for 'A Doll's House'."}, {'prompt': "Describe 'Absalom, Absalom!' without using the words: William Faulkner.", 'completion': "A description for 'Absalom, Absalom!'."}]


### Chat format (compatible to gpt-4o)

In [None]:
chat_training_data = []

for category, words in taboo_data.items():
    for word, taboo_words in words.items():
        user_message = f"Describe '{word}' without using the words: {', '.join(taboo_words)}."
        assistant_message = f"A description for '{word}'."

        chat_training_data.append({
            "messages": [
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        })

# Savechat-style data to a new JSONL file
chat_output_path = '/content/drive/MyDrive/Taboo data/chat_training_data.jsonl'
with open(chat_output_path, 'w') as outfile:
    for entry in chat_training_data:
        json.dump(entry, outfile)
        outfile.write('\n')

print(f"Chat training data saved to {chat_output_path}")

Chat training data saved to /content/drive/MyDrive/Taboo data/chat_training_data.jsonl


## Save Training Data in JSONL Format

In [None]:
output_path = '/content/drive/MyDrive/Taboo data/chat_training_data.jsonl'

# Save data in JSONL format
with open(output_path, 'w') as outfile:
    for entry in chat_training_data:
        json.dump(entry, outfile)
        outfile.write('\n')

print(f"Training data saved to {output_path}")

Training data saved to /content/drive/MyDrive/Taboo data/chat_training_data.jsonl


## Set up OpenAI API Key

In [None]:
from google.colab import userdata

OPENAI_API_KEY = userdata.get("OPEN_API_KEY")
openai.api_key = OPENAI_API_KEY

## Upload Training file to OpenAI

In [None]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

with open(output_path, "rb") as file:
    training_file = client.files.create(
        file=file,
        purpose="fine-tune"
    )
print("Training file ID:", training_file.id)

Training file ID: file-R9qd4t2bI5AonslJJHR4ckPI


## Start the Fine-Tuning Job

In [None]:
fine_tune_response = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    model="gpt-4o-2024-08-06",
    suffix="TabooGPT-4o"
)

print("Fine-tuning job details:", fine_tune_response)

Fine-tuning job details: FineTuningJob(id='ftjob-TiNk77Rnyxv2AYNKM8wxh2Lb', created_at=1731425914, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-0zORcMd10Hj9lDB13OACLHnu', result_files=[], seed=1528435292, status='validating_files', trained_tokens=None, training_file='file-R9qd4t2bI5AonslJJHR4ckPI', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='TabooGPT-4o')


## Monitor the Fine-Tuning Job Status (periodic run while job is in progress)

In [None]:
fine_tune_job_id = fine_tune_response.id

import time

# Check status every 60 seconds
while True:
    status_response = client.fine_tuning.jobs.retrieve(fine_tune_job_id)
    print("Fine-tuning job status:", status_response.status)

    if status_response.status == 'succeeded':
        print("Fine-tuning job completed successfully!")
        break
    elif status_response.status == 'failed':
        print("Fine-tuning job failed.")
        print("Error details:", status_response.error)
        break

    # Wait before checking again
    time.sleep(60)

Fine-tuning job status: validating_files
Fine-tuning job status: validating_files
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status: running
Fine-tuning job status

KeyboardInterrupt: 