In [1]:
!pip install -U openai




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import transformers
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [4]:
from openai import OpenAI
client = OpenAI(api_key="your_open_ai_key")

In [5]:
dataset = load_dataset("conv_ai_2")
df = pd.DataFrame(dataset['train'])

In [6]:
def preprocess_dialog(dialog):
    try:
        dialog_list = dialog
        dialog_text = " ".join([d['text'] for d in dialog_list])
        return dialog_text
    except:
        return ""

def preprocess_profile(profile):
    try:
        profile_list = profile
        profile_text = ' '.join([''.join(char_list) for char_list in profile_list])
        return profile_text
    except:
        return ""

# Apply preprocessing
df['dialog_text'] = df['dialog'].apply(preprocess_dialog)
df['bot_profile_text'] = df['bot_profile'].apply(preprocess_profile)
df['user_profile_text'] = df['user_profile'].apply(preprocess_profile)

# Combine dialog and profiles
df['combined_text'] = df['bot_profile_text'] + " [SEP] " + df['user_profile_text'] + " [SEP] " + df['dialog_text']

# Display the processed data
df[['combined_text', 'profile_match']].head()

Unnamed: 0,combined_text,profile_match
0,i have amazing children and grandchildren. i c...,0
1,my father was a door to door salesman. i've th...,1
2,i am a gold medalist olympian. i love italian ...,1
3,i fantasize about taking over the world. i'm a...,1
4,i am 40 years old. i work as a car salesman. m...,1


In [7]:
df = df[['combined_text', 'profile_match']]

In [8]:
# Stratified splitting. Assuming 'Top Category' can be used for stratification
train_data, val_data = train_test_split(
    df,
    test_size=0.2,
    stratify=df['profile_match'],
    random_state=42  # for reproducibility
)

In [9]:
def convert_to_gpt35_format(dataset):
    fine_tuning_data = []
    for _, row in dataset.iterrows():
        json_response = '{"profile_match": "' + str(row['profile_match']) + '"}'
        fine_tuning_data.append({
            "messages": [
                {"role": "user", "content": row['combined_text']},
                {"role": "system", "content": json_response}
            ]
        })
    return fine_tuning_data


In [10]:
gpt_train_data = convert_to_gpt35_format(train_data)
gpt_val_data = convert_to_gpt35_format(val_data)

In [12]:
gpt_val_data

[{'messages': [{'role': 'user',
    'content': 'i just bought a house recently. i eat a raw diet. i see the rolling stones in concert every year. i am married with two kids. i am an economics major. [SEP] my pets name is charlie. i drive a 2015 honda civic. i produce music for artists. my favourite food is pizza. i listen to rap music. [SEP] Hey! Are you an economics major? Because I am. How are you? I am great, how are you? Fuck you I am a big fan of classic rock Stupped bot Sorry this is as clear as mud to me. I am sure your ego is from mommy issues ? Maybe a pet can help calm you down.'},
   {'role': 'system', 'content': '{"profile_match": "1"}'}]},
 {'messages': [{'role': 'user',
    'content': "my boyfriend and i are moving into an apartment together next week. my favorite colors are red and silver. i'm fluent in english spanish and french. i'm an elementary school teacher. [SEP] i own two vintage mustangs. my favorite music is country. i like to work on vintage cars. i've two dog

In [11]:
def write_to_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')

training_file_name = "train.jsonl"
validation_file_name = "val.jsonl"

write_to_jsonl(train_data, training_file_name)
write_to_jsonl(val_data, validation_file_name)

In [None]:
# Upload Training and Validation Files
training_file = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
validation_file = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)

# Create Fine-Tuning Job
suffix_name = "gpt_conv_ai"
response = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    validation_file=validation_file.id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)
response 

In [None]:

client.fine_tuning.jobs.list(limit=10) # search for suffix_name

In [None]:
response = client.fine_tuning.jobs.retrieve("ftjob-h5P9gmmDXysjbOD99jz8TtYv") # retrieve based on yours job
response
     

In [None]:
fine_tuned_model_id = response.fine_tuned_model
print("\nFine-tuned model id:", fine_tuned_model_id)

In [None]:
def format_test(row):
    formatted_message = [{"role": "user", "content": row['Support Query']}]
    return formatted_message

def predict(test_messages, fine_tuned_model_id):
    response = client.chat.completions.create(
        model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=50
    )
    return response.choices[0].message.content

def store_predictions(test_df,y_test, fine_tuned_model_id):
    test_df['Prediction'] = None
    pred = []
    for index, row in test_df.iterrows():
        test_message = format_test(row)
        prediction_result = predict(test_message, fine_tuned_model_id)
        pred.append(prediction_result)
    print("F1 Score : ", f1_score(pred,y_test))

    # test_df.to_csv("predictions.csv")

In [None]:
store_predictions(val_data,val_data['profile_match'], fine_tuned_model_id)