# Initializing the setup

In [None]:
import pandas as pd
import openai

In [None]:
from IPython.display import clear_output
import time

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import json

import os

In [None]:
with open("OPENAI_API_KEY.txt", "r") as file:
    openai.api_key = file.read()

# Open dataframe

In [None]:
original_df = pd.read_csv('FACT-GPT dataset.csv', index_col=[0])

In [None]:
def get_train_and_validation_sets(original_df, model_name):

    training_list = []
    for index, row in original_df.iterrows():
        entailment = {'old_index': index,
                      'claim_number': row['claim_number'],
                      'claim': row['claim'],
                      'generated_tweet': row[f'generated_entail_tweet_{model_name}'],
                      'ground_truth': 'ENTAILMENT'}
        neutral = {'old_index': index,
                   'claim_number': row['claim_number'],
                   'claim': row['claim'],
                   'generated_tweet': row[f'generated_neutral_tweet_{model_name}'],
                   'ground_truth': 'NEUTRAL'}
        contradiction = {'old_index': index,
                         'claim_number': row['claim_number'],
                         'claim': row['claim'],
                         'generated_tweet': row[f'generated_contradict_tweet_{model_name}'],
                         'ground_truth': 'CONTRADICTION'}

        training_list.extend([contradiction, entailment, neutral])

    df = pd.DataFrame(training_list)

    # Shuffle the DataFrame
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Split the data into training and validation sets (80:20)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    return train_df, val_df

train_df, val_df = get_train_and_validation_sets(original_df, 'gpt-4')

In [None]:
# Function to create dataset from DataFrame
def create_dataset(df):
    dataset = []
    for i, row in df.iterrows():
        tweet = row['generated_tweet']
        claim = row['claim']
        ground_truth = row['ground_truth']
        datum = {
            "messages": [
                {"role": "system", "content": f"""Which of the following best describes the relationship between TWEET and CLAIM?

If TWEET is true:
(ENTAILMENT) then CLAIM is also true.
(NEUTRAL) CLAIM cannot be said to be true or false.
(CONTRADICTION) then CLAIM is false."""},
                {"role": "user", "content": f"TWEET: {tweet}\nCLAIM: {claim}\nANSWER:"},
                {"role": "assistant", "content": f"{ground_truth}"}
            ]
        }
        dataset.append(datum)

    return dataset

# Create train and validation datasets
train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)

# Save the datasets as JSON files
with open(f'train_gpt-4.jsonl', 'w') as f:

    for d in train_dataset:
        # Convert dictionary to JSON-formatted string
        json_str = json.dumps(d)

        # Write to file
        f.write(json_str + "\n")

with open(f'val_gpt-4.jsonl', 'w') as f:

    for d in val_dataset:
        # Convert dictionary to JSON-formatted string
        json_str = json.dumps(d)

        # Write to file
        f.write(json_str + "\n")

# Make and upload json

In [None]:
openai.File.create(
  file=open('train_gpt-4.jsonl', "rb"),
  purpose='fine-tune'
)

In [None]:
openai.File.create(
  file=open('val_gpt-4.jsonl', "rb"),
  purpose='fine-tune'
)

In [None]:
openai_file_dict = {
 'train_gpt-4.jsonl': ### file "id" ###,
 'val_gpt-4.jsonl': ### file "id" ###,
}

# fine tuning (gpt-3.5-turbo)

In [None]:
import time
while True:
    try:
        openai.FineTuningJob.create(training_file= openai_file_dict['train_gpt-4.jsonl'],
                                    validation_file= openai_file_dict['val_gpt-4.jsonl'],
                                    model="gpt-3.5-turbo",
                                    hyperparameters={"n_epochs":3})
        break
    except Exception as e:
        print(e)
        time.sleep(300)

In [None]:
openai.FineTuningJob.list()['data']

In [None]:
model_name = ### model "id" ###

# Annotation loop

In [None]:
test_df = original_df
test_df['gpt-3_5_finetuned_on_gpt_4'] = None

In [None]:
start_time = time.time()

# Iterate through the DataFrame
for i, row in test_df.iterrows():
    tweet = row['tweet']
    claim = row['claim']

    try:
        retries = 3
        while retries > 0:
            try:

                if pd.isnull(row['gpt-3_5_finetuned_on_gpt_4']):
                    completion = openai.ChatCompletion.create(
                          model=model_name,
                          messages=[{'role': 'system', 'content': f"""Which of the following best describes the relationship between TWEET and CLAIM?

If TWEET is true:
(ENTAILMENT) then CLAIM is also true.
(NEUTRAL) CLAIM cannot be said to be true or false.
(CONTRADICTION) then CLAIM is false."""},

                        {"role": "user", "content": f"""TWEET: {tweet}
CLAIM: {claim}
ANSWER:"""}
                          ],
                          temperature=0
                        )
                    test_df.at[i, 'gpt-3_5_finetuned_on_gpt_4'] = completion.choices[0].message['content']

                break


            except openai.error.OpenAIError as e:
                print(f"Error: {e}")
                retries -= 1
                time.sleep(5)  # Wait for 5 seconds before

    except Exception as e:
        print(e)

    test_df.to_csv('FACT-GPT dataset.csv')

    end_time = time.time()
    runtime = end_time - start_time

    clear_output(wait=True)

    print(f"Iteration: {i+1}, Runtime: {runtime} seconds")