In [20]:
from google.colab import drive
drive.mount('/content/drive')

In [21]:
import pandas as pd
import numpy as np

from sklearn.utils import resample
import json

In [13]:
original_df = pd.read_csv('/content/drive/MyDrive/FACT-GPT dataset.csv')

In [14]:
def generate_entailment_prompt(tweet, claim):

    prompt = f"""<s>[INST] <<SYS>> Which of the following best describes the relationship between TWEET and CLAIM? Your answer should only be either ENTAILMENT, NEUTRAL, or CONTRADICTION.

If TWEET is true:
(ENTAILMENT) then CLAIM is also true.
(NEUTRAL) CLAIM cannot be said to be true or false.
(CONTRADICTION) then CLAIM is false. <</SYS>>

TWEET: {tweet}
CLAIM: {claim}
ANSWER: [/INST]"""

    return prompt

# Create Train Dataset

In [15]:
def get_train_set(original_df, model_name):

    training_list = []

    for index, row in original_df.iterrows():

        entailment = {'old_index': index,
                      'claim_number': row['claim_number'],
                      'claim': row['claim'],
                      'generated_tweet': row[f'generated_entail_tweet_{model_name}'],
                      'ground_truth': 'ENTAILMENT'}

        neutral = {'old_index': index,
                    'claim_number': row['claim_number'],
                    'claim': row['claim'],
                    'generated_tweet': row[f'generated_neutral_tweet_{model_name}'],
                    'ground_truth': 'NEUTRAL'}

        contradiction = {'old_index': index,
            'claim_number': row['claim_number'],
            'claim': row['claim'],
            'generated_tweet': row[f'generated_contradict_tweet_{model_name}'],
            'ground_truth': 'CONTRADICTION'}

        training_list.append(contradiction)
        training_list.append(entailment)
        training_list.append(neutral)


    df = pd.DataFrame(training_list)

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    train_dataset = []
    for i, row in df.iterrows():
        tweet = row['generated_tweet']
        claim = row['claim']
        ground_truth = row['ground_truth']

        datum = {
"instruction": f"""<<SYS>> Which of the following best describes the relationship between TWEET and CLAIM?

If TWEET is true:
(ENTAILMENT) then CLAIM is also true.
(NEUTRAL) CLAIM cannot be said to be true or false.
(CONTRADICTION) then CLAIM is false. <</SYS>>""",

"input": f"""TWEET: {tweet}
CLAIM: {claim}
ANSWER:""",

"output": f"{ground_truth}"
        }
        train_dataset.append(datum)

    with open(f'/content/drive/MyDrive/Finetuning/LLaMA-Efficient-Tuning/data/train_{model_name}.json', 'w') as f:
        json.dump(train_dataset, f, indent=4)

    return df, train_dataset

In [22]:
df, train_dataset = get_train_set(original_df, 'gpt-4')
print(train_dataset[0]['instruction'])
print(train_dataset[0]['input'])
print(train_dataset[0]['output'])
len(train_dataset), df['ground_truth'].value_counts()

# Make test set json (for placeholder)

In [23]:
# test (original order)
test_dataset = []
for i, row in original_df.iterrows():
    tweet = row['tweet']
    claim = row['claim']
    ground_truth = row['Mturk_1']

    datum = {

"instruction": f"""<<SYS>> Which of the following best describes the relationship between TWEET and CLAIM?

If TWEET is true:
(ENTAILMENT) then CLAIM is also true.
(NEUTRAL) CLAIM cannot be said to be true or false.
(CONTRADICTION) then CLAIM is false. <</SYS>>""",

"input": f"""TWEET: {tweet}
Claim: {claim}
Answer:""",

"output": f"{ground_truth}"
    }

    test_dataset.append(datum)

In [19]:
with open('/content/drive/MyDrive/Finetuning/LLaMA-Efficient-Tuning/data/test.json', 'w') as f:
    json.dump(test_dataset, f, indent=4)