In [None]:
import pandas as pd
import numpy as np

from sklearn.utils import resample
import json

In [None]:
load_path = 'data/data_final.csv'
original_df = pd.read_csv(load_path, index_col=[0])

In [None]:
def generate_entailment_prompt(tweet, claim, cot_true, shot_true, front_true=False):
    if cot_true==True:
        cot=" Let's think step by step."
    else:
        cot=""

    if shot_true==True:
        shot="""
TWEET: A dog is running in a field.
CLAIM: An animal is running in a field.
ANSWER: A dog is an animal. A dog running in a field is an animal running in a field. So the final answer is ENTAILMENT.

TWEET: A man is breaking three eggs in a bowl.
CLAIM: A girl is pouring some milk in a bowl.
ANSWER: A man is breaking three eggs in a bowl does not imply that a girl is pouring some milk in a bowl. So the final answer is NEUTRAL.

TWEET: A man is playing golf.
CLAIM: No man is playing golf.
ANSWER: A man is playing golf and no man is playing golf cannot be true at the same time. So the final answer is CONTRADICTION.
"""
    else:
        shot=""

    if front_true==True:
        constraint="You must first choose from ENTAILMENT, NEUTRAL, or CONTRADICTION, and then provide an explanation."

    elif front_true==False:
        constraint="You must provide an explanation, and then a final choice as ENTAILMENT, NEUTRAL, or CONTRADICTION."

    else:
        constraint="Your answer should only be either ENTAILMENT, NEUTRAL, or CONTRADICTION."

    prompt = f"""<s>[INST] <<SYS>> Which of the following best describes the relationship between TWEET and CLAIM? {constraint}

If TWEET is true:
(ENTAILMENT) then CLAIM is also true.
(NEUTRAL) CLAIM cannot be said to be true or false.
(CONTRADICTION) then CLAIM is false. <</SYS>>
{shot}
TWEET: {tweet}
CLAIM: {claim}
ANSWER:{cot} [/INST]"""

    return prompt

In [None]:
list(original_df.columns)

# Create Balanced and Imbalanced Train Dataset, both in original and reversed order

In [None]:
def get_train_set(original_df, model_name, balanced, reversed=False):

    if balanced==True:
        b_suffix='_balanced'
    elif balanced==False:
        b_suffix='_unbalanced'

    if reversed==True:
        suffix='_reversed'
    elif reversed==False:
        suffix=''

    training_list = []

    for index, row in original_df.iterrows():

        entailment = {'old_index': index,
                      'claim_number': row['claim_number'],
                      'claim': row['claim'],
                      'generated_tweet': row[f'generated_entail_tweet{suffix}_{model_name}'],
                      'ground_truth': 'ENTAILMENT'}

        neutral = {'old_index': index,
                    'claim_number': row['claim_number'],
                    'claim': row['claim'],
                    'generated_tweet': row[f'generated_neutral_tweet{suffix}_{model_name}'],
                    'ground_truth': 'NEUTRAL'}

        contradiction = {'old_index': index,
            'claim_number': row['claim_number'],
            'claim': row['claim'],
            'generated_tweet': row[f'generated_contradict_tweet{suffix}_{model_name}'],
            'ground_truth': 'CONTRADICTION'}

        training_list.append(contradiction)
        training_list.append(entailment)
        training_list.append(neutral)


    df = pd.DataFrame(training_list)

    if balanced == False:
        # Separate the classes
        df_entailment = df[df.ground_truth == 'ENTAILMENT']
        df_neutral = df[df.ground_truth == 'NEUTRAL']
        df_contradiction = df[df.ground_truth == 'CONTRADICTION']

        # Define new sample sizes
        n_total = len(df)
        n_entailment = int(0.5 * n_total)
        n_neutral = int(0.35 * n_total)
        n_contradiction = n_total - n_entailment - n_neutral

        # Resample the DataFrames
        df_entailment_resampled = resample(df_entailment, replace=True, n_samples=n_entailment, random_state=42)
        df_neutral_resampled = resample(df_neutral, replace=True, n_samples=n_neutral, random_state=42)
        df_contradiction_resampled = resample(df_contradiction, replace=False, n_samples=n_contradiction, random_state=42)

        # Concatenate the resampled DataFrames
        df = pd.concat([df_entailment_resampled, df_neutral_resampled, df_contradiction_resampled])

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    train_dataset = []
    for i, row in df.iterrows():
        tweet = row['generated_tweet']
        claim = row['claim']
        ground_truth = row['ground_truth']

        if reversed==False:

            datum = {
"instruction": f"""<<SYS>> Which of the following best describes the relationship between TWEET and CLAIM?

If TWEET is true:
(ENTAILMENT) then CLAIM is also true.
(NEUTRAL) CLAIM cannot be said to be true or false.
(CONTRADICTION) then CLAIM is false. <</SYS>>""",

"input": f"""TWEET: {tweet}
CLAIM: {claim}
ANSWER:""",

"output": f"{ground_truth}"

            }

        elif reversed==True:

            datum = {

"instruction": f"""<<SYS> Which of the following best describes the relationship between CLAIM and TWEET?

If CLAIM is true:
(ENTAILMENT) then TWEET is also true.
(NEUTRAL) TWEET cannot be said to be true or false.
(CONTRADICTION) then TWEET is false. <</SYS>>""",

"input": f"""CLAIM: {claim}
TWEET: {tweet}
ANSWER:""",

"output": f"{ground_truth}"

             }

        train_dataset.append(datum)

    with open(f'data/llama_train_json/train_{model_name}{b_suffix}{suffix}.json', 'w') as f:
        json.dump(train_dataset, f, indent=4)

    return df, train_dataset

In [None]:
df, train_dataset = get_train_set(original_df, 'gpt-4', balanced=True, reversed=False)
print(train_dataset[0]['instruction'])
print(train_dataset[0]['input'])
print(train_dataset[0]['output'])
len(train_dataset), df['ground_truth'].value_counts()

In [None]:
df, train_dataset = get_train_set(original_df, 'gpt-4', balanced=True, reversed=True)
print(train_dataset[0]['instruction'])
print(train_dataset[0]['input'])
print(train_dataset[0]['output'])
len(train_dataset), df['ground_truth'].value_counts()

In [None]:
df, train_dataset = get_train_set(original_df, 'gpt-4', balanced=False, reversed=False)
print(train_dataset[0]['instruction'])
print(train_dataset[0]['input'])
print(train_dataset[0]['output'])
len(train_dataset), df['ground_truth'].value_counts()

In [None]:
df, train_dataset = get_train_set(original_df, 'gpt-4', balanced=False, reversed=True)
print(train_dataset[0]['instruction'])
print(train_dataset[0]['input'])
print(train_dataset[0]['output'])
len(train_dataset), df['ground_truth'].value_counts()

In [None]:
df, train_dataset = get_train_set(original_df, 'gpt-3_5', balanced=True, reversed=False)
df, train_dataset = get_train_set(original_df, 'gpt-3_5', balanced=True, reversed=True)
df, train_dataset = get_train_set(original_df, 'gpt-3_5', balanced=False, reversed=False)
df, train_dataset = get_train_set(original_df, 'gpt-3_5', balanced=False, reversed=True)

df, train_dataset = get_train_set(original_df, '70b', balanced=True, reversed=False)
df, train_dataset = get_train_set(original_df, '70b', balanced=True, reversed=True)
df, train_dataset = get_train_set(original_df, '70b', balanced=False, reversed=False)
df, train_dataset = get_train_set(original_df, '70b', balanced=False, reversed=True)

# Make test set json

In [None]:
# test (original order)
test_dataset = []
for i, row in original_df.iterrows():
    tweet = row['tweet']
    claim = row['claim']
    ground_truth = row['Mturk_1']

    datum = {

"instruction": f"""<<SYS>> Which of the following best describes the relationship between TWEET and CLAIM?

If TWEET is true:
(ENTAILMENT) then CLAIM is also true.
(NEUTRAL) CLAIM cannot be said to be true or false.
(CONTRADICTION) then CLAIM is false. <</SYS>>""",

"input": f"""TWEET: {tweet}
Claim: {claim}
Answer:""",

"output": f"{ground_truth}"
    }

    test_dataset.append(datum)

In [None]:
with open('data/LLaMA-Efficient-Tuning/data/test.json', 'w') as f:
    json.dump(test_dataset, f, indent=4)

In [None]:
# test (reversed order)
test_dataset_r = []
for i, row in original_df.iterrows():
    tweet = row['tweet']
    claim = row['claim']
    ground_truth = row['Mturk_1_reversed']

    datum = {

"instruction": f"""<<SYS>> Which of the following best describes the relationship between CLAIM and TWEET?

If CLAIM is true:
(ENTAILMENT) then TWEET is also true.
(NEUTRAL) TWEET cannot be said to be true or false.
(CONTRADICTION) then TWEET is false. <</SYS>>""",

"input": f"""CLAIM: {claim}
TWEET: {tweet}
ANSWER:""",

"output": f"{ground_truth}"
    }

    test_dataset_r.append(datum)

In [None]:
with open('data/LLaMA-Efficient-Tuning/data/test_r.json', 'w') as f:
    json.dump(test_dataset_r, f, indent=4)