# Open dataframe

In [1]:
import pandas as pd
import numpy as np

from sklearn.utils import resample
import json

In [3]:
original_df = pd.read_csv('FACT-GPT dataset.csv', index_col=[0])

In [4]:
original_df.columns

Index(['platform', 'claim_number', 'claim', 'date_claimed', 'date_checked',
       'verdict', 'tweet_id', 'tweet_date', 'score', 'Mturk_1', 'Mturk_2',
       'Mturk_3', 'Mturk_4', 'Mturk_5', 'generated_entail_tweet_gpt-4',
       'generated_contradict_tweet_gpt-4', 'generated_neutral_tweet_gpt-4',
       'annotation_only_gpt-4', 'annotation_only_gpt-3_5',
       'annotation_only_70b', 'annotation_only_13b', 'annotation_only_7b',
       'gpt-3_5_finetuned_on_gpt_4', 'gpt-3_5_finetuned_on_gpt_3_5',
       'gpt-3_5_finetuned_on_70b', '13b_finetuned_on_gpt_4',
       '13b_finetuned_on_gpt_3_5', '13b_finetuned_on_70b',
       '7b_finetuned_on_gpt_4', '7b_finetuned_on_gpt_3_5',
       '7b_finetuned_on_70b'],
      dtype='object')

In [5]:
columns_eval = ['annotation_only_gpt-4',
 'annotation_only_gpt-3_5',
 'annotation_only_13b',
 'annotation_only_7b',
 'gpt-3_5_finetuned_on_gpt_4',
 'gpt-3_5_finetuned_on_gpt_3_5',
 'gpt-3_5_finetuned_on_70b',
 '13b_finetuned_on_gpt_4',
 '13b_finetuned_on_gpt_3_5',
 '13b_finetuned_on_70b',
 '7b_finetuned_on_gpt_4',
 '7b_finetuned_on_gpt_3_5',
 '7b_finetuned_on_70b'
]

## Preprocess predicted classes

In [7]:
import re

def find_all_annotation(s):
    # Convert the string to uppercase for case-insensitive matching.
    s = str(s).upper()
    tokens = re.findall(r'\w+', s)

    # Map for valid tokens that simplifies various forms to their base form.
    valid_tokens_map = {
        'ENTAIL': 'ENTAILMENT',
        'ENTAILS': 'ENTAILMENT',
        'ENTAILING': 'ENTAILMENT',
        'ENTAILMENT': 'ENTAILMENT',

        'CONTRADICT': 'CONTRADICTION',
        'CONTRADICTS': 'CONTRADICTION',
        'CONTRADICTING': 'CONTRADICTION',
        'CONTRADICTION': 'CONTRADICTION',

        'NEUTRAL': 'NEUTRAL',
    }

    valid_tokens = set([
        'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION',
    ])

    # We map each token to its valid form if present in the map, otherwise keep the token as is
    annotate_list = [valid_tokens_map.get(token, token) for token in tokens if token in valid_tokens_map.keys()]

    # Return the first valid annotation found, or None if none are found.
    if len(annotate_list) == 0:
        return None
    else:
        return annotate_list[0]

# Apply the modified function:
original_df[columns_eval] = original_df[columns_eval].applymap(find_all_annotation)

# This line is meant to demonstrate how to find indices where the column values are None, which might not directly relate to fixing the function.
none_indices = {col: original_df[original_df[col].isna()].index.tolist() for col in columns_eval}

In [8]:
original_df[columns_eval].head()

Unnamed: 0,annotation_only_gpt-4,annotation_only_gpt-3_5,annotation_only_13b,annotation_only_7b,gpt-3_5_finetuned_on_gpt_4,gpt-3_5_finetuned_on_gpt_3_5,gpt-3_5_finetuned_on_70b,13b_finetuned_on_gpt_4,13b_finetuned_on_gpt_3_5,13b_finetuned_on_70b,7b_finetuned_on_gpt_4,7b_finetuned_on_gpt_3_5,7b_finetuned_on_70b
0,NEUTRAL,CONTRADICTION,CONTRADICTION,CONTRADICTION,ENTAILMENT,ENTAILMENT,ENTAILMENT,ENTAILMENT,ENTAILMENT,ENTAILMENT,ENTAILMENT,ENTAILMENT,ENTAILMENT
1,NEUTRAL,ENTAILMENT,CONTRADICTION,ENTAILMENT,ENTAILMENT,ENTAILMENT,ENTAILMENT,NEUTRAL,NEUTRAL,NEUTRAL,ENTAILMENT,ENTAILMENT,ENTAILMENT
2,NEUTRAL,NEUTRAL,CONTRADICTION,CONTRADICTION,CONTRADICTION,CONTRADICTION,CONTRADICTION,NEUTRAL,CONTRADICTION,CONTRADICTION,CONTRADICTION,ENTAILMENT,NEUTRAL
3,CONTRADICTION,NEUTRAL,CONTRADICTION,CONTRADICTION,NEUTRAL,ENTAILMENT,NEUTRAL,NEUTRAL,NEUTRAL,NEUTRAL,NEUTRAL,ENTAILMENT,ENTAILMENT
4,NEUTRAL,NEUTRAL,CONTRADICTION,CONTRADICTION,ENTAILMENT,ENTAILMENT,NEUTRAL,ENTAILMENT,NEUTRAL,NEUTRAL,ENTAILMENT,ENTAILMENT,ENTAILMENT


In [9]:
# Function to count rows that are not one word
def count_non_single_word_rows(column):
    return sum(original_df[column].apply(lambda x: len(str(x).split()) != 1 or x is None))

counts = {column: count_non_single_word_rows(column) for column in original_df.columns if column in columns_eval}

# Print the results
for column, count in counts.items():
    print(f"{column}: {count} rows are not one word")

annotation_only_gpt-4: 0 rows are not one word
annotation_only_gpt-3_5: 0 rows are not one word
annotation_only_13b: 2 rows are not one word
annotation_only_7b: 0 rows are not one word
gpt-3_5_finetuned_on_gpt_4: 0 rows are not one word
gpt-3_5_finetuned_on_gpt_3_5: 0 rows are not one word
gpt-3_5_finetuned_on_70b: 0 rows are not one word
13b_finetuned_on_gpt_4: 0 rows are not one word
13b_finetuned_on_gpt_3_5: 0 rows are not one word
13b_finetuned_on_70b: 1 rows are not one word
7b_finetuned_on_gpt_4: 0 rows are not one word
7b_finetuned_on_gpt_3_5: 0 rows are not one word
7b_finetuned_on_70b: 0 rows are not one word


# Evaluation

## Set ground truth

In [11]:
from collections import Counter

# Columns for 'Mturk'
mturk_columns = ['Mturk_1', 'Mturk_2', 'Mturk_3', 'Mturk_4', 'Mturk_5']

# Function to calculate the majority vote, considering ties
def majority_vote(row):
    counter = Counter(row)
    max_count = max(counter.values())
    return [k for k, v in counter.items() if v == max_count]

# Calculate the majority vote for 'Mturk~'
original_df['Majority_Mturk'] = original_df[mturk_columns].apply(majority_vote, axis=1)

In [12]:
original_df['Majority_Mturk'].value_counts()

[ENTAILMENT]                   647
[NEUTRAL]                      433
[CONTRADICTION]                 90
[NEUTRAL, ENTAILMENT]           17
[ENTAILMENT, NEUTRAL]           17
[CONTRADICTION, NEUTRAL]         8
[CONTRADICTION, ENTAILMENT]      5
[ENTAILMENT, CONTRADICTION]      5
[NEUTRAL, CONTRADICTION]         3
Name: Majority_Mturk, dtype: int64

In [14]:
import pickle

# # Function to randomly tie-break and aggregate based on the given scheme
# def random_tie_break(val1_list):
#     val1 = np.random.choice(val1_list, 1)[0] if isinstance(val1_list, list) else val1_list
#     label_set = {'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION'}

#     if val1 == 'ENTAILMENT':
#         return 'ENTAILMENT'
#     elif val1 == 'CONTRADICTION':
#         return 'CONTRADICTION'
#     elif val1 not in label_set:
#         return None
#     else:
#         return 'NEUTRAL'

# # Set the seed for reproducibility
# np.random.seed(42)

# # Convert the relevant columns to NumPy arrays for faster computation
# majority_mturk = original_df['Majority_Mturk'].to_numpy()

# # Initialize a list to store the randomly tie-broken and aggregated lists
# random_aggregated_mturks = []

# # Create 1000 random tie-broken and aggregated lists
# for i in range(1000):
#     random_aggregated_mturk = [random_tie_break(val1) for val1 in majority_mturk]
#     random_aggregated_mturks.append(random_aggregated_mturk)


# # Pickle the random_aggregated_mturks list
# pickle_file_path = 'FACT-GPT eval tiebreak.pkl'
# with open(pickle_file_path, 'wb') as f:
#     pickle.dump(random_aggregated_mturks, f)

In [16]:
# Load the pickled random_aggregated_mturks list
pickle_file_path = 'FACT-GPT eval tiebreak.pkl'
with open(pickle_file_path, 'rb') as f:
    random_aggregated_mturks = pickle.load(f)

In [17]:
# Initialize a Counter to store the frequency of each class across all numpies
overall_counter = Counter()

# Count class frequencies across the 1,000 randomly aggregated lists
for random_aggregated_mturk in random_aggregated_mturks:
    counter = Counter(random_aggregated_mturk)
    overall_counter += counter

# Calculate the average frequency of each class
total_count = sum(overall_counter.values())
average_ratio = {k: v / total_count for k, v in overall_counter.items()}
average_frequency = {k: v / 1000 for k, v in overall_counter.items()}
average_ratio, average_frequency

({'ENTAILMENT': 0.5460334693877551,
  'CONTRADICTION': 0.08204244897959184,
  'NEUTRAL': 0.37192408163265306},
 {'ENTAILMENT': 668.891, 'CONTRADICTION': 100.502, 'NEUTRAL': 455.607})

## Evaluate over 1,000 samples

In [18]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, accuracy_score, f1_score
from itertools import product

def evaluate_aggregated_mturks(df, column_name, sample=1000):
    # Extract the relevant columns
    column = df[column_name].to_numpy()

    # Initialize lists to store precision, recall, and accuracy
    precision_list = []
    recall_list = []
    mic_precision_list = []
    mic_recall_list = []
    accuracy_list = []

    # Initialize a counter for None values
    none_counter = 0

    # Initialize a list to store the indices of None values
    none_indices = []

    # Generate 1000 random tie-broken and aggregated numpy arrays and evaluate metrics
    for i in range(sample):
        aggregated_col = []
        for idx, val1 in enumerate(column):
            if val1 is None:
                none_counter += 1
                none_indices.append(idx)
                aggregated_col.append(None)
                continue
            aggregated_col.append(random_tie_break(val1))

        aggregated_col = np.array(aggregated_col)

        # Filter out None values for the metric calculations
        valid_indices = aggregated_col != None
        filtered_aggregated_col = aggregated_col[valid_indices]
        filtered_reference = np.array([random_aggregated_mturk[i] for i in range(len(random_aggregated_mturk)) if valid_indices[i]])

        # Calculate and store precision, recall, and accuracy
        precision_list.append(precision_score(filtered_reference, filtered_aggregated_col, average='macro', zero_division=0))
        recall_list.append(recall_score(filtered_reference, filtered_aggregated_col, average='macro', zero_division=0))
        mic_precision_list.append(precision_score(filtered_reference, filtered_aggregated_col, average='micro', zero_division=0))
        mic_recall_list.append(recall_score(filtered_reference, filtered_aggregated_col, average='micro', zero_division=0))
        accuracy_list.append(accuracy_score(filtered_reference, filtered_aggregated_col))

        # Calculate precision for each label
        con_precision_list = precision_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[0]
        ent_precision_list = precision_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[1]
        neu_precision_list = precision_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[2]

        # Calculate recall for each label
        con_recall_list = recall_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[0]
        ent_recall_list = recall_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[1]
        neu_recall_list = recall_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[2]

        # Calculate f1 for each label
        con_f1_list = f1_score(filtered_reference, filtered_aggregated_col, average=None)[0]
        ent_f1_list = f1_score(filtered_reference, filtered_aggregated_col, average=None)[1]
        neu_f1_list = f1_score(filtered_reference, filtered_aggregated_col, average=None)[2]


    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_mic_precision = np.mean(mic_precision_list)
    avg_mic_recall = np.mean(mic_recall_list)
    avg_accuracy = np.mean(accuracy_list)

    ent_precision = np.mean(ent_precision_list)
    neu_precision = np.mean(neu_precision_list)
    con_precision = np.mean(con_precision_list)

    ent_recall = np.mean(ent_recall_list)
    neu_recall = np.mean(neu_recall_list)
    con_recall = np.mean(con_recall_list)

    ent_f1 = np.mean(ent_f1_list)
    neu_f1 = np.mean(neu_f1_list)
    con_f1 = np.mean(con_f1_list)


    return f"""Overall performance
Macro precision: {avg_precision}
Macro recall: {avg_recall}
Micro precision: {avg_mic_precision}
Micro recall: {avg_mic_recall}
Accuracy: {avg_accuracy}

Label-by-label (order: E-N-C)
Precision: {ent_precision}, {neu_precision}, {con_precision}
Recall: {ent_recall}, {neu_recall}, {con_recall}
F1-score: {ent_f1}, {neu_f1}, {con_f1}

None: {none_counter}"""


In [19]:
for col in columns_eval:
    try:

        print(col)
        print(evaluate_aggregated_mturks(original_df, col, sample=1000))
        print()


    except Exception as e:
        print(e)

annotation_only_gpt-4
Overall performance
Macro precision: 0.6354674452014268
Macro recall: 0.6966010606309113
Micro precision: 0.6261224489795918
Micro recall: 0.6261224489795918
Accuracy: 0.6261224489795918

Label-by-label (order: E-N-C)
Precision: 0.9804560260586319, 0.5443037974683544, 0.38164251207729466
Recall: 0.44925373134328356, 0.8505494505494505, 0.79
F1-score: 0.616171954964176, 0.6638078902229846, 0.5146579804560261

None: 0

annotation_only_gpt-3_5
Overall performance
Macro precision: 0.5622234149134105
Macro recall: 0.6093395659067301
Micro precision: 0.5844897959183674
Micro recall: 0.5844897959183674
Accuracy: 0.5844897959183674

Label-by-label (order: E-N-C)
Precision: 0.8588235294117647, 0.5328467153284672, 0.295
Recall: 0.43582089552238806, 0.8021978021978022, 0.59
F1-score: 0.5782178217821782, 0.6403508771929826, 0.3933333333333333

None: 0

annotation_only_13b
Overall performance
Macro precision: 0.5098200561974132
Macro recall: 0.4650455229371904
Micro precision: