# Open dataframe

In [None]:
import pandas as pd
import numpy as np

from sklearn.utils import resample
import json

In [None]:
load_path = 'data/data_final.csv'
original_df = pd.read_csv(load_path, index_col=[0])

In [None]:
columns_eval = ['annotation_only_gpt-4',
 'annotation_only_gpt-3_5',
 'annotation_only_13b',
 'annotation_only_7b',
 'gpt-3_5_finetuned_on_gpt_4',
 'gpt-3_5_finetuned_on_gpt_3_5',
 'gpt-3_5_finetuned_on_70b']

In [None]:
original_df

# Evaluation by Monte Carlo Simulation

## Set ground truth

In [None]:
from collections import Counter

# Columns for 'Mturk~' and 'Mturk~_reversed'
mturk_columns = ['Mturk_1', 'Mturk_2', 'Mturk_3', 'Mturk_4', 'Mturk_5']
mturk_reversed_columns = ['Mturk_1_reversed', 'Mturk_2_reversed', 'Mturk_3_reversed', 'Mturk_4_reversed', 'Mturk_5_reversed']

# Function to calculate the majority vote, considering ties
def majority_vote(row):
    counter = Counter(row)
    max_count = max(counter.values())
    return [k for k, v in counter.items() if v == max_count]

# Calculate the majority vote for 'Mturk~'
original_df['Majority_Mturk'] = original_df[mturk_columns].apply(majority_vote, axis=1)

In [None]:
print(647/1225)
print(433/1225)
print(90/1225)
print(55/1225)

original_df['Majority_Mturk'].value_counts()

In [None]:
import pickle

# Function to randomly tie-break and aggregate based on the given scheme
def random_tie_break(val1_list):
    val1 = np.random.choice(val1_list, 1)[0] if isinstance(val1_list, list) else val1_list
    label_set = {'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION'}

    if val1 == 'ENTAILMENT':
        return 'ENTAILMENT'
    elif val1 == 'CONTRADICTION':
        return 'CONTRADICTION'
    elif val1 not in label_set:
        return None
    else:
        return 'NEUTRAL'

# Set the seed for reproducibility
np.random.seed(42)

# Convert the relevant columns to NumPy arrays for faster computation
majority_mturk = original_df['Majority_Mturk'].to_numpy()

# Initialize a list to store the randomly tie-broken and aggregated lists
random_aggregated_mturks = []

# Create 10 random tie-broken and aggregated lists as a demonstration
for i in range(1000):
    random_aggregated_mturk = [random_tie_break(val1) for val1 in majority_mturk]
    random_aggregated_mturks.append(random_aggregated_mturk)

# Pickle the random_aggregated_mturks list
pickle_file_path = 'data/aggregate.pkl'
with open(pickle_file_path, 'wb') as f:
    pickle.dump(random_aggregated_mturks, f)

In [None]:
# Load the pickled random_aggregated_mturks list
pickle_file_path = 'data/aggregate.pkl'
with open(pickle_file_path, 'rb') as f:
    random_aggregated_mturks = pickle.load(f)

In [None]:
# Initialize a Counter to store the frequency of each class across all numpies
overall_counter = Counter()

# Count class frequencies across the 1,000 randomly aggregated lists
for random_aggregated_mturk in random_aggregated_mturks:
    counter = Counter(random_aggregated_mturk)
    overall_counter += counter

# Calculate the average frequency of each class
total_count = sum(overall_counter.values())
average_ratio = {k: v / total_count for k, v in overall_counter.items()}
average_frequency = {k: v / 1000 for k, v in overall_counter.items()}
average_ratio, average_frequency

## Preprocess predicted classes

In [None]:
import re

def find_all_annotation(s, front):
    s = str(s).upper()
    tokens = re.split(r'\W', s)  # split on non-alphanumeric characters

    valid_tokens_map = {
        'ENTAIL': 'ENTAILMENT',
        'ENTAILS': 'ENTAILMENT',
        'ENTAILING': 'ENTAILMENT',
        'CONTRADICT': 'CONTRADICTION',
        'CONTRADICTS': 'CONTRADICTION',
        'CONTRADICTING': 'CONTRADICTION'
    }

    valid_tokens = set([
        'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION',
    ])

    annotate_list = [valid_tokens_map.get(token, token) for token in tokens if token in valid_tokens or token in valid_tokens_map]

    if len(annotate_list) == 0:
        return None
    elif front:
        return annotate_list[0]
    else:
        return annotate_list[-1]

for col in columns_eval:
    if col.startswith('zero_shot_cot') or col.startswith('few_shot'):
        original_df[col] = original_df[col].apply(lambda x: find_all_annotation(x, front=False))
    else:
        original_df[col] = original_df[col].apply(lambda x: find_all_annotation(x, front=True))


In [None]:
none_indices = {col: original_df[original_df[col].isna()].index.tolist() for col in columns_eval}

In [None]:
# Function to count rows that are not one word
def count_non_single_word_rows(column):
    return sum(original_df[column].apply(lambda x: len(str(x).split()) != 1))

# Iterate through columns ending with '_cleaned' and apply the counting function
counts = {column: count_non_single_word_rows(column) for column in original_df.columns if column in columns_eval}

# Print the results
for column, count in counts.items():
    print(f"{column}: {count} rows are not one word")

## Evaluate over 1,000 samples

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, accuracy_score, f1_score
from itertools import product

def evaluate_aggregated_mturks(df, column_name, sample=1000):
    # Extract the relevant columns
    column = df[column_name].to_numpy()

    # Initialize lists to store precision, recall, and accuracy
    precision_list = []
    recall_list = []
    mic_precision_list = []
    mic_recall_list = []
    accuracy_list = []

    # Initialize a counter for None values
    none_counter = 0

    # Initialize a list to store the indices of None values
    none_indices = []

    # Generate 1000 random tie-broken and aggregated numpy arrays and evaluate metrics
    for i in range(sample):
        aggregated_col = []
        for idx, val1 in enumerate(column):
            if val1 is None:
                none_counter += 1
                none_indices.append(idx)
                aggregated_col.append(None)
                continue
            aggregated_col.append(random_tie_break(val1))

        aggregated_col = np.array(aggregated_col)

        # Filter out None values for the metric calculations
        valid_indices = aggregated_col != None
        filtered_aggregated_col = aggregated_col[valid_indices]
        filtered_reference = np.array([random_aggregated_mturk[i] for i in range(len(random_aggregated_mturk)) if valid_indices[i]])

        # Calculate and store precision, recall, and accuracy
        precision_list.append(precision_score(filtered_reference, filtered_aggregated_col, average='macro', zero_division=0))
        recall_list.append(recall_score(filtered_reference, filtered_aggregated_col, average='macro', zero_division=0))
        mic_precision_list.append(precision_score(filtered_reference, filtered_aggregated_col, average='micro', zero_division=0))
        mic_recall_list.append(recall_score(filtered_reference, filtered_aggregated_col, average='micro', zero_division=0))
        accuracy_list.append(accuracy_score(filtered_reference, filtered_aggregated_col))

        # Calculate precision for each label
        con_precision_list = precision_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[0]
        ent_precision_list = precision_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[1]
        neu_precision_list = precision_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[2]

        # Calculate recall for each label
        con_recall_list = recall_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[0]
        ent_recall_list = recall_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[1]
        neu_recall_list = recall_score(filtered_reference, filtered_aggregated_col, average=None, zero_division=0)[2]

        # Calculate accuracy for each label
        con_f1_list = f1_score(filtered_reference, filtered_aggregated_col, average=None)[0]
        ent_f1_list = f1_score(filtered_reference, filtered_aggregated_col, average=None)[1]
        neu_f1_list = f1_score(filtered_reference, filtered_aggregated_col, average=None)[2]


    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_mic_precision = np.mean(mic_precision_list)
    avg_mic_recall = np.mean(mic_recall_list)
    avg_accuracy = np.mean(accuracy_list)

    ent_precision = np.mean(ent_precision_list)
    neu_precision = np.mean(neu_precision_list)
    con_precision = np.mean(con_precision_list)

    ent_recall = np.mean(ent_recall_list)
    neu_recall = np.mean(neu_recall_list)
    con_recall = np.mean(con_recall_list)

    ent_f1 = np.mean(ent_f1_list)
    neu_f1 = np.mean(neu_f1_list)
    con_f1 = np.mean(con_f1_list)


    print(f"""Overall performance
Macro precision: {avg_precision}
Macro recall: {avg_recall}
Micro precision: {avg_mic_precision}
Micro recall: {avg_mic_recall}
Accuracy: {avg_accuracy}

Label-by-label (order: E-N-C)
Precision: {ent_precision}, {neu_precision}, {con_precision}
Recall: {ent_recall}, {neu_recall}, {con_recall}
F1-score: {ent_f1}, {neu_f1}, {con_f1}

None: {none_counter}""")

    return avg_precision, avg_recall, avg_accuracy


In [None]:
# Test the function with 'Majority_Mturk' column and 'gpt-3_5_finetuned_on_gpt_4' as the reference column
for col in columns_eval:
    try:
        print(col)
        evaluate_aggregated_mturks(original_df, col, sample=1000)
        print()
        
    except Exception as e:
        print(e)

# Fine-tune llamas and plot them

In [None]:
finetuned_model_size_list = ['13b', '7b']
trainset_model_list = ['gpt-4', 'gpt-3_5', 'llama_2_70b']

df = pd.DataFrame()

for finetuned_model_size, trainset_model in product(finetuned_model_size_list, trainset_model_list):
    model_config_folder = f'train_valid_{finetuned_model_size}_train_{trainset_model}_balanced'

    for number in range(61, 550, 61):
        file_path = f'data/Finetuning/{model_config_folder}/test-{number}/generated_predictions.jsonl'
        df[f'{number}_{model_config_folder}'] = pd.read_json(file_path, lines=True)['predict']

# Function to clean cells and standardize to first occurrence of {ENTAILMENT, NEUTRAL, CONTRADICTION}
def clean_and_standardize(cell):
    standard_labels = {'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION', 'no relationship'}
    if isinstance(cell, str):
        for label in standard_labels:
            if label in cell:
                return 'NEUTRAL' if label == 'no relationship' else label
    return cell


# Apply the function to clean and standardize the DataFrame
df = df.applymap(clean_and_standardize)

In [None]:
for finetuned_model_size, trainset_model in product(finetuned_model_size_list, trainset_model_list):

    try:
        model_config_folder = f'train_valid_{finetuned_model_size}_train_{trainset_model}_balanced'
        col = f'549_{model_config_folder}'
        print(col)
        print(evaluate_aggregated_mturks(df, col, sample=1000))

    except Exception as e:
        print(e)


In [None]:
import re
def find_all_occurrences_entailment(s):
    s = str(s)
    tokens = re.split(r'\W', s)  # split on non-alphanumeric characters
    valid_tokens = ['ENTAILMENT', 'NEUTRAL', 'CONTRADICTION']
    return ' '.join(list(set([token for token in tokens if token in valid_tokens])))

# Function to count rows that are not one word
def count_non_single_word_rows(column):
    return sum(original_df[column].apply(lambda x: len(str(x).split()) != 1))

In [None]:
steps = [61, 122, 183, 244, 305, 366, 427, 488, 549]
precision = []
recall = []
accuracy = []

for number in range(61, 550, 61):
    P, R, A = evaluate_aggregated_mturks(df, f"{number}_train_valid_13b_train_gpt-4_balanced", sample=1000)
    print(f"{number}_train_valid_13b_train_gpt-4_balanced")
    precision.append(P)
    recall.append(R)
    accuracy.append(A)

import json
import numpy as np
import matplotlib.pyplot as plt

def read_and_process_log(file_path):
    log_entries = []
    train_loss_by_epoch = {}
    validation_loss_by_epoch = {}

    # Read the JSONL file
    with open(file_path, 'r') as f:
        for line in f:
            log_entries.append(json.loads(line.strip()))

    # Populate the lists by epoch
    for entry in log_entries:
        epoch = entry['epoch']
        if entry['loss'] is not None:
            train_loss_by_epoch.setdefault(epoch, []).append(entry['loss'])
        if entry['eval_loss'] is not None:
            validation_loss_by_epoch.setdefault(epoch, []).append(entry['eval_loss'])

    # Average the losses for each epoch
    avg_train_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in train_loss_by_epoch.items()}
    avg_validation_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in validation_loss_by_epoch.items()}

    # Sort by epoch for plotting
    sorted_train_epochs = sorted(avg_train_loss_by_epoch.keys())
    sorted_train_loss = [avg_train_loss_by_epoch[epoch] for epoch in sorted_train_epochs]

    sorted_validation_epochs = sorted(avg_validation_loss_by_epoch.keys())
    sorted_validation_loss = [avg_validation_loss_by_epoch[epoch] for epoch in sorted_validation_epochs]

    return sorted_train_epochs, sorted_train_loss, sorted_validation_epochs, sorted_validation_loss

# Read and process the logs for 'tweet-claim' and 'claim-tweet' presentation orders
tweet_claim_epochs, tweet_claim_train_loss, tweet_claim_val_epochs, tweet_claim_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced/trainer_log.jsonl')
claim_tweet_epochs, claim_tweet_train_loss, claim_tweet_val_epochs, claim_tweet_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced_reversed/trainer_log.jsonl')

# Function to smooth data using a simple moving average
def smooth_data(data, window_size=5):
    smoothed_data = []
    for i in range(len(data)):
        start_idx = max(0, i - window_size // 2)
        end_idx = min(len(data), i + window_size // 2 + 1)
        window_data = data[start_idx:end_idx]
        smoothed_data.append(np.mean(window_data))
    return smoothed_data

# Smooth the training curves
smoothed_tweet_claim_train_loss = smooth_data(tweet_claim_train_loss)
smoothed_claim_tweet_train_loss = smooth_data(claim_tweet_train_loss)

# Calculate the corresponding epochs for each step
epochs_for_metrics = np.array(steps) / 183

# Create the plot with specified figsize, bigger fonts and thicker lines
fig, ax1 = plt.subplots(figsize=(10, 8))
plt.rcParams.update({'font.size': 16})

# Plot loss curves with thicker lines
ax1.plot(tweet_claim_epochs, smoothed_tweet_claim_train_loss, label='Train Loss (Tweet-Claim)', linewidth=5, color='teal')
ax1.plot(tweet_claim_val_epochs, tweet_claim_val_loss, label='Validation Loss', linewidth=5, color='orange')
ax1.plot(claim_tweet_epochs, smoothed_claim_tweet_train_loss, label='Train Loss (Claim-Tweet)', linewidth=5, color='dodgerblue')
ax1.plot(claim_tweet_val_epochs, claim_tweet_val_loss, label='Validation Loss', linewidth=5, color='red')

# Labels, title and grid for ax1
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Average Loss')
ax1.set_ylim([0, 1])
ax1.set_title('Base model: Llama-2-13b-chat-hf\nTraining set: GPT-4', fontsize=30, loc='left')
ax1.grid(False)


# Create another y-axis for the metrics
ax2 = ax1.twinx()
ax2.set_ylim([0, 1])  # Limit y-axis to [0, 1]
ax2.set_ylabel('Performance Metrics')
# ax2.grid(axis='y', linestyle='--', linewidth=0.7, color='gray')
for y in np.linspace(0, 1, num=5):  # Replace with your actual y-values
    ax2.axhline(y, color='gray', linestyle='--', linewidth=0.7)

ax1.set_yticks([0.25, 0.5, 0.75])
ax1.set_yticklabels(['0.25', '0.5', '0.75'])

ax2.set_yticks([0.25, 0.5, 0.75])
ax2.set_yticklabels(['0.25', '0.5', '0.75'])

# Plot the metrics on the new y-axis with thicker lines
ax2.plot(epochs_for_metrics, precision, label='Precision', marker='o', linestyle='--', linewidth=5, color='magenta')
ax2.plot(epochs_for_metrics, recall, label='Recall', marker='s', linestyle='--', linewidth=5, color='green')
ax2.plot(epochs_for_metrics, accuracy, label='Accuracy', marker='x', linestyle='--', linewidth=5, color='purple')


# Add divided legend with solid background
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()

ax1.legend(lines1, labels1, loc='upper left', framealpha=1.0)
ax2.legend(lines2, labels2, loc='upper right', framealpha=1.0)


# Show the plot
plt.show()


In [None]:
steps = [61, 122, 183, 244, 305, 366, 427, 488, 549]
precision = []
recall = []
accuracy = []

for number in range(61, 550, 61):
    P, R, A = evaluate_aggregated_mturks(df, f"{number}_train_valid_13b_train_gpt-3_5_balanced", sample=1000)
    print(f"{number}_train_valid_13b_train_gpt-3_5_balanced")
    precision.append(P)
    recall.append(R)
    accuracy.append(A)

import json
import numpy as np
import matplotlib.pyplot as plt

def read_and_process_log(file_path):
    log_entries = []
    train_loss_by_epoch = {}
    validation_loss_by_epoch = {}

    # Read the JSONL file
    with open(file_path, 'r') as f:
        for line in f:
            log_entries.append(json.loads(line.strip()))

    # Populate the lists by epoch
    for entry in log_entries:
        epoch = entry['epoch']
        if entry['loss'] is not None:
            train_loss_by_epoch.setdefault(epoch, []).append(entry['loss'])
        if entry['eval_loss'] is not None:
            validation_loss_by_epoch.setdefault(epoch, []).append(entry['eval_loss'])

    # Average the losses for each epoch
    avg_train_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in train_loss_by_epoch.items()}
    avg_validation_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in validation_loss_by_epoch.items()}

    # Sort by epoch for plotting
    sorted_train_epochs = sorted(avg_train_loss_by_epoch.keys())
    sorted_train_loss = [avg_train_loss_by_epoch[epoch] for epoch in sorted_train_epochs]

    sorted_validation_epochs = sorted(avg_validation_loss_by_epoch.keys())
    sorted_validation_loss = [avg_validation_loss_by_epoch[epoch] for epoch in sorted_validation_epochs]

    return sorted_train_epochs, sorted_train_loss, sorted_validation_epochs, sorted_validation_loss

# Read and process the logs for 'tweet-claim' and 'claim-tweet' presentation orders
tweet_claim_epochs, tweet_claim_train_loss, tweet_claim_val_epochs, tweet_claim_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced/trainer_log.jsonl')
claim_tweet_epochs, claim_tweet_train_loss, claim_tweet_val_epochs, claim_tweet_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced_reversed/trainer_log.jsonl')

# Function to smooth data using a simple moving average
def smooth_data(data, window_size=5):
    smoothed_data = []
    for i in range(len(data)):
        start_idx = max(0, i - window_size // 2)
        end_idx = min(len(data), i + window_size // 2 + 1)
        window_data = data[start_idx:end_idx]
        smoothed_data.append(np.mean(window_data))
    return smoothed_data

# Smooth the training curves
smoothed_tweet_claim_train_loss = smooth_data(tweet_claim_train_loss)
smoothed_claim_tweet_train_loss = smooth_data(claim_tweet_train_loss)

# Calculate the corresponding epochs for each step
epochs_for_metrics = np.array(steps) / 183

# Create the plot with specified figsize, bigger fonts and thicker lines
fig, ax1 = plt.subplots(figsize=(10, 8))
plt.rcParams.update({'font.size': 16})

# Plot loss curves with thicker lines
ax1.plot(tweet_claim_epochs, smoothed_tweet_claim_train_loss, label='Train Loss (Tweet-Claim)', linewidth=5, color='teal')
ax1.plot(tweet_claim_val_epochs, tweet_claim_val_loss, label='Validation Loss', linewidth=5, color='orange')
ax1.plot(claim_tweet_epochs, smoothed_claim_tweet_train_loss, label='Train Loss (Claim-Tweet)', linewidth=5, color='dodgerblue')
ax1.plot(claim_tweet_val_epochs, claim_tweet_val_loss, label='Validation Loss', linewidth=5, color='red')

# Labels, title and grid for ax1
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Average Loss')
ax1.set_ylim([0, 1])
ax1.set_title('Base model: Llama-2-13b-chat-hf\nTrain set: GPT-3.5-Turbo', fontsize=30, loc='left')
ax1.grid(False)


# Create another y-axis for the metrics
ax2 = ax1.twinx()
ax2.set_ylim([0, 1])  # Limit y-axis to [0, 1]
ax2.set_ylabel('Performance Metrics')
# ax2.grid(axis='y', linestyle='--', linewidth=0.7, color='gray')
for y in np.linspace(0, 1, num=5):  # Replace with your actual y-values
    ax2.axhline(y, color='gray', linestyle='--', linewidth=0.7)

ax1.set_yticks([0.25, 0.5, 0.75])
ax1.set_yticklabels(['0.25', '0.5', '0.75'])

ax2.set_yticks([0.25, 0.5, 0.75])
ax2.set_yticklabels(['0.25', '0.5', '0.75'])

# Plot the metrics on the new y-axis with thicker lines
ax2.plot(epochs_for_metrics, precision, label='Precision', marker='o', linestyle='--', linewidth=5, color='magenta')
ax2.plot(epochs_for_metrics, recall, label='Recall', marker='s', linestyle='--', linewidth=5, color='green')
ax2.plot(epochs_for_metrics, accuracy, label='Accuracy', marker='x', linestyle='--', linewidth=5, color='purple')


# Add divided legend with solid background
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()

ax1.legend(lines1, labels1, loc='upper left', framealpha=1.0)
ax2.legend(lines2, labels2, loc='upper right', framealpha=1.0)


# Show the plot
plt.show()


In [None]:
steps = [61, 122, 183, 244, 305, 366, 427, 488, 549]
precision = []
recall = []
accuracy = []

for number in range(61, 550, 61):
    P, R, A = evaluate_aggregated_mturks(df, f"{number}_train_valid_13b_train_llama_2_70b_balanced", sample=1000)
    print(f"{number}_train_valid_13b_train_llama_2_70b_balanced")
    precision.append(P)
    recall.append(R)
    accuracy.append(A)

import json
import numpy as np
import matplotlib.pyplot as plt

def read_and_process_log(file_path):
    log_entries = []
    train_loss_by_epoch = {}
    validation_loss_by_epoch = {}

    # Read the JSONL file
    with open(file_path, 'r') as f:
        for line in f:
            log_entries.append(json.loads(line.strip()))

    # Populate the lists by epoch
    for entry in log_entries:
        epoch = entry['epoch']
        if entry['loss'] is not None:
            train_loss_by_epoch.setdefault(epoch, []).append(entry['loss'])
        if entry['eval_loss'] is not None:
            validation_loss_by_epoch.setdefault(epoch, []).append(entry['eval_loss'])

    # Average the losses for each epoch
    avg_train_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in train_loss_by_epoch.items()}
    avg_validation_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in validation_loss_by_epoch.items()}

    # Sort by epoch for plotting
    sorted_train_epochs = sorted(avg_train_loss_by_epoch.keys())
    sorted_train_loss = [avg_train_loss_by_epoch[epoch] for epoch in sorted_train_epochs]

    sorted_validation_epochs = sorted(avg_validation_loss_by_epoch.keys())
    sorted_validation_loss = [avg_validation_loss_by_epoch[epoch] for epoch in sorted_validation_epochs]

    return sorted_train_epochs, sorted_train_loss, sorted_validation_epochs, sorted_validation_loss

# Read and process the logs for 'tweet-claim' and 'claim-tweet' presentation orders
tweet_claim_epochs, tweet_claim_train_loss, tweet_claim_val_epochs, tweet_claim_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced/trainer_log.jsonl')
claim_tweet_epochs, claim_tweet_train_loss, claim_tweet_val_epochs, claim_tweet_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced_reversed/trainer_log.jsonl')

# Function to smooth data using a simple moving average
def smooth_data(data, window_size=5):
    smoothed_data = []
    for i in range(len(data)):
        start_idx = max(0, i - window_size // 2)
        end_idx = min(len(data), i + window_size // 2 + 1)
        window_data = data[start_idx:end_idx]
        smoothed_data.append(np.mean(window_data))
    return smoothed_data

# Smooth the training curves
smoothed_tweet_claim_train_loss = smooth_data(tweet_claim_train_loss)
smoothed_claim_tweet_train_loss = smooth_data(claim_tweet_train_loss)

# Calculate the corresponding epochs for each step
epochs_for_metrics = np.array(steps) / 183

# Create the plot with specified figsize, bigger fonts and thicker lines
fig, ax1 = plt.subplots(figsize=(10, 8))
plt.rcParams.update({'font.size': 16})

# Plot loss curves with thicker lines
ax1.plot(tweet_claim_epochs, smoothed_tweet_claim_train_loss, label='Train Loss (Tweet-Claim)', linewidth=5, color='teal')
ax1.plot(tweet_claim_val_epochs, tweet_claim_val_loss, label='Validation Loss', linewidth=5, color='orange')
ax1.plot(claim_tweet_epochs, smoothed_claim_tweet_train_loss, label='Train Loss (Claim-Tweet)', linewidth=5, color='dodgerblue')
ax1.plot(claim_tweet_val_epochs, claim_tweet_val_loss, label='Validation Loss', linewidth=5, color='red')

# Labels, title and grid for ax1
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Average Loss')
ax1.set_ylim([0, 1])
ax1.set_title('Base model: Llama-2-13b-chat-hf\nTrain set: Llama-2-70b-chat-hf', fontsize=30, loc='left')
ax1.grid(False)


# Create another y-axis for the metrics
ax2 = ax1.twinx()
ax2.set_ylim([0, 1])  # Limit y-axis to [0, 1]
ax2.set_ylabel('Performance Metrics')
# ax2.grid(axis='y', linestyle='--', linewidth=0.7, color='gray')
for y in np.linspace(0, 1, num=5):  # Replace with your actual y-values
    ax2.axhline(y, color='gray', linestyle='--', linewidth=0.7)

ax1.set_yticks([0.25, 0.5, 0.75])
ax1.set_yticklabels(['0.25', '0.5', '0.75'])

ax2.set_yticks([0.25, 0.5, 0.75])
ax2.set_yticklabels(['0.25', '0.5', '0.75'])

# Plot the metrics on the new y-axis with thicker lines
ax2.plot(epochs_for_metrics, precision, label='Precision', marker='o', linestyle='--', linewidth=5, color='magenta')
ax2.plot(epochs_for_metrics, recall, label='Recall', marker='s', linestyle='--', linewidth=5, color='green')
ax2.plot(epochs_for_metrics, accuracy, label='Accuracy', marker='x', linestyle='--', linewidth=5, color='purple')


# Add divided legend with solid background
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()

ax1.legend(lines1, labels1, loc='upper left', framealpha=1.0)
ax2.legend(lines2, labels2, loc='upper right', framealpha=1.0)


# Show the plot
plt.show()


In [None]:
steps = [61, 122, 183, 244, 305, 366, 427, 488, 549]
precision = []
recall = []
accuracy = []

for number in range(61, 550, 61):
    P, R, A = evaluate_aggregated_mturks(df, f"{number}_train_valid_7b_train_gpt-4_balanced", sample=1000)
    print(f"{number}_train_valid_7b_train_gpt-4_balanced")
    precision.append(P)
    recall.append(R)
    accuracy.append(A)

import json
import numpy as np
import matplotlib.pyplot as plt

def read_and_process_log(file_path):
    log_entries = []
    train_loss_by_epoch = {}
    validation_loss_by_epoch = {}

    # Read the JSONL file
    with open(file_path, 'r') as f:
        for line in f:
            log_entries.append(json.loads(line.strip()))

    # Populate the lists by epoch
    for entry in log_entries:
        epoch = entry['epoch']
        if entry['loss'] is not None:
            train_loss_by_epoch.setdefault(epoch, []).append(entry['loss'])
        if entry['eval_loss'] is not None:
            validation_loss_by_epoch.setdefault(epoch, []).append(entry['eval_loss'])

    # Average the losses for each epoch
    avg_train_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in train_loss_by_epoch.items()}
    avg_validation_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in validation_loss_by_epoch.items()}

    # Sort by epoch for plotting
    sorted_train_epochs = sorted(avg_train_loss_by_epoch.keys())
    sorted_train_loss = [avg_train_loss_by_epoch[epoch] for epoch in sorted_train_epochs]

    sorted_validation_epochs = sorted(avg_validation_loss_by_epoch.keys())
    sorted_validation_loss = [avg_validation_loss_by_epoch[epoch] for epoch in sorted_validation_epochs]

    return sorted_train_epochs, sorted_train_loss, sorted_validation_epochs, sorted_validation_loss

# Read and process the logs for 'tweet-claim' and 'claim-tweet' presentation orders
tweet_claim_epochs, tweet_claim_train_loss, tweet_claim_val_epochs, tweet_claim_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced/trainer_log.jsonl')
claim_tweet_epochs, claim_tweet_train_loss, claim_tweet_val_epochs, claim_tweet_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced_reversed/trainer_log.jsonl')

# Function to smooth data using a simple moving average
def smooth_data(data, window_size=5):
    smoothed_data = []
    for i in range(len(data)):
        start_idx = max(0, i - window_size // 2)
        end_idx = min(len(data), i + window_size // 2 + 1)
        window_data = data[start_idx:end_idx]
        smoothed_data.append(np.mean(window_data))
    return smoothed_data

# Smooth the training curves
smoothed_tweet_claim_train_loss = smooth_data(tweet_claim_train_loss)
smoothed_claim_tweet_train_loss = smooth_data(claim_tweet_train_loss)

# Calculate the corresponding epochs for each step
epochs_for_metrics = np.array(steps) / 183

# Create the plot with specified figsize, bigger fonts and thicker lines
fig, ax1 = plt.subplots(figsize=(10, 8))
plt.rcParams.update({'font.size': 16})

# Plot loss curves with thicker lines
ax1.plot(tweet_claim_epochs, smoothed_tweet_claim_train_loss, label='Train Loss (Tweet-Claim)', linewidth=5, color='teal')
ax1.plot(tweet_claim_val_epochs, tweet_claim_val_loss, label='Validation Loss', linewidth=5, color='orange')
ax1.plot(claim_tweet_epochs, smoothed_claim_tweet_train_loss, label='Train Loss (Claim-Tweet)', linewidth=5, color='dodgerblue')
ax1.plot(claim_tweet_val_epochs, claim_tweet_val_loss, label='Validation Loss', linewidth=5, color='red')

# Labels, title and grid for ax1
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Average Loss')
ax1.set_ylim([0, 1])
ax1.set_title('Base model: Llama-2-7b-chat-hf\nTrain set: GPT-4', fontsize=30, loc='left')
ax1.grid(False)


# Create another y-axis for the metrics
ax2 = ax1.twinx()
ax2.set_ylim([0, 1])  # Limit y-axis to [0, 1]
ax2.set_ylabel('Performance Metrics')
# ax2.grid(axis='y', linestyle='--', linewidth=0.7, color='gray')
for y in np.linspace(0, 1, num=5):  # Replace with your actual y-values
    ax2.axhline(y, color='gray', linestyle='--', linewidth=0.7)

ax1.set_yticks([0.25, 0.5, 0.75])
ax1.set_yticklabels(['0.25', '0.5', '0.75'])

ax2.set_yticks([0.25, 0.5, 0.75])
ax2.set_yticklabels(['0.25', '0.5', '0.75'])

# Plot the metrics on the new y-axis with thicker lines
ax2.plot(epochs_for_metrics, precision, label='Precision', marker='o', linestyle='--', linewidth=5, color='magenta')
ax2.plot(epochs_for_metrics, recall, label='Recall', marker='s', linestyle='--', linewidth=5, color='green')
ax2.plot(epochs_for_metrics, accuracy, label='Accuracy', marker='x', linestyle='--', linewidth=5, color='purple')


# Add divided legend with solid background
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()

ax1.legend(lines1, labels1, loc='upper left', framealpha=1.0)
ax2.legend(lines2, labels2, loc='upper right', framealpha=1.0)


# Show the plot
plt.show()


In [None]:
steps = [61, 122, 183, 244, 305, 366, 427, 488, 549]
precision = []
recall = []
accuracy = []

for number in range(61, 550, 61):
    P, R, A = evaluate_aggregated_mturks(df, f"{number}_train_valid_7b_train_gpt-3_5_balanced", sample=1000)
    print(f"{number}_train_valid_7b_train_gpt-3_5_balanced")
    precision.append(P)
    recall.append(R)
    accuracy.append(A)

import json
import numpy as np
import matplotlib.pyplot as plt

def read_and_process_log(file_path):
    log_entries = []
    train_loss_by_epoch = {}
    validation_loss_by_epoch = {}

    # Read the JSONL file
    with open(file_path, 'r') as f:
        for line in f:
            log_entries.append(json.loads(line.strip()))

    # Populate the lists by epoch
    for entry in log_entries:
        epoch = entry['epoch']
        if entry['loss'] is not None:
            train_loss_by_epoch.setdefault(epoch, []).append(entry['loss'])
        if entry['eval_loss'] is not None:
            validation_loss_by_epoch.setdefault(epoch, []).append(entry['eval_loss'])

    # Average the losses for each epoch
    avg_train_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in train_loss_by_epoch.items()}
    avg_validation_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in validation_loss_by_epoch.items()}

    # Sort by epoch for plotting
    sorted_train_epochs = sorted(avg_train_loss_by_epoch.keys())
    sorted_train_loss = [avg_train_loss_by_epoch[epoch] for epoch in sorted_train_epochs]

    sorted_validation_epochs = sorted(avg_validation_loss_by_epoch.keys())
    sorted_validation_loss = [avg_validation_loss_by_epoch[epoch] for epoch in sorted_validation_epochs]

    return sorted_train_epochs, sorted_train_loss, sorted_validation_epochs, sorted_validation_loss

# Read and process the logs for 'tweet-claim' and 'claim-tweet' presentation orders
tweet_claim_epochs, tweet_claim_train_loss, tweet_claim_val_epochs, tweet_claim_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced/trainer_log.jsonl')
claim_tweet_epochs, claim_tweet_train_loss, claim_tweet_val_epochs, claim_tweet_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced_reversed/trainer_log.jsonl')

# Function to smooth data using a simple moving average
def smooth_data(data, window_size=5):
    smoothed_data = []
    for i in range(len(data)):
        start_idx = max(0, i - window_size // 2)
        end_idx = min(len(data), i + window_size // 2 + 1)
        window_data = data[start_idx:end_idx]
        smoothed_data.append(np.mean(window_data))
    return smoothed_data

# Smooth the training curves
smoothed_tweet_claim_train_loss = smooth_data(tweet_claim_train_loss)
smoothed_claim_tweet_train_loss = smooth_data(claim_tweet_train_loss)

# Calculate the corresponding epochs for each step
epochs_for_metrics = np.array(steps) / 183

# Create the plot with specified figsize, bigger fonts and thicker lines
fig, ax1 = plt.subplots(figsize=(10, 8))
plt.rcParams.update({'font.size': 16})

# Plot loss curves with thicker lines
ax1.plot(tweet_claim_epochs, smoothed_tweet_claim_train_loss, label='Train Loss (Tweet-Claim)', linewidth=5, color='teal')
ax1.plot(tweet_claim_val_epochs, tweet_claim_val_loss, label='Validation Loss', linewidth=5, color='orange')
ax1.plot(claim_tweet_epochs, smoothed_claim_tweet_train_loss, label='Train Loss (Claim-Tweet)', linewidth=5, color='dodgerblue')
ax1.plot(claim_tweet_val_epochs, claim_tweet_val_loss, label='Validation Loss', linewidth=5, color='red')

# Labels, title and grid for ax1
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Average Loss')
ax1.set_ylim([0, 1])
ax1.set_title('Base model: Llama-2-7b-chat-hf\nTrain set: GPT-3.5 Turbo', fontsize=30, loc='left')
ax1.grid(False)


# Create another y-axis for the metrics
ax2 = ax1.twinx()
ax2.set_ylim([0, 1])  # Limit y-axis to [0, 1]
ax2.set_ylabel('Performance Metrics')
# ax2.grid(axis='y', linestyle='--', linewidth=0.7, color='gray')
for y in np.linspace(0, 1, num=5):  # Replace with your actual y-values
    ax2.axhline(y, color='gray', linestyle='--', linewidth=0.7)

ax1.set_yticks([0.25, 0.5, 0.75])
ax1.set_yticklabels(['0.25', '0.5', '0.75'])

ax2.set_yticks([0.25, 0.5, 0.75])
ax2.set_yticklabels(['0.25', '0.5', '0.75'])

# Plot the metrics on the new y-axis with thicker lines
ax2.plot(epochs_for_metrics, precision, label='Precision', marker='o', linestyle='--', linewidth=5, color='magenta')
ax2.plot(epochs_for_metrics, recall, label='Recall', marker='s', linestyle='--', linewidth=5, color='green')
ax2.plot(epochs_for_metrics, accuracy, label='Accuracy', marker='x', linestyle='--', linewidth=5, color='purple')


# Add divided legend with solid background
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()

ax1.legend(lines1, labels1, loc='upper left', framealpha=1.0)
ax2.legend(lines2, labels2, loc='upper right', framealpha=1.0)


# Show the plot
plt.show()


In [None]:
steps = [61, 122, 183, 244, 305, 366, 427, 488, 549]
precision = []
recall = []
accuracy = []

for number in range(61, 550, 61):
    P, R, A = evaluate_aggregated_mturks(df, f"{number}_train_valid_7b_train_llama_2_70b_balanced", sample=1000)
    print(f"{number}_train_valid_7b_train_llama_2_70b_balanced")
    precision.append(P)
    recall.append(R)
    accuracy.append(A)

import json
import numpy as np
import matplotlib.pyplot as plt

def read_and_process_log(file_path):
    log_entries = []
    train_loss_by_epoch = {}
    validation_loss_by_epoch = {}

    # Read the JSONL file
    with open(file_path, 'r') as f:
        for line in f:
            log_entries.append(json.loads(line.strip()))

    # Populate the lists by epoch
    for entry in log_entries:
        epoch = entry['epoch']
        if entry['loss'] is not None:
            train_loss_by_epoch.setdefault(epoch, []).append(entry['loss'])
        if entry['eval_loss'] is not None:
            validation_loss_by_epoch.setdefault(epoch, []).append(entry['eval_loss'])

    # Average the losses for each epoch
    avg_train_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in train_loss_by_epoch.items()}
    avg_validation_loss_by_epoch = {epoch: np.mean(losses) for epoch, losses in validation_loss_by_epoch.items()}

    # Sort by epoch for plotting
    sorted_train_epochs = sorted(avg_train_loss_by_epoch.keys())
    sorted_train_loss = [avg_train_loss_by_epoch[epoch] for epoch in sorted_train_epochs]

    sorted_validation_epochs = sorted(avg_validation_loss_by_epoch.keys())
    sorted_validation_loss = [avg_validation_loss_by_epoch[epoch] for epoch in sorted_validation_epochs]

    return sorted_train_epochs, sorted_train_loss, sorted_validation_epochs, sorted_validation_loss

# Read and process the logs for 'tweet-claim' and 'claim-tweet' presentation orders
tweet_claim_epochs, tweet_claim_train_loss, tweet_claim_val_epochs, tweet_claim_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced/trainer_log.jsonl')
claim_tweet_epochs, claim_tweet_train_loss, claim_tweet_val_epochs, claim_tweet_val_loss = read_and_process_log('data/Finetuning/train_valid_13b_train_gpt-4_balanced_reversed/trainer_log.jsonl')

# Function to smooth data using a simple moving average
def smooth_data(data, window_size=5):
    smoothed_data = []
    for i in range(len(data)):
        start_idx = max(0, i - window_size // 2)
        end_idx = min(len(data), i + window_size // 2 + 1)
        window_data = data[start_idx:end_idx]
        smoothed_data.append(np.mean(window_data))
    return smoothed_data

# Smooth the training curves
smoothed_tweet_claim_train_loss = smooth_data(tweet_claim_train_loss)
smoothed_claim_tweet_train_loss = smooth_data(claim_tweet_train_loss)

# Calculate the corresponding epochs for each step
epochs_for_metrics = np.array(steps) / 183

# Create the plot with specified figsize, bigger fonts and thicker lines
fig, ax1 = plt.subplots(figsize=(10, 8))
plt.rcParams.update({'font.size': 16})

# Plot loss curves with thicker lines
ax1.plot(tweet_claim_epochs, smoothed_tweet_claim_train_loss, label='Train Loss (Tweet-Claim)', linewidth=5, color='teal')
ax1.plot(tweet_claim_val_epochs, tweet_claim_val_loss, label='Validation Loss', linewidth=5, color='orange')
ax1.plot(claim_tweet_epochs, smoothed_claim_tweet_train_loss, label='Train Loss (Claim-Tweet)', linewidth=5, color='dodgerblue')
ax1.plot(claim_tweet_val_epochs, claim_tweet_val_loss, label='Validation Loss', linewidth=5, color='red')

# Labels, title and grid for ax1
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Average Loss')
ax1.set_ylim([0, 1])
ax1.set_title('Base model: Llama-2-7b-chat-hf\nTrain set: Llama-2-70b-chat-hf', fontsize=30, loc='left')
ax1.grid(False)


# Create another y-axis for the metrics
ax2 = ax1.twinx()
ax2.set_ylim([0, 1])  # Limit y-axis to [0, 1]
ax2.set_ylabel('Performance Metrics')
# ax2.grid(axis='y', linestyle='--', linewidth=0.7, color='gray')
for y in np.linspace(0, 1, num=5):  # Replace with your actual y-values
    ax2.axhline(y, color='gray', linestyle='--', linewidth=0.7)

ax1.set_yticks([0.25, 0.5, 0.75])
ax1.set_yticklabels(['0.25', '0.5', '0.75'])

ax2.set_yticks([0.25, 0.5, 0.75])
ax2.set_yticklabels(['0.25', '0.5', '0.75'])

# Plot the metrics on the new y-axis with thicker lines
ax2.plot(epochs_for_metrics, precision, label='Precision', marker='o', linestyle='--', linewidth=5, color='magenta')
ax2.plot(epochs_for_metrics, recall, label='Recall', marker='s', linestyle='--', linewidth=5, color='green')
ax2.plot(epochs_for_metrics, accuracy, label='Accuracy', marker='x', linestyle='--', linewidth=5, color='purple')


# Add divided legend with solid background
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()

ax1.legend(lines1, labels1, loc='upper left', framealpha=1.0)
ax2.legend(lines2, labels2, loc='upper right', framealpha=1.0)


# Show the plot
plt.show()
