In [10]:
import numpy as np
from scipy.stats import ttest_rel
import pandas as pd
import os
from contextlib import redirect_stdout

In [2]:
baseline_method = 'zero-shot'
our_method = 'totally-cross-sim'

In [3]:
SOURCE_TASKS = ['ARC-Easy', 'ag_news', 'boolq', 'commonsense_qa', 'conll2003_pos', 'conll2003_ner', 'mnli', 'qqp', 'race', 'sst2']
TARGET_TASKS = ['ARC-Challenge', 'financial_phrasebank', 'medmcqa', 'sciq', 'social_i_qa']

In [None]:
dir = 'acc-results'

In [None]:
result_path='results/dataset_name={}/sourcce_dataset={}-model={}-method={}-shots={}.csv'
def eval(dataset_name,source_dataset,model='kglm_text_davinci_003',method='totally-cross-sim',k=1):
    r=[]
    df=pd.read_csv(result_path.format(dataset_name,source_dataset,model,method,k))
    df=df.fillna('')
    output=list(df['pred'])
    gold=np.array(df['true_label'])
    gold=[str(o).lower() for o in gold]
    
    output=[str(o).replace('Label:','') for o in output]
    output = [o.strip(' .[]":\'').lower() for o in output]
    output = [o.split('.')[0] for o in output]
    output = [o.split(',')[0] for o in output]
    output = [o.split(':')[0] for o in output]
    output = [o.split('-')[0] for o in output]
    
    output = np.array(output)
    acc=np.mean(output==gold)*100
    print(f"Acc for {dataset_name} using {source_dataset} is ",acc)
    return acc

In [8]:
arr1 = np.array([])
arr2 = np.array(['c2', 'd', 'e'])
arr3 = np.concatenate((arr1, arr2))
arr3

array(['c2', 'd', 'e'], dtype='<U32')

In [12]:
model_path = {'llama_7b': '_home_models_savedModels_Llama-2-7b-hf_', 'llama_13b': '_home_eshaan_models_Llama-2-13b-hf_', 'gpt': 'kglm_text_davinci_003'}

In [9]:
def get_processed_output(path):
    r=[]
    df=pd.read_csv(path)
    df=df.fillna('')
    output=list(df['pred'])
    gold=np.array(df['true_label'])
    gold=[str(o).lower() for o in gold]
    
    output=[str(o).replace('Label:','') for o in output]
    output = [o.strip(' .[]":\'').lower() for o in output]
    output = [o.split('.')[0] for o in output]
    output = [o.split(',')[0] for o in output]
    output = [o.split(':')[0] for o in output]
    output = [o.split('-')[0] for o in output]
    
    output = np.array(output)
    gold = np.array(gold)
    correct = (output==gold)

    return correct

In [26]:
def get_p_t_value(model, model_path):

    with open(f'{model}_one_tail_t_test.txt', 'w') as f:
        with redirect_stdout(f):
            print(f'############## {model} ##############')
    
    for source_task in SOURCE_TASKS:
    
        baseline_predictions = np.array([])
        our_predictions = np.array([])

        for target_task in TARGET_TASKS:
            if model == 'gpt':
                baseline_result_path=f'acc-results/dataset_name={target_task}/sourcce_dataset=None-model={model_path[model]}-method={baseline_method}-shots={1}.csv'
            else:
                baseline_result_path=f'acc-results/dataset_name={target_task}/sourcce_dataset=None-model={model_path[model]}-method={baseline_method}-shots={0}.csv'

            our_result_path=f'acc-results/dataset_name={target_task}/sourcce_dataset={source_task}-model={model_path[model]}-method={our_method}-shots={1}.csv'

            baseline_acc = get_processed_output(baseline_result_path)
            our_acc = get_processed_output(our_result_path) 

            baseline_predictions = np.concatenate((baseline_predictions, baseline_acc))
            our_predictions = np.concatenate((our_predictions, our_acc))
        
        # print(f"Baseline Predictions: {baseline_predictions.shape}")
        # print(f"Our Predictions: {our_predictions.shape}")

        # Perform a one-tail paired t-test
        t_statistic, p_value = ttest_rel(our_predictions, baseline_predictions)

        # Set the significance level
        alpha = 0.05

        one_tail_p_value = (p_value / 2)
        # Check if the p-value is less than the significance level
        if one_tail_p_value < alpha and t_statistic > 0:
            with open(f'{model}_one_tail_t_test.txt', 'a') as f:
                with redirect_stdout(f):
                    print(f'{source_task} => p-value: {one_tail_p_value:.2e}, t-statistic: {t_statistic:.4f}) => SIGNIFICANT')

        else:
            with open(f'{model}_one_tail_t_test.txt', 'a') as f:
                with redirect_stdout(f):
                    print(f'{source_task} => p-value: {one_tail_p_value:.2e}, t-statistic: {t_statistic:.4f}) => NOT SIGNIFICANT')
        
        print(f"Done with {source_task} !")
            

In [27]:
for key in model_path.keys():
    print(f"################ {key} ##################")
    get_p_t_value(key, model_path)

################ llama_7b ##################
Done with ARC-Easy !
Done with ag_news !
Done with boolq !
Done with commonsense_qa !
Done with conll2003_pos !
Done with conll2003_ner !
Done with mnli !
Done with qqp !
Done with race !
Done with sst2 !
################ llama_13b ##################
Done with ARC-Easy !
Done with ag_news !
Done with boolq !
Done with commonsense_qa !
Done with conll2003_pos !
Done with conll2003_ner !
Done with mnli !
Done with qqp !
Done with race !
Done with sst2 !
################ gpt ##################
Done with ARC-Easy !
Done with ag_news !
Done with boolq !
Done with commonsense_qa !
Done with conll2003_pos !
Done with conll2003_ner !
Done with mnli !
Done with qqp !
Done with race !
Done with sst2 !
