In [2]:
#change the following paths

#reference files
REF_TRAIN = '/scratch/ankita.maity/LeWiDi/refs/train_ref_md.tsv'
REF_VAL = '/scratch/ankita.maity/LeWiDi/refs/val_ref_md.tsv'
REF_TEST = '/scratch/ankita.maity/LeWiDi/refs/test_ref_md.tsv'

#logits for train, val and test
TRAIN = '/scratch/ankita.maity/LeWiDi/predictions_all/predictions_better_mixer/train_md_new.csv'
VAL = '/scratch/ankita.maity/LeWiDi/predictions_all/predictions_better_mixer/val_md_new.csv'
TEST = '/scratch/ankita.maity/LeWiDi/predictions_all/predictions_better_mixer/test_md_new.csv'

#output path to store the processed train, val and test tsv files in required format
TRAIN_TSV = '/scratch/ankita.maity/LeWiDi/predictions_all/submit_better_mixer/train_md_final_new.tsv'
VAL_TSV = '/scratch/ankita.maity/LeWiDi/predictions_all/submit_better_mixer/val_md_final_new.tsv'
TEST_TSV = '/scratch/ankita.maity/LeWiDi/predictions_all/submit_better_mixer/test_md_final_new.tsv'

In [None]:
import torch
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [4]:
train = pd.read_csv(f'{TRAIN}')
val = pd.read_csv(f'{VAL}')
test = pd.read_csv(f'{TEST}')

In [5]:
train = train.sort_values('0', ignore_index=True)

In [6]:
train = train.drop(['0'], axis=1)
val = val.drop(['0'], axis=1)
test = test.drop(['0'], axis=1)

In [7]:
#change according to size of dataframe (md for now)
def compute(df):
    for i in range(len(df)):
        for a in range(1,6):
            logit = df[f'{a}'][i]
            prediction = torch.sigmoid(torch.as_tensor(logit))
            prediction = prediction.detach().cpu().numpy()
            df[f'{a}'][i] = 1*(prediction>=0.5)
            
    
    columns = ['hard_label','prob_0', 'prob_1']
    df_final = pd.DataFrame(columns=columns)
    
    for i in range(len(df)):
        l = []
        summ = pd.Series.sum(df.loc[i,:])
        if(summ>=3):
            l.append(1)
        
        else:
            l.append(0)
        
        one_prob = summ/5
        zero_prob = 1 - one_prob
    
        l.append(zero_prob)
        l.append(one_prob)
    
        df_final.loc[len(df_final)]=l
        
    df_final['hard_label'] = df_final['hard_label'].astype(int)
    return df_final


In [8]:
train_final = compute(train)
val_final = compute (val)
test_final = compute(test)

In [9]:
#saving the train, val and test (for submitting on CodaLab)
train_final.to_csv(f'{TRAIN_TSV}', header=None, sep="\t", index = False)
val_final.to_csv(f'{VAL_TSV}', header=None, sep="\t", index = False)
test_final.to_csv(f'{TEST_TSV}', header=None, sep="\t", index = False)

In [10]:
def cross_entropy(targets_soft, predictions_soft, epsilon = 1e-12):                                
    predictions = np.clip(predictions_soft, epsilon, 1. - epsilon)                                      
    N = predictions.shape[0]
    ce = -np.sum(targets_soft*np.log(predictions+1e-9))/N
    return ce

In [11]:
def f1_metric(targets_hard, prediction_hard):
    f1_wa = sklearn.metrics.f1_score(targets_hard, prediction_hard, average = 'micro')                
    return f1_wa

In [12]:
def get_data (myfile):
    soft = list()
    hard = list()
    with open(myfile,'r') as f:
            for line in f:
                line=line.replace('\n','')
                parts=line.split('\t')
                soft.append([float(parts[1]),float(parts[2])])
                hard.append(parts[0])
    return(soft,hard)

In [None]:
#cross entropy and F1 for train dataset
soft_ref, hard_ref = get_data(f'{REF_TRAIN}')
soft_pred, hard_pred = get_data(f'{TRAIN_TSV}')

soft_score = cross_entropy(soft_ref,soft_pred)
hard_score = f1_metric(hard_ref,hard_pred)

print("cross entropy (soft score): ", soft_score)
print("micro F1 (hard score): ", hard_score)

In [None]:
#validating on val dataset
soft_ref, hard_ref = get_data(f'{REF_VAL}')
soft_pred, hard_pred = get_data(f'{VAL_TSV}')

soft_score = cross_entropy(soft_ref,soft_pred)
hard_score = f1_metric(hard_ref,hard_pred)

print("cross entropy (soft score): ", soft_score)
print("micro F1 (hard score): ", hard_score)

In [None]:
#scores on test dataset
soft_ref, hard_ref = get_data(f'{REF_TEST}')
soft_pred, hard_pred = get_data(f'{TEST_TSV}')

soft_score = cross_entropy(soft_ref,soft_pred)
hard_score = f1_metric(hard_ref,hard_pred)

print("cross entropy (soft score): ", soft_score)
print("micro F1 (hard score): ", hard_score)