In [None]:
import sys
import json
import torch
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from scipy.spatial.distance import cdist
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import roc_curve, precision_recall_curve, auc as auc_score

# Python Class for embedding extraction
from extraction import WordEmbeddingExtraction

In [None]:
wicita_path = '/project/folder/' # Use your project folder
embs_path = '/project/folder/embeddings' # Folder where embeddings will be stored

In [None]:
def formatname(txt):
    return txt.replace('.jsonl', '.new.jsonl')

def change_format(data_filename, binary_filename, ranking_filename, test=None):
    dataset = list()
    gold = list()
    ranking = list()
    
    lines = open(data_filename.replace('/binary/', '/ranking/'), mode='r', encoding='utf-8').readlines()
    for i, line in enumerate(open(data_filename, mode='r', encoding='utf-8')):
        pair = json.loads(line)
        dataset.append(dict(id=pair['id'], 
                            lemma=pair["lemma"] if test != 'eng' else pair["lemma1"],
                            sent=pair['sentence1'],
                            start=pair['start1'],
                            end=pair['end1']))
        
        dataset.append(dict(id=pair['id'], 
                            lemma=pair["lemma"] if test != 'eng' else pair["lemma2"],
                            sent=pair['sentence2'],
                            start=pair['start2'],
                            end=pair['end2']))
        
        if test is None:
            gold.append(f'{pair["label"]}\n')
            ranking.append(f'{json.loads(lines[i])["score"]}\n')
    
    pd.DataFrame(dataset).to_json(formatname(data_filename), orient='records', lines=True)
    
    if test is None:
        open(binary_filename, mode='w').writelines(gold)
        open(ranking_filename, mode='w').writelines(ranking)

def extract(model, dataset, batch_size=32, max_length=512, agg_sub_words='mean', layers=12):
    embs = WordEmbeddingExtraction(model).extract(dataset=dataset, 
                                                  batch_size=batch_size,
                                                  max_length=max_length, 
                                                  agg_sub_words=agg_sub_words,
                                                  layers=layers)
    split_embs = defaultdict(dict)

    for l in range(1, 12+1):
        E = embs[l]
        E1, E2 = list(), list()
        for i in range(0, E.shape[0], 2):
            E1.append(E[i])
            E2.append(E[i+1])

        split_embs['sent1'][l]=torch.stack(E1)
        split_embs['sent2'][l]=torch.stack(E2)
    
    return dict(split_embs)

def compute_scores(embeddings, layers=12):
    scores = defaultdict(list)

    n_pairs = embeddings['sent1'][1].shape[0]
    
    for i in range(n_pairs):
        embs_t1, embs_t2 = list(), list()

        for j in range(1, layers + 1):
            embs_t1_lj, embs_t2_lj = embeddings['sent1'][j][i].cpu(), embeddings['sent2'][j][i].cpu()
            embs_t1.append(embs_t1_lj)
            embs_t2.append(embs_t2_lj)
            
            # cosine similarity
            cd = cdist([embs_t1_lj.numpy()], [embs_t2_lj.numpy()], metric='cosine')[0][0]
            scores[f'CS{j}'].append(1 - cd)
        
        scores[f'CS_AVG'].append(1 - cdist([torch.stack(embs_t1[-4:]).mean(axis=0).numpy()], 
                                           [torch.stack(embs_t2[-4:]).mean(axis=0).numpy()], 
                                           metric='cosine')[0][0])
        
        # Cosine Distance and Similarity Matrix between embeddings of different layers
        cd_matrix = cdist([e.numpy() for e in embs_t1], [e.numpy() for e in embs_t2], metric='cosine')
        cs_matrix = 1-cdist([e.numpy() for e in embs_t1], [e.numpy() for e in embs_t2], metric='cosine')
        cs_matrix_F = cs_matrix[:4, :4]
        cs_matrix_M = cs_matrix[4:8, 4:8]
        cs_matrix_L = cs_matrix[-4:, -4:]
        
        # Cond
        cond = np.linalg.cond(cs_matrix, 'fro')
        cond_F = np.linalg.cond(cs_matrix_F, 'fro')
        cond_M = np.linalg.cond(cs_matrix_M, 'fro')
        cond_L = np.linalg.cond(cs_matrix_L, 'fro')
        
        scores['-COND'].append(-cond_cs)
        scores['-COND_L'].append(-cond_L)
        scores['-COND_M'].append(-cond_M)
        scores['-COND_F'].append(-cond_F)

    for s in scores:
        scores[s] = np.array(scores[s])

    return scores

def best_threshold(y_true: np.array, y: np.array, func='accuracy') -> tuple:
    """
    Calculates the accuracy/f1 score for a binary classification problem.
    The function first calculates the False Positive Rate (FPR), True Positive Rate (TPR), and Thresholds using the
    roc_curve function from Scikit-learn. Next, it calculates the accuracy score for each threshold value and returns
    the maximum accuracy score and its corresponding threshold value.

    Args:
        y(np.array): array containing predicted values
        y_true(np.array): array containing ground truth values.
    Returns:
        acc, thr
    """

    # False Positive Rate - True Positive Rate
    fpr, tpr, thresholds = roc_curve(y_true, y)

    scores = []
    for thresh in thresholds:
        if func == 'accuracy':
            scores.append(accuracy_score(y_true, [m >= thresh for m in y]))
        elif func == 'f1':
            scores.append(f1_score(y_true, [m >= thresh for m in y], average='weighted'))

    scores = np.array(scores)

    # Max accuracy
    max_ = scores.max()

    # Threshold associated to the maximum accuracy
    max_threshold = thresholds[scores.argmax()]

    return round(float(max_), 3), max_threshold

In [None]:
# Hugginface models
models = {'it': 'dbmdz/bert-base-italian-cased',
          'm': 'bert-base-multilingual-cased',
          'xl': 'xlm-roberta-base'}

# Wicita datasets
datasets = {'train': f'{wicita_path}/binary/train.jsonl',
            'dev': f'{wicita_path}/binary/dev.jsonl',
            'test': f'{wicita_path}/binary/test.jsonl',
            'test_eng': f'{wicita_path}/binary/test_eng.jsonl'}

# filename of new gold data files
it_binary_golds = {'train': f'{wicita_path}/train_gold_binary.txt',
                   'dev': f'{wicita_path}/dev_gold_binary.txt',
                   'test': f'{wicita_path}/test_gold_binary.txt',
                   'test_eng': f'{wicita_path}/test_eng_gold_binary.txt'}

it_ranking_golds = {'train': f'{wicita_path}/train_gold_ranking.txt',
                   'dev': f'{wicita_path}/dev_gold_ranking.txt',
                   'test': f'{wicita_path}/test_gold_ranking.txt',
                   'test_eng': f'{wicita_path}/test_eng_gold_ranking.txt'}

In [None]:
# Change data format: make sure you downloaded the data and created the directory
change_format(datasets['train'], it_binary_golds['train'], it_ranking_golds['train'])
change_format(datasets['dev'], it_binary_golds['dev'], it_ranking_golds['dev'])
change_format(datasets['test'], it_binary_golds['test'], it_ranking_golds['test'], test='it')
change_format(datasets['test_eng'], it_binary_golds['test_eng'], it_ranking_golds['test_eng'], test='eng')

In [None]:
# Collect embeddings for each model
for m in list(models):
    train_embs = extract(models[m], formatname(datasets['train']))
    dev_embs = extract(models[m], formatname(datasets['dev']))
    test_embs = extract(models[m], formatname(datasets['test']))
    test_eng_embs = extract(models[m], formatname(datasets['test_eng']))
    
    torch.save(train_embs, f'{embs_path}/{m}_train.pt')
    torch.save(dev_embs, f'{embs_path}/{m}_dev.pt')
    torch.save(test_embs, f'{embs_path}/{m}_test.pt')
    torch.save(test_eng_embs, f'{embs_path}/{m}_test_eng.pt')

In [None]:
# Choose a model
model = 'xl'

In [None]:
# Load embeddings
train_embs = torch.load(f'{embs_path}/{model}_train.pt')
dev_embs = torch.load(f'{embs_path}/{model}_dev.pt')
test_embs = torch.load(f'{embs_path}/{model}_test.pt')
test_eng_embs = torch.load(f'{embs_path}/{model}_test_eng.pt')

In [None]:
# Compute score [cosine similarities, and condition number]
scores_train = compute_scores(train_embs)
scores_dev = compute_scores(dev_embs)
scores_test = compute_scores(test_embs)
scores_test_eng = compute_scores(test_eng_embs)

In [None]:
# Binary and Ranking ground truth
bin_gold_train = np.array([eval(i.strip()) for i in open(it_binary_golds['train'], mode='r').readlines()])
bin_gold_dev = np.array([eval(i.strip()) for i in open(it_binary_golds['dev'], mode='r').readlines()])
rank_gold_train = np.array([eval(i.strip()) for i in open(it_ranking_golds['train'], mode='r').readlines()])
rank_gold_dev = np.array([eval(i.strip()) for i in open(it_ranking_golds['dev'], mode='r').readlines()])

#gold_test = np.array([eval(i.strip()) for i in open(it_binary_golds['test'], mode='r').readlines()])
#gold_test_eng = np.array([eval(i.strip()) for i in open(it_binary_golds['test_eng'], mode='r').readlines()])

In [None]:
def full_eval_binary(Y_train, Y_test, scores_train, scores_test):
    stats=defaultdict(lambda: defaultdict(int))

    for s in scores_train:
        acc, thr=best_threshold(Y_train, scores_train[s], 'accuracy')
        stats[s]['score_train'] = np.array([int(i>=thr) for i in scores_train[s]])
        stats[s]['score_test'] = np.array([int(i>=thr) for i in scores_test[s]])
        stats[s]['acc_train']=acc
        stats[s]['acc_test']=accuracy_score(Y_test, [int(i>=thr) for i in scores_test[s]])
        
        f1, thr=best_threshold(Y_train, scores_train[s], 'f1')
        stats[s]['f_wscore_train']=f1
        stats[s]['f_wscore_test']=f1_score(Y_test, [int(i>=thr) for i in scores_test[s]])
        stats[s]['thr']=thr

    df = pd.DataFrame(stats).T.reset_index().rename(columns={'index':'measure'})
    df = df.sort_values(by=['acc_test', 'measure'], ascending=False)
    return df

def full_eval_ranking(Y_train, Y_test, scores_train, scores_test):
    stats=defaultdict(lambda: defaultdict(int))

    for s in scores_train:
        stats[s]['score_train'] = scores_train[s]
        stats[s]['score_test'] = scores_test[s]
        corr, pvalue=spearmanr(Y_train, scores_train[s])
        stats[s]['corr_train']=corr
        stats[s]['pvalue_train']=pvalue
        corr, pvalue=spearmanr(Y_test, scores_test[s])
        stats[s]['corr_test']=corr
        stats[s]['pvalue_test']=pvalue

    df = pd.DataFrame(stats).T.reset_index().rename(columns={'index':'measure'})
    df = df.sort_values(by=['corr_test', 'measure'], ascending=False)
    return df

In [None]:
binary_t = full_eval_binary(bin_gold_train, bin_gold_dev, scores_train, scores_dev)
ranking_t = full_eval_ranking(rank_gold_train, rank_gold_dev, scores_train, scores_dev)

In [None]:
binary_d = full_eval_binary(bin_gold_dev, bin_gold_train, scores_dev, scores_train)
ranking_d = full_eval_ranking(rank_gold_dev, rank_gold_train, scores_dev, scores_train)

In [None]:
measures = ['-COND', '-COND_L', '-COND_M', '-COND_F', 
            'CS10', 'CS_AVG', 'CS9', 'CS8', 'CS11', 'CS7',
            'CS6', 'CS5', 'CS12', 'CS4', 'CS3', 'CS1', 'CS2']

In [None]:
bin_gold_tot = np.concatenate([bin_gold_dev, bin_gold_train])
rank_gold_tot = np.concatenate([rank_gold_dev, rank_gold_train])
scores_tot = {m: np.concatenate([scores_dev[m], scores_train[m]]) for m in measures}

In [None]:
seed = 42

def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

stats=defaultdict(lambda: defaultdict(list))
    
n_splits = 100
n_example_train = 2000
mask = list(range(0, bin_gold_tot.shape[0]))

for i in tqdm(range(n_splits)):
    random.shuffle(mask)
    
    # Gold true: Train - Test
    bin_cv_train_gold = bin_gold_tot[mask[:n_example_train]]
    bin_cv_test_gold = bin_gold_tot[mask[n_example_train:]]
    rank_cv_train_gold = rank_gold_tot[mask[:n_example_train]]
    rank_cv_test_gold = rank_gold_tot[mask[n_example_train:]]
    
    # Predictions: Train - Test
    cv_scores_train = {m: scores_tot[m][mask[:n_example_train]] for m in measures}
    cv_scores_test = {m: scores_tot[m][mask[n_example_train:]] for m in measures}
    
    for s in cv_scores_train:
        corr, pvalue=spearmanr(rank_cv_train_gold, cv_scores_train[s])
        stats[s]['corr_train'].append(corr)
        stats[s]['pvalue_train'].append(pvalue)
        
        corr, pvalue=spearmanr(rank_cv_test_gold, cv_scores_test[s])
        stats[s]['corr_test'].append(corr)
        stats[s]['pvalue_test'].append(pvalue)
        
        _, thr=best_threshold(bin_cv_train_gold, cv_scores_train[s], func='f1')
        train_preds = [int(i>=thr) for i in cv_scores_train[s]]
        pr, re, f1, _ = precision_recall_fscore_support(bin_cv_train_gold, train_preds, average='weighted')
        stats[s]['pr_train'].append(pr)
        stats[s]['re_train'].append(re)
        stats[s]['f1_train'].append(f1)
        
        test_preds = [int(i>=thr) for i in cv_scores_test[s]]
        pr, re, f1, _ = precision_recall_fscore_support(bin_cv_test_gold, test_preds, average='weighted')
        stats[s]['pr_test'].append(pr)
        stats[s]['re_test'].append(re)
        stats[s]['f1_test'].append(f1)
            
        stats[s]['thr'].append(thr)

In [None]:
# Development Stats
df_stats=defaultdict(lambda: defaultdict(list))

for m in stats:
    for s in list(stats[m]):
        df_stats[m][s] = np.array(stats[m][s]).mean()

df = pd.DataFrame(df_stats).T.round(3).sort_index()
df.sort_values('f1_test', ascending=False)

In [None]:
def submit(filename, goldname, scores, output):
    res = list()
    lines = open(filename, mode='r', encoding='utf-8').readlines()
    for k, i in enumerate(range(0, len(lines), 2)):
        if 'binary' in output:
            res.append(dict(id=json.loads(lines[i])['id'],
                 label=scores[k]))
        else:
            res.append(dict(id=json.loads(lines[i])['id'],
             score=scores[k]))
    
    pd.DataFrame(res).to_json(output, orient='records', lines=True)
    lines = open(output).readlines()
    lines = lines[:-1] + [lines[-1].strip()]
    open(output, mode='w').writelines(lines)

In [None]:
measure = '-COND_M'
thr= -1195.522

submit(formatname(datasets['dev']), it_binary_golds['dev'], [int(i>=thr) for i in scores_dev[measure]], f"{wicita_path}binary_dev.jsonl")
submit(formatname(datasets['dev']), it_ranking_golds['dev'], scores_dev[measure], f"{wicita_path}ranking_dev.jsonl")

submit(formatname(datasets['train']), it_binary_golds['train'],  [int(i>=thr) for i in scores_train[measure]], f"{wicita_path}binary_train.jsonl")
submit(formatname(datasets['train']), it_ranking_golds['train'], scores_train[measure], f"{wicita_path}ranking_train.jsonl")

submit(formatname(datasets['test']), it_binary_golds['test'], [int(i>=thr) for i in scores_test[measure]], f"{wicita_path}binary.jsonl")
submit(formatname(datasets['test']), it_ranking_golds['test'], scores_test[measure], f"{wicita_path}ranking.jsonl")

submit(formatname(datasets['test_eng']), it_binary_golds['test_eng'], [int(i>=thr) for i in scores_test_eng[measure]], f"{wicita_path}binary_eng.jsonl")
submit(formatname(datasets['test_eng']), it_ranking_golds['test_eng'], scores_test_eng[measure], f"{wicita_path}ranking_eng.jsonl")