## Objective
This notebook takes all wordwise decisions from the individual models and layers them to test selected combinations of models in the TEST dataset.

In [None]:
import pandas as pd
import re
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pickle



In [None]:
## train data
words_train = pd.read_csv('../data/train_wordwise_clean.csv')
words_dev = pd.read_csv('../data/dev_wordwise_clean.csv')
words_test = pd.read_csv('../data/test_wordwise_clean.csv')

## testing data in fragments
frag_train = pd.read_csv('../data/all_train_aligned.csv')
frag_dev = pd.read_csv('../data/all_dev_aligned.csv')
frag_test = pd.read_csv('../data/all_test_aligned.csv')


frag_train = frag_train[frag_train.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_dev = frag_dev[frag_dev.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_test = frag_test[frag_test.apply(lambda x: type(x['0']) == str, axis = 1)]

In [None]:

with open('../models/BERT/pickled_results/test_wordwise_decisions.pkl', 'rb') as f:
  bert_test_wordwise_decisions = pickle.load(f)

with open('../models/LEX/lexicon_decisions_test.pkl', 'rb') as f:
  lexicon_test = pickle.load(f)

with open('../models/MLE/mle_85_decisions_test.pkl', 'rb') as f:
  mle_85_test = pickle.load(f)

with open('../models/MLE/mle_0_decisions_test.pkl', 'rb') as f:
  mle_0_test = pickle.load(f)


#### Preprocess BERT outputs
Our BERT model is limited to 20 tokens, which is enough for a large majority of the fragments but is short for some exceptionally long ones. We pad with zeroes to equal the lengths of the decision arrays from other models.


In [None]:
#### Check preformance when padded to 30? (only lose around 150 tokens)
bert_test_padded_decisions = []

for x, y in zip(mle_0_test, bert_test_wordwise_decisions):
  y = y + [0 for x in range(len(x['levels']) - len(y))]
  bert_test_padded_decisions.append(y)

### Create default level vectors

In [None]:
levels_0 = []
levels_3 = []
levels_4 = []
levels_5 = []

for x in mle_0_test:
  levels_0.append([0 for x in range(len(x['levels']))])
  levels_3.append([3 for x in range(len(x['levels']))])
  levels_4.append([4 for x in range(len(x['levels']))])
  levels_5.append([5 for x in range(len(x['levels']))])


### Get ground truth
The ground truth is included with the output of every model.

In [None]:
test_gt = [e['gts'] for e in mle_0_test]

wordwise_gt = np.concatenate(test_gt)

In [None]:
# Rounds levels 1-2 (returned sometimes by LEX) to 3

def level_keep0(l):
  if l > 0:
    if l < 3:
      return 3
    else:
      return l
  else:
    return 0

In [None]:
decisions = {
    'levels_0': np.concatenate([e['levels'] for e in levels_0]),
    'levels_3': np.concatenate([e['levels'] for e in levels_3]),
    'levels_4': np.concatenate([e['levels'] for e in levels_4]),
    'levels_5': np.concatenate([e['levels'] for e in levels_5]),
    'levels_mle': np.concatenate([e['levels'] for e in mle_0_test]),
    'levels_mle_85': np.concatenate([e['levels'] for e in mle_85_test]),
    'levels_lexicon': [level_keep0(l) for l in np.concatenate([e['levels'] for e in lexicon_test])],
    'levels_bert': [x+3 for x in np.concatenate(bert_test_padded_decisions)]
}

### Layered Experiments

In [None]:
def comb_experiment_pipeline(decision_1, decision_2 = decisions['levels_0'], decision_final = decisions['levels_0']):
  final_result = []
  ### DECISION 2
  for i, x in enumerate(decision_1):
    if x == 0:
      final_result.append(decision_2[i])
    else:
      final_result.append(x)

  ### Final decision
  for i, x in enumerate(final_result):
    if x == 0:
      final_result[i] = decision_final[i]

  return final_result

exps = [
    comb_experiment_pipeline(decisions['levels_mle_85'], decision_2 = decisions['levels_lexicon'], decision_final = decisions['levels_bert']),
    comb_experiment_pipeline(decisions['levels_mle'], decision_2 = decisions['levels_0'], decision_final = decisions['levels_bert']),
    comb_experiment_pipeline(decisions['levels_lexicon'], decision_2 = decisions['levels_0'], decision_final = decisions['levels_bert']),
    comb_experiment_pipeline(decisions['levels_bert'], decision_2 = decisions['levels_0'], decision_final = decisions['levels_3']),
    ]


In [None]:
wordwise_gt = test_gt.astype(int)
def results_to_csv(result_arr):
  all_rows = []
  for resu in result_arr:
    inv_report = classification_report(final_gt, resu, output_dict = True)

    arr_inv = np.concatenate([[inv_report[x]['f1-score'],
            inv_report[x]['precision'],
            inv_report[x]['recall'],] for x in ['3', '4', '5']])
    arr_inv = np.append(arr_inv, inv_report['accuracy'])
    arr_inv = np.append(arr_inv, inv_report['macro avg']['f1-score'])

    all_rows.append(arr_inv)

  return all_rows

all_rows = results_to_csv(exps)

df_results_words = pd.DataFrame(all_rows, columns = ['f1_3','3_prec','3_recall','f1_4','4_prec','4_recall','f1_5','5_prec','5_recall','accuracy','f1_macro'])

In [None]:
df_results_words

### Aggregation into Fragment Level Experiments

In [None]:
frag_0 = [x['levels'] for x in levels_0]
frag_3 = [x['levels'] for x in levels_3]
frag_4 = [x['levels'] for x in levels_4]
frag_5 = [x['levels'] for x in levels_5]
frag_mle = [x['levels'] for x in mle_0_test]
frag_mle_85 = [x['levels'] for x in mle_85_test]
frag_lexicon = [[l if l > 3 or l == 0 else 3 for l in x['levels']] for x in lexicon_test]
frag_bert = [[w+3 for w in frag] for frag in bert_test_padded_decisions]

In [None]:
def frag_exps_pipeline(decision_1, decision_2 = frag_0, decision_final = frag_0, frags = frag_dev['0']):
  all_results = []
  for d1, d2, d3, f in zip(decision_1, decision_2, decision_final, frags):
    toks = [t.split('#')[0] for t in f.split(' ')]
    gts = [int(t.split('#')[1]) for t in f.split(' ')]
    gold_level = max(gts)
 
    decision = [dec if dec != 0 else alt for dec, alt in zip(d1, d2)]

    decision = [dec if dec != 0 else alt for dec, alt in zip(decision, d3)]

    pred = max(decision)

    all_results.append([
        gold_level,
        pred
    ])
  return all_results


In [None]:
exps = [

    frag_exps_pipeline(frag_mle_85, decision_2 = frag_lexicon, decision_final = frag_bert),
    frag_exps_pipeline(frag_mle, decision_2 = frag_0, decision_final = frag_bert),
    frag_exps_pipeline(frag_lexicon, decision_2 = frag_0, decision_final = frag_bert),
    frag_exps_pipeline(frag_bert, decision_2 = frag_0, decision_final = frag_3),

]

In [None]:
def frag_results_to_csv(result_arr):
  all_rows = []
  for resu in result_arr:
    rr = pd.DataFrame(resu, columns = ['gt', 'pred'])

    inv_report = classification_report(rr['gt'], rr['pred'], output_dict = True)

    arr_inv = np.concatenate([[inv_report[x]['f1-score'],
            inv_report[x]['precision'],
            inv_report[x]['recall'],] for x in ['3', '4', '5']])
    arr_inv = np.append(arr_inv, inv_report['accuracy'])
    arr_inv = np.append(arr_inv, inv_report['macro avg']['f1-score'])

    all_rows.append(arr_inv)

  return all_rows

In [None]:
frag_df = pd.DataFrame(frag_results_to_csv(exps))

In [None]:
frag_df