# Objective
For the layered MLE (x, y) -> Lexicon -> BERT, cross validate x, y.

x: minimum confidence to add a level to the model (aka, if I have seen this token more than x% of the time at this level, I trust this level)
y: minimum occurrences (I only trust this level if I have seen the word more than y times)

In [None]:
import pandas as pd

from tqdm import tqdm
import re
from sklearn.metrics import classification_report
import numpy as np
import math
from tqdm import tqdm
import os


In [None]:
## train data
words_train = pd.read_csv('../../data/train_wordwise_clean.csv')
words_dev = pd.read_csv('../../data/dev_wordwise_clean.csv')
words_test = pd.read_csv('../../data/test_wordwise_clean.csv')

## testing data in fragments
frag_train = pd.read_csv('../../data/all_train_aligned.csv')
frag_dev = pd.read_csv('../../data/all_dev_aligned.csv')
frag_test = pd.read_csv('../../data/all_test_aligned.csv')


frag_train = frag_train[frag_train.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_dev = frag_dev[frag_dev.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_test = frag_test[frag_test.apply(lambda x: type(x['0']) == str, axis = 1)]

### Acquire other decisions
Decisions on dev for BERT, Lexicon

In [None]:
import pickle

with open('../../models/BERT/pickled_results/dev_wordwise_decisions.pkl', 'rb') as f:
  bert_dev_wordwise_decisions = pickle.load(f)

with open('../../models/LEX/lexicon_decisions_dev.pkl', 'rb') as f:
  lexicon_dev = pickle.load(f)

### MLE training pipeline

In [None]:
def get_mle_counts_aligned(words, levels):
  dict_levels = {}
  for word, level in zip(words, levels):
      try:
          #assume every entry of dict_levels : {3: int, 4: int, 5: int}
          dict_levels[word][level] += 1
      except:
          dict_levels[word] = {3: 0, 4: 0, 5: 0}
          dict_levels[word][level] += 1
  return dict_levels






In [None]:
# ugly, lazy global
stats = []

def max_frequency_strategy(dict_levels, confidence = 0, min_occurrences = 1):
  dict_levels_max = {}
  no = 0
  for token in dict_levels.keys():
    cd = max(dict_levels[token].values())/sum(dict_levels[token].values())
    if cd >= confidence and sum(dict_levels[token].values()) >= min_occurrences:
      dict_levels_max[token] = max(dict_levels[token].items(), key = lambda x: x[1])[0]
    else:
      no += 1
  return dict_levels_max, no

def mle_training_pipeline_aligned(data, strategy, confidence = 0, min_occurrences = 1):
  counts = get_mle_counts_aligned(data['Word'], data['Label'])
  dl, rejects = strategy(counts, confidence = confidence, min_occurrences = min_occurrences)
  stats.append([confidence, min_occurrences, rejects])
  return dl



#### Cross validate min-confidence 30-100, min-occurrences 1-8

In [None]:

mle_thresholds = [mle_training_pipeline_aligned(
    words_train, 
    max_frequency_strategy, 
    confidence = x/100, 
    min_occurrences = y) 
    for y in range(1,8) 
    for x in range(30, 105, 5)]
mle_functions = [
    lambda token, analyses: mle.get(token, 0) for mle in mle_thresholds
]


In [None]:
len(mle_thresholds[0])

In [None]:
stats_df = pd.DataFrame(stats, columns =['confidence', 'threshold', 'tok_rejected'])

In [None]:
def get_rl_0(token, analyses, oov_level = 0):
  return oov_level

def get_rl_3(token, analyses, oov_level = 3):
  return oov_level

def get_rl_4(token, analyses, oov_level = 4):
  return oov_level

def get_rl_5(token, analyses, oov_level = 5):
  return oov_level

In [None]:
def levels_pipeline(fragment, decision_1, decision_2, requires_disambig = False):
  tokens = [t.split('#')[0] for t in fragment.split(' ')]
  gt = [t.split('#')[1] for t in fragment.split(' ')]

  if requires_disambig:
    analyses = [token.analyses for token in bert_disambig.disambiguate(tokens)]
  else:
    analyses = gt

  # decision round 1:
  levels = [decision_1(token, analysis) for token, analysis in zip(tokens, analyses)]

  # decision round 2:
  levels = [l if l != 0 else decision_2(t, a) for l, t, a in zip(levels, tokens, analyses)]

  return {
      'levels': levels,
      'gts': gt,
  }

def mle_levels_pipeline(fragment, model):
  tokens = [t.split('#')[0] for t in fragment.split(' ')]
  gt = [t.split('#')[1] for t in fragment.split(' ')]

  levels = [model.get(token, 0) for token in tokens]

  return {
      'levels': levels
  }



In [None]:
levels_0 = [levels_pipeline(f, get_rl_0, get_rl_0) for f in frag_dev['0']]
levels_3 = [levels_pipeline(f, get_rl_3, get_rl_3) for f in frag_dev['0']]
levels_4 = [levels_pipeline(f, get_rl_4, get_rl_4) for f in frag_dev['0']]
levels_5 = [levels_pipeline(f, get_rl_5, get_rl_5) for f in frag_dev['0']]

## Lexicon might level 1 or 2.

def level_keep0(l):
  if l > 0:
    if l < 3:
      return 3
    else:
      return l
  else:
    return 0


In [None]:
levels_mles = [
    [
        mle_levels_pipeline(f, model) for f in frag_dev['0']
    ]
    for model in mle_thresholds
]

#### We need to preprocess BERT outputs, as they are padded to 20.


In [None]:
bert_padded_decisions = []

for x, y in zip(levels_0, bert_dev_wordwise_decisions):
  y = y + [0 for x in range(len(x['levels']) - len(y))]
  bert_padded_decisions.append(y)

#### Decisions setup

In [None]:
decisions = {
    'levels_0': np.concatenate([e['levels'] for e in levels_0]),
    'levels_3': np.concatenate([e['levels'] for e in levels_3]),
    'levels_4': np.concatenate([e['levels'] for e in levels_4]),
    'levels_5': np.concatenate([e['levels'] for e in levels_5]),
    'levels_lexicon': [level_keep0(l) for l in np.concatenate([e['levels'] for e in lexicon_dev])],
    'levels_bert': [x+3 for x in np.concatenate(bert_padded_decisions)]
}

mle_decisions = [
    np.concatenate([e['levels'] for e in d]) for d in levels_mles
]

In [None]:
final_gt = np.concatenate([e['gts'] for e in levels_0])

In [None]:
def comb_experiment_pipeline(decision_1, decision_2 = decisions['levels_0'], decision_final = decisions['levels_0']):
  final_result = []
  ### DECISION 2
  for i, x in enumerate(decision_1):
    if x == 0:
      final_result.append(decision_2[i])
    else:
      final_result.append(x)

  ### Final decision
  for i, x in enumerate(final_result):
    if x == 0:
      final_result[i] = decision_final[i]

  return final_result

exps = [
    comb_experiment_pipeline(mle_dec, decision_2 = decisions['levels_lexicon'], decision_final = decisions['levels_bert']) for mle_dec in mle_decisions
]


In [None]:
final_gt = final_gt.astype(int)
def results_to_csv(result_arr):
  all_rows = []
  for resu in result_arr:
    inv_report = classification_report(final_gt, resu, output_dict = True)

    arr_inv = np.concatenate([[inv_report[x]['f1-score'],
            inv_report[x]['precision'],
            inv_report[x]['recall'],] for x in ['3', '4', '5']])
    arr_inv = np.append(arr_inv, inv_report['accuracy'])
    arr_inv = np.append(arr_inv, inv_report['macro avg']['f1-score'])

    all_rows.append(arr_inv)

  return all_rows





In [None]:
all_rows = results_to_csv(exps)

In [None]:
df_results_old = pd.DataFrame(all_rows, columns = ['f1_3','3_prec','3_recall','f1_4','4_prec','4_recall','f1_5','5_prec','5_recall','accuracy','f1_macro'])

### Aggregation Experiments

In [None]:
levels_frag_mles = [[e['levels'] for e in d] for d in levels_mles]
levels_frag_0 = [x['levels'] for x in levels_0]
levels_frag_3 = [x['levels'] for x in levels_3]


levels_frag_lexicon = [[l if l > 3 or l == 0 else 3 for l in x['levels']] for x in lexicon_dev]
levels_frag_bert = [[w+3 for w in frag] for frag in bert_padded_decisions]

In [None]:

def frag_exps_pipeline(decision_1, decision_2 = levels_frag_0, decision_final = levels_frag_0, frags = frag_dev['0']):
  all_results = []
  for d1, d2, d3, f in zip(decision_1, decision_2, decision_final, frags):
    toks = [t.split('#')[0] for t in f.split(' ')]
    gts = [int(t.split('#')[1]) for t in f.split(' ')]
    gold_level = max(gts)




    decision = [dec if dec != 0 else alt for dec, alt in zip(d1, d2)]


    decision = [dec if dec != 0 else alt for dec, alt in zip(decision, d3)]


    pred = max(decision)

    all_results.append([
        gold_level,
        pred
    ])
  return all_results


In [None]:
exps_frags = [
    frag_exps_pipeline(mle_dec, decision_2 = levels_frag_lexicon, decision_final = levels_frag_bert) for mle_dec in levels_frag_mles
]

In [None]:
def frag_results_to_csv(result_arr):
  all_rows = []
  for resu in result_arr:
    rr = pd.DataFrame(resu, columns = ['gt', 'pred'])

    inv_report = classification_report(rr['gt'], rr['pred'], output_dict = True)

    arr_inv = np.concatenate([[inv_report[x]['f1-score'],
            inv_report[x]['precision'],
            inv_report[x]['recall'],] for x in ['3', '4', '5']])
    arr_inv = np.append(arr_inv, inv_report['accuracy'])
    arr_inv = np.append(arr_inv, inv_report['macro avg']['f1-score'])

    all_rows.append(arr_inv)

  return all_rows

In [None]:
frag_df = pd.DataFrame(frag_results_to_csv(exps_frags))

In [None]:
frag_df