In [43]:
import pandas as pd

from tqdm import tqdm
import re
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight


import math

from sklearn.metrics import f1_score, accuracy_score, classification_report

import matplotlib.pyplot as plt
from tqdm import tqdm
import os

!pip install camel_tools

from camel_tools.tokenizers.word import simple_word_tokenize

### replace with your installation of camel_tools.
os.environ['CAMELTOOLS_DATA'] = '/content/drive/MyDrive/camel_tools'



In [44]:
!pip install Levenshtein
import Levenshtein
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from pathlib import Path
S31_DB_PATH = Path('../data/disambig_db/calima-msa-s31.db')
S31_DB = MorphologyDB(S31_DB_PATH, 'a')
S31_AN = Analyzer(S31_DB, 'NOAN_ALL', cache_size=100000)
bert_disambig = BERTUnfactoredDisambiguator.pretrained('msa', top=1000, pretrained_cache = False)
bert_disambig._analyzer = S31_AN



Some weights of the model checkpoint at /content/drive/MyDrive/camel_tools/data/disambig_bert_unfactored/msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Objective
All Wordwise Techniques and Vocabularies. Code so that we can automatically generate results.

As well, produce summary statistics on the vocabularies: MLE, FREQ, SAMERLEX, etc.

### Expected Training and Testing Data:
Punctuation tokenized, all tokens that are either all punctuation or digits removed.

In [45]:
## train data
import numpy as np
words_train = pd.read_csv('../data/train_wordwise_clean.csv')
words_dev = pd.read_csv('../data/dev_wordwise_clean.csv')
words_test = pd.read_csv('../data/test_wordwise_clean.csv')

## testing data in fragments
frag_train = pd.read_csv('../data/all_train_aligned.csv')
frag_dev = pd.read_csv('../data/all_dev_aligned.csv')
frag_test = pd.read_csv('../data/all_test_aligned.csv')


frag_train = frag_train[frag_train.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_dev = frag_dev[frag_dev.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_test = frag_test[frag_test.apply(lambda x: type(x['0']) == str, axis = 1)]

### Acquire BERT Decisions from BERT Notebook
Decisions made on levelling the dev set.

In [46]:
import pickle

### save wordwise decisions
with open('../BERT/pickled_results/res_wordwise_decisions.pkl', 'rb') as f:
  res_wordwise_decisions = pickle.load(f)

### Acquire FREQ vocabulary

In [47]:
import pickle
with open('../data/freq/freq_token_db.pkl', 'rb') as f:
  freq_backoff = pickle.load(f)
with open('../data/freq/freq_token_db.pkl/all_levels_freq_binning_cumulative.pkl', 'rb') as f:
  words_and_levels_freq_binning_cum = pickle.load(f)

alt_backoff = words_and_levels_freq_binning_cum[10000][0]

### Acquire MLE vocabulary

In [48]:
def get_mle_counts_aligned(words, levels):
  dict_levels = {}
  for word, level in zip(words, levels):
      try:
          #assume every entry of dict_levels : {3: int, 4: int, 5: int}
          dict_levels[word][level] += 1
      except:
          dict_levels[word] = {3: 0, 4: 0, 5: 0}
          dict_levels[word][level] += 1
  return dict_levels






In [49]:
# ugly, lazy global
stats = []

def max_frequency_strategy(dict_levels, confidence = 0, min_occurrences = 1):
  dict_levels_max = {}
  no = 0
  for token in dict_levels.keys():
    cd = max(dict_levels[token].values())/sum(dict_levels[token].values())
    if cd >= confidence and sum(dict_levels[token].values()) >= min_occurrences:
      dict_levels_max[token] = max(dict_levels[token].items(), key = lambda x: x[1])[0]
    else:
      no += 1
  return dict_levels_max, no

def mle_training_pipeline_aligned(data, strategy, confidence = 0, min_occurrences = 1):
  counts = get_mle_counts_aligned(data['Word'], data['Label'])
  dl, rejects = strategy(counts, confidence = confidence, min_occurrences = min_occurrences)
  stats.append([confidence, min_occurrences, rejects])
  return dl



In [50]:
### look at min number of things - cross the confidence intervals and the threshold appearance minima (1s and 2s, 3s) (cross this)
### look at para level disambig - add to method as tuning for the lexicon.
### send note to

### do as tuning
### Error Analysis
### select a sample of a hundred cases (fragment) - word levels might be lil bit more (report on percentages, and which errors were consequential - inconsequential)
### Automatic classification: look at who caused the error


In [51]:

mle_thresholds = [mle_training_pipeline_aligned(words_train, max_frequency_strategy, confidence = x/100, min_occurrences = y) for y in range(1,8) for x in range(30, 105, 5)]
mle_functions = [
    lambda token, analyses: mle.get(token, 0) for mle in mle_thresholds
]


In [52]:
len(mle_thresholds[0])

31924

In [53]:
stats_df = pd.DataFrame(stats, columns =['confidence', 'threshold', 'tok_rejected'])

In [54]:

# configure lexicon
import pickle
with open('../data/lemma_db/quick_lemma_lookup.pkl', 'rb') as f:
    lemma_db = pickle.load(f)


def sort_score(list_of_analyses):
  list_of_analyses.sort(key = lambda x: x.score, reverse = True)
  highest_score = list_of_analyses[0].score
  analyses_with_equal_score = [x for x in list_of_analyses
                                if x.score == highest_score]
  return analyses_with_equal_score


def sort_lexlogprob(list_of_analyses):
  list_of_analyses.sort(key = lambda x: x.analysis['lex_logprob'], reverse=True)
  highest_prob = list_of_analyses[0].analysis['lex_logprob']
  analyses_with_equal_prob = [x for x in list_of_analyses
                                if x.analysis['lex_logprob'] == highest_prob]
  return analyses_with_equal_prob

def default_level_oov(word, level, analysis):
  return level

def sort_level(list_of_analyses):
  list_of_analyses.sort(key = lambda x: get_rl_single(x.analysis, 9999))
  lowest_rl = get_rl_single(list_of_analyses[0].analysis, 9999)
  analyses_with_equal_level = [x for x in list_of_analyses
                                    if get_rl_single(x.analysis, 9999) == lowest_rl]
  return analyses_with_equal_level

def score_then_level_then_llp(list_of_analyses):
  list_of_analyses = sort_score(list_of_analyses)
  if len(list_of_analyses) == 1:
    return list_of_analyses[0].analysis

  list_of_analyses = sort_level(list_of_analyses)
  if len(list_of_analyses) == 1:
    return list_of_analyses[0].analysis

  list_of_analyses = sort_lexlogprob(list_of_analyses)
  return list_of_analyses[0].analysis







In [55]:
def get_rl_0(token, analyses, oov_level = 0):
  return oov_level

def get_rl_3(token, analyses, oov_level = 3):
  return oov_level

def get_rl_4(token, analyses, oov_level = 4):
  return oov_level

def get_rl_5(token, analyses, oov_level = 5):
  return oov_level


def get_rl_alt_freq(token, analyses, oov_level = 0):
    try:
        return alt_backoff[token]
    except:
        return oov_level

def get_rl_freq(token, analyses, oov_level = 0):
    try:
        return freq_backoff[token]
    except:
        return oov_level

def get_rl_single(analysis, oov_level = 0):
  lex = analysis['lex']
  pos = analysis['pos']
  if pos == 'noun_prop':
    return 3
  result = lemma_db.get(lex)
  if result:
    if len(result) == 1:
      rl = result[0]['level']
    else:
      most_similar_element = None
      max_similarity = -1

      for element in result:
          similarity = Levenshtein.ratio(pos, element['pos'])
          if similarity > max_similarity:
              max_similarity = similarity
              most_similar_element = element

      rl = most_similar_element['level']
    return rl
  else:
    return oov_level


def get_rl_lexicon(token, analyses, oov_level = 0):
  analysis = score_then_level_then_llp(analyses)

  return get_rl_single(analysis)


In [56]:
def levels_pipeline(fragment, decision_1, decision_2, requires_disambig = False):
  tokens = [t.split('#')[0] for t in fragment.split(' ')]
  gt = [t.split('#')[1] for t in fragment.split(' ')]

  if requires_disambig:
    analyses = [token.analyses for token in bert_disambig.disambiguate(tokens)]
  else:
    analyses = gt

  # decision round 1:
  levels = [decision_1(token, analysis) for token, analysis in zip(tokens, analyses)]

  # decision round 2:
  levels = [l if l != 0 else decision_2(t, a) for l, t, a in zip(levels, tokens, analyses)]

  return {
      'levels': levels,
      'gts': gt,
  }

def mle_levels_pipeline(fragment, model):
  tokens = [t.split('#')[0] for t in fragment.split(' ')]
  gt = [t.split('#')[1] for t in fragment.split(' ')]

  levels = [model.get(token, 0) for token in tokens]

  return {
      'levels': levels
  }



In [57]:
### New Approach: calculate decisions first, then hierarchy. Saves time!
pure_0 = [levels_pipeline(f, get_rl_0, get_rl_0) for f in frag_dev['0']]


pure_3 = [levels_pipeline(f, get_rl_3, get_rl_3) for f in frag_dev['0']]
pure_4 = [levels_pipeline(f, get_rl_4, get_rl_4) for f in frag_dev['0']]
pure_5 = [levels_pipeline(f, get_rl_5, get_rl_5) for f in frag_dev['0']]

pure_freq = [levels_pipeline(f, get_rl_freq, get_rl_0) for f in frag_dev['0']]
pure_alt_freq = [levels_pipeline(f, get_rl_alt_freq, get_rl_0) for f in frag_dev['0']]


pure_lexicon = [levels_pipeline(f, get_rl_lexicon, get_rl_0, requires_disambig = True) for f in frag_dev['0']]


def level_keep0(l):
  if l > 0:
    if l < 3:
      return 3
    else:
      return l
  else:
    return 0

## Lexicon might level 1 or 2.



In [58]:
pure_mles = [
    [
        mle_levels_pipeline(f, model) for f in frag_dev['0']
    ]
    for model in mle_thresholds
]

#### We need to preprocess BERT outputs, as they are padded to 20.


In [59]:
#### Check preformance when padded to 30? (only lose around 150 tokens)
bert_padded_decisions = []

for x, y in zip(pure_alt_freq, res_wordwise_decisions[0]):
  y = y + [0 for x in range(len(x['levels']) - len(y))]
  bert_padded_decisions.append(y)

In [60]:
np.concatenate(bert_padded_decisions).shape

(22075,)

#### Decisions setup

In [61]:
decisions = {
    'pure_0': np.concatenate([e['levels'] for e in pure_0]),
    'pure_3': np.concatenate([e['levels'] for e in pure_3]),
    'pure_4': np.concatenate([e['levels'] for e in pure_4]),
    'pure_5': np.concatenate([e['levels'] for e in pure_5]),
    'pure_freq': np.concatenate([e['levels'] for e in pure_freq]),
    'pure_alt_freq': np.concatenate([e['levels'] for e in pure_alt_freq]),
    'pure_lexicon': [level_keep0(l) for l in np.concatenate([e['levels'] for e in pure_lexicon])],
    'pure_bert': [x+3 for x in np.concatenate(bert_padded_decisions)]
}

mle_decisions = [
    np.concatenate([e['levels'] for e in d]) for d in pure_mles
]

In [62]:
final_gt = np.concatenate([e['gts'] for e in pure_freq])

In [63]:
def comb_experiment_pipeline(decision_1, decision_2 = decisions['pure_0'], decision_final = decisions['pure_0']):
  final_result = []
  ### DECISION 2
  for i, x in enumerate(decision_1):
    if x == 0:
      final_result.append(decision_2[i])
    else:
      final_result.append(x)

  ### Final decision
  for i, x in enumerate(final_result):
    if x == 0:
      final_result[i] = decision_final[i]

  return final_result

exps = [
    comb_experiment_pipeline(mle_dec, decision_2 = decisions['pure_lexicon'], decision_final = decisions['pure_bert']) for mle_dec in mle_decisions
]


In [64]:
final_gt = final_gt.astype(int)
def results_to_csv(result_arr):
  all_rows = []
  for resu in result_arr:
    inv_report = classification_report(final_gt, resu, output_dict = True)

    arr_inv = np.concatenate([[inv_report[x]['f1-score'],
            inv_report[x]['precision'],
            inv_report[x]['recall'],] for x in ['3', '4', '5']])
    arr_inv = np.append(arr_inv, inv_report['accuracy'])
    arr_inv = np.append(arr_inv, inv_report['macro avg']['f1-score'])

    all_rows.append(arr_inv)

  return all_rows





In [65]:
all_rows = results_to_csv(exps)

In [66]:
df_results_old = pd.DataFrame(all_rows, columns = ['f1_3','3_prec','3_recall','f1_4','4_prec','4_recall','f1_5','5_prec','5_recall','accuracy','f1_macro'])

### Aggregation Experiments

In [None]:
pure_0 = [levels_pipeline(f, get_rl_0, get_rl_0) for f in frag_dev['0']]


pure_3 = [levels_pipeline(f, get_rl_3, get_rl_3) for f in frag_dev['0']]

pure_freq = [levels_pipeline(f, get_rl_freq, get_rl_0) for f in frag_dev['0']]
pure_alt_freq = [levels_pipeline(f, get_rl_alt_freq, get_rl_0) for f in frag_dev['0']]


In [None]:
pure_frag_mles = [[e['levels'] for e in d] for d in pure_mles]

In [None]:
len(pure_frag_mles[0])

2948

In [None]:
pure_frag_0 = [x['levels'] for x in pure_0]
pure_frag_3 = [x['levels'] for x in pure_3]


pure_frag_lexicon = [[l if l > 3 or l == 0 else 3 for l in x['levels']] for x in pure_lexicon]
pure_frag_bert = [[w+3 for w in frag] for frag in bert_padded_decisions]

In [None]:

def frag_exps_pipeline(decision_1, decision_2 = pure_frag_0, decision_final = pure_frag_0, frags = frag_dev['0']):
  all_results = []
  for d1, d2, d3, f in zip(decision_1, decision_2, decision_final, frags):
    toks = [t.split('#')[0] for t in f.split(' ')]
    gts = [int(t.split('#')[1]) for t in f.split(' ')]
    gold_level = max(gts)

    words_5 = [t for t, l in zip(toks, gts) if l == 5]
    words_4 = [t for t, l in zip(toks, gts) if l == 4]
    words_3 = [t for t, l in zip(toks, gts) if l == 3]

    ### predictive process
    ## d2

    round = ['mle' if l != 0 else 0 for l in d1]

    decision = [dec if dec != 0 else alt for dec, alt in zip(d1, d2)]

    round = ['lex' if r == 0 and d != 0 else r for r, d in zip(round, decision)]

    decision = [dec if dec != 0 else alt for dec, alt in zip(decision, d3)]

    round = ['bert' if r == 0 and d != 0 else r for r, d in zip(round, decision)]

    pred = max(decision)


    misidentified_5 = ["{}/{}/{}".format(t, d, r) for t, g, d, r in zip(toks, gts, decision, round) if (d != g and g == 5)]
    misidentified_4 = ["{}/{}/{}".format(t, d, r) for t, g, d, r in zip(toks, gts, decision, round) if (d != g and g == 4)]
    misidentified_3 = ["{}/{}/{}".format(t, d, r) for t, g, d, r in zip(toks, gts, decision, round) if (d != g and g == 3)]

    all_errors = [' '.join(["{}->{}->{}".format(g, r, d) for t, g, d, r in zip(toks, gts, decision, round) if (d != g and g == x)]) for x in [3,4,5]]

    all_results.append([
        gold_level,
        pred
    ])
  return all_results


In [None]:
exps_frags = [
    frag_exps_pipeline(mle_dec, decision_2 = pure_frag_lexicon, decision_final = pure_frag_bert) for mle_dec in pure_frag_mles
]

In [None]:
def frag_results_to_csv(result_arr):
  all_rows = []
  for resu in result_arr:
    rr = pd.DataFrame(resu, columns = ['gt', 'pred'])

    inv_report = classification_report(rr['gt'], rr['pred'], output_dict = True)

    arr_inv = np.concatenate([[inv_report[x]['f1-score'],
            inv_report[x]['precision'],
            inv_report[x]['recall'],] for x in ['3', '4', '5']])
    arr_inv = np.append(arr_inv, inv_report['accuracy'])
    arr_inv = np.append(arr_inv, inv_report['macro avg']['f1-score'])

    all_rows.append(arr_inv)

  return all_rows

In [None]:
frag_df = pd.DataFrame(frag_results_to_csv(exps_frags))

In [70]:
frag_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.914784,0.910813,0.91879,0.863809,0.868048,0.859611,0.845752,0.846859,0.844648,0.880936,0.874782
1,0.914784,0.910813,0.91879,0.863809,0.868048,0.859611,0.845752,0.846859,0.844648,0.880936,0.874782
2,0.914784,0.910813,0.91879,0.863809,0.868048,0.859611,0.845752,0.846859,0.844648,0.880936,0.874782
3,0.914784,0.910813,0.91879,0.863809,0.868048,0.859611,0.845752,0.846859,0.844648,0.880936,0.874782
4,0.914784,0.910813,0.91879,0.863809,0.868048,0.859611,0.845752,0.846859,0.844648,0.880936,0.874782
5,0.914649,0.912114,0.917197,0.864425,0.868192,0.860691,0.845401,0.84485,0.845953,0.880936,0.874825
6,0.914649,0.912114,0.917197,0.864425,0.868192,0.860691,0.845401,0.84485,0.845953,0.880936,0.874825
7,0.915147,0.911532,0.91879,0.865364,0.870087,0.860691,0.847258,0.847258,0.847258,0.881954,0.875923
8,0.918254,0.915348,0.921178,0.868435,0.870793,0.866091,0.847613,0.849279,0.845953,0.884328,0.878101
9,0.91789,0.914625,0.921178,0.868435,0.870793,0.866091,0.846859,0.849081,0.844648,0.883989,0.877728
