In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

### Approach:

Divide the 11M word corpus into equally sized 10000 word bins. Count, within these bins, the number of 3, 4, and 5s. Assign to each bin the majority category.

Build a dict words `{word: bin}`

Build a dict bins `{bin: [l3, l4, l5]}`

Use the majority of each bin to label the words above.

In [None]:
cleaned_freqs = pd.read_csv('../../data/freq/all_camelbert_freqs.csv')

In [None]:
import pickle

- Get all our known, labelled words and place them into the bins

In [None]:
with open('../../data/levels_db/mle_max_aligned_model.pkl', 'rb') as f:
  known_levels = pickle.load(f)

- Do this for bins of equal size in terms of cumulative frequency

In [None]:
total_words = sum(cleaned_freqs['1'])

In [None]:
def level_full_pipeline_eq_sized(n_bins, oov_level = 5):

  bin_size = total_words/n_bins
  words = {}
  bins = {i: {3: 0, 4: 0, 5: 0} for i in range(0, n_bins)}

  bin_words = []

  current_sum = 0
  current_bin = 0
  words_in_bin = 0
  for word, freq in zip(cleaned_freqs['0'], cleaned_freqs['1']):
    words[word] = current_bin
    current_sum += freq
    words_in_bin += 1
    if current_sum > bin_size:
      current_sum = current_sum % bin_size
      current_bin += 1
      bin_words.append(words_in_bin)
      words_in_bin = 0

  for word, level in known_levels.items():
    try:
      bin = words[word]
      bins[bin][level] += 1
    except:
      pass


  no_hits_bins = 0

  bins_levelled = {}
  for bin in bins.keys():
    if list(bins[bin].values()).count(0) == 3:
      bins_levelled[bin] = oov_level
      no_hits_bins += 1
    else:
      bins_levelled[bin] = max(bins[bin].items(), key = lambda x: x[1])[0]

  levelled_words = {}
  for word in words:
    levelled_words[word] = bins_levelled[words[word]]

  return levelled_words, bin_words


In [None]:
bincounts = [10000, 5000, 2000, 1000, 500, 200, 100]
level_sets = {bc: level_full_pipeline_eq_sized(bc) for bc in bincounts}

### Benchmarking

In [None]:
with open('../../data/freq/all_levels_freq_binning_cumulative.pkl', 'wb') as f:
  pickle.dump(level_sets, f)

def get_rl(token, model, oov_level = 3):
  try:
      return model[token]
  except:
      return oov_level

## testing data in fragments
frag_train = pd.read_csv('../../data/all_train_aligned.csv')
frag_dev = pd.read_csv('../../data/all_dev_aligned.csv')
frag_test = pd.read_csv('../../data/all_test_aligned.csv')


frag_train = frag_train[frag_train.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_dev = frag_dev[frag_dev.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_test = frag_test[frag_test.apply(lambda x: type(x['0']) == str, axis = 1)]

def levels_pipeline(fragment, model):
  tokens = [t.split('#')[0] for t in fragment.split(' ')]
  gt = [t.split('#')[1] for t in fragment.split(' ')]

  # decision round 1:
  levels = [get_rl(token, model) for token in tokens]

  return {
      'levels': levels,
      'gts': gt,
  }

keys = list(level_sets.keys())

In [None]:
results = [
    [levels_pipeline(f, level_sets[w][0]) for f in frag_dev['0']]
    for w in keys
]

res = [
    np.concatenate([e['levels'] for e in r])
    for r in results
]

final_gt = np.concatenate([e['gts'] for e in results[0]])

final_gt = final_gt.astype(int)

def results_to_csv(result_arr):
  all_rows = []
  for resu, key in zip(result_arr, keys):
    inv_report = classification_report(final_gt, resu, output_dict = True)

    arr_inv = np.concatenate([[inv_report[x]['f1-score'],
            inv_report[x]['precision'],
            inv_report[x]['recall'],] for x in ['3', '4', '5']])
    arr_inv = np.append(arr_inv, inv_report['accuracy'])
    arr_inv = np.append(arr_inv, inv_report['macro avg']['f1-score'])
    arr_inv = np.append(arr_inv, key)


    all_rows.append(arr_inv)

  return all_rows

pd.DataFrame(results_to_csv(res), columns= ['f1_3','3_prec','3_recall','f1_4','4_prec','4_recall','f1_5','5_prec','5_recall','accuracy','f1_macro', 'binsize'])