In [1]:
from google.colab import drive
drive.mount('/content/drive')
from tqdm import tqdm
import pandas as pd

Mounted at /content/drive


### Approach:

Divide the 11M word corpus into equally sized 10000 word bins. Count, within these bins, the number of 3, 4, and 5s. Assign to each bin the majority category.

Build a dict words `{word: bin}`

Build a dict bins `{bin: [l3, l4, l5]}`

Use the majority of each bin to label the words above.

In [2]:
cleaned_freqs = pd.read_csv('/content/drive/My Drive/capstone_data/bashar_data/capstone_data/splits/all_camelbert_freqs.csv')

In [3]:
cleaned_freqs['0']

0                     .
1                    في
2                     ،
3                    من
4                     "
               ...     
11724677    أبريسديانتو
11724678     أبروسكاتوا
11724679       إبداعتكن
11724680         أبجديو
11724681          أإشرف
Name: 0, Length: 11724682, dtype: object

In [4]:
import pickle

### Now
- Get all our known, labelled words and place them into the bins

In [9]:
with open('/content/drive/My Drive/capstone_data/disambig_db/mle_max_aligned_model.pkl', 'rb') as f:
  known_levels = pickle.load(f)

In [None]:
def get_levels_for_bins(n_bins):
  words = {}
  bins = {i: {3: 0, 4: 0, 5: 0} for i in range((len(cleaned_freqs['0'])+1) // n_bins)}
  not_in_corpus = 0

  for i in range(len(cleaned_freqs['0'])):
    words[cleaned_freqs['0'][i]] = i // n_bins

  for word, level in known_levels.items():
    try:
      bin = words[word]
      bins[bin][level] += 1
    except:
      not_in_corpus += 1
      pass
  return bins



In [None]:
## level according to the bins
def level_full_pipeline(binsize, oov_level = 5):
  words = {}
  bins = {i: {3: 0, 4: 0, 5: 0} for i in range((len(cleaned_freqs['0']) // binsize) + 1)}
  not_in_corpus = 0

  for i in range(len(cleaned_freqs['0'])):
    words[cleaned_freqs['0'][i]] = i // binsize

  for word, level in known_levels.items():
    try:
      bin = words[word]
      bins[bin][level] += 1
    except:
      not_in_corpus += 1
      pass

  no_hits_bins = 0

  bins_levelled = {}
  for bin in bins.keys():
    if list(bins[bin].values()).count(0) == 3:
      bins_levelled[bin] = oov_level
      no_hits_bins += 1
    else:
      bins_levelled[bin] = max(bins[bin].items(), key = lambda x: x[1])[0]

  levelled_words = {}
  for word in words:
    levelled_words[word] = bins_levelled[words[word]]

  print(no_hits_bins)
  return levelled_words




In [None]:
lw_2000 = level_full_pipeline(2000)
lw_5000 = level_full_pipeline(5000)
lw_10000 = level_full_pipeline(10000)
lw_20000 = level_full_pipeline(20000)
lw_50000 = level_full_pipeline(50000)
lw_100000 = level_full_pipeline(100000)

4156
1345
517
178
29
6


In [None]:
lw_2000_3 = level_full_pipeline(2000, 3)
lw_5000_3 = level_full_pipeline(5000, 3)
lw_10000_3 = level_full_pipeline(10000, 3)
lw_20000_3 = level_full_pipeline(20000, 3)
lw_50000_3 = level_full_pipeline(50000, 3)
lw_100000_3 = level_full_pipeline(100000, 3)

4156
1345
517
178
29
6


In [None]:
11000000 / 20000

In [None]:
11724682 // 20000

In [None]:
pd.DataFrame(lw_2000.values()).value_counts()

5    9058682
3    2216000
4     450000
dtype: int64

In [19]:
def print_counts(counts):
  print(pd.DataFrame(counts.values()).value_counts())

In [None]:
print_counts(lw_2000)
print_counts(lw_5000)
print_counts(lw_10000)
print_counts(lw_20000)
print_counts(lw_50000)
print_counts(lw_100000)
print_counts(lw_2000_3)
print_counts(lw_5000_3)
print_counts(lw_10000_3)
print_counts(lw_20000_3)
print_counts(lw_50000_3)
print_counts(lw_100000_3)

5    9058682
3    2216000
4     450000
dtype: int64
5    7894682
3    3195000
4     635000
dtype: int64
5    6784682
3    4100000
4     840000
dtype: int64
5    5384682
3    5220000
4    1120000
dtype: int64
3    6950000
5    3574682
4    1200000
dtype: int64
3    7500000
5    3124682
4    1100000
dtype: int64
3    10526682
5      748000
4      450000
dtype: int64
3    9919682
5    1170000
4     635000
dtype: int64
3    9264682
5    1620000
4     840000
dtype: int64
3    8764682
5    1840000
4    1120000
dtype: int64
3    8374682
5    2150000
4    1200000
dtype: int64
3    8024682
5    2600000
4    1100000
dtype: int64


In [None]:
#### save all levels for testing
all_things = {
  '2000': lw_2000,
  '5000': lw_5000,
  '10000': lw_10000,
  '20000': lw_20000,
  '50000': lw_50000,
  '100000': lw_100000,
  '2000_3': lw_2000_3,
  '5000_3': lw_5000_3,
  '10000_3': lw_10000_3,
  '20000_3': lw_20000_3,
  '50000_3': lw_50000_3,
  '100000_3': lw_100000_3
}



In [None]:
with open('/content/drive/My Drive/capstone_data/disambig_db/all_levels_freq_binning.pkl', 'wb') as f:
  pickle.dump(all_things, f)

### now do this but for bins of equal frequency cumulative

In [7]:
total_words = sum(cleaned_freqs['1'])

In [34]:
def level_full_pipeline_eq_sized(n_bins, oov_level = 5):

  bin_size = total_words/n_bins
  words = {}
  bins = {i: {3: 0, 4: 0, 5: 0} for i in range(0, n_bins)}

  bin_words = []

  current_sum = 0
  current_bin = 0
  words_in_bin = 0
  for word, freq in zip(cleaned_freqs['0'], cleaned_freqs['1']):
    words[word] = current_bin
    current_sum += freq
    words_in_bin += 1
    if current_sum > bin_size:
      current_sum = current_sum % bin_size
      current_bin += 1
      bin_words.append(words_in_bin)
      words_in_bin = 0

  for word, level in known_levels.items():
    try:
      bin = words[word]
      bins[bin][level] += 1
    except:
      pass


  no_hits_bins = 0

  bins_levelled = {}
  for bin in bins.keys():
    if list(bins[bin].values()).count(0) == 3:
      bins_levelled[bin] = oov_level
      no_hits_bins += 1
    else:
      bins_levelled[bin] = max(bins[bin].items(), key = lambda x: x[1])[0]

  levelled_words = {}
  for word in words:
    levelled_words[word] = bins_levelled[words[word]]

  print(no_hits_bins)
  return levelled_words, bin_words


In [35]:
bincounts = [10000, 5000, 2000, 1000, 500, 200, 100]
level_sets = {bc: level_full_pipeline_eq_sized(bc) for bc in bincounts}

4765
1934
603
255
105
32
10


In [36]:
with open('/content/drive/My Drive/capstone_data/disambig_db/all_levels_freq_binning_cumulative.pkl', 'wb') as f:
  pickle.dump(level_sets, f)

In [37]:
for s in bincounts:
  print_counts(level_sets[s])

AttributeError: 'tuple' object has no attribute 'values'

In [44]:
cleaned_freqs['0'][1:100]

1          في
2           ،
3          من
4           "
5         على
       ...   
95        وهي
96    الحرمين
97        فيه
98        مثل
99    السعودي
Name: 0, Length: 99, dtype: object