In [1]:
import pandas as pd
import os
import re
import itertools
import string
import multiprocessing

from string import ascii_lowercase
from nltk.tokenize import word_tokenize
from multiprocessing.pool import ThreadPool
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def get_count(c,norvig_orig):
    return norvig_orig.apply(lambda x: x.term.count(c) * x.freq, axis=1).sum()

def preProcess(s):
    s = s.lower()
    s = re.sub(r'[^A-Za-z]', ' ', s)
    s = re.sub(r'\s+', ' ' , s)
    s = word_tokenize(s)
    return ' '.join(s)

# (1) Prob tables generation -- between page #20 up to page #25

In [3]:
COCA = pd.DataFrame([['deet',420], ['deft',1240], ['defer', 2237], ['defeat',21940], ['defect',3972]], columns=['word','frequency'])
COCA_pop = 1001610938
COCA['P(w)'] = COCA['frequency']/COCA_pop
COCA['rank'] = COCA['frequency'].rank(ascending=False, method='min').astype(int)
COCA

Unnamed: 0,word,frequency,P(w),rank
0,deet,420,4.193245e-07,5
1,deft,1240,1.238006e-06,4
2,defer,2237,2.233402e-06,3
3,defeat,21940,2.190471e-05,1
4,defect,3972,3.965612e-06,2


In [4]:
WIKI = pd.DataFrame([['deet',124], ['deft',814], ['defer', 1416], ['defeat',121408], ['defect',7793]], columns=['word','frequency'])
WIKI_pop = 1.9e9
WIKI['P(w)'] = WIKI['frequency']/WIKI_pop
WIKI['rank'] = WIKI['frequency'].rank(ascending=False, method='min').astype(int)
WIKI

Unnamed: 0,word,frequency,P(w),rank
0,deet,124,6.526316e-08,5
1,deft,814,4.284211e-07,4
2,defer,1416,7.452632e-07,3
3,defeat,121408,6.389895e-05,1
4,defect,7793,4.101579e-06,2


In [5]:
topdir = '../resource/iula'
all_content = []
for dirpath, dirnames, filename in os.walk(topdir):
    for name in filename:
        if name.endswith('plain.txt'):
            with open(os.path.join(dirpath, name)) as f:
                all_content.append(f.read())

processed_content = [preProcess(s) for s in all_content]

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(processed_content)
freq_iula = vectorizer.transform(processed_content)
freq_iula = pd.DataFrame(freq_iula.todense(), columns=vectorizer.get_feature_names_out()).sum()

print(freq_iula)

aa          34
aaa          2
aaaaaa       1
aalborg      2
aarhus       1
            ..
zvi          1
zygosity     1
zygote      10
zygotes      5
zygotic      2
Length: 34019, dtype: int64


In [7]:
query = ['deet', 'deft', 'defer', 'defect', 'defeat']
tranformed_query = [vectorizer.inverse_transform(vectorizer.transform([q])) for q in query]
query_freq = pd.Series([freq_iula.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0 for tq in tranformed_query], index= query)

print(query_freq)

deet       0
deft       0
defer      1
defect    59
defeat     7
dtype: int64


In [8]:
IULA = pd.DataFrame(query_freq, columns=['frequency'])
IULA_pop = len(processed_content)
IULA['P(w)'] = IULA['frequency']/IULA_pop
IULA['rank'] = IULA['frequency'].rank(ascending=False).astype(int)

In [9]:
norvig = pd.read_csv('https://norvig.com/ngrams/count_1edit.txt',sep='\t',encoding="ISO-8859-1",header=None)
norvig.columns = ['term', 'edit']
norvig = norvig.set_index('term')
print(norvig.head())

      edit
term      
e|i    917
a|e    856
i|e    771
e|a    749
a|i    559


In [10]:
norvig_orig = pd.read_csv('https://norvig.com/ngrams/count_big.txt',sep='\t',encoding="ISO-8859-1",header=None)
norvig_orig = norvig_orig.dropna()
norvig_orig.columns=['term','freq']
print(norvig_orig.head())

    term   freq
0      a  21160
1    aah      1
2  aaron      5
3     ab      2
4  aback      3


# (2) Update the tables with Norvig and calculate the final prob -- page #28 - page #33

In [11]:
character_set = list(map(''.join, itertools.product(ascii_lowercase, repeat=1))) + list (map(''.join, itertools.product(ascii_lowercase, repeat=2)))

pool = ThreadPool(8) 
freq_list = pool.starmap(get_count, zip(character_set, itertools.repeat(norvig_orig)))

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')

In [12]:
COCA['P(x|w)'] = [
    (0 / freq_df.loc['f'].values)[0], #deet
    (norvig.loc['e| '].values / freq_df.loc['e'].values)[0], #deft
    (norvig.loc['t|r'].values / freq_df.loc['r'].values)[0], #defer
    (norvig.loc['e|ea'].values / freq_df.loc['ea'].values)[0], #defeat
    (norvig.loc['e|ec'].values / freq_df.loc['ec'].values)[0] #defect
]
COCA['109 P(x|w)P(w)'] = 1e9 * COCA['P(w)'] * COCA['P(x|w)']
COCA

Unnamed: 0,word,frequency,P(w),rank,P(x|w),109 P(x|w)P(w)
0,deet,420,4.193245e-07,5,0.0,0.0
1,deft,1240,1.238006e-06,4,3e-06,0.003912
2,defer,2237,2.233402e-06,3,3.6e-05,0.079366
3,defeat,21940,2.190471e-05,1,0.012834,281.124909
4,defect,3972,3.965612e-06,2,0.003167,12.558705


In [13]:
IULA['P(x|w)'] = COCA['P(x|w)']
IULA['109 P(x|w)P(w)'] = 1e9 * IULA['P(w)'] * IULA['P(x|w)']
IULA

Unnamed: 0,frequency,P(w),rank,P(x|w),109 P(x|w)P(w)
deet,0,0.0,4,,
deft,0,0.0,4,,
defer,1,0.007812,3,,
defect,59,0.460938,1,,
defeat,7,0.054688,2,,
