### Created By Dusit Chunviset (642115017)

In [1]:
import pandas as pd
import os
import re
import itertools
import string
import multiprocessing

from string import ascii_lowercase
from nltk.tokenize import word_tokenize
from multiprocessing.pool import ThreadPool
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def get_count(c,norvig_orig):
    return norvig_orig.apply(lambda x: x.term.count(c) * x.freq, axis=1).sum()

def preProcess(s):
    s = s.lower()
    s = re.sub(r'[^A-Za-z]', ' ', s)
    s = re.sub(r'\s+', ' ' , s)
    s = word_tokenize(s)
    return ' '.join(s)

# Spell Correction
Mispelled word: shell </br>
words 1: shall </br>
words 2: shull </br>
words 3: shil </br>
words 4: shout </br>
words 5: shalled </br>

In [3]:
COCA = pd.DataFrame([['shall',55200], ['shull',12140], ['shil', 2237], ['shout',21520], ['shale',3972]], columns=['word','frequency'])
COCA_pop = 1001610938
COCA['P(w)'] = COCA['frequency']/COCA_pop
COCA['rank'] = COCA['frequency'].rank(ascending=False, method='min').astype(int)
COCA

Unnamed: 0,word,frequency,P(w),rank
0,shall,55200,5.5e-05,1
1,shull,12140,1.2e-05,3
2,shil,2237,2e-06,5
3,shout,21520,2.1e-05,2
4,shale,3972,4e-06,4


In [4]:
WIKI = pd.DataFrame([['shall',55200], ['shull',12140], ['shil', 2237], ['shout',21520], ['shale',3972]], columns=['word','frequency'])
WIKI_pop = 1.9e9
WIKI['P(w)'] = WIKI['frequency']/WIKI_pop
WIKI['rank'] = WIKI['frequency'].rank(ascending=False, method='min').astype(int)
WIKI

Unnamed: 0,word,frequency,P(w),rank
0,shall,55200,2.9e-05,1
1,shull,12140,6e-06,3
2,shil,2237,1e-06,5
3,shout,21520,1.1e-05,2
4,shale,3972,2e-06,4


In [5]:
topdir = '../resource/iula'
all_content = []
for dirpath, dirnames, filename in os.walk(topdir):
    for name in filename:
        if name.endswith('plain.txt'):
            with open(os.path.join(dirpath, name)) as f:
                all_content.append(f.read())

processed_content = [preProcess(s) for s in all_content]

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(processed_content)
freq_iula = vectorizer.transform(processed_content)
freq_iula = pd.DataFrame(freq_iula.todense(), columns=vectorizer.get_feature_names_out()).sum()

print(freq_iula)

aa          34
aaa          2
aaaaaa       1
aalborg      2
aarhus       1
            ..
zvi          1
zygosity     1
zygote      10
zygotes      5
zygotic      2
Length: 34019, dtype: int64


In [7]:
query = ['shall', 'shull', 'shil', 'shout', 'shale']
tranformed_query = [vectorizer.inverse_transform(vectorizer.transform([q])) for q in query]
query_freq = pd.Series([freq_iula.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0 for tq in tranformed_query], index= query)

print(query_freq)

shall    910
shull      0
shil       0
shout      1
shale      0
dtype: int64


In [8]:
IULA = pd.DataFrame(query_freq, columns=['frequency'])
IULA_pop = len(processed_content)
IULA['P(w)'] = IULA['frequency']/IULA_pop
IULA['rank'] = IULA['frequency'].rank(ascending=False).astype(int)
IULA

In [9]:
norvig = pd.read_csv('https://norvig.com/ngrams/count_1edit.txt',sep='\t',encoding="ISO-8859-1",header=None)
norvig.columns = ['term', 'edit']
norvig = norvig.set_index('term')
print(norvig.head())

      edit
term      
e|i    917
a|e    856
i|e    771
e|a    749
a|i    559


In [10]:
norvig_orig = pd.read_csv('https://norvig.com/ngrams/count_big.txt',sep='\t',encoding="ISO-8859-1",header=None)
norvig_orig = norvig_orig.dropna()
norvig_orig.columns=['term','freq']
print(norvig_orig.head())

    term   freq
0      a  21160
1    aah      1
2  aaron      5
3     ab      2
4  aback      3


In [11]:
character_set = list(map(''.join, itertools.product(ascii_lowercase, repeat=1))) + list (map(''.join, itertools.product(ascii_lowercase, repeat=2)))

pool = ThreadPool(8) 
freq_list = pool.starmap(get_count, zip(character_set, itertools.repeat(norvig_orig)))

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')

In [12]:
COCA['P(x|w)'] = [
    (norvig.loc['s|a'].values / freq_df.loc['a'].values)[0], #shall
    (norvig.loc['s|u'].values / freq_df.loc['u'].values)[0], #shull
    (norvig.loc['s|i'].values / freq_df.loc['i'].values)[0], #shil
    (norvig.loc['s|t'].values / freq_df.loc['ut'].values)[0], #shout
    (norvig.loc['s|e'].values / freq_df.loc['e'].values)[0] #shale
]
COCA

Unnamed: 0,word,frequency,P(w),rank,P(x|w)
0,shall,55200,5.5e-05,1,3.2e-05
1,shull,12140,1.2e-05,3,3.6e-05
2,shil,2237,2e-06,5,1.6e-05
3,shout,21520,2.1e-05,2,0.003091
4,shale,3972,4e-06,4,1.9e-05


In [13]:
COCA['109 P(x|w)P(w)'] = 1e9 * COCA['P(w)'] * COCA['P(x|w)']
COCA

Unnamed: 0,word,frequency,P(w),rank,P(x|w),109 P(x|w)P(w)
0,shall,55200,5.5e-05,1,3.2e-05,1.758801
1,shull,12140,1.2e-05,3,3.6e-05,0.436881
2,shil,2237,2e-06,5,1.6e-05,0.036651
3,shout,21520,2.1e-05,2,0.003091,66.401736
4,shale,3972,4e-06,4,1.9e-05,0.075178
