In [1]:
import os
import pandas as pd
import seaborn as sns

In [2]:
folder = os.path.join(os.getcwd(),'..','data','korpus')

vanilla = os.path.join(os.getcwd(),'..','data','korpus','ngram','vanilla')
laplace = os.path.join(os.getcwd(),'..','data','korpus','ngram','laplace')
unk = os.path.join(os.getcwd(),'..','data','korpus','ngram','unk')

### Generating Vanilla Language Model

In [3]:
df = pd.read_csv(os.path.join(folder,'korpus_clean.csv'))

word_count = pd.read_csv(os.path.join(folder,'korpus_frequency.csv'))['Frequency'].sum()
words = df['Word'].to_numpy()


#### Unigram

In [5]:
%%time

#Calculating frequencies

def calcualte_unigram(words, word_count: int):
    unigram = {}
    
    #Calculating frequency.
    for i in range(len(words)):
        first  = words[i]

        if f'{first}' in unigram:
            unigram[f'{first}'] += 1
        else:
            unigram[f'{first}'] = 1
            
    df_unigram = pd.DataFrame(unigram.items(), columns=['Unigram', 'Frequency'])
    
    #Calculating probability.
    df_unigram['Probability'] = [freq/word_count for freq in unigram.values()]
    df_unigram.dropna()
    print('Finished!')
    return df_unigram


df_unigram = calcualte_unigram(words, word_count)

Finished!
Wall time: 18.3 s


In [6]:
#Saving unigram
dict_unigram = df_unigram.set_index('Unigram').T.to_dict('list')
df_unigram.to_csv(os.path.join(vanilla,'unigram.csv'), index=False)

#### Bigram

In [8]:
#Bigram
def calculate_bigram(words, unigram: dict): 
    bigram = {}

    #Calculate frequency.
    for i in range(len(words)-1):
        first  = words[i]
        second = words[i+1]
        
        if f'{first};{second}' in bigram:
            bigram[f'{first};{second}'] += 1
        else:
            bigram[f'{first};{second}'] = 1        
    
    df_bigram = pd.DataFrame(bigram.items(), columns=['Bigram', 'Frequency'])
    
    df_bigram['Probability'] = [bi_freq/unigram[bi.split(';')[0]][0] for bi,bi_freq in bigram.items()]
    
    print('Finished!')
    return df_bigram
    
df_bigram = calculate_bigram(words, dict_unigram)
    

247476
247492
5005824
6923575
10709077
10752974
11385818
11398858
11398905
11480965
11826323
13520878
16285476
18018972
19455483
21127389
23454399
23454944
23612612
23738662
24080878
24132454
24257829
24425637
24427680
24429024
24445612
24454137
24709079
24764620
24765165
24771316
24778355
25019100
25019122
25101952
25115318
25603987
25710555
25778520
25878773
25963473
25964027
25965653
25965880
26068338
26068395
26259796
26305135
26390757
26475252
26504390
26815499
27058218
27066567
27066603
27126275
27126820
27240716
27281456
27307428
27319638
27320884
27324910
27374356
27396405
27397258
27619524
27760496
28058607
28129631
30664382
30939600
31036660
33455477
33614350
34213530
34779342
37702230
40473885
40552274
40580412
40612656
41329120
41899429
43214087
43214293
45050682
45051550
47050431
50085951
50086941
50087360
50155998
50551186
51894838
52032415
53418056
54829634
57766346
59805878
60250921
60250973
60251222
60251561
61079802
62061333
Finished!


In [9]:
df_bigram

Unnamed: 0,Bigram,Frequency
0,<s>;L-,161367
1,L-;għan,1909
2,għan;prinċipali,203
3,prinċipali;ta',888
4,ta';Conectando,1
...,...,...
6724643,nipproduċu;meta,1
6724644,kejl;juri,2
6724645,tnaqqas;miċ-,1
6724646,qed;tkej,1


In [10]:
#Saving bigram
dict_bigram = df_bigram.set_index('Bigram').T.to_dict('list')
df_bigram.to_csv(os.path.join(vanilla,'bigram.csv'), index=False)

#### Trigram

In [11]:
%%time

def calculate_trigram(words, bigram: dict): 
    trigram = {}

    #Calculate frequency.
    for i in range(len(words)-2):
        first  = words[i]
        second = words[i+1]
        third = words[i+2]

        if f'{first};{second};{third}' in trigram:
            trigram[f'{first};{second};{third}'] += 1
        else:
            trigram[f'{first};{second};{third}'] = 1
            
            
    df_trigram = pd.DataFrame(trigram.items(), columns=['Trigram', 'Frequency'])
    
    #Calculate probability.
    probability = []

    for tri,tri_freq in trigram.items():
        first,second = tri.split(';')[:2]
        bi_freq = dict_bigram[f'{first};{second}'][0]
        probability.append(tri_freq/bi_freq)

    df_trigram['Probability'] = probability
    
    print('Finished!')
    return df_trigram

df_trigram = calculate_trigram(words, dict_bigram)

Finished!
Wall time: 3min 44s


In [12]:
df_trigram

Unnamed: 0,Trigram,Frequency,Probability
0,<s>;L-;għan,1757,0.010888
1,L-;għan;prinċipali,78,0.040859
2,għan;prinċipali;ta',50,0.246305
3,prinċipali;ta';Conectando,1,0.001126
4,ta';Conectando;Mundos,1,1.000000
...,...,...,...
21266594,juri;żieda;dan,1,0.025641
21266595,żieda;dan;ifis,1,0.100000
21266596,ekonomika;tkun;kibret,1,0.111111
21266597,Jekk;jonqos;ikun,1,0.066667


In [13]:
#Saving trigram
df_trigram.to_csv(os.path.join(vanilla,'trigram.csv'), index=False)

### Generate UNK Model

#### Unigram

In [14]:
#Sum frequencies for all word with frequency less than 3.

#Load vanilla unigram.
df_unigram = pd.read_csv(os.path.join(vanilla,'unigram.csv'))

#Calculate Frequency and Probability.
condition = df_unigram['Frequency'] < 3

unk_frequency = df_unigram[condition].Frequency.sum()
unk_probability = unk_frequency/word_count

#Remove the words that occure less than 3 times.
df_unigram = df_unigram.drop(df_unigram[condition].index)

#Add the <UNK> token.
df_unk = pd.DataFrame({'Unigram': '<UNK>', 'Frequency': unk_frequency, 'Probability': unk_probability}
                   ,index=[0])
df_unigram = pd.concat([df_unk,df_unigram], ignore_index = True)

#Save model.
df_unigram.to_csv(os.path.join(unk,'unigram.csv'), index=False)
dict_unigram = df_unigram.set_index('Unigram').T.to_dict('list')

#Replace low frequency words with <UNK>

for i in range(len(words)):
    #If the word is not in dict_unigram then it means it was removed and set to <UNK>
    if words[i] not in dict_unigram:
        words[i] = '<UNK>'


df_unigram

Unnamed: 0,Unigram,Frequency,Probability
0,<UNK>,378799,6.068782e-03
1,<s>,2545588,4.078316e-02
2,L-,189976,3.043627e-03
3,għan,22124,3.544511e-04
4,prinċipali,7469,1.196617e-04
...,...,...,...
205664,Formers,3,4.806334e-08
205665,THETIS,3,4.806334e-08
205666,Neurone,4,6.408446e-08
205667,PEEP,10,1.602111e-07


#### Bigram

In [15]:
df_bigram = calculate_bigram(words, dict_unigram)

247476
247492
5005824
6923575
10709077
10752974
11385818
11398858
11398905
11480965
11826323
13520878
16285476
18018972
19455483
21127389
23454399
23454944
23612612
23738662
24080878
24132454
24257829
24425637
24427680
24429024
24445612
24454137
24709079
24764620
24765165
24771316
24778355
25019100
25019122
25101952
25115318
25603987
25710555
25778520
25878773
25963473
25964027
25965653
25965880
26068338
26068395
26259796
26305135
26390757
26475252
26504390
26815499
27058218
27066567
27066603
27126275
27126820
27240716
27281456
27307428
27319638
27320884
27324910
27374356
27396405
27397258
27619524
27760496
28058607
28129631
30664382
30939600
31036660
33455477
33614350
34213530
34779342
37702230
40473885
40552274
40580412
40612656
41329120
41899429
43214087
43214293
45050682
45051550
47050431
50085951
50086941
50087360
50155998
50551186
51894838
52032415
53418056
54829634
57766346
59805878
60250921
60250973
60251222
60251561
61079802
62061333
Finished!


In [16]:
df_bigram.to_csv(os.path.join(unk,'bigram.csv'), index=False)
# dict_bigram = df_bigram.set_index('Bigram').T.to_dict('list')

df_bigram

Unnamed: 0,Bigram,Frequency
0,<s>;L-,161367
1,L-;għan,1909
2,għan;prinċipali,203
3,prinċipali;ta',888
4,ta';Conectando,1
...,...,...
6129580,nipproduċu;meta,1
6129581,kejl;juri,2
6129582,tnaqqas;miċ-,1
6129583,qed;tkej,1


#### Trigram

In [17]:
# %%time

# df_trigram = calculate_trigram(words, dict_bigram)

Finished!
Wall time: 4min 58s


In [18]:
# df_trigram.to_csv(os.path.join(unk,'trigram.csv'), index=False)

# df_trigram

Unnamed: 0,Trigram,Frequency,Probability
0,<s>;L-;għan,1757,0.010888
1,L-;għan;prinċipali,78,0.040859
2,għan;prinċipali;ta',50,0.246305
3,prinċipali;ta';Conectando,1,0.001126
4,ta';Conectando;Mundos,1,1.000000
...,...,...,...
20754968,juri;żieda;dan,1,0.025641
20754969,żieda;dan;ifis,1,0.100000
20754970,ekonomika;tkun;kibret,1,0.111111
20754971,Jekk;jonqos;ikun,1,0.066667
