In [46]:
import os
import pandas as pd
import seaborn as sns

In [47]:
folder = os.path.join(os.getcwd(),'..','data','korpus')

vanilla = os.path.join(os.getcwd(),'..','data','korpus','ngram','vanilla')
laplace = os.path.join(os.getcwd(),'..','data','korpus','ngram','laplace')
unk = os.path.join(os.getcwd(),'..','data','korpus','ngram','unk')

### Generating Vanilla Language Model

In [84]:
df = pd.read_csv(os.path.join(folder,'norm_korpus_clean.csv'))

word_count = pd.read_csv(os.path.join(folder,'norm_korpus_frequency.csv'))['Frequency'].sum()
words = df['Word']
words

0                 <s>
1                  L-
2                għan
3          prinċipali
4                 ta'
              ...    
4031890        ġimgħa
4031891     intlagħbu
4031892            l-
4031893       partiti
4031894          mir-
Name: Word, Length: 4031895, dtype: object

#### Unigram

In [58]:
%%time

#Calculating frequencies

def calcualte_unigram(words: list, word_count: int):
    unigram = {}
    
    #Calculating frequency.
    for i in range(len(words)):
        first  = words[i]

        if f'{first}' in unigram:
            unigram[f'{first}'] += 1
        else:
            unigram[f'{first}'] = 1
            
    df_unigram = pd.DataFrame(unigram.items(), columns=['Unigram', 'Frequency'])
    
    #Calculating probability.
    df_unigram['Probability'] = [freq/word_count for freq in unigram.values()]
    
    print('Finished!')
    return df_unigram


df_unigram = calcualte_unigram(words, word_count)

Finished!
Wall time: 9.9 s


In [59]:
df_unigram

Unnamed: 0,Unigram,Frequency,Probability
0,<s>,179269,4.446273e-02
1,L-,11342,2.813070e-03
2,għan,872,2.162755e-04
3,prinċipali,535,1.326920e-04
4,ta',95215,2.361545e-02
...,...,...,...
125418,Deċiżi,1,2.480224e-07
125419,Tistqarr,1,2.480224e-07
125420,tgħini,1,2.480224e-07
125421,Tirringrazzja,1,2.480224e-07


In [60]:
#Saving unigram
dict_unigram = df_unigram.set_index('Unigram').T.to_dict('list')
df_unigram.to_csv(os.path.join(vanilla,'unigram.csv'), index=False)

#### Bigram

In [61]:
%%time

#Bigram
def calculate_bigram(words: list, unigram: dict): 
    bigram = {}

    #Calculate frequency.
    for i in range(len(words)-1):
        first  = words[i]
        second = words[i+1]
        
        if f'{first};{second}' in bigram:
            bigram[f'{first};{second}'] += 1
        else:
            bigram[f'{first};{second}'] = 1        

    df_bigram = pd.DataFrame(bigram.items(), columns=['Bigram', 'Frequency'])
    
    #Calculate probability.
    df_bigram['Probability'] = [bi_freq/unigram[bi.split(';')[0]][0] for bi,bi_freq in bigram.items()]
    
    print('Finished!')
    return df_bigram
    
df_bigram = calculate_bigram(words, dict_unigram)
    

Finished!
Wall time: 22.1 s


In [62]:
df_bigram

Unnamed: 0,Bigram,Frequency,Probability
0,<s>;L-,9397,0.052418
1,L-;għan,105,0.009258
2,għan;prinċipali,22,0.025229
3,prinċipali;ta',55,0.102804
4,ta';Conectando,1,0.000011
...,...,...,...
1074685,ħolm;x',1,0.014085
1074686,x';iwettqu,1,0.000226
1074687,iwettqu;fihom,1,0.020408
1074688,ġimgħa;intlagħbu,1,0.000697


In [63]:
#Saving bigram
dict_bigram = df_bigram.set_index('Bigram').T.to_dict('list')
df_bigram.to_csv(os.path.join(vanilla,'bigram.csv'), index=False)

#### Trigram

In [77]:
%%time

def calculate_trigram(words: list, bigram: dict): 
    trigram = {}

    #Calculate frequency.
    for i in range(len(words)-2):
        first  = words[i]
        second = words[i+1]
        third = words[i+2]

        if f'{first};{second};{third}' in trigram:
            trigram[f'{first};{second};{third}'] += 1
        else:
            trigram[f'{first};{second};{third}'] = 1
            
            
    df_trigram = pd.DataFrame(trigram.items(), columns=['Trigram', 'Frequency'])
    
    #Calculate probability.
    probability = []

    for tri,tri_freq in trigram.items():
        first,second = tri.split(';')[:2]
        bi_freq = dict_bigram[f'{first};{second}'][0]
        probability.append(tri_freq/bi_freq)

    df_trigram['Probability'] = probability
    
    print('Finished!')
    return df_trigram

df_trigram = calculate_trigram(words, dict_bigram)

Finished!
Wall time: 34 s


In [81]:
df_trigram

Unnamed: 0,Trigram,Frequency,Probability
0,<s>;L-;għan,105,0.011174
1,L-;għan;prinċipali,10,0.095238
2,għan;prinċipali;ta',6,0.272727
3,prinċipali;ta';Conectando,1,0.018182
4,ta';Conectando;Mundos,1,1.000000
...,...,...,...
2263818,iwettqu;fihom;</s>,1,1.000000
2263819,il-;ġimgħa;intlagħbu,1,0.001848
2263820,ġimgħa;intlagħbu;l-,1,1.000000
2263821,intlagħbu;l-;partiti,1,0.500000


In [67]:
#Saving trigram
df_trigram.to_csv(os.path.join(vanilla,'trigram.csv'), index=False)

### Generate UNK Model

#### Unigram

In [71]:
#Sum frequencies for all word with frequency less than 3.

#Load vanilla unigram.
df_unigram = pd.read_csv(os.path.join(vanilla,'unigram.csv'))

#Calculate Frequency and Probability.
condition = df_unigram['Frequency'] < 3

unk_frequency = df_unigram[condition].Frequency.sum()
unk_probability = unk_frequency/word_count

#Remove the words that occure less thann 3 times.
df = df_unigram.drop(df_unigram[condition].index)

#Add the <UNK> token.
df_unk = pd.DataFrame({'Unigram': '<UNK>', 'Frequency': unk_frequency, 'Probability': unk_probability}
                   ,index=[0])
df_unigram = pd.concat([df_unk,df_unigram], ignore_index = True)

#Save model.
df_unigram.to_csv(os.path.join(unk,'unigram.csv'))
dict_unigram = df_unigram.set_index('Unigram').T.to_dict('list')

df_unigram

Unnamed: 0,Unigram,Frequency,Probability
0,<UNK>,96085,2.383123e-02
1,<s>,179269,4.446273e-02
2,L-,11342,2.813070e-03
3,għan,872,2.162755e-04
4,prinċipali,535,1.326920e-04
...,...,...,...
125419,Deċiżi,1,2.480224e-07
125420,Tistqarr,1,2.480224e-07
125421,tgħini,1,2.480224e-07
125422,Tirringrazzja,1,2.480224e-07


#### Bigram

In [69]:
df_bigram = calculate_bigram(words, dict_unigram)

Finished!


In [82]:
df_bigram.to_csv(os.path.join(unk,'bigram.csv'))
dict_bigram = df_bigram.set_index('Bigram').T.to_dict('list')

df_bigram

Unnamed: 0,Bigram,Frequency,Probability
0,<s>;L-,9397,0.052418
1,L-;għan,105,0.009258
2,għan;prinċipali,22,0.025229
3,prinċipali;ta',55,0.102804
4,ta';Conectando,1,0.000011
...,...,...,...
1074685,ħolm;x',1,0.014085
1074686,x';iwettqu,1,0.000226
1074687,iwettqu;fihom,1,0.020408
1074688,ġimgħa;intlagħbu,1,0.000697


#### Trigram

In [85]:
%%time

df_trigram = calculate_trigram(words, dict_bigram)

Finished!
Wall time: 35.1 s


Unnamed: 0,Trigram,Frequency,Probability
0,<s>;L-;għan,105,0.011174
1,L-;għan;prinċipali,10,0.095238
2,għan;prinċipali;ta',6,0.272727
3,prinċipali;ta';Conectando,1,0.018182
4,ta';Conectando;Mundos,1,1.000000
...,...,...,...
2263818,iwettqu;fihom;</s>,1,1.000000
2263819,il-;ġimgħa;intlagħbu,1,0.001848
2263820,ġimgħa;intlagħbu;l-,1,1.000000
2263821,intlagħbu;l-;partiti,1,0.500000


In [86]:
df_trigram.to_csv(os.path.join(unk,'trigram.csv'))

df_trigram

Unnamed: 0,Trigram,Frequency,Probability
0,<s>;L-;għan,105,0.011174
1,L-;għan;prinċipali,10,0.095238
2,għan;prinċipali;ta',6,0.272727
3,prinċipali;ta';Conectando,1,0.018182
4,ta';Conectando;Mundos,1,1.000000
...,...,...,...
2263818,iwettqu;fihom;</s>,1,1.000000
2263819,il-;ġimgħa;intlagħbu,1,0.001848
2263820,ġimgħa;intlagħbu;l-,1,1.000000
2263821,intlagħbu;l-;partiti,1,0.500000
