In [371]:
import os
import pandas as pd
import seaborn as sns

In [372]:
folder = os.path.join(os.getcwd(),'..','data','korpus')

### Generating Unigram, Bigram and Trigram.

In [401]:
df = pd.read_csv(os.path.join(folder,'norm_korpus_clean.csv'))

word_count = pd.read_csv(os.path.join(folder,'norm_korpus_frequency.csv'))['Frequency'].sum()
words = df['Word']
words

0                 <s>
1                  L-
2                għan
3          prinċipali
4                 ta'
              ...    
4031890        ġimgħa
4031891     intlagħbu
4031892            l-
4031893       partiti
4031894          mir-
Name: Word, Length: 4031895, dtype: object

#### Unigram

In [402]:
%%time

#Calculating frequencies

unigram = {}

for i in range(len(words)):
    first  = words[i]
    
    if f'{first}' in unigram:
        unigram[f'{first}'] += 1
    else:
        unigram[f'{first}'] = 1        


Wall time: 10.9 s


In [403]:
df_unigram = pd.DataFrame(unigram.items(), columns=['Unigram', 'Frequency'])

In [413]:
%%time

#Calculating probabilities
df_unigram['Probability'] = [freq/word_count for freq in unigram.values()]

Wall time: 48.8 ms


In [414]:
df_unigram

Unnamed: 0,Unigram,Frequency,Probability
0,<s>,179269,4.446273e-02
1,L-,11342,2.813070e-03
2,għan,872,2.162755e-04
3,prinċipali,535,1.326920e-04
4,ta',95215,2.361545e-02
...,...,...,...
125418,Deċiżi,1,2.480224e-07
125419,Tistqarr,1,2.480224e-07
125420,tgħini,1,2.480224e-07
125421,Tirringrazzja,1,2.480224e-07


In [398]:
#Saving unigram
dict_unigram = df_unigram.set_index('Unigram').T.to_dict('list')
df_unigram.to_csv(os.path.join(folder,'ngram','unigram.csv'), index=False)

#### Bigram

In [415]:
%%time

bigram = {}

#Bigram
for i in range(len(words)-1):
    first  = words[i]
    second = words[i+1]
    if f'{first};{second}' in bigram:
        bigram[f'{first};{second}'] += 1
    else:
        bigram[f'{first};{second}'] = 1        

Wall time: 21.6 s


In [416]:
df_bigram = pd.DataFrame(bigram.items(), columns=['Bigram', 'Frequency'])

In [417]:
probability = []

for i,(bi,bi_freq) in enumerate(bigram.items()):
    first = bi.split(';')[0]
    first_freq = dict_unigram[first][0]
    probability.append(bi_freq/first_freq)

print('Finished!')
df_bigram['Probability'] = probability

Finished!


In [419]:
df_bigram

Unnamed: 0,Bigram,Frequency,Probability
0,<s>;L-,9397,0.052418
1,L-;għan,105,0.009258
2,għan;prinċipali,22,0.025229
3,prinċipali;ta',55,0.102804
4,ta';Conectando,1,0.000011
...,...,...,...
1074685,ħolm;x',1,0.014085
1074686,x';iwettqu,1,0.000226
1074687,iwettqu;fihom,1,0.020408
1074688,ġimgħa;intlagħbu,1,0.000697


In [418]:
#Saving bigram
dict_bigram = df_bigram.set_index('Bigram').T.to_dict('list')
df_bigram.to_csv(os.path.join(folder,'ngram','bigram.csv'), index=False)

#### Trigram

In [6]:
%%time

trigram = {}

#Trigram
for i in range(len(words)-2):
    first  = words[i]
    second = words[i+1]
    third = words[i+2]
    
    if f'{first},{second},{third}' in trigram:
        trigram[f'{first},{second},{third}'] += 1
    else:
        trigram[f'{first},{second},{third}'] = 1        

Wall time: 4.35 s


In [7]:
df = pd.DataFrame(trigram.items(), columns=['Trigram', 'Frequency']).sort_values(by=['Frequency'], ascending=False)
df.to_csv(os.path.join(folder,'ngram','trigram.csv'), index=False)

### Generate Vanilla Language Model