In [1]:
import os
import pandas as pd
import seaborn as sns

In [2]:
folder = os.path.join(os.getcwd(),'..','data','korpus')
vanilla = os.path.join(os.getcwd(),'..','data','korpus','ngram','vanilla')
unk = os.path.join(os.getcwd(),'..','data','korpus','ngram','unk')

In [5]:
df_test = pd.read_csv(os.path.join(folder,'Test','Test.csv'))
df_test

Unnamed: 0,Word,POS,Lemma,Root
0,<s>,START,,
1,Meħtieġa,NOUN-PROP,meħtieġa,
2,passi,NOUN,passa,p-s-j
3,konkreti,ADJ,konkret,
4,dwar,PREP,dwar,
...,...,...,...,...
360,biżżej,ADJ,,
361,­,X-ENG,­,
362,jed,VERB,,
363,.,X-PUN,.,


In [169]:
%%time

def linear_interpolation(sentence:str, model:str):

    if model == 'vanilla' or model == 'laplace':
        model_path = vanilla
    elif model == 'unk': 
        model_path = unk
    else: raise Exception('Model does not exist!')
        
    #Load ngrams for the model chosen
    print('Loading models: ', end='')
    uni = pd.read_csv(os.path.join(model_path,'unigram.csv'))
    bi = pd.read_csv(os.path.join(model_path,'bigram.csv'))
    tri = pd.read_csv(os.path.join(model_path,'trigram.csv'))
    print('[OK]')
    
    #Calculate size of korpus. (Number of words)
    if model != 'vanilla':
        path = os.path.join(folder,'norm_korpus_clean.csv')
        korpus_size = sum(1 for _ in open(path, encoding='utf-8'))    
    
    words = sentence.split(' ')
    word_count = len(words)
    
    if word_count < 3:
        raise Exception('Sentence must have at least 3 words')
    
    #Get a numpy array of n-grams
    uni_words = uni['Unigram'].to_numpy()
    bi_words  = bi ['Bigram' ].to_numpy()
    tri_words = tri['Trigram'].to_numpy()
    
    P = 1
    
    V = len(uni) #Size of the vocabulary
    
    λ1,λ2,λ3 = 0.1,0.3,0.6
      
    for i in range(2,word_count):        
        uni_word = f'{words[i]}'
        bi_word  = f'{words[i-1]};{words[i]}'
        tri_word = f'{words[i-2]};{words[i-1]};{words[i]}'
        
        #Get the probabilities. If the model is vanilla, we don't have to get the laplace smoothed value.
        #The function will just run into an IndexError.
        P1 = uni[uni['Unigram'] == uni_word].iat[0,2] if model == 'vanilla' or uni_word in uni_words else 1/len(uni_words)
        P2 = bi [bi ['Bigram']  == bi_word ].iat[0,2] if model == 'vanilla' or bi_word  in bi_words  else 1/len(bi_words)
        P3 = tri[tri['Trigram'] == tri_word].iat[0,2] if model == 'vanilla' or tri_word in tri_words else 1/len(tri_words)
        
        #If the model is laplace smoothed (UNK or laplace), 
        #add 1 to the count of the ngram and 
        #add V to the denominator.
        
        #The laplace smoothed probability makes use of the normal vanilla probability. 
        #This is done in order to reduce code complexity and the number of nested if statements.
        
        if model != 'vanilla':
            N = korpus_size
            P1 = P1*((N+V)/N) + (1/(N+V))
            
            #The frequency of the previous word
            history = f'{words[i-1]}'
            N = uni[uni['Unigram']==history].iat[0,1] if history in uni_words else 1
            P2 = P2*((N+V)/N) + (1/(N+V))
            
            #The frequency of the two previous words
            history = f'{words[i-2]};{words[i-1]}'
            N = bi[bi['Bigram']==history].iat[0,1] if history in bi_words else 1
            P3 = P3*((N+V)/N) + (1/(N+V))
        
        #Update the current frequency of the sentenece.
        P *= λ1*P1 + λ2*P2 + λ3*P3

    return P
    

linear_interpolation('Darba f\' għalqa sibt teżor', 'unk')

Loading models: [OK]


1.6688119602932157e-07

### Perplexity

In [104]:
def get_probability(sentence:list, model:str, n:int):

    if model == 'vanilla' or model == 'laplace':
        model_path = vanilla
    elif model == 'unk': 
        model_path = unk
    else: raise Exception('Model does not exist!')
    
    if n in [1,2,3]:
        ngram_path = ['unigram.csv','bigram.csv','trigram.csv'][n-1]
        ngram_type = ['Unigram','Bigram','Trigram'][n-1]

        print('Loading models: ', end='')
        df_ngram = pd.read_csv(os.path.join(model_path, ngram_path))
        ngram_words = df_ngram[ngram_type].to_numpy() #Get a numpy array of words for ngram

        if n != 1:
            prev_df_ngram = pd.read_csv(os.path.join(model_path, ['unigram.csv','bigram.csv','trigram.csv'][n-2]))
            prev_ngram_type = ['Unigram','Bigram','Trigram'][n-2]
            prev_ngram_words = prev_df_ngram[prev_ngram_type].to_numpy() #Get a numpy array of words for n-1gram
        print('[OK]')
        
    else: raise Exception('Choose Unigram, Bigram or Trigram!')
    
    if model != 'vanilla':
        path = os.path.join(folder,'korpus_clean.csv')
        korpus_size = sum(1 for _ in open(path, encoding='utf-8'))#Size of korpus. (Number of words)
        V = len(pd.read_csv(os.path.join(vanilla,'unigram.csv'))) #Size of the vocabulary
    
    P = 1 #Probability of sentences
  
    for i,s in enumerate(sentence):
        words = s.split(' ')
        word_count = len(words)
    
        if word_count < n:
            raise Exception(f'Sentence must have at least {n} words')
    
        print(f'Sentence {i} finished..')
        
        for i in range(n-1,word_count):

            if n == 1: word = f'{words[i]}'
            if n == 2: word = f'{words[i-1]};{words[i]}'
            if n == 3: word = f'{words[i-2]};{words[i-1]};{words[i]}' 


            #Get the probabilities. If the model is vanilla, we don't have to get the laplace smoothed value.
            #The function will just run into an IndexError.
            Pi = df_ngram[df_ngram[ngram_type] == word].iat[0,2] if model == 'vanilla' or word in ngram_words else 1/len(ngram_words)

            #If the model is laplace smoothed (UNK or laplace), 
            #add 1 to the count of the ngram and 
            #add V to the denominator.

            #The laplace smoothed probability makes use of the normal vanilla probability. 
            #This is done in order to reduce code complexity and the number of nested if statements.

            if model != 'vanilla':

                if n == 1:
                    N = korpus_size

                else:
                    if n == 2: history = f'{words[i-1]}' #The previous word
                    if n == 3: history = f'{words[i-2]};{words[i-1]}' #The 2 previous words

                    #The frequency of the previous word/s.
                    N = prev_df_ngram[prev_df_ngram[prev_ngram_type]==history].iat[0,1] if history in prev_ngram_words else 1

                    #The probability of the current word
                    Pi = Pi*((N+V)/N) + (1/(N+V))

            #Update the current frequency of the sentenece.
            P *= Pi

    return P

In [105]:
def calc_perplexity(model:str, n:int):
    sentences = []
    current_sentence = []
        
    for word in df_test['Word']:
        current_sentence.append(word)
        
        if word == '</s>':
            sentences.append(' '.join(current_sentence))
            current_sentence = []

    P = get_probability(sentences[0:1],model,n)
    return P

In [106]:
%%time
calc_perplexity(model='unk',n=1)

Loading models: [OK]
Sentence 0 finished..
0.0407831575915836
5.28696788530689e-07
0.0001002601364431
1.2576575121108814e-05
0.0027410525318095
0.0173651261248645
5.142777852071248e-05
0.0046188873979817
0.0001266949758697
0.000100436368706
9.869006719239528e-05
0.0407831415704688
Wall time: 9.91 s


1.5743930286565555e-41

In [86]:
get_probability(['<s> Meħtieġa passi konkreti dwar il- qagħda fl- Isptar Mater Dei </s>'],'unk',2)

Loading models: [OK]
Sentence 0 finished..


7.686912360132718e+50