In [165]:
import os
import pandas as pd
import seaborn as sns

In [166]:
folder = os.path.join(os.getcwd(),'..','data','korpus')
vanilla = os.path.join(os.getcwd(),'..','data','korpus','ngram','vanilla')
unk = os.path.join(os.getcwd(),'..','data','korpus','ngram','unk')

In [169]:
%%time

def linear_interpolation(sentence:str, model:str):

    if model == 'vanilla' or model == 'laplace':
        model_path = vanilla
    elif model == 'unk': 
        model_path = unk
    else: raise Exception('Model does not exist!')
        
    #Load ngrams for the model chosen
    print('Loading models: ', end='')
    uni = pd.read_csv(os.path.join(model_path,'unigram.csv'))
    bi = pd.read_csv(os.path.join(model_path,'bigram.csv'))
    tri = pd.read_csv(os.path.join(model_path,'trigram.csv'))
    print('[OK]')
    
    #Calculate size of korpus. (Number of words)
    if model != 'vanilla':
        path = os.path.join(folder,'norm_korpus_clean.csv')
        korpus_size = sum(1 for _ in open(path, encoding='utf-8'))    
    
    words = sentence.split(' ')
    word_count = len(words)
    
    if word_count < 3:
        raise Exception('Sentence must have at least 3 words')
    
    #Get a numpy array of n-grams
    uni_words = uni['Unigram'].to_numpy()
    bi_words  = bi ['Bigram' ].to_numpy()
    tri_words = tri['Trigram'].to_numpy()
    
    P = 1
    
    V = len(uni) #Size of the vocabulary
    
    λ1,λ2,λ3 = 0.1,0.3,0.6
      
    for i in range(2,word_count):        
        uni_word = f'{words[i]}'
        bi_word  = f'{words[i-1]};{words[i]}'
        tri_word = f'{words[i-2]};{words[i-1]};{words[i]}'
        
        #Get the probabilities. If the model is vanilla, we don't have to get the laplace smoothed value.
        #The function will just run into an IndexError.
        P1 = uni[uni['Unigram'] == uni_word].iat[0,2] if model == 'vanilla' or uni_word in uni_words else 1/len(uni_words)
        P2 = bi [bi ['Bigram']  == bi_word ].iat[0,2] if model == 'vanilla' or bi_word  in bi_words  else 1/len(bi_words)
        P3 = tri[tri['Trigram'] == tri_word].iat[0,2] if model == 'vanilla' or tri_word in tri_words else 1/len(tri_words)
        
        #If the model is laplace smoothed (UNK or laplace), 
        #add 1 to the count of the ngram and 
        #add V to the denominator.
        
        #The laplace smoothed probability makes use of the normal vanilla probability. 
        #This is done in order to reduce code complexity and the number of nested if statements.
        
        if model != 'vanilla':
            N = korpus_size
            P1 = P1*((N+V)/N) + (1/(N+V))
            
            #The frequency of the previous word
            history = f'{words[i-1]}'
            N = uni[uni['Unigram']==history].iat[0,1] if history in uni_words else 1
            P2 = P2*((N+V)/N) + (1/(N+V))
            
            #The frequency of the two previous words
            history = f'{words[i-2]};{words[i-1]}'
            N = bi[bi['Bigram']==history].iat[0,1] if history in bi_words else 1
            P3 = P3*((N+V)/N) + (1/(N+V))
        
        #Update the current frequency of the sentenece.
        P *= λ1*P1 + λ2*P2 + λ3*P3

    return P
    

linear_interpolation('Darba f\' għalqa sibt teżor', 'unk')

Loading models: [OK]


1.6688119602932157e-07