# Perplexity

The following functions calculate linear interpolation and the perplexity on a list of sentences.

In [2]:
import os
import pandas as pd
import seaborn as sns

In [3]:
folder = os.path.join(os.getcwd(),'..','data','korpus')
vanilla = os.path.join(os.getcwd(),'..','data','korpus','ngram','vanilla')
unk = os.path.join(os.getcwd(),'..','data','korpus','ngram','unk')

In [4]:
df_test = pd.read_csv(os.path.join(folder,'Test','Test.csv'))
df_test = df_test.dropna()

In [5]:
%%time

def linear_interpolation(sentences:list, model:str):

    if model == 'vanilla' or model == 'laplace':
        model_path = vanilla
    elif model == 'unk': 
        model_path = unk
    else: raise Exception('Model does not exist!')
        
    #Load ngrams for the model chosen
    print('Loading models: ', end='')
    uni = pd.read_csv(os.path.join(model_path,'unigram.csv'))
    bi = pd.read_csv(os.path.join(model_path,'bigram.csv'))
    tri = pd.read_csv(os.path.join(model_path,'trigram.csv'))
    print('[OK]')
    
    #Calculate size of korpus. (Number of words)
    if model != 'vanilla':
        path = os.path.join(folder,'norm_korpus_clean.csv')
        korpus_size = sum(1 for _ in open(path, encoding='utf-8'))    
    
    
    #Get a numpy array of n-grams
    uni_words = uni['Unigram'].to_numpy()
    bi_words  = bi ['Bigram' ].to_numpy()
    tri_words = tri['Trigram'].to_numpy()

    P = 1

    V = len(uni) #Size of the vocabulary

    λ1,λ2,λ3 = 0.1,0.3,0.6

    for s in sentences:
        words = s.split(' ')
        word_count = len(words)

        if word_count < 3:
            print(f'Sentence {s} must have at least 3 words. Returning probability calculated till now.')
            return P

        
        for i in range(2,word_count):    
            uni_word = f'{words[i]}'
            bi_word  = f'{words[i-1]};{words[i]}'
            tri_word = f'{words[i-2]};{words[i-1]};{words[i]}'

            #Get the probabilities. If the model is vanilla, we don't have to get the laplace smoothed value.
            #The function will just run into an IndexError.
            P1 = uni[uni['Unigram'] == uni_word].iat[0,2] if model == 'vanilla' or uni_word in uni_words else 1/len(uni_words)
            P2 = bi [bi ['Bigram']  == bi_word ].iat[0,2] if model == 'vanilla' or bi_word  in bi_words  else 1/len(bi_words)
            P3 = tri[tri['Trigram'] == tri_word].iat[0,2] if model == 'vanilla' or tri_word in tri_words else 1/len(tri_words)

            #If the model is laplace smoothed (UNK or laplace), 
            #add 1 to the count of the ngram and 
            #add V to the denominator.

            #The laplace smoothed probability makes use of the normal vanilla probability. 
            #This is done in order to reduce code complexity and the number of nested if statements.

            if model != 'vanilla':
                N = korpus_size
                P1 = P1*((N+V)/N) + (1/(N+V))

                #The frequency of the previous word
                history = f'{words[i-1]}'
                N = uni[uni['Unigram']==history].iat[0,1] if history in uni_words else 1
                P2 = P2*((N+V)/N) + (1/(N+V))

                #The frequency of the two previous words
                history = f'{words[i-2]};{words[i-1]}'
                N = bi[bi['Bigram']==history].iat[0,1] if history in bi_words else 1
                P3 = P3*((N+V)/N) + (1/(N+V))

            #Update the current frequency of the sentenece.
            P *= λ1*P1 + λ2*P2 + λ3*P3

    return P
    

linear_interpolation(['Darba f\' għalqa sibt teżor'], 'unk')

Loading models: [OK]
Wall time: 12.5 s


1.9477697621004495e-08

### Testing linear interpolation

We can dry run through an example of linear interpolation to test it. Let's consider the sentence 'Malta hija gżira'.

First we will find the ngram probabilities of the phrase.

In [None]:
# print('Loading models: ', end='')
# uni = pd.read_csv(os.path.join(model_path,'unigram.csv'))
# bi = pd.read_csv(os.path.join(model_path,'bigram.csv'))
# tri = pd.read_csv(os.path.join(model_path,'trigram.csv'))
# print('[OK]')

### Perplexity

In [6]:
def get_probability(sentence:list, model:str, n:int):

    if model == 'vanilla' or model == 'laplace':
        model_path = vanilla
    elif model == 'unk': 
        model_path = unk
    else: raise Exception('Model does not exist!')
    
    if n in [1,2,3]:
        ngram_path = ['unigram.csv','bigram.csv','trigram.csv'][n-1]
        ngram_type = ['Unigram','Bigram','Trigram'][n-1]

        print('Loading models: ', end='')
        df_ngram = pd.read_csv(os.path.join(model_path, ngram_path))
        ngram_words = df_ngram[ngram_type].to_numpy() #Get a numpy array of words for ngram

        if n != 1:
            prev_df_ngram = pd.read_csv(os.path.join(model_path, ['unigram.csv','bigram.csv','trigram.csv'][n-2]))
            prev_ngram_type = ['Unigram','Bigram','Trigram'][n-2]
            prev_ngram_words = prev_df_ngram[prev_ngram_type].to_numpy() #Get a numpy array of words for n-1gram
        print('[OK]')
        
    else: raise Exception('Choose Unigram, Bigram or Trigram!')
    
    if model != 'vanilla':
        path = os.path.join(folder,'korpus_clean.csv')
        korpus_size = sum(1 for _ in open(path, encoding='utf-8'))#Size of korpus. (Number of words)
        V = len(pd.read_csv(os.path.join(vanilla,'unigram.csv'))) #Size of the vocabulary
    
    P = 1 #Probability of sentences
  
    for sen_num,s in enumerate(sentence):
        words = s.split(' ')
        word_count = len(words)
    
        if word_count < n:
            raise Exception(f'Sentence must have at least {n} words')
    
#         print(f'Sentence {sen_num} finished..')
        
        for i in range(n-1,word_count):

            if n == 1: word = f'{words[i]}'
            if n == 2: word = f'{words[i-1]};{words[i]}'
            if n == 3: word = f'{words[i-2]};{words[i-1]};{words[i]}' 


            #Get the probabilities. If the model is vanilla, we don't have to get the laplace smoothed value.
            #The function will just run into an IndexError.
            Pi = df_ngram[df_ngram[ngram_type] == word].iat[0,2] if model == 'vanilla' or word in ngram_words else 1/len(ngram_words)

            #If the model is laplace smoothed (UNK or laplace), 
            #add 1 to the count of the ngram and 
            #add V to the denominator.

            #The laplace smoothed probability makes use of the normal vanilla probability. 
            #This is done in order to reduce code complexity and the number of nested if statements.

            if model != 'vanilla':

                if n == 1:
                    N = korpus_size

                else:
                    if n == 2: history = f'{words[i-1]}' #The previous word
                    if n == 3: history = f'{words[i-2]};{words[i-1]}' #The 2 previous words

                    #The frequency of the previous word/s.
                    N = prev_df_ngram[prev_df_ngram[prev_ngram_type]==history].iat[0,1] if history in prev_ngram_words else 1

                    #The probability of the current word
                    Pi = Pi*((N+V)/N) + (1/(N+V))

            #Update the current probability of the sentenece.
            P *= Pi

    return P

In [7]:
def calc_perplexity(model:str, n:int):
    sentences = []
    current_sentence = []
        
    for word in df_test['Word']:
        current_sentence.append(word)
        
        if word == '</s>':
            sentences.append(' '.join(current_sentence))
            current_sentence = []
    
#     for s in sentences: print(s)
    if n < 4:
        P = get_probability(sentences, model,n)
    else:
        P = linear_interpolation(sentences, model)
    print(P)
    
    P **= -1/len(sentences)
    return P

## Perplexity evaluation

We will calculate the perplexity of the test corpus across all variations of model an ngram. Note that n=4 represents linear inteprolation.

###### Vanilla

In [8]:
%%time

v1,v2,v3,v4 = 0,0,0,0

try: v1 = calc_perplexity(model='vanilla',n=1)
except Exception: pass
try: v2 = calc_perplexity(model='vanilla',n=2)
except Exception: pass
try: v3 = calc_perplexity(model='vanilla',n=3)
except Exception: pass
try: v4 = calc_perplexity(model='vanilla',n=4)
except Exception: pass

Loading models: [OK]
Loading models: [OK]
Loading models: [OK]
Loading models: [OK]
Wall time: 28.2 s


###### Laplace

In [9]:
%%time
l1 = calc_perplexity(model='laplace',n=1)
l2 = calc_perplexity(model='laplace',n=2)
l3 = calc_perplexity(model='laplace',n=3)
l4 = calc_perplexity(model='laplace',n=4)

Loading models: [OK]
0.0
Loading models: 

  P **= -1/len(sentences)


[OK]
3.538130414718423e-226
Loading models: [OK]
9.293100482130701e-158
Loading models: [OK]
2.4721668065992436e-90
Wall time: 4min 12s


###### UNK

In [10]:
%%time
u1 = calc_perplexity(model='unk',n=1)
u2 = calc_perplexity(model='unk',n=2)
u3 = calc_perplexity(model='unk',n=3)
u4 = calc_perplexity(model='unk',n=4)

Loading models: [OK]
0.0
Loading models: 

  P **= -1/len(sentences)


[OK]
1.3539623190605953e-223
Loading models: [OK]
3.669375254943107e-156
Loading models: [OK]
1.9106911591162646e-148
Wall time: 3min 33s


In [22]:
df = pd.DataFrame([[v1,l1,u1],
                   [v2,l2,u2],
                   [v3,l3,u3],
                   [v4,l4,u4]],
                   columns = ['Vanilla', 'Laplace', 'UNK',],
                   index =['Unigram','Bigram','Trigram','Linear Interpolation'])
df

Unnamed: 0,Vanilla,Laplace,UNK
Unigram,0,inf,inf
Bigram,0,2.3057200000000003e+56,5.213128e+55
Trigram,0,1.811173e+39,7.225235e+38
Linear Interpolation,0,2.521916e+22,8.50555e+36


From the above perplexity table, we can see that the longer the ngram the lower its perplexity. The Vanilla perplexity couldn't be calculated because the model encountered words/phrases which it did not see before. The unigram perplexity for Laplace and UNK blew up and hence were represented as infinity.