# Building a Language Model

In [54]:
import mpmath as mp
import os
import io
import sys
import pandas as pd
import psutil

from tabulate import tabulate
from nltk.tokenize import word_tokenize
from nltk.collocations import *
from datetime import datetime
from collections import Counter

In [18]:
from sklearn.model_selection import train_test_split
import numpy as np

Creating a variable which stores the corpus. This makes the code relatively modular since only the value of the variable has to be changed in order to test with different corpora.

The corpus used for testing is the academic1 corpus in the Maltese set.

In [19]:
path = 'Corpus/academic1.txt'

The below function upon called returns the memory currently being used by python.exe in GBs

In [20]:
def RAMusage():
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0]/2.**30
    print('Memory Use: ', memoryUse, 'GB')

Removing Symbols from the Corpus in order for the Language Models, Perplexities and Sentence Generators to focus solely on only words

In [31]:
def RemoveSymbols(corpus):
    arr = []
    symbols = "“”‘’!\"#$€%&()*'+-,./:;<=>?@[\]^_`{|}~\n"
    
    for i in corpus:
        if i not in symbols:
            arr.append(i)
            
    return arr

## Other attempts to read corpus in a more efficient manner:

rawcorpus_dir = 'E:\Corpus\\'

output_dir = 'Corpus\KorpusMalti.csv'

csvout = pd.DataFrame()

for filename in os.listdir('Corpus'):

    data = pd.read_csv(filename, sep = ':', index_col = 0, header = None)
    csvout.csvout.append(data)
        
csvout.to_csv(output_dir)

In [21]:
extraction_start = datetime.now()

file = open(path)
corpus = file.read()

tokenize = word_tokenize(corpus)
tokens = RemoveSymbols(tokenize)

extraction_end = datetime.now()

extraction_time = dict()
extraction_time['extraction_time'] = extraction_end - extraction_start
RAMusage()
print('Extraction Time(HH::MM:SS:ms) - {}\n\n'.format(extraction_time['extraction_time']))

Memory Use:  0.1607208251953125 GB
Extraction Time(HH::MM:SS:ms) - 0:00:01.345009




In [23]:
def NGrams(words, n):
    ngrams = []
    
    for i in range(0, len(words)):
        ngram = ' '.join(words[i:i + n])
        ngrams.append(ngram)
        
    return ngrams

bigram = NGrams(tokens, 5)
#freqdist = nltk.FreqDist(bigrams)

#for i,j in freqdist.items():
    #print(i,j)

In [24]:
def Split(corpus):
    
    file = open(path)
    corpus = file.read()
    words = []
    
    for line in corpus:
        
        words.append(line)
        
    train, test = train_test_split(words, test_size = 0.66, train_size = 0.34, shuffle = False)
    
    return train, test

x, y = Split(path)

In [32]:
def Perplexity(test, model):
    
    perp = mp.mpf(1)
    
    N = mp.mpf(0)
    
    for line in test:
        N += len(line)
        line = ' '.join(line)
        
        if model[line] > 0:
            perp = perp * (1/model[line])
        else:
            perp = perp * sys.maxsize
            
    perp = pow(perp, 1/float(N))
    return perp

# Part 2 - Building a Language Model

## Vanilla

In [25]:
def VanillaUnigram(train):
    
    model = Counter(train)
    
    for word in model:
        model[word] = model[word]/len(train)
        
    return model

def VanillaBigram(train):
    
    model = Counter([(word, train[i + 1]) for i, word in enumerate(train[:-1])])
    counter = Counter(train)
    
    for word in model:
        model[word] = model[word]/counter[word[0]]
        
    return model

def VanillaTrigram(train):
    
    bigram = Counter([(word, train[i + 1]) for i, word in enumerate(train[:-1])])
    trigram = Counter([(word, train[i + 1], train[i + 2]) for i, word in enumerate(train[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]/bigram[(word[0], word[1])]
        
    return trigram

In [48]:
a = VanillaUnigram(x)
p1 = (Perplexity(y, a))

b = VanillaBigram(x)
p2 = (Perplexity(y, b))

c = VanillaTrigram(x)
p3 = (Perplexity(y, c))

i1 = (Interpolation(a, b, c, ["<s>", "</s>"], "il-"))

## Laplace 

In [27]:
def LaplaceUnigram(train):
    
    model = Counter(train)
    
    for word in model:
        model[word] = (model[word]+1)/len(train)
        
    return model

def LaplaceBigram(train):
    
    model = Counter([(word, train[i + 1]) for i, word in enumerate(train[:-1])])
    counter = Counter(train)
    
    for word in model:
        model[word] = model[word] + 1/counter[word[0], word[1]]
        
    return model

def LaplaceTrigram(train):
    
    bigram = Counter([(word, train[i + 1]) for i, word in enumerate(train[:-1])])
    trigram = Counter([(word, train[i + 1], train[i + 2]) for i, word in enumerate(train[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word] + 1 /bigram[(word[0], word[1], word[2])]
        
    return trigram

In [49]:
a2 = LaplaceUnigram(x)
p4 = (Perplexity(y, a2))

#b2 = LaplaceBigram(x)
#p5 = (Perplexity(y, b2))

#c2 = LaplaceTrigram(x)
#p6 = (Perplexity(y, c2))

#i2 = (Interpolation(a2, b2, c2, ["<s>", "</s>"], "il"))

## UNK

In [29]:
def UNKUnigram(train):
    
    counter = Counter(train)
    model = {}
    model["<UNK>"] = 0
    
    for word in counter:
        if counter[word] == 1:
            model["<UNK>"] += 1
            
        else:
            model[word] = counter[word]
            
    for word in model:
        model[word] = model[word]/len(train)
        
    return model

def UNKBigram(train):
    
    unigram = UNKUnigram(train)
    
    for i, word in enumerate(train):
        if not (word in unigram):
            train[i] = "<UNK>"
            
    return VanillaBigram(train)

def UNKTrigram(train):
    
    unigram = UNKUnigram(train)
    
    for i, word in enumerate(train):
        if not (word in unigram):
            train[i] = "<UNK>"
            
    return VanillaTrigram(train)

In [50]:
a3 = UNKUnigram(x)
#p7 = (Perplexity(y, a3))

b3 = UNKBigram(x)
p8 = Perplexity(y, b3)

c3 = UNKTrigram(x)
p9 = (Perplexity(y, c3))

#i3 = (Interpolation(a3, b3, c3, ["<s>", "</s>"], "il"))

In [65]:
head = ["Model", "Unigram", "Bigram", "Trigram", "Interpolation"]

In [66]:
data = [{"Vanilla", p1, p2, p3, i1},
        {"Laplace", p4, 0, 0, 0},
        {"UNK", 0, p8, p9 , 0}]

print("Vanilla Model: ", "Unigram: ", p1, "Bigram: ", p2, "Trigram: ", p3, "Interpolation: ", i1)
print("\n")
print("Laplace Model: ", "Unigram: ", p4, "Bigram: ", p2, "Trigram: ", p3, "Interpolation: ", i1)

+----------------------+-------------+------------------+-----------+
| Model                |     Unigram | Bigram           | Trigram   |
| 9.22337203685476e+18 | 0           | 34.9366920882137 | Vanilla   |
+----------------------+-------------+------------------+-----------+
| Laplace              | 0           | 34.9201727634248 |           |
+----------------------+-------------+------------------+-----------+
| 0                    | 9.22337e+18 | UNK              |           |
+----------------------+-------------+------------------+-----------+


## Probability

In [15]:
def UnigramProbability(unigram, sentence, word):
    return unigram[word]

def BigramProbability(bigram, sentence, word):
    if (sentence[-1], word) in bigram:
        return bigram[sentence[-1],word]
    
    else:
        return 0
    
def TrigramProbability(trigram, sentence, word): 
    if (sentence[-2],sentence[-1], word) in trigram:
        return trigram[sentence[-2],sentence[-1],word]
    
    else:
        return 0
    
def Interpolation(unigram, bigram, trigram, sentence, word):
    Unigram = 0.1*(unigram[word])
    Bigram = 0.3*(bigram[sentence[-1], word])
    Trigram = 0.6*(trigram[sentence[-2], sentence[-1], word])
    
    return Unigram+Bigram+Trigram

## Generate

In [None]:
def UnigramGenerate(unigram, sentence, last = "", count = None):
    
    if(count != 0 and sentence[-1] != last):
        
        weights = np.array(list(unigram.values()))
        norm = weights/np.sum(weights)
        
        resample = np.random.multinomial(1, norm)
        key = list(resample).index(1)
        value = list(unigram.keys())[key]
        
        sentence.append(value)
        if count != None:
            UnigramGenerate(unigram, sentence, last, count-1)
        else:
            UnigramGenerate(unigram, sentence, last)
            
    return sentence

#print(UnigramGenerate(a, ["<s>"], "</s>"))

In [34]:
def BigramGenerate(bigram, sentence, last, count = None):
    
    if(count != 0 and sentence != last):
        bigrams = []
        
    for word in bigram:
        if word[0] == sentence[-1]:
            bigrams[word] = bigram[word]
            
    if(bigrams == []):
        return sentence 
    
    weights = np.array(list(bigrams.values()))
    norm = weights / np.sum(weights)
    resample = np.random.multinomial(1, norm)
    key = list(resample).index(1)
    value = list(bigrams.keys())[key]
    
    sentence.append(value)
    
    if count != None:
        BigramGenerate(bigram, sentence, last, count-1)
    else:
        BigramGenerate(bigram, sentence, last)
        
    return sentence

In [36]:
def TrigramGenerate(bigram, trigram, sentence, last = "", count = None):
    if(len(sentence) == 1):
        sentence = BigramGenerate(bigram, sentence, last, count=1)
        
    if(count != 0 and sentence[-1] != last):
        trigrams = []
        
        for word in trigram:
            
            if(word[0] == sentence[-2] and word[1] == sentence[-1]):
                trigrams[word] = trigram[word]
                
        if(trigrams == []):
            return sentence
        
        weights = np.array(list(bigrams.values()))
        norm = weights / np.sum(weights)
        resample = np.random.multinomial(1, norm)
        key = list(resample).index(1)
        value = list(bigrams.keys())[key] 
        
        sentence.append(value[2])
        if count != None:
            TrigramGenerate(bigram, trigram, sentence, last, count-1)
            
        else:
            TrigramGenerate(bigram, trigram, sentence, last)
            
            
    return sentence