# N-grams



In [9]:
import nltk
from nltk import ngrams
from nltk.tokenize import sent_tokenize
import os

if os.path.isfile('The Eye of the World - Robert Jordan.txt'): 
    f = open('The Eye of the World - Robert Jordan.txt', encoding="utf8")
    content = f.read()
    content = " ".join(content.lower().split())
    sent_tokenize_list = sent_tokenize(content)
    content = nltk.word_tokenize(content)
    print("Number of Sentences:", len(sent_tokenize_list))
    print("Number of Tokens:", len(content))

Number of Sentences: 22180
Number of Tokens: 367891


In [13]:
from collections import defaultdict

def createUnigramCount(content):
    unigram = ngrams(content, 1)
    model = defaultdict(lambda: 0)
    for i in content:
        for word1 in unigram:
            #print(word1)
            model[word1] += 1
    return model

def createBigramCount(content):
    bigram = ngrams(content, 2)
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for i in content:
        for word1, word2 in bigram:
            model[word1][word2] += 1
    return model

def createTrigramCount(content):
    trigram = ngrams(content, 3)
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for i in content:
        for word1, word2, word3 in trigram:
            model[(word1, word2)][word3] += 1
    return model

def createFourgramCount(content):
    fourgram = ngrams(content, 4)
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for i in content:
        for word1, word2, word3, word4 in fourgram:
            model[(word1, word2, word3)][word4] += 1
    return model

unigramCount = createUnigramCount(content)
bigramCount = createBigramCount(content)
trigramCount = createTrigramCount(content)
fourgramCount = createFourgramCount(content)

In [21]:
import numpy as np

def unigramPredict(content, numOfSentences):
    unigramKeys = []
    for i in list(unigramCount.keys()):
        unigramKeys.append(i[0])
    sentence = []
    count = 0
    punctuations = ['.', '!', '?', '"']
    while count < numOfSentences:
        randomToken = np.random.choice(unigramKeys, 1, p = [float(i)/(sum(unigramCount.values())) for i in unigramCount.values()])[0]
        if randomToken in punctuations:
            count += 1
        print(randomToken)
        sentence.append(randomToken)
    return sentence

def bigramPredict(content, seed, numOfSentences):
    sentence = seed.split()
    count = 0
    punctuations = ['.', '!', '?', '"']
    while count < numOfSentences:
        randomToken = np.random.choice(list(bigramCount[sentence[-1]]), 1, p = [float(i)/(sum(bigramCount[sentence[-1]].values())) for i in bigramCount[sentence[-1]].values()])[0]
        if randomToken in punctuations:
            count += 1
        sentence.append(randomToken)
    return sentence

def trigramPredict(content, seed, numOfSentences):
    sentence = seed.split()
    randomToken = np.random.choice(list(bigramCount[sentence[-1]]), 1, p = [float(i)/(sum(bigramCount[sentence[-1]].values())) for i in bigramCount[sentence[-1]].values()])[0]
    sentence.append(randomToken)
    count = 0
    punctuations = ['.', '!', '?', '"']
    while count < numOfSentences:
        if len(trigramModel[(sentence[-2], sentence[-1])]) == 0:
            randomToken = np.random.choice(list(bigramCount[sentence[-1]]), 1, p = [float(i)/(sum(bigramCount[sentence[-1]].values())) for i in bigramCount[sentence[-1]].values()])[0]
        else:
            randomToken = np.random.choice(list(trigramCount[(sentence[-2], sentence[-1])]), 1, p = [float(i)/(sum(trigramCount[(sentence[-2], sentence[-1])].values())) for i in trigramCount[(sentence[-2], sentence[-1])].values()])[0]
        if randomToken in punctuations:
            count += 1
        sentence.append(randomToken)
    return sentence

def fourgramPredict(content, seed, numOfSentences):
    sentence = seed.split()
    randomToken = np.random.choice(list(bigramCount[sentence[-1]]), 1, p = [float(i)/(sum(bigramCount[sentence[-1]].values())) for i in bigramCount[sentence[-1]].values()])[0]
    sentence.append(randomToken)
    randomToken = np.random.choice(list(trigramCount[(sentence[-2], sentence[-1])]), 1, p = [float(i)/(sum(trigramCount[(sentence[-2], sentence[-1])].values())) for i in trigramCount[(sentence[-2], sentence[-1])].values()])[0]
    sentence.append(randomToken)
    count = 0
    punctuations = ['.', '!', '?', '"']
    while count < numOfSentences:
        if len(fourgramCount[(sentence[-3], sentence[-2], sentence[-1])]) == 0:
            if len(trigramCount[(sentence[-2], sentence[-1])]) == 0:
                randomToken = np.random.choice(list(bigramCount[sentence[-1]]), 1, p = [float(i)/(sum(bigramCount[sentence[-1]].values())) for i in bigramCount[sentence[-1]].values()])[0]
            else:
                randomToken = np.random.choice(list(trigramCount[(sentence[-2], sentence[-1])]), 1, p = [float(i)/(sum(trigramCount[(sentence[-2], sentence[-1])].values())) for i in trigramCount[(sentence[-2], sentence[-1])].values()])[0]
        else:
            randomToken = np.random.choice(list(fourgramCount[(sentence[-3], sentence[-2], sentence[-1])]), 1, p = [float(i)/(sum(fourgramCount[(sentence[-3], sentence[-2], sentence[-1])].values())) for i in fourgramCount[(sentence[-3], sentence[-2], sentence[-1])].values()])[0]
        if randomToken in punctuations:
            count += 1
        sentence.append(randomToken)
    return sentence

def generate(ls):
    output = ls[0].capitalize()
    for token in ls[1:]:
        if output[-1] in ['.', '!', '?'] or token == 'i':
            output += " " + token.capitalize()
        elif token in [',', "’", '”', ':', '.', '!', '?']:
            output += token
        else:
            output += " " + token
    return output

print("Bigram:")
print(generate(bigramPredict(content, 'the', 10)))
print()
print("Trigram:")
print(generate(trigramPredict(content, 'the', 10)))
print()
print("Four-gram:")
print(generate(fourgramPredict(content, 'the', 10)))

Bigram:
The weather or have trollocs had shouted as she realized what did not decide. With his gaze, the room. If by tapping a large space taken an apple cider, he stood in. A round his knees. . He was crisply dark one’ s eye is rarely did teach you. ” the great lord captain domon isn’ alzamon. ” egwene and moiraine sedai are the boy, and her head ; her eyes regarded them. She stood open. It, rand remained whole day or daughter was coming looking at midday. The size again and aching, as if he smiled. Nynaeve’ ll never heard them after all right here. ” moiraine did not more than the inn was gone back there were just the cabinetmaker—would see about that set off her fervently he said his head.

Trigram:
The big gray felt wolves tearing at their first step another strike at them even with an anxious cluster. The warder. “ lan will be little rest for privacy, he did not appear to expect an answer. One of the arinelle. The ground all about them! ” moiraine replied. “ all things. Lews theri

# Perplexity


In [18]:
import random
np.random.seed(1)
random.shuffle(sent_tokenize_list)

trainSet = sent_tokenize_list[: int(len(sent_tokenize_list) * 0.9)] # 90%
testSet = sent_tokenize_list[int(len(sent_tokenize_list) * 0.9):] # 10%
flatten = ' '.join(trainSet)
flatten = nltk.word_tokenize(flatten)

In [19]:
import math
unigramModel = createUnigramCount(flatten)
bigramModel = createBigramCount(flatten)
trigramModel = createTrigramCount(flatten)
fourgramModel = createFourgramCount(flatten)

def perplexityUnigram(testSet):
    summation = 0
    words = 0
    for s in testSet:
        tokens = nltk.word_tokenize(s)
        words += len(tokens)
        for index, word in enumerate(tokens):
            p = (unigramModel[(tokens[index],)] + 0.1)/(sum(unigramModel.values()) + (0.1 * len(unigramModel.values())))
            summation += math.log(p, 2)
    average = summation/words
    return pow(2, -average)

def perplexityBigram(testSet):
    summation = 0
    words = 0
    for s in testSet:
        tokens = nltk.word_tokenize(s)
        words += len(tokens)
        for index, word in enumerate(tokens):
            if index > 0:
                if bigramModel[tokens[index - 1]][tokens[index]] == 0:
                    p = (unigramModel[(tokens[index],)] + 0.1)/(sum(unigramModel.values()) + (0.1 * len(unigramModel.values())))
                else:
                    p = (bigramModel[tokens[index - 1]][tokens[index]])/(sum(bigramModel[tokens[index - 1]].values()))                
                summation += math.log(p, 2)
    average = summation/words
    return pow(2, -average)

def perplexityTrigram(testSet):
    summation = 0
    words = 0
    for s in testSet:
        tokens = nltk.word_tokenize(s)
        words += len(tokens)
        for index, word in enumerate(tokens):
            if index > 1:
                if trigramModel[(tokens[index - 2], tokens[index - 1])][tokens[index]] == 0:
                    if bigramModel[tokens[index - 1]][tokens[index]] == 0:
                        p = (unigramModel[(tokens[index],)] + 0.1)/(sum(unigramModel.values()) + (0.1 * len(unigramModel.values())))
                    else:
                        p = (bigramModel[tokens[index - 1]][tokens[index]])/(sum(bigramModel[tokens[index - 1]].values()))
                else:
                    p = (trigramModel[(tokens[index - 2], tokens[index - 1])][tokens[index]])/(sum(trigramModel[(tokens[index - 2], tokens[index - 1])].values()))                
                summation += math.log(p, 2)
    average = summation/words
    return pow(2, -average)

def perplexityFourgram(testSet):
    summation = 0
    words = 0
    for s in testSet:
        tokens = nltk.word_tokenize(s)
        words += len(tokens)
        for index, word in enumerate(tokens):
            if index > 2:
                if fourgramModel[(tokens[index - 3], tokens[index - 2], tokens[index - 1])][tokens[index]] == 0:
                    if trigramModel[(tokens[index - 2], tokens[index - 1])][tokens[index]] == 0:
                        if bigramModel[tokens[index - 1]][tokens[index]] == 0:
                            p = (unigramModel[(tokens[index],)] + 0.1)/(sum(unigramModel.values()) + (0.1 * len(unigramModel.values())))
                        else:
                            p = (bigramModel[tokens[index - 1]][tokens[index]])/(sum(bigramModel[tokens[index - 1]].values()))
                    else:
                        p = (trigramModel[(tokens[index - 2], tokens[index - 1])][tokens[index]])/(sum(trigramModel[(tokens[index - 2], tokens[index - 1])].values()))
                else:
                    p = (fourgramModel[(tokens[index - 3], tokens[index - 2], tokens[index - 1])][tokens[index]])/sum(fourgramModel[(tokens[index - 3], tokens[index - 2], tokens[index - 1])].values())
                summation += math.log(p, 2)
    average = summation/words
    return pow(2, -average)

print(perplexityUnigram(testSet))
print(perplexityBigram(testSet))
print(perplexityTrigram(testSet))
print(perplexityFourgram(testSet))

515.399937964938
83.58977846088527
45.045136485957954
32.7965663761686
