In [82]:
def preprocess(filename, N, is_train):
    with open(filename, 'r') as f:
        sentences = [line.strip() for line in f.readlines()]  # Get list of sentences from given file

    for i in range(len(sentences)):
        sentences[i] = "<s> " * max(1, N - 1) + sentences[i].replace("\n", "") + " </s>" # Add start and end of sentence tag

    tokens = ' '.join(sentences).split(" ")  # Get list of individual tokens
    if not is_train: # if it is test set, no need to obtain dictionary or replace with <UNK>
        return {}, tokens

    dictionary = dict() # store dictionary of all words along with their counts
    for token in tokens:  # Count number of occurences of each token
        if token in dictionary:
            dictionary[token] += 1
        else:
            dictionary[token] = 1

    for i in range(len(tokens)):  # Mark tokens with single occurrence with <UNK>
        if dictionary[tokens[i]] == 1 and tokens[i] != "<UNK>":
            del dictionary[tokens[i]]
            tokens[i] = "<UNK>"
            if "<UNK>" in dictionary:
                dictionary["<UNK>"] += 1
            else:
                dictionary["<UNK>"] = 1

    return dictionary, tokens


In [2]:
def get_N_gram_counts(tokens, N):
    N_gram_counts = dict() # stores N-grams with their counts
   
    for i in range(len(tokens)-N+1):
        N_gram = tuple(tokens[i:i+N]) # the tuple of N tokens is used as key in the dictionary of N-grams
        if N_gram in N_gram_counts:
            N_gram_counts[N_gram] += 1
        else:
            N_gram_counts[N_gram] = 1

    return N_gram_counts


In [3]:
def laplace_smoothing(tokens, N, dictionary):
    N_gram_counts = get_N_gram_counts(tokens, N) # stores N-grams with their counts
    N_1_gram_counts = get_N_gram_counts(tokens, N-1) # stores (N-1)-grams with their counts

    distribution = {} # stores probability distribution of N-grams in the corpus
    for key,value in N_gram_counts.items():
        distribution[key] = (value+1)/(N_1_gram_counts[key[:-1]] + len(dictionary))

    return distribution


In [66]:
def build_model(tokens, N, dictionary):
    distribution = None
    if N == 1:
        distribution = {(key,): value / len(tokens)
                        for key, value in dictionary.items()}
    else:
        distribution = laplace_smoothing(tokens, N, dictionary)

    return distribution


In [98]:
from math import log,exp
from itertools import product

def handle_oov_tokens(N_gram, model, N):
    masks  = list(reversed(list(product((0,1), repeat=N))))
    for mask in masks:
        modified_N_gram = []
        for i in range(N):
            if mask[i]==1:
                modified_N_gram.append(N_gram[i])
            else:
                modified_N_gram.append("<UNK>")
        modified_N_gram = tuple(modified_N_gram)
        if modified_N_gram in model:
            return modified_N_gram


def compute_perplexity(N, dictionary, model):
    _,test_tokens = preprocess("test.txt", N, False) # ignore dictionary of test corpus and get the tokens
    f=open("out.txt", "w")

    N_grams = get_N_gram_counts(test_tokens, N) # get list of all N-grams from test set
    perplexity = 0
    for N_gram,_ in N_grams.items():
        modified_N_gram = handle_oov_tokens(N_gram, model, N)
        probability = model[modified_N_gram]
        f.writelines(str(modified_N_gram)+str(probability)+"\n")
        perplexity -= log(probability)
    perplexity *= 1/len(test_tokens)
    perplexity = exp(perplexity)
    return perplexity


In [99]:
print("Waiting for N to be entered ...")
N = int(input("Enter N of model : "))

print("Preprocessing of training file started ...")
dictionary, tokens = preprocess("train.txt", N, True)
print("Preprcessing of training file completed", len(dictionary))

print("Model training started for N =", N, " ...")
model = build_model(tokens, N, dictionary)
print("Model training completed")

print("Computing perplexity for test set ...")
perplexity = compute_perplexity(N, dictionary, model)
print("Perplexity of test set =", perplexity)


Waiting for N to be entered ...
Preprocessing of training file started ...
Preprcessing of training file completed 23505
Model training started for N = 1  ...
Model training completed
Computing perplexity for test set ...
Perplexity of test set = 1.7305446871062764


In [7]:
from re import T


def generate_sentence(sentence, N, dictionary, tokens):
    sentence = "<s> " * max(1, N-1) + sentence
    sentence = sentence.split(' ')
    sentence = sentence[-N+1:]

    N_1_gram_counts = get_N_gram_counts(tokens,N-1)
    probabilities = dict()
    for word in dictionary:
        if word == '<UNK>':
            continue
        sentence.append(word)
        probability = 0
        if tuple(sentence) in model:
            probability = model[tuple(sentence)]
        else:
            probability = 1/(N_1_gram_counts[tuple(sentence[:-1])] + len(dictionary))
        probabilities[tuple(sentence)] = probability
        sentence = sentence[:-1]
    
    probabilities = list(probabilities.items())
    probabilities.sort(key=lambda x: x[1], reverse=True)
    f = open("out.txt", "w")
    for key in probabilities:
        f.writelines(str(key[0])+" : "+str(key[1])+"\n")


In [8]:
generate_sentence("abcd man", 2,dictionary,tokens)

In [9]:
import nltk
vocab  = nltk.FreqDist(tokens)

In [10]:
f=open("out.txt","w")
for key,value in vocab.items():
    f.writelines(key+" : "+str(value)+"\n")

In [11]:
def smooth(n):
    vocab_size = len(vocab)

    n_grams = nltk.ngrams(tokens, n)
    n_vocab = nltk.FreqDist(n_grams)

    m_grams = nltk.ngrams(tokens, n-1)
    m_vocab = nltk.FreqDist(m_grams)
    print(len(n_vocab),len(m_vocab),vocab_size)

    def smoothed_count(n_gram, n_count):
        m_gram = n_gram[:-1]
        m_count = m_vocab[m_gram]
        return (n_count + 1) / (m_count + 1 * vocab_size)

    return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }

In [12]:
smooth(3)

812980 362772 23505


{('<s>', 'liberty', 'all'): 0.00012758356723653993,
 ('liberty', 'all', 'star'): 0.00012762155953545753,
 ('all', 'star', 'usa'): 8.508103969030502e-05,
 ('star', 'usa', 'sets'): 8.508465923593976e-05,
 ('usa', 'sets', 'initial'): 8.508465923593976e-05,
 ('sets', 'initial', 'payout'): 0.0002550586634926033,
 ('initial', 'payout', '</s>'): 0.0004677268475210477,
 ('payout', '</s>', '<s>'): 0.007683863885839737,
 ('</s>', '<s>', 'we'): 0.005915884269017053,
 ('<s>', 'we', 'are'): 0.003541961830152513,
 ('we', 'are', 'being'): 8.433125316242199e-05,
 ('are', 'being', 'accused'): 8.482483671218933e-05,
 ('being', 'accused', 'of'): 8.508465923593976e-05,
 ('accused', 'of', 'not'): 8.507018290089324e-05,
 ('of', 'not', 'implementing'): 8.504124500382685e-05,
 ('not', 'implementing', 'this'): 8.508465923593976e-05,
 ('implementing', 'this', 'agreement'): 8.508465923593976e-05,
 ('this', 'agreement', '</s>'): 0.00012759984688018373,
 ('agreement', '</s>', '<s>'): 0.005963205751744555,
 ('</s>'

In [13]:
from math import log

In [14]:
from math import e,exp


print(log(e))
print(log(exp(1)))

1.0
1.0


In [15]:
import argparse
from itertools import product
import math
import nltk
from pathlib import Path

def _convert_oov(ngram, n):
    """Convert, if necessary, a given n-gram to one which is known by the model.
    Starting with the unmodified ngram, check each possible permutation of the n-gram
    with each index of the n-gram containing either the original token or <UNK>. Stop
    when the model contains an entry for that permutation.
    This is achieved by creating a 'bitmask' for the n-gram tuple, and swapping out
    each flagged token for <UNK>. Thus, in the worst case, this function checks 2^n
    possible n-grams before returning.
    Returns:
        The n-gram with <UNK> tokens in certain positions such that the model
        contains an entry for it.
    """
    masks  = list(reversed(list(product((0,1), repeat=n))))
    mask = lambda ngram, bitmask: tuple((token if flag == 1 else "<UNK>" for token,flag in zip(ngram, bitmask)))

    ngram = (ngram,) if type(ngram) is str else ngram
    for possible_known in [mask(ngram, bitmask) for bitmask in masks]:
        if possible_known in model:
            return possible_known

def perplexity( filename,n):
    """Calculate the perplexity of the model against a given test corpus.
    
    Args:
        test_data (list of str): sentences comprising the training corpus.
    Returns:
        The perplexity of the model as a float.
    
    """
    _,test_tokens = preprocess(filename, n, False)
    test_ngrams = nltk.ngrams(test_tokens, n)
    N = len(test_tokens)

    known_ngrams  = (_convert_oov(ngram,n) for ngram in test_ngrams)
    probabilities = [model[ngram] for ngram in known_ngrams]

    return math.exp((-1/N) * sum(map(math.log, probabilities)))

In [16]:
N=4
print("Preprocessing of training file started ...")
dictionary, tokens = preprocess("train.txt", N, True)
print("Preprcessing of training file completed")

print("Model training started for N =", N, " ...")
model = build_model(tokens, N, dictionary)
print("Model training completed")
model[tuple(["<UNK>" for _ in range(0,N)])]=1/len(dictionary)
perplexity("test.txt", N)

Preprocessing of training file started ...
Preprcessing of training file completed
Model training started for N = 4  ...
Model training completed


4372.836089515262