In [192]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank
import string
import numpy as np
from collections import Counter, defaultdict
import re
import math
import random

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [193]:
#Preprocess the text and eliminate words belonging to the corresponding tags mentioned (Task a)
def pre_process(text):
  pro_words = [(x.lower(), y) for (x,y) in corp if y not in ('NN', ',', '.', 'CD', 'LS', 'SENT', 'SYM', '#', '$', '-LRB-', '-RRB-', ':', '-NONE-', '"', '“', '”')]
  print('Pre-processed Corpus: {}\n'.format(pro_words[:250]))
  print('Length of the corpus: {}'.format(len(pro_words)))
  #Split the corpus into trainset (80%) and testset (20%)
  trainset = pro_words[:(len(pro_words) // 10) * 8]
  testset = pro_words[(len(pro_words) // 10) * 8:]
  print('Length of Trainset: {}'.format(len(trainset)))
  print('Length of Testset: {}\n'.format(len(testset)))
  return trainset, testset

#Functions to generate Unigrams, Bigrams and corresponding probabilities
def total_count(ngramcount):
    N = 0
    for nGram in ngramcount:
        N = N + ngramcount[nGram]
    return N

def get_word_count(words):
    WordCount = {}
    for i in range(len(words) - 1):
        x = (words[i])
        if not x in WordCount:
            WordCount[x] = 1
        else:
            WordCount[x] += 1
    return WordCount
    
def get_word_unigram_count(words):
    unigramCount = {}
    for i in range(len(words) - 1):
        x = (words[i][0])
        if not x in unigramCount:
            unigramCount[x] = 1
        else:
            unigramCount[x] += 1
    return unigramCount

def get_word_bigram_count(words):
    bigramCount = {}
    for i in range(len(words) - 1):
        x = (words[i][0], words[i+1][0])
        if not x in bigramCount:
            bigramCount[x] = 1
        else:
            bigramCount[x] += 1
    return bigramCount

def get_pos_unigram_count(words):
    Counts = {}
    for i in range(len(words) - 1):
        x = (words[i][1])
        if not x in Counts:
            Counts[x] = 1
        else:
            Counts[x] += 1
    return Counts

def get_pos_bigram_count(words):
    Counts = {}
    for i in range(len(words) - 1):
        x = (words[i][1], words[i+1][1])
        if not x in Counts:
            Counts[x] = 1
        else:
            Counts[x] += 1
    return Counts


def get_pos_unigram_probability(posUnigramCounts, totalUnigramCount, posUnigram, d):
    try:
        N_pos = posUnigramCounts[posUnigram]
    except KeyError:
        N_pos = 0
    N_plus = len(posUnigramCounts)
    N = totalUnigramCount
    lambda_dot = d * N_plus / N
    P_uni = 1 / N_plus
    return ((max(N_pos - d, 0) / N) + lambda_dot * P_uni)
    

def get_pos_bigram_probability(posUnigramCounts, totalUnigramCount, posBigramCounts, posBigram, d):
    try:
        N_pos_Bigram = posBigramCounts[posBigram]
    except KeyError:
        N_pos_Bigram = 0
    pos = posBigram[0]
    N_pos = posUnigramCounts[pos]
    history_pos =  posBigram[1]
    try:
        N_history = posUnigramCounts[history_pos]
    except KeyError:
        N_history = 0
    N_plus_history = 0
    for bigram in posBigramCounts:
        if bigram[0][0] == history_pos:
            N_plus_history = N_plus_history + 1
    
    P_abs = get_pos_unigram_probability(posUnigramCounts, totalUnigramCount, pos, d)
    if N_history > 0:
        lambda_history =   d * N_plus_history / N_history
        prob = (max(N_pos_Bigram - d, 0) / N_history) + lambda_history * P_abs
    else:
        prob = P_abs
    return prob


def get_tagged_word_unigram_probability(unigramCounts, totalUnigramCount, taggedWord, d):
    try:
        N_w = unigramCounts[taggedWord[0]]
    except KeyError:
        N_w = 0
    N = totalUnigramCount
    P_uni = 1 / len(unigramCounts)
    N_plus = len(unigramCounts)
    lambda_dot = d * N_plus / N
    prob = ((max(N_w - d, 0) / N) + lambda_dot * P_uni)
    return prob
    
    
def get_tagged_word_bigram_probability(unigramCounts, totalUnigramCount, posUnigramCounts, posBigramCounts, taggedWordCount, taggedWord, d):
    try:    
        N_w_pos = taggedWordCount[taggedWord]
    except KeyError:
        N_w_pos = 0
    pos = taggedWord[1]
    N_pos = posUnigramCounts[pos]
    N_plus_pos = 0
    for bigram in posBigramCounts:
        if bigram[0][0] == pos:
            N_plus_pos = N_plus_pos + 1
    lambda_pos = d * N_plus_pos / N_pos
    P_abs = get_tagged_word_unigram_probability(unigramCounts, totalUnigramCount, taggedWord, d)
    if N_pos >0:
        prob = (max(N_w_pos - d, 0) / N_pos) + lambda_pos * P_abs
    else:
        prob = P_abs
    return prob

def get_pos_list(taggedWords):
    pos = []
    for taggedWord in taggedWords:
        if taggedWord[1] not in pos:
            pos.append(taggedWord[1])
    return pos

def get_pos(word, posBigramCounts):
    for bigram in posBigramCounts:
        if bigram[0][0] == word:
            return bigram[0][1]

def get_class_based_bigram_probability(bigram, pos_list, unigramCounts, totalUnigramCount, posUnigramCounts, posBigramCounts, taggedWordCount, d, totalPosUnigramCount):
    prob = 0
    history_pos = get_pos(bigram[0], posBigramCounts)
    for pos in pos_list:
        taggedWord = (bigram[0], pos)
        posBigram = (pos, history_pos)
        taggedWordBigramProbability =  get_tagged_word_bigram_probability(unigramCounts, totalUnigramCount, posUnigramCounts, posBigramCounts, taggedWordCount, taggedWord, d)
        posBigramProbability = get_pos_bigram_probability(posUnigramCounts, totalPosUnigramCount, posBigramCounts, posBigram, d)
        prob = prob + (taggedWordBigramProbability * posBigramProbability)
    return prob

"""
def compute_perplexity(interp = True):
     sumlog = 0
    for bigram in testBigrams:
        word1 = bigram[0]
         word2 = bigram[1]
         if interp:
             bigram_prob = estimate_interpolated_bigram_prob(word1, word2)
         else:
             bigram_prob = calculcate_bigram_prob(word1, word2)
         log_bigram_prob = math.log(bigram_prob, 2)
         sumlog = sumlog + log_bigram_prob
     perplexity = pow(2, (-1) * sumlog/len(testBigrams))  
     return perplexity 
""" 

def perplexity(testBigrams, pos_list, unigramCounts, totalUnigramCount, totalBigramCount, posUnigramCounts, posBigramCounts, taggedWordCount, d, totalPosUnigramCount):
    entropy = 0
    for bigram in testBigrams:
        rel_prob = testBigrams[bigram] / totalBigramCount
        smoothed_prob = get_class_based_bigram_probability(bigram, pos_list, unigramCounts, totalUnigramCount, posUnigramCounts, posBigramCounts, taggedWordCount, d,  totalPosUnigramCount)
        log_smoothed_prob = math.log1p(smoothed_prob)
        entropy = entropy + rel_prob * smoothed_prob
    perplexity = math.pow(2, -entropy)
    return perplexity

In [None]:
#Driver Code
corp = treebank.tagged_words()
trainset, testset = pre_process(corp)
print('Train Content: {}'.format(trainset))
print('Test Content: {}\n'.format(testset))

#Trainset & Testset Word Counts
trainWordCount = get_word_count(trainset)
testWordCount = get_word_count(testset)

#Trainset & Testset Word Unigrams
trainWordUnigrams = get_word_unigram_count(trainset)
testWordUnigrams = get_word_unigram_count(testset)

#Trainset & Testset Word Bigrams
trainWordBigrams = get_word_bigram_count(trainset)
testWordBigrams = get_word_bigram_count(testset)

#Trainset & Testset Pos Unigrams
trainPosUnigrams = get_pos_unigram_count(trainset)
testPosUnigrams = get_pos_unigram_count(testset)

#Trainset & Testset Pos Bigrams
trainPosBigrams = get_pos_bigram_count(trainset)
testPosBigrams = get_pos_bigram_count(testset)

print('Train Word Counts: {}'.format(trainWordCount))
print('Train Word Unigram Counts: {}'.format(trainWordUnigrams))
print('Train Word Bigram Counts: {}\n'.format(trainWordBigrams))

#Total Unigram Word & Pos counts
trainTotalPosUnigramCount = total_count(trainPosUnigrams)
trainTotalWordUnigramCount = total_count(trainWordUnigrams)

#Total Bigram Word & Pos counts
trainTotalPosBigramCount = total_count(trainPosBigrams)
trainTotalWordBigramCount = total_count(trainWordBigrams)
trainPosList = get_pos_list(trainset)

#Bigram probailities for task B and D
BigramProb = []
for bigram in trainWordBigrams:
  prob = get_class_based_bigram_probability(bigram[0], trainPosList, trainWordUnigrams, trainTotalWordUnigramCount, trainPosUnigrams, trainPosBigrams, trainWordCount, 0.9, trainTotalPosUnigramCount)
  BigramProb.append((bigram[0], prob))
print(BigramProb)

#Perplexities for task C & D
perplexity = perplexity(testWordBigrams, trainPosList, trainWordUnigrams, trainTotalWordUnigramCount, trainTotalWordBigramCount, trainPosUnigrams, trainPosBigrams, trainWordCount, 0.9, trainTotalPosUnigramCount)
print(perplexity)

**Results for task b, c & d are printed on the fly in the above code cell. Thank you!**

# Inference for task e:
The perplexity value of the language model considering the POS tags is lower than the model without considering the tags. This would make it evident that the later has better performance in terms of predicting the sample space correctly!