In [66]:
from collections import Counter

### 1. Write a function that calculates the unigram probability of a word appearing in some corpus.

In [67]:
def unigramProbability(word, count_table):
  count = count_table[word]
  N = len(count_table)
  return count / N

### Importing all the required corpus

In [68]:
import nltk
nltk.download("brown")
nltk.download("webtext")
nltk.download("reuters")
nltk.download("punkt_tab")
from nltk.corpus import brown, webtext, reuters
brown_corpus = brown.sents()
brown_corpus = [" ".join(sentence) for sentence in brown_corpus]
brown_corpus = ["<s> " + sentence + " </s>" for sentence in brown_corpus][:5000]
webtext_corpus = webtext.sents()
webtext_corpus = [" ".join(sentence) for sentence in webtext_corpus]
webtext_corpus = ["<s> " + sentence + " </s>" for sentence in webtext_corpus][:5000]
reuters_corpus = reuters.sents()
reuters_corpus = [" ".join(sentence) for sentence in reuters_corpus]
reuters_corpus = ["<s> " + sentence + " </s>" for sentence in reuters_corpus][:5000]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Calculate V : unique words in vocabulary

In [69]:
def getUniqueWordsCountInVocab(corpus):
  tokens = set()

  for i in range(len(corpus)):
    temp = corpus[i].split()
    for t in temp:
      tokens.add(t)

  V = len(tokens)
  return(V)

In [70]:
V_brown = getUniqueWordsCountInVocab(brown_corpus)
print(V_brown)
V_webtext = getUniqueWordsCountInVocab(webtext_corpus)
print(V_webtext)

15086
11379


### Tokenize words of given corpus and store word count

In [71]:
def tokenizeCorpus(corpus):
  tokens = []
  for i in range(len(corpus)):
    temp = corpus[i].split()
    for t in temp:
      tokens.append(t)
  return tokens

In [72]:
brown_tokens = tokenizeCorpus(brown_corpus)
brown_wc = Counter(brown_tokens) # this the word count of brown corpus

print(brown_tokens)
print(brown_wc)

webtext_tokens = tokenizeCorpus(webtext_corpus)
webtext_wc = Counter(webtext_tokens) # this the word count of webtext corpus

print(webtext_tokens)
print(webtext_wc)



### Create Bigrams: Write a function that finds all bigrams in a sentence and returns them as a list of tuples.

In [73]:
def create_bigrams(corpus):
  bigrams = []
  corpus = list(corpus)
  for i in range(len(corpus)-1):
    bigrams.append((corpus[i], corpus[i+1])) # appends tuples to list

  # print(bigrams)
  return bigrams

### Conditional Probability (BIGRAMS):

Write a function that calculates the conditional probability of a particular word given some previous word and a list of bigram tuples. This should be done using Laplace Smoothing


I have maintained a map (cp) to store calculated values of conditional probabilities for later use


In [74]:
def conditional_probability(word1, word2, bigrams_wc, corpus_wc, V):
  count_w1w2 = bigrams_wc[(word1, word2)]
  count_w1 = corpus_wc[word1]
  return (count_w1w2+1) / (count_w1 + V) #Laplace smoothing

In [75]:
#For brown corpus
bgrams_brown = create_bigrams(brown_tokens)
bgrams_brown_wc = Counter(bgrams_brown)

In [76]:
cp_brown = {} # cp_brown -> map of conditional probabilities of bigrams

for i in bgrams_brown:
  cp_brown[i] = conditional_probability(i[0], i[1], bgrams_brown_wc, brown_wc, V_brown)

In [77]:
cp_brown

{('<s>', 'The'): 0.04027680971821169,
 ('The', 'Fulton'): 0.000125203455615375,
 ('Fulton', 'County'): 0.00046357615894039735,
 ('County', 'Grand'): 0.0001322314049586777,
 ('Grand', 'Jury'): 0.00019878081102570898,
 ('Jury', 'said'): 0.0001325556733828208,
 ('said', 'Friday'): 0.00032270556344391377,
 ('Friday', 'an'): 0.00013221392212599986,
 ('an', 'investigation'): 0.0003243593902043464,
 ('investigation', 'of'): 0.0003312355084465055,
 ('of', "Atlanta's"): 0.00010980564401010212,
 ("Atlanta's", 'recent'): 0.0001325381047051027,
 ('recent', 'primary'): 0.00013236267372600927,
 ('primary', 'election'): 0.00019863603257630935,
 ('election', 'produced'): 0.00013221392212599986,
 ('produced', '``'): 0.00013252054068380598,
 ('``', 'no'): 0.00012615908660821296,
 ('no', 'evidence'): 0.00019729054320662896,
 ('evidence', "''"): 0.00019859658413875282,
 ("''", 'that'): 0.0002528125395019593,
 ('that', 'any'): 0.00031314586334314526,
 ('any', 'irregularities'): 0.00013167423793534794,
 ('i

In [78]:
#For webtext corpus
bgrams_webtext = create_bigrams(webtext_tokens)
bgrams_webtext_wc = Counter(bgrams_webtext)

In [79]:
cp_webtext = {} # cp_webtext -> conditional probabilities of bigrams

for i in bgrams_webtext:
  cp_webtext[i] = conditional_probability(i[0], i[1], bgrams_webtext_wc, webtext_wc, V_webtext)

In [80]:
cp_webtext

{('<s>', 'Cookie'): 0.00018316136516270836,
 ('Cookie', 'Manager'): 0.0008771160424524164,
 ('Manager', ':'): 0.00025956047759127874,
 (':', '"'): 0.0022407394440165256,
 ('"', 'Don'): 0.0003726337755254136,
 ('Don', "'"): 0.0035905070496540853,
 ("'", 't'): 0.09080005644137153,
 ('t', 'allow'): 0.000552224676554118,
 ('allow', 'sites'): 0.0002624212736179146,
 ('sites', 'that'): 0.00043698654081454294,
 ('that', 'set'): 0.00025397900440230273,
 ('set', 'removed'): 0.0002610966057441253,
 ('removed', 'cookies'): 0.00026313481273572496,
 ('cookies', 'to'): 0.00026232948583420777,
 ('to', 'set'): 0.0013562709686630024,
 ('set', 'future'): 0.0002610966057441253,
 ('future', 'cookies'): 0.00026352775825720307,
 ('cookies', '"'): 0.0004372158097236796,
 ('"', 'should'): 0.0014160083469965717,
 ('should', 'stay'): 0.00033701238520515627,
 ('stay', 'checked'): 0.0005259006047856955,
 ('checked', 'When'): 0.00017540782318891423,
 ('When', 'in'): 0.0003471017007983339,
 ('in', 'full'): 0.000507

### Predict next word (BIGRAMS):

Write a function that predicts the next word in a sequence (i.e., find the bigram which yields the highest
conditional probability given a set initial word).

In [81]:
def predict_next_word(word, bigrams, cp):
  max_prob = 0
  next_word = ""

  for i in bigrams:
    if i[0] == word:
      if cp[i] > max_prob:
        max_prob = cp[i]
        next_word = i[1]

  return next_word

In [82]:
predict_next_word("took", bgrams_brown, cp_brown)

'over'

In [83]:
predict_next_word("took", bgrams_webtext, cp_webtext)

'to'

### Predict sentence (BIGRAMS):

Write a function that predicts an entire sentence by continuously finding the most probable next word given a list of bigrams and some initial sequence. Only do this up to some specified limit in length (assume the initial sequence len ≥ 1, and use a limit of 5-10 for testing).


In [84]:
def predict_sentence(initial_sentence, bgrams, limit, cp):
  sentence = initial_sentence.split()
  while len(sentence) < limit:
    next_word = predict_next_word(sentence[-1], bgrams, cp)
    sentence.append(next_word)

  return " ".join(sentence)

In [85]:
sent_b_brown = predict_sentence("<s> Fools are ", bgrams_brown, 9, cp_brown)
print(sent_b_brown)

<s> Fools are not be a year . </s>


In [86]:
sent_b_webtext = predict_sentence("<s> Fools are ", bgrams_webtext, 9, cp_webtext)
print(sent_b_webtext)

<s> Fools are not work in the page is


### Create Trigrams: Write a function that finds all trigrams in a sentence and returns them as a list of tuples.

In [87]:
def create_trigrams(corpus):
  trigrams = []
  corpus = list(corpus)
  for i in range(len(corpus)-2):
    trigrams.append((corpus[i], corpus[i+1], corpus[i+2]))

  # print(trigrams)
  return trigrams

### Calculate conditional probabilities (TRIGRAMS):

In [88]:
def conditional_probability_trigrams(word1, word2, word3, trigrams_wc, bigrams_wc, V):
  count_w1w2w3 = trigrams_wc[(word1, word2, word3)]
  count_w1w2 = bigrams_wc[(word1, word2)]
  return (count_w1w2w3+1) / (count_w1w2 + V)

In [89]:
#For brown corpus
trigrams_brown = create_trigrams(brown_tokens)
trigrams_brown_wc = Counter(trigrams_brown)

In [90]:
cp_t_brown = {} # cp_t -> conditional probabilities of trigrams

for i in trigrams_brown:
  cp_t_brown[i] = conditional_probability_trigrams(i[0], i[1],i[2], trigrams_brown_wc, bgrams_brown_wc, V_brown)

In [91]:
cp_t_brown

{('<s>', 'The', 'Fulton'): 0.00012583364791745311,
 ('The', 'Fulton', 'County'): 0.00013256445946841652,
 ('Fulton', 'County', 'Grand'): 0.00013252054068380598,
 ('County', 'Grand', 'Jury'): 0.00013256445946841652,
 ('Grand', 'Jury', 'said'): 0.0001325556733828208,
 ('Jury', 'said', 'Friday'): 0.00013256445946841652,
 ('said', 'Friday', 'an'): 0.0001325381047051027,
 ('Friday', 'an', 'investigation'): 0.00013256445946841652,
 ('an', 'investigation', 'of'): 0.0002650762094102054,
 ('investigation', 'of', "Atlanta's"): 0.0001325381047051027,
 ('of', "Atlanta's", 'recent'): 0.00013256445946841652,
 ("Atlanta's", 'recent', 'primary'): 0.00013256445946841652,
 ('recent', 'primary', 'election'): 0.00013256445946841652,
 ('primary', 'election', 'produced'): 0.0001325556733828208,
 ('election', 'produced', '``'): 0.00013256445946841652,
 ('produced', '``', 'no'): 0.00013256445946841652,
 ('``', 'no', 'evidence'): 0.00013256445946841652,
 ('no', 'evidence', "''"): 0.0001325556733828208,
 ('evid

In [92]:
#For webtext corpus
trigrams_webtext = create_trigrams(webtext_tokens)
trigrams_webtext_wc = Counter(trigrams_webtext)

In [93]:
cp_t_webtext = {} # cp_t -> conditional probabilities of trigrams

for i in trigrams_webtext:
  cp_t_webtext[i] = conditional_probability_trigrams(i[0], i[1],i[2], trigrams_webtext_wc, bgrams_webtext_wc, V_webtext)

In [94]:
cp_t_webtext

{('<s>', 'Cookie', 'Manager'): 0.00017573148229505317,
 ('Cookie', 'Manager', ':'): 0.00017562346329469617,
 ('Manager', ':', '"'): 0.00017573148229505317,
 (':', '"', 'Don'): 0.00017528483786152498,
 ('"', 'Don', "'"): 0.0004392515154177282,
 ('Don', "'", 't'): 0.0035029337069795953,
 ("'", 't', 'allow'): 0.0005527043031977891,
 ('t', 'allow', 'sites'): 0.00026350461133069827,
 ('allow', 'sites', 'that'): 0.0002635972234425797,
 ('sites', 'that', 'set'): 0.0002635509092506369,
 ('that', 'set', 'removed'): 0.0002635972234425797,
 ('set', 'removed', 'cookies'): 0.0002635972234425797,
 ('removed', 'cookies', 'to'): 0.0002635972234425797,
 ('cookies', 'to', 'set'): 0.0002635972234425797,
 ('to', 'set', 'future'): 0.00026322716504343247,
 ('set', 'future', 'cookies'): 0.0002635972234425797,
 ('future', 'cookies', '"'): 0.00017573148229505317,
 ('cookies', '"', 'should'): 0.0002635509092506369,
 ('"', 'should', 'stay'): 0.00035096955339124333,
 ('should', 'stay', 'checked'): 0.0003514320857

### Predict next words (TRIGRAMS)

In [95]:
def predict_next_word_trigrams(word, trigrams, cp_t):
  max_prob = 0
  next_word = ""

  for i in trigrams:
    if i[0] == word:
      if cp_t[i] > max_prob:
        max_prob = cp_t[i]
        next_word = i[1]

  return next_word

In [96]:
predict_next_word_trigrams("took", trigrams_brown, cp_t_brown)

'place'

In [97]:
predict_next_word_trigrams("took", trigrams_webtext, cp_t_webtext)

'to'

### Predict sentences (TRIGRAMS):

In [98]:
def predict_sentence_trigrams(initial_sentence, trigrams, limit, cp_t):
  sentence = initial_sentence.split()
  while len(sentence) < limit:
    next_word = predict_next_word_trigrams(sentence[-1], trigrams, cp_t)
    sentence.append(next_word)

  return " ".join(sentence)

In [99]:
sent_t_brown = predict_sentence_trigrams("<s> Fools are ", trigrams_brown, 9, cp_t_brown)
print(sent_t_brown)

<s> Fools are expected to attend the United Nations


In [100]:
sent_t_webtext = predict_sentence_trigrams("<s> Fools are ", trigrams_webtext, 9, cp_t_webtext)
print(sent_t_webtext)

<s> Fools are . </s> <s> ARTHUR : Oh


### Calculate Perplexity using BIGRAM models

In [101]:
import math
def perplexity_bigrams(sentence, ct, bgrams_wc, corpus_wc, V):
    bgrams_s = create_bigrams(sentence.split())

    log_prob_sum = 0  # Using log sum of probabilities instead of multiplying (multiplication was giving division by zero error)
    for i in bgrams_s:
        if i in ct:
            prob = ct[i]
        else:
            prob = conditional_probability(i[0], i[1], bgrams_wc, corpus_wc, V)

        log_prob_sum += math.log2(prob)

    perplexity = 2 ** (-log_prob_sum / len(bgrams_s))
    return perplexity

In [102]:
perplexity_bigrams(sent_b_brown, cp_brown, bgrams_brown_wc, brown_wc, V_brown)

756.9730396235207

In [103]:
perplexity_bigrams(sent_b_webtext, cp_webtext, bgrams_webtext_wc, webtext_wc, V_webtext)

452.73677982528073

### Calculate perplexity using TRIGRAM models

In [104]:
def perplexity_trigrams(sentence, cp_t, trigrams_wc, bgrams_wc, V):
    trigrams_s = create_trigrams(sentence.split())

    log_prob_sum = 0
    for i in trigrams_s:
        if i in cp_t:
            prob = cp_t[i]
        else:
            prob = conditional_probability_trigrams(i[0], i[1], i[2], trigrams_wc, bgrams_wc, V)

        log_prob_sum += math.log2(prob)

    perplexity = 2 ** (-log_prob_sum / len(trigrams_s))
    return perplexity

In [105]:
perplexity_trigrams(sent_t_brown, cp_t_brown, trigrams_brown_wc, bgrams_brown_wc, V_brown)

5858.908232448942

In [106]:
perplexity_trigrams(sent_t_webtext, cp_t_webtext, trigrams_webtext_wc, bgrams_webtext_wc, V_webtext)

530.2337982680406

### bigrams_brown

In [107]:
bgrams_brown

[('<s>', 'The'),
 ('The', 'Fulton'),
 ('Fulton', 'County'),
 ('County', 'Grand'),
 ('Grand', 'Jury'),
 ('Jury', 'said'),
 ('said', 'Friday'),
 ('Friday', 'an'),
 ('an', 'investigation'),
 ('investigation', 'of'),
 ('of', "Atlanta's"),
 ("Atlanta's", 'recent'),
 ('recent', 'primary'),
 ('primary', 'election'),
 ('election', 'produced'),
 ('produced', '``'),
 ('``', 'no'),
 ('no', 'evidence'),
 ('evidence', "''"),
 ("''", 'that'),
 ('that', 'any'),
 ('any', 'irregularities'),
 ('irregularities', 'took'),
 ('took', 'place'),
 ('place', '.'),
 ('.', '</s>'),
 ('</s>', '<s>'),
 ('<s>', 'The'),
 ('The', 'jury'),
 ('jury', 'further'),
 ('further', 'said'),
 ('said', 'in'),
 ('in', 'term-end'),
 ('term-end', 'presentments'),
 ('presentments', 'that'),
 ('that', 'the'),
 ('the', 'City'),
 ('City', 'Executive'),
 ('Executive', 'Committee'),
 ('Committee', ','),
 (',', 'which'),
 ('which', 'had'),
 ('had', 'over-all'),
 ('over-all', 'charge'),
 ('charge', 'of'),
 ('of', 'the'),
 ('the', 'election

### trigrams_brown

In [108]:
trigrams_brown

[('<s>', 'The', 'Fulton'),
 ('The', 'Fulton', 'County'),
 ('Fulton', 'County', 'Grand'),
 ('County', 'Grand', 'Jury'),
 ('Grand', 'Jury', 'said'),
 ('Jury', 'said', 'Friday'),
 ('said', 'Friday', 'an'),
 ('Friday', 'an', 'investigation'),
 ('an', 'investigation', 'of'),
 ('investigation', 'of', "Atlanta's"),
 ('of', "Atlanta's", 'recent'),
 ("Atlanta's", 'recent', 'primary'),
 ('recent', 'primary', 'election'),
 ('primary', 'election', 'produced'),
 ('election', 'produced', '``'),
 ('produced', '``', 'no'),
 ('``', 'no', 'evidence'),
 ('no', 'evidence', "''"),
 ('evidence', "''", 'that'),
 ("''", 'that', 'any'),
 ('that', 'any', 'irregularities'),
 ('any', 'irregularities', 'took'),
 ('irregularities', 'took', 'place'),
 ('took', 'place', '.'),
 ('place', '.', '</s>'),
 ('.', '</s>', '<s>'),
 ('</s>', '<s>', 'The'),
 ('<s>', 'The', 'jury'),
 ('The', 'jury', 'further'),
 ('jury', 'further', 'said'),
 ('further', 'said', 'in'),
 ('said', 'in', 'term-end'),
 ('in', 'term-end', 'presentmen

### bigrams_webtext

In [109]:
bgrams_webtext

[('<s>', 'Cookie'),
 ('Cookie', 'Manager'),
 ('Manager', ':'),
 (':', '"'),
 ('"', 'Don'),
 ('Don', "'"),
 ("'", 't'),
 ('t', 'allow'),
 ('allow', 'sites'),
 ('sites', 'that'),
 ('that', 'set'),
 ('set', 'removed'),
 ('removed', 'cookies'),
 ('cookies', 'to'),
 ('to', 'set'),
 ('set', 'future'),
 ('future', 'cookies'),
 ('cookies', '"'),
 ('"', 'should'),
 ('should', 'stay'),
 ('stay', 'checked'),
 ('checked', 'When'),
 ('When', 'in'),
 ('in', 'full'),
 ('full', 'screen'),
 ('screen', 'mode'),
 ('mode', 'Pressing'),
 ('Pressing', 'Ctrl'),
 ('Ctrl', '-'),
 ('-', 'N'),
 ('N', 'should'),
 ('should', 'open'),
 ('open', 'a'),
 ('a', 'new'),
 ('new', 'browser'),
 ('browser', 'when'),
 ('when', 'only'),
 ('only', 'download'),
 ('download', 'dialog'),
 ('dialog', 'is'),
 ('is', 'left'),
 ('left', 'open'),
 ('open', 'add'),
 ('add', 'icons'),
 ('icons', 'to'),
 ('to', 'context'),
 ('context', 'menu'),
 ('menu', 'So'),
 ('So', 'called'),
 ('called', '"'),
 ('"', 'tab'),
 ('tab', 'bar'),
 ('bar',

### trigrams_webtext

In [110]:
trigrams_webtext

[('<s>', 'Cookie', 'Manager'),
 ('Cookie', 'Manager', ':'),
 ('Manager', ':', '"'),
 (':', '"', 'Don'),
 ('"', 'Don', "'"),
 ('Don', "'", 't'),
 ("'", 't', 'allow'),
 ('t', 'allow', 'sites'),
 ('allow', 'sites', 'that'),
 ('sites', 'that', 'set'),
 ('that', 'set', 'removed'),
 ('set', 'removed', 'cookies'),
 ('removed', 'cookies', 'to'),
 ('cookies', 'to', 'set'),
 ('to', 'set', 'future'),
 ('set', 'future', 'cookies'),
 ('future', 'cookies', '"'),
 ('cookies', '"', 'should'),
 ('"', 'should', 'stay'),
 ('should', 'stay', 'checked'),
 ('stay', 'checked', 'When'),
 ('checked', 'When', 'in'),
 ('When', 'in', 'full'),
 ('in', 'full', 'screen'),
 ('full', 'screen', 'mode'),
 ('screen', 'mode', 'Pressing'),
 ('mode', 'Pressing', 'Ctrl'),
 ('Pressing', 'Ctrl', '-'),
 ('Ctrl', '-', 'N'),
 ('-', 'N', 'should'),
 ('N', 'should', 'open'),
 ('should', 'open', 'a'),
 ('open', 'a', 'new'),
 ('a', 'new', 'browser'),
 ('new', 'browser', 'when'),
 ('browser', 'when', 'only'),
 ('when', 'only', 'downlo

### Testing trained data using Brown, Webtext and Reuters corpus

In [111]:
from random import sample

In [112]:
#Brown corpus

sentence_brown = ""

# for i in range(3):
#   sentence_brown += brown_corpus[i]

sentence_brown = sample(brown_corpus, 5)


In [113]:
#Webtext corpus

sentence_webtext = ""

sentence_webtext = sample(webtext_corpus, 5)

In [114]:
perplexity_bigrams(sentence_brown[0], cp_brown, bgrams_brown_wc, brown_wc, V_brown)

2027.6832677329162

In [115]:
perplexity_bigrams(sentence_webtext[0], cp_webtext, bgrams_webtext_wc, webtext_wc, V_webtext)

324.2753109594715

In [116]:
perplexity_trigrams(sentence_brown[0], cp_t_brown, trigrams_brown_wc, bgrams_brown_wc, V_brown)

5907.899592583485

In [117]:
perplexity_trigrams(sentence_webtext[0], cp_t_webtext, trigrams_webtext, bgrams_webtext, V_webtext)

1703.6000234506382

In [118]:
#Reuters corpus

sentence_reuters = ""

sentence_reuters = sample(webtext_corpus, 25)

In [119]:
#Bigrams

perplexity_reuters_brown = []
perplexity_reuters_webtext = []

for sent in sentence_reuters:
  perplexity_reuters_brown.append(perplexity_bigrams(sent, cp_brown, bgrams_brown_wc, brown_wc, V_brown))
  perplexity_reuters_webtext.append(perplexity_bigrams(sent, cp_webtext, bgrams_webtext_wc, webtext_wc, V_webtext))

print("BIGRAMS")
print("For brown corpus - ", sum(perplexity_reuters_brown)/len(perplexity_reuters_brown))
print("For webtext corpus - ", sum(perplexity_reuters_webtext)/len(perplexity_reuters_webtext))

BIGRAMS
For brown corpus -  7184.086475265947
For webtext corpus -  874.4615084977543


In [120]:
#Trigrams

perplexity_reuters_brown = []
perplexity_reuters_webtext = []

for sent in sentence_reuters:
  perplexity_reuters_brown.append(perplexity_trigrams(sent, cp_t_brown, trigrams_brown_wc, bgrams_brown_wc, V_brown))
  perplexity_reuters_webtext.append(perplexity_trigrams(sent, cp_t_webtext, trigrams_webtext_wc, bgrams_webtext_wc, V_webtext))

print("TRIGRAMS")
print("For brown corpus - ", sum(perplexity_reuters_brown)/len(perplexity_reuters_brown))
print("For webtext corpus - ", sum(perplexity_reuters_webtext)/len(perplexity_reuters_webtext))

TRIGRAMS
For brown corpus -  14132.009282947682
For webtext corpus -  2793.3643862436434
