In [None]:
import nltk
nltk.download('reuters') # one time execution
nltk.download('punkt')  # one time execution

from nltk.corpus import reuters
import math
import random

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# see the data
reuters.raw()[:260]

"ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n  Mounting trade friction between the\n  U.S. And Japan has raised fears among many of Asia's exporting\n  nations that the row could inflict far-reaching economic\n  damage, businessmen and officials said.\n      T"

In [None]:
# get sentences
dataset_sents = reuters.sents()
# print first sentence
dataset_sents[1]

['They',
 'told',
 'Reuter',
 'correspondents',
 'in',
 'Asian',
 'capitals',
 'a',
 'U',
 '.',
 'S',
 '.',
 'Move',
 'against',
 'Japan',
 'might',
 'boost',
 'protectionist',
 'sentiment',
 'in',
 'the',
 'U',
 '.',
 'S',
 '.',
 'And',
 'lead',
 'to',
 'curbs',
 'on',
 'American',
 'imports',
 'of',
 'their',
 'products',
 '.']

In [None]:
# number of sentences
len(dataset_sents)

54716

In [None]:
# get list of words
dataset_words = reuters.words()
# print first word
dataset_words[:20]

['ASIAN',
 'EXPORTERS',
 'FEAR',
 'DAMAGE',
 'FROM',
 'U',
 '.',
 'S',
 '.-',
 'JAPAN',
 'RIFT',
 'Mounting',
 'trade',
 'friction',
 'between',
 'the',
 'U',
 '.',
 'S',
 '.']

In [None]:
# total number of words
len(dataset_words)

1720901

In [None]:
# size of vocabulary
len(set(dataset_words))

41600

In [None]:
data_sents = dataset_sents[:40000]
data_sents_test = dataset_sents[40000:]

In [None]:
# number of words in train data
num_words = 0
for sentence in data_sents:
  num_words += len(sentence)
num_words

1262448

In [None]:
# create two lists containing words
data_words_train = dataset_words[:num_words]
data_words_test = dataset_words[num_words:]

In [None]:
def createBigram(data):
	listOfBigrams = []
	bigramCounts = {}
	unigramCounts = {}

	for i in range(len(data)):
		if i < len(data) - 1:

			listOfBigrams.append((data[i], data[i + 1]))

			if (data[i], data[i+1]) in bigramCounts:
				bigramCounts[(data[i], data[i + 1])] += 1
			else:
				bigramCounts[(data[i], data[i + 1])] = 1

		if data[i] in unigramCounts:
			unigramCounts[data[i]] += 1
		else:
			unigramCounts[data[i]] = 1

	return listOfBigrams, unigramCounts, bigramCounts

In [None]:
def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):

	listOfProb = {}
	for bigram in listOfBigrams:
		word1 = bigram[0]
		word2 = bigram[1]
		
		listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))

	file = open('bigramProb.txt', 'w')
	file.write('Bigram' + '\t\t\t' + 'Count' + '\t' + 'Probability' + '\n')

	for bigrams in listOfBigrams:
		file.write(str(bigrams) + ' : ' + str(bigramCounts[bigrams]) + ' : ' + str(listOfProb[bigrams]) + '\n')
	file.close()

	return listOfProb

In [None]:
def bigramWithAddOneSmoothing(listOfBigrams, unigramCounts, bigramCounts):

	listOfProb = {}
	cStar = {}


	for bigram in listOfBigrams:
		word1 = bigram[0]
		word2 = bigram[1]
		listOfProb[bigram] = (bigramCounts.get(bigram) + 1)/(unigramCounts.get(word1) + len(unigramCounts))
		cStar[bigram] = (bigramCounts[bigram] + 1) * unigramCounts[word1] / (unigramCounts[word1] + len(unigramCounts))

	file = open('addOneSmoothing.txt', 'w')
	file.write('Bigram' + '\t\t\t' + 'Count' + '\t' + 'Probability' + '\n')

	for bigrams in listOfBigrams:
		file.write(str(bigrams) + ' : ' + str(bigramCounts[bigrams])
				   + ' : ' + str(listOfProb[bigrams]) + '\n')

	file.close()

	return listOfProb, cStar

In [None]:
# Main Program

# Create a list of bigrams and get frequencies of unigrams and bigrams
listOfBigrams, unigramCounts, bigramCounts = createBigram(data_words_train)

# Calculate bigram probabilities
bigramProb = calcBigramProb(listOfBigrams, unigramCounts, bigramCounts)

# Apply Add-1 Smoothing and calculate probabilities and get reconstructed count of bigrams
bigramAddOne, addOneCstar = bigramWithAddOneSmoothing(listOfBigrams, unigramCounts, bigramCounts)

In [None]:
input = 'we must be very careful'

inputList = [] # list to store bigrams

for i in range(len(input.split())-1):
  inputList.append((input.split()[i], input.split()[i+1]))
inputList

[('we', 'must'), ('must', 'be'), ('be', 'very'), ('very', 'careful')]

In [None]:
# Open a file to write output
output1 = open('bigramProb-OUTPUT.txt', 'w')

# initial probability of a sentence
outputProb1 = 1

output1.write('Bigram\t\t\t\t' + 'Count\t\t\t\t' + 'Probability\n\n')

for i in range(len(inputList)):

  # if bigram is present in the model, get updated probability
  if inputList[i] in bigramProb:
    # write bigram, its count and probability to the file
    output1.write(str(inputList[i]) + '\t\t' + str(bigramCounts[inputList[i]]) + '\t\t' + str(bigramProb[inputList[i]]) + '\n')
    # multiply with probability of a current bigram
    outputProb1 *= bigramProb[inputList[i]]

  # if bigram is not present in the model, sentence probability is zero
  else:
    output1.write(str(inputList[i]) + '\t\t\t' + str(0) + '\t\t\t' + str(0) + '\n')
    outputProb1 *= 0

output1.write('\n' + 'Probablility = ' + str(outputProb1))
outputProb1

2.8076159793430567e-07

In [None]:
# Open a file to write output
output2 = open('addOneSmoothing-OUTPUT.txt', 'w')

# initial probability of a sentence
outputProb2 = 1

output2.write('Bigram\t\t\t\t' + 'Count\t\t\t\t' + 'Probability\n\n')

for i in range(len(inputList)):

  # if bigram is present in the model, get updated probability
  if inputList[i] in bigramAddOne:
    # Update probability of the sentence
    outputProb2 *= bigramAddOne[inputList[i]]

    output2.write(str(inputList[i]) + '\t\t' + str(addOneCstar[inputList[i]]) + '\t\t' + str(bigramAddOne[inputList[i]]) + '\n')

  # if bigram is not present in the model, use unigram counts to get estimated probability
  else:
    # if first word in a bigram is not present in unigrams, add with with count 1
    if inputList[i][0] not in unigramCounts:
      unigramCounts[inputList[i][0]] = 1
    
    # calculate probability of that word
    prob = (1) / (unigramCounts[inputList[i][0]] + len(unigramCounts))

    # # reconstructed count for the bigram
    addOneCStar = 1 * unigramCounts[inputList[i][0]] / (unigramCounts[inputList[i][0]] + len(unigramCounts))
    
    # Update probability of the sentence
    outputProb2 *= prob

    output2.write(str(inputList[i]) + '\t' + str(addOneCStar) + '\t' + str(prob) + '\n')

output2.write('\n' + 'Probablility = ' + str(outputProb2))
outputProb2

4.2254340301928457e-14

In [None]:
# input sentence
print(input)

# list of bigrams
print(inputList)

# probability given by simple bigram model
print ('Bigram Model: ', outputProb1)

# probability given by bigram model with add-1 smoothing
print ('Add One: ', outputProb2)

we must be very careful
[('we', 'must'), ('must', 'be'), ('be', 'very'), ('very', 'careful')]
Bigram Model:  2.8076159793430567e-07
Add One:  4.2254340301928457e-14


In [None]:
def sentence_prob_with_next_word(next_word):
  outputProb = 1
  new_bigram = (input.split()[-1], next_word)
  if new_bigram in bigramAddOne:
    outputProb *= bigramAddOne[new_bigram]
  else:
    if new_bigram[0] not in unigramCounts:
      unigramCounts[new_bigram[0]] = 1
    prob = (1) / (unigramCounts[new_bigram[0]] + len(unigramCounts))
    outputProb *= prob
  return outputProb

In [None]:
input = 'the investors are'
possible_words = ['cheated', 'happy', 'smart', 'afraid']

inputList = []
outputProb = 1

for i in range(len(input.split())-1):
  inputList.append((input.split()[i], input.split()[i+1]))


for i in range(len(inputList)):

  if inputList[i] in bigramAddOne:
    outputProb *= bigramAddOne[inputList[i]]
  else:
    if inputList[i][0] not in unigramCounts:
      unigramCounts[inputList[i][0]] = 1
    prob = (1) / (unigramCounts[inputList[i][0]] + len(unigramCounts))
    outputProb *= prob

In [None]:
max_prob = 0
index_of_next_word = -1
for i, word in enumerate(possible_words):
  final_prob = outputProb * sentence_prob_with_next_word(word)
  if final_prob > max_prob:
    max_prob = final_prob
    index_of_next_word = i

print('Next Word:', possible_words[index_of_next_word])
print('Output Sentece:', input, possible_words[index_of_next_word])

Next Word: happy
Output Sentece: the investors are happy


In [None]:
input1 = 'the market is very happy these days'
input2 = 'market is the happy these very days'


inputList1 = []
inputList2 = []


outputProb1 = 1
outputProb2 = 1


for i in range(len(input1.split())-1):
  inputList1.append((input1.split()[i], input1.split()[i+1]))

for i in range(len(input2.split())-1):
  inputList2.append((input2.split()[i], input2.split()[i+1]))


for i in range(len(inputList1)):
  if inputList1[i] in bigramAddOne:
    outputProb1 *= bigramAddOne[inputList1[i]]
  else:
    if inputList1[i][0] not in unigramCounts:
      unigramCounts[inputList1[i][0]] = 1
    prob1 = (1) / (unigramCounts[inputList1[i][0]] + len(unigramCounts))
    outputProb1 *= prob1


for i in range(len(inputList2)):
  if inputList2[i] in bigramAddOne:
    outputProb2 *= bigramAddOne[inputList2[i]]
  else:
    if inputList2[i][0] not in unigramCounts:
      unigramCounts[inputList2[i][0]] = 1
    prob2 = (1) / (unigramCounts[inputList2[i][0]] + len(unigramCounts))
    outputProb2 *= prob2

print (input1, ':', outputProb1)
print (input2, ':', outputProb2)

the market is very happy these days : 3.0787233025784113e-22
market is the happy these very days : 2.5840603259051406e-24


In [None]:
def calculate_bigram_sentence_probability(input):

  inputList = []
  outputProb = 1

  for i in range(len(input)-1):
    inputList.append((input[i], input[i+1]))

  for i in range(len(inputList)):
    if inputList[i] in bigramAddOne:
      outputProb *= bigramAddOne[inputList[i]]
    else:
      if inputList[i][0] not in unigramCounts:
        unigramCounts[inputList[i][0]] = 1
      prob = (1) / (unigramCounts[inputList[i][0]] + len(unigramCounts))
      outputProb *= prob

  return outputProb

In [None]:
def calculate_number_of_bigrams(sentences):
        bigram_count = 0
        for sentence in sentences:
            # remove one for number of bigrams in sentence
            bigram_count += len(sentence) - 1
        return bigram_count

In [None]:
def calculate_bigram_perplexity(model, sentences):
    number_of_bigrams = calculate_number_of_bigrams(sentences)
    bigram_sentence_probability_log_sum = 0
    for sentence in sentences:
        p = calculate_bigram_sentence_probability(sentence)
        if p != 0.0:
          a = math.log(p)
        else:
          a = 0
        bigram_sentence_probability_log_sum -= a
    return math.pow(2, bigram_sentence_probability_log_sum / number_of_bigrams)

In [None]:
print("PERPLEXITY over Training Data:", calculate_bigram_perplexity(bigramAddOne, data_sents))
print("PERPLEXITY over Test Data:", calculate_bigram_perplexity(bigramAddOne, data_sents_test))

PERPLEXITY over Training Data: 137.07939217258775
PERPLEXITY over Test Data: 168.93416630740222


In [None]:
# initial word
text = ["there"]

sentence_finished = False
 
while not sentence_finished:
  # select a random probability threshold  
  r = random.random()
  accumulator = 0.0

  for pair in bigramProb.keys():
    if pair[0] == text[-1]:
      accumulator += bigramProb[pair]
      # select words that are above the probability threshold
      if accumulator >= r:
          text.append(pair[1])
          break

  if text[-1] == 'None':
    sentence_finished = True
  if len(text) > 20:
    sentence_finished = True
 
print (' '.join([t for t in text if t]))

there are liberalised 20 Record April 7 mln vs loss of a special meeting in December 1986 net loss 7 ,
