In [4]:
import re
import numpy as np
import math
from collections import Counter, defaultdict

In [5]:
#loading dataset
with open('/content/train.txt') as f:
  train_data = f.read()

In [15]:
# train_data

In [6]:
train_data = [re.sub(r'[^a-zA-Z]+', ' ', line).lower() for line in train_data.split('\n') if len(line) > 0]

# Add start and end tokens, and split into words
train_list = [['$start'] + line.split() + ['$end'] for line in train_data]

# Create unigram list
train_unigram = [word for sentence in train_list for word in sentence]


In [7]:
#unknown word handling train
#replace words that appear less than 5 times with $unk
count_unigrams = Counter(train_unigram)
for i in range(len(train_unigram)):
  if count_unigrams[train_unigram[i]]<=5:
    train_unigram[i] = '$unk'

In [8]:
train_bigram = [train_unigram[i] + '_' + train_unigram[i+1] for i in range(len(train_unigram)-1)]
train_bigram = list(filter(lambda x: x!='$end_$start', train_bigram))

In [9]:
#unigram and bigram counts from train set
count_unigrams = Counter(train_unigram)
count_unigrams = defaultdict(lambda: 0, count_unigrams)
count_bigrams = Counter(train_bigram)
count_bigrams = defaultdict(lambda: 0, count_bigrams)

In [10]:
#Vocabulary size
V = len(count_unigrams.keys())
print("The vocabulary size ",V)
print("The length of train_unigrams ",len(train_unigram))
print("The length of train_bigrams ",len(train_bigram))
count_unigrams['$unk']

The vocabulary size  1262
The length of train_unigrams  80376
The length of train_bigrams  79864


7814

In [11]:
#unsmoothed probabilities
unsmoothed_prob_unigrams = dict(map(lambda x: (x[0], x[1]/len(train_unigram)), count_unigrams.items()))
unsmoothed_prob_bigrams = dict(map(lambda x: (x[0], x[1]/count_unigrams[x[0].split("_")[0]]), count_bigrams.items()))

In [13]:
# unsmoothed_prob_unigrams

In [14]:
# unsmoothed_prob_bigrams

In [None]:
def smoothed_uni_probability(word, k=1):
    # Count of the word + smoothing factor k
    cw = count_unigrams.get(word, 0) + k
    # Total number of unigrams in the corpus + smoothing for vocabulary size
    c = len(train_unigram) + k * V
    # Return smoothed probability
    return cw / c

# Function to calculate perplexity for unigrams
def perplexity_unigrams(x, k=1):
    p = 0
    n = len(x)  # Total number of unigrams in the input x

    for word in x:
        prob = smoothed_uni_probability(word, k)
        if prob > 0:
            p += np.log2(prob)

    p = p / n
    pp = math.pow(2, -p)
    return pp

In [None]:
# Function to calculate smoothed probability for bigrams
def smoothed_bi_probability(bigram, k=1):
    # Count of the bigram + smoothing factor k
    cw = count_bigrams.get(bigram, 0) + k
    # Get the first word of the bigram
    first_word = bigram.split("_")[0]
    # Count of the first word + smoothing for vocabulary size
    c = count_unigrams.get(first_word, 0) + k * V
    # Return smoothed probability
    return cw / c

# Function to calculate perplexity for bigrams
def perplexity_bigrams(x, k=1):
    p = 0
    n = len(x)  # Total number of bigrams in the input x

    for bigram in x:
        prob = smoothed_bi_probability(bigram, k)
        if prob > 0:
            p += np.log2(prob)

    p = p / n
    pp = math.pow(2, -p)
    return pp

In [None]:
print(smoothed_uni_probability('we', k=1))
print(smoothed_bi_probability('we',  k=1))

0.013712743775297437
0.00044033465433729633


In [None]:
print(perplexity_unigrams(train_unigram, k=1))
print(perplexity_bigrams(train_bigram, k=1))


209.16740676392953
148.26608661608694


In [None]:
with open('/content/val.txt') as f:
  val_data = f.read()

In [None]:
val_data = [re.sub(r'[^a-zA-Z]+', ' ', line).lower() for line in val_data.split('\n') if len(line) > 0]

# Add start and end tokens, and split into words
val_list = [['$start'] + line.split() + ['$end'] for line in val_data]

# Create unigram list
val_unigram = [word for sentence in val_list for word in sentence]

In [None]:
#unknown word handling in val
for i in range(len(val_unigram)):
  if val_unigram[i] not in train_unigram:
    val_unigram[i] = '$unk'

val_bigram = [val_unigram[i] + '_' + val_unigram[i+1] for i in range(len(val_unigram)-1)]
val_bigram = list(filter(lambda x: x!='$end_$start', val_bigram))

In [None]:
pp_unigram_val = []
pp_bigram_val = []

# Validation set perplexities
for k in [1, 0.5, 0.8]:
    unigram_pp = perplexity_unigrams(val_unigram, k=k)
    bigram_pp = perplexity_bigrams(val_bigram, k=k)

    # Append results
    pp_unigram_val.append(unigram_pp)
    pp_bigram_val.append(bigram_pp)

    # Print the results
    print(f"The Perplexity of Unigram-Validation for k={k}: {unigram_pp}")
    print(f"The Perplexity of Bigram-Validation for k={k}: {bigram_pp}")

The Perplexity of Unigram-Validation for k=1: 205.56688506739232
The Perplexity of Bigram-Validation for k=1: 177.64155153350805
The Perplexity of Unigram-Validation for k=0.5: 205.4374918102955
The Perplexity of Bigram-Validation for k=0.5: 141.2661179880533
The Perplexity of Unigram-Validation for k=0.8: 205.51184093893193
The Perplexity of Bigram-Validation for k=0.8: 164.38273828454047


In [None]:
pp_unigram_train = []
pp_bigram_train = []

# Train set perplexities
for k in [1, 0.5, 0.8]:
    # Compute perplexities
    unigram_pp = perplexity_unigrams(train_unigram, k=k)
    bigram_pp = perplexity_bigrams(train_bigram, k=k)

    # Append results to lists
    pp_unigram_train.append(unigram_pp)
    pp_bigram_train.append(bigram_pp)

    # Print the perplexity for the current smoothing factor
    print(f"Perplexity of Unigram-Train for k={k}: {unigram_pp}")
    print(f"Perplexity of Bigram-Train for k={k}: {bigram_pp}")

Perplexity of Unigram-Train for k=1: 209.16740676392953
Perplexity of Bigram-Train for k=1: 148.26608661608694
Perplexity of Unigram-Train for k=0.5: 209.11499725767476
Perplexity of Bigram-Train for k=0.5: 108.4268781205447
Perplexity of Unigram-Train for k=0.8: 209.14259196505088
Perplexity of Bigram-Train for k=0.8: 133.81461000623858


In [None]:
pp_unigram_train

[209.16740676392953, 209.11499725767476, 209.14259196505088]

In [None]:
pp_unigram_val

[205.56688506739232, 205.4374918102955, 205.51184093893193]

In [None]:
pp_bigram_train

[148.26608661608694, 108.4268781205447, 133.81461000623858]

In [None]:
pp_bigram_val

[177.64155153350805, 141.2661179880533, 164.38273828454047]