In [1]:
import math
import nltk
from nltk.tokenize import word_tokenize
import random
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re

## Basic File Paths
file_path = "D:\program files\hin_wikipedia_2021_10K\hin_wikipedia_2021_10K\hin_wikipedia_2021_10K-sentences.txt"


## Opening the File in read mode ("r")
with open(file_path,encoding="utf-8") as f:
    data = f.read()

def preprocess_pipeline(data):

    # Split by newline character
    sentences = np.array(data.split('\n'))
    
    # Remove leading and trailing spaces
    sentences = np.array([s.strip() for s in sentences])
    
    # Drop Empty Sentences
    sentences = np.array([s for s in sentences if len(s) > 0])
    
    # Empty List to hold Tokenized Sentences
    tokenized = []
    
    # Iterate through sentences
    for sentence in sentences:
        # Convert to a list of words
        token = word_tokenize(sentence)
        for i,tokens in enumerate(token):
            if re.search("[a-zA-Z0-9]",tokens):
                token.pop(i)
        
        # Append to list
        tokenized.append(token)
        
    return np.array(tokenized,dtype=object)


## Pass our data to this function    
tokenized_sentences = preprocess_pipeline(data)
print("done")

done


In [2]:
## Obtain Train and Test Split 
train, test = train_test_split(tokenized_sentences, test_size=0.2, random_state=42)

## Obtain Train and Validation Split 
train, val = train_test_split(train, test_size=0.25, random_state=42)


In [3]:
def count_the_words(sentences) -> 'dict':
    
  # Creating a Dictionary of counts
  word_counts = {}

  # Iterating over sentences
  for sentence in sentences:
    
    # Iterating over Tokens
    for token in sentence:
      if re.search("[!@#$%&a-zA-Z0-9]",token) or token in ["के","में","की","है।","को","का","है","लिए","घाट"]:
        continue
      # Add count for new word
      if token not in word_counts.keys():
        word_counts[token] = 1
        
      # Increase count by one
      else:
        word_counts[token] += 1
        
  return word_counts

In [4]:
def handling_oov(tokenized_sentences, count_threshold) -> 'list':

  # Empty list for closed vocabulary
  closed_vocabulary = []

  # Obtain frequency dictionary using previously defined function
  words_count = count_the_words(tokenized_sentences)
    
  # Iterate over words and counts 
  for word, count in words_count.items():
    
    # Append if it's more(or equal) to the threshold 
    if count >= count_threshold :
      closed_vocabulary.append(word)

  return closed_vocabulary


In [5]:
def unk_tokenize(tokenized_sentences, vocabulary, unknown_token = "<unk>") -> 'list':

  # Convert Vocabulary into a set
  vocabulary = set(vocabulary)

  # Create empty list for sentences
  new_tokenized_sentences = []
  
  # Iterate over sentences
  for sentence in tokenized_sentences:

    # Iterate over sentence and add <unk> 
    # if the token is absent from the vocabulary
    new_sentence = []
    for token in sentence:
      if token in vocabulary:
        new_sentence.append(token)
      else:
        new_sentence.append(unknown_token)
    
    # Append sentece to the new list
    new_tokenized_sentences.append(new_sentence)

  return new_tokenized_sentences

In [6]:
def cleansing(train_data, test_data, count_threshold):
    
  # Get closed Vocabulary
  vocabulary = handling_oov(train_data, count_threshold)
    
  # Updated Training Dataset
  new_train_data = unk_tokenize(train_data, vocabulary)
    
  # Updated Test Dataset
  new_test_data = unk_tokenize(test_data, vocabulary)

  return new_train_data, new_test_data, vocabulary

In [7]:
min_freq = 6
final_train, final_test, vocabulary = cleansing(train, test, min_freq)

In [8]:
def count_n_grams(data, n, start_token = "<s>", end_token = "<e>") -> 'dict':

  # Empty dict for n-grams
  n_grams = {}
 
  # Iterate over all sentences in the dataset
  for sentence in data:
        
    # Append n start tokens and a single end token to the sentence
    sentence = [start_token]*n + sentence + [end_token]
    
    # Convert the sentence into a tuple
    sentence = tuple(sentence)

    # Temp var to store length from start of n-gram to end
    m = len(sentence) if n==1 else len(sentence)-1
    
    # Iterate over this length
    for i in range(m):
        
      # Get the n-gram
      n_gram = sentence[i:i+n]
    
      # Add the count of n-gram as value to our dictionary
      # IF n-gram is already present
      if n_gram in n_grams.keys():
        n_grams[n_gram] += 1
      # Add n-gram count
      else:
        n_grams[n_gram] = 1
        
  return n_grams

In [9]:
def prob_for_single_word(word, previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary_size, k = 1.0) -> 'float':

  # Convert the previous_n_gram into a tuple 
  previous_n_gram = tuple(previous_n_gram)
    
  # Calculating the count, if exists from our freq dictionary otherwise zero
  previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
  
  # The Denominator
  denom = previous_n_gram_count + k * vocabulary_size

  # previous n-gram plus the current word as a tuple
  nplus1_gram = previous_n_gram + (word,)

  # Calculating the nplus1 count, if exists from our freq dictionary otherwise zero 
  nplus1_gram_count = nplus1_gram_counts[nplus1_gram] if nplus1_gram in nplus1_gram_counts else 0

  # Numerator
  num = nplus1_gram_count + k

  # Final Fraction
  prob = num / denom
  return prob


In [10]:
def probs(previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary, k=1.0) -> 'dict':

  # Convert to Tuple
  previous_n_gram = tuple(previous_n_gram)

  # Add end and unknown tokens to the vocabulary
  vocabulary = vocabulary + ["<e>", "<unk>"]

  # Calculate the size of the vocabulary
  vocabulary_size = len(vocabulary)

  # Empty dict for probabilites
  probabilities = {}

  # Iterate over words 
  for word in vocabulary:
    
    # Calculate probability
    probability = prob_for_single_word(word, previous_n_gram, 
                                           n_gram_counts, nplus1_gram_counts, 
                                           vocabulary_size, k=k)
    # Create mapping: word -> probability
    probabilities[word] = probability

  return probabilities

In [11]:
def auto_complete(previous_tokens, n_gram_counts, nplus1_gram_counts, vocabulary, k=1.0, start_with=None):

    
    # length of previous words
    n = len(list(n_gram_counts.keys())[0]) 
    
    # most recent 'n' words
    previous_n_gram = previous_tokens[-n:]
    
    # Calculate probabilty for all words
    probabilities = probs(previous_n_gram,n_gram_counts, nplus1_gram_counts,vocabulary, k=k)

    # Intialize the suggestion and max probability
    suggestion = None
    max_prob = 0

    # Iterate over all words and probabilites, returning the max.
    # We also add a check if the start_with parameter is provided
    for word, prob in probabilities.items():
        if word!="<unk>":
            if start_with != None: 
            
                if not word.startswith(start_with):
                    continue 

            if prob > max_prob: 

                suggestion = word
                max_prob = prob

    return suggestion

In [12]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):

    # See how many models we have
    count = len(n_gram_counts_list)
    
    # Empty list for suggestions
    suggestions = []
    
    # IMP: Earlier "-1"
    
    # Loop over counts
    for i in range(count-1):
        
        # get n and nplus1 counts
        n_gram_counts = n_gram_counts_list[i]
        nplus1_gram_counts = n_gram_counts_list[i+1]
        
        # get suggestions 
        suggestion = auto_complete(previous_tokens, n_gram_counts,
                                    nplus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        # Append to list
        suggestions.append(suggestion)
        
    return suggestions

In [13]:
n_gram_counts_list = []
for n in range(1, 6):
    n_model_counts = count_n_grams(final_train, n)
    n_gram_counts_list.append(n_model_counts)

In [14]:
def generating_word(sentence):
    previous_tokens = word_tokenize(sentence)
    suggestion = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)
    return suggestion

In [15]:
sentence = "हमसे भरे हुए तलैया देखना"
suggestion = generating_word(sentence)
display(suggestion)

['बाद', 'बाद', 'बाद', 'बाद']

In [16]:
def estimate_probability(word: str,
                         previous_n_gram: tuple, 
                         n_gram_counts: dict,
                         n_plus1_gram_counts: dict,
                         vocabulary_size: int,
                         k: float=1.0) -> float:
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing

    Args:
       word: next word
       previous_n_gram: A sequence of words of length n
       n_gram_counts: Dictionary of counts of n-grams
       n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
       vocabulary_size: number of words in the vocabulary
       k: positive constant, smoothing parameter

    Returns:
       A probability
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)

    n_plus1_gram = previous_n_gram + (word,)  
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)       
    return (n_plus1_gram_count + k)/(previous_n_gram_count + k * vocabulary_size)

In [17]:
def calculate_perplexity(sentence: list,
                         n_gram_counts: dict,
                         n_plus1_gram_counts: dict,
                         vocabulary_size: int,
                         k: float=1.0):
    """
    Calculate perplexity for a list of sentences

    Args:
       sentence: List of strings
       n_gram_counts: Dictionary of counts of (n+1)-grams
       n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
       vocabulary_size: number of unique words in the vocabulary
       k: Positive smoothing constant

    Returns:
       Perplexity score
    """
    # length of previous words
    n = len(list(n_gram_counts.keys())[0]) 

    # prepend <s> and append <e>
    sentence = ["<s>"] * n + sentence + ["<e>"]

    # Cast the sentence from a list to a tuple
    sentence = tuple(sentence)

    # length of sentence (after adding <s> and <e> tokens)
    N = len(sentence)

    # The variable p will hold the product
    # that is calculated inside the n-root
    # Update this in the code below
    product_pi = 1.0

    ### START CODE HERE (Replace instances of 'None' with your code) ###

    # Index t ranges from n to N - 1, inclusive on both ends
    for t in range(n, N): # complete this line

        # get the n-gram preceding the word at position t
        n_gram = sentence[t - n: t]

        # get the word at position t
        word = sentence[t]

        # Estimate the probability of the word given the n-gram
        # using the n-gram counts, n-plus1-gram counts,
        # vocabulary size, and smoothing constant
        probability = estimate_probability(
            word=word, previous_n_gram=n_gram,
            vocabulary_size=vocabulary_size,
            n_gram_counts=n_gram_counts,
            n_plus1_gram_counts=n_plus1_gram_counts, k=k)

        # Update the product of the probabilities
        # This 'product_pi' is a cumulative product 
        # of the (1/P) factors that are calculated in the loop
        product_pi *= 1/probability

    # Take the Nth root of the product
    perplexity = product_pi**(1/N)

    ### END CODE HERE ### 
    return perplexity


In [47]:
print(calculate_perplexity(final_test[90],n_gram_counts_list[0],n_gram_counts_list[1],len(vocabulary)))

67.52378170703766


In [45]:
total_perplexity = 0
for sentence in final_test:
    total_perplexity+=calculate_perplexity(sentence,n_gram_counts_list[2],n_gram_counts_list[3],len(vocabulary))
print(total_perplexity/len(final_test))

325.9235292052117
