# This file is for LM_1_corpus

# Data Loading

Importing all required Libraries

In [103]:
import re
import pandas as pd
import numpy as np
import random
import zipfile
import nltk
nltk.data.path.append('.')

Loading data from LM_corpus and storing in document named variable

In [96]:
import os
import PyPDF2

dataset_dir = 'LM_1_Corpus'

def read_pdf_file(filename):
    with open(filename, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ''
        for page_num in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
        return text

documents = []
for filename in os.listdir(dataset_dir):
    if filename.endswith('.pdf'):
        filepath = os.path.join(dataset_dir, filename)
        text = read_pdf_file(filepath)
        documents.append(text)
print("Total Number of Documents:", len(documents))

Total Number of Documents: 6


In [97]:
print("Data type:", type(documents))
print(documents)

Data type: <class 'list'>
['T urk J Elec Eng & Comp Sci\n(2019) 27: 3665 – 3681\n© TÜBİT AK\ndoi:10.3906/elk-1806-132\nTurkish Journal of Electrical Engineering & Computer Sciences \nhttp://journals.tubitak.gov.tr/elektrik/\nResearch Article \nIncremental author name disambiguation using author profile models and\nself-citations\nIjaz HUSSAIN∗, Sohail ASGHAR\nDepartment of Computer Science, COMSA TS University Islamabad, Islamabad, Pakistan\nReceived: 18.06.2018 • Accepted/Published Online: 05.06.2019 • Final V ersion: 18.09.2019\nAbstract: Author name ambiguity in bibliographic databases (BDs) such as DBLP is a challenging problem that\ndegrades the information retrieval quality , citation analysis, and proper attribution to the authors. It occurs when\nseveral authors have the same name (homonym) or when an author publishes under several name variants (synonym).\nT raditionally , much research has been conducted to disambiguate whole bibliographic database at once whenever some\nnew 

In [98]:
# concatenate all strings in documents to create a single string
LM_corpus = ' '.join(documents)
type(LM_corpus)


str

# Data Pre-Processing 

### Function for spliting data into sentences

This function will split data into senetences on the basis on new line i.e '\n'

In [109]:
def split_to_sentences(Data):
    sentences = Data.split('\n')
    # Removing leading and trailing spaces dropping empty strings
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    return sentences  

### Function for tokenizing the sentences

This function will tokenize the above splited sentences.

In [113]:
def tokenize_sentences(sentences_list):
    # Tokenizes a list of sentences into a list of lists of tokens
    # Initialize an empty list to hold tokenized sentences
    tokenized_sentences = []
    
    # Loop through each sentence in the list of sentences
    for sentence in sentences_list:
        # Convert the sentence to lowercase
        lowercase_sentence = sentence.lower()
        # Tokenize the lowercase sentence into a list of tokens
        tokenized_sentence = nltk.word_tokenize(lowercase_sentence)
        # Append the tokenized sentence to the list of tokenized sentences
        tokenized_sentences.append(tokenized_sentence)
        
    return tokenized_sentences


Combining both above functions into one function and then testing the whole function by givivg a test string

In [114]:
def get_tokenized_data(data):
    
    # Split the input data into sentences
    sentences = split_to_sentences(data)
    
    # Tokenize each sentence into a list of words
    tokenized_sentences = tokenize_sentences(sentences)
    
    # Return the list of tokenized sentences
    return tokenized_sentences

In [115]:
x = """ My name is Ahmad Faraz.\n I am from Lahore. \n I love Cricket."""
print(x)
verify = get_tokenized_data(x)
print(verify)

 My name is Ahmad Faraz.
 I am from Lahore. 
 I love Cricket.
[['my', 'name', 'is', 'ahmad', 'faraz', '.'], ['i', 'am', 'from', 'lahore', '.'], ['i', 'love', 'cricket', '.']]


### Splitining the data


In [116]:
# Get tokenized data from the LM corpus
tokenized_data = get_tokenized_data(LM_corpus)

# Shuffle the data and split into train and test sets
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8) # Set the size of the training set to be 80% of the data
train_data = tokenized_data[0:train_size] # Select the first 80% of the data for training
test_data = tokenized_data[train_size:] # Select the remaining 20% of the data for testing

In [119]:
# Print the total number of sentences in the tokenized data along with
# the number of sentences in the training and test sets.
print("Total sentences: {}\nTrain set: {}\nTest set: {}".format(len(tokenized_data), len(train_data), len(test_data)))

# Print the first training sample.
print("First training sample:")
print(train_data[0])

# Print the first test sample.
print("First test sample:")
print(test_data[0])

Total sentences: 4902
Train set: 3921
Test set: 981
First training sample:
['in', 'digital', 'library', '.', 'ieee', 'transactions', 'on', 'knowledge', 'and']
First test sample:
['conﬂictory', 'names', 'from', 'the', 'results', 'of', 'the', 'previous', 'stage', '.', 'in', 'the', 'last', ',', 'post-processing', 'stage', ',', 'the', 'uncon']


### Function for word count

This function will loop through each tokenized sentence and check if the word is not in the word count array then set its value to 1 esle add 1 to its count

In [120]:
def count_words(tokenized_sentences):
    
    # Create an empty dictionary to store the word counts
    word_counts = {}
    
    # Loop through each sentence in the tokenized sentences
    for sentence in tokenized_sentences:
        
        # Loop through each word in the sentence
        for word in sentence:

            # If the word is not already in the dictionary, add it with a count of 1
            if word not in word_counts.keys(): 
                word_counts[word] = 1
            
            # If the word is already in the dictionary, increment its count by 1
            else:
                word_counts[word] += 1
    
    return word_counts


In [121]:
# test your code
count_words(verify)

{'my': 1,
 'name': 1,
 'is': 1,
 'ahmad': 1,
 'faraz': 1,
 '.': 3,
 'i': 2,
 'am': 1,
 'from': 1,
 'lahore': 1,
 'love': 1,
 'cricket': 1}

### Function for threshold count

In [122]:
def Nplus_freq_words(tokenized_sentences, count_threshold):
    
    # Initialize an empty list to contain the words that appear at least 'count_threshold' times.
    closed_vocab = []
    
    # Get the word counts of the tokenized sentences using the count_words function.
    word_counts = count_words(tokenized_sentences)
    
    # Iterate over each word and its count in the word_counts dictionary.
    for word, count in word_counts.items():
        
        # Check that the word's count is at least as great as the minimum count threshold.
        if count >= count_threshold:
            
            # Append the word to the closed_vocab list.
            closed_vocab.append(word)
    
    return closed_vocab


In [123]:
# test your code
tmp_closed_vocab = Nplus_freq_words(verify, count_threshold=2)
print(f"Closed vocabulary:")
print(tmp_closed_vocab)

Closed vocabulary:
['.', 'i']


### Function for replacing OOV with unk token 

In [48]:
def OOV_words_replcaement(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    
    # Convert the vocabulary into a set for faster search
    vocabulary_set = set(vocabulary)
    
    replaced_sentences = []
    
    # Go through each sentence
    for sentence in tokenized_sentences:
        
        replaced_sentence = []
        
        # for each token in the sentence
        for token in sentence:
            
            # Check if the token is in the vocabulary
            if token in vocabulary_set: 
                
                # If yes , add  word to the replaced_sentence
                replaced_sentence.append(token)
            
            else:
                # otherwise, add unknown token instead
                replaced_sentence.append(unknown_token)
        
        # Append the list of tokens to the list of replaced sentences
        replaced_sentences.append(replaced_sentence)
        
    return replaced_sentences


### Function for Pre-Process Data 

In [124]:
def preprocess_data(train_data, test_data, count_threshold):

    # Get the closed vocabulary using the train data
    closed_vocab = Nplus_freq_words(train_data, count_threshold)
    
    # For the train data, replace less frequent words with "<unk>"
    train_data_replaced = OOV_words_replcaement(train_data, closed_vocab)
    
    # For the test data, replace less frequent words with "<unk>"
    test_data_replaced = OOV_words_replcaement(test_data, closed_vocab)
    
    # Return the preprocessed train and test data, as well as the closed vocabulary
    return train_data_replaced, test_data_replaced, closed_vocab


In [53]:
# Set the minimum frequency count for words to be included in the vocabulary
min_word_freq = 2

# Preprocess the train and test data
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        min_word_freq)

In [127]:
print(f"First preprocessed training sample: {train_data_processed[0]}\n")
print(f"First preprocessed test sample: {test_data_processed[0]}\n")
print(f"First 10 vocabulary: {vocabulary[:10]}\n")
print(f"Size of vocabulary: {len(vocabulary)}")


First preprocessed training sample: ['in', 'digital', 'library', '.', 'ieee', 'transactions', 'on', 'knowledge', 'and']

First preprocessed test sample: ['<unk>', 'names', 'from', 'the', 'results', 'of', 'the', 'previous', 'stage', '.', 'in', 'the', 'last', ',', '<unk>', 'stage', ',', 'the', '<unk>']

First 10 vocabulary: ['in', 'digital', 'library', '.', 'ieee', 'transactions', 'on', 'knowledge', 'and', 'shannon']

Size of vocabulary: 2613


# Building n-gram Model

### Function for Counting n-grams 

In [138]:
def N_Grams_Count(data, n, start_token='<s>', end_token = '<e>'):

    # Initialize an empty dictionary to hold the n-grams and their counts
    n_grams = {}

    # Iterate over each sentence in the data
    for sentence in data:
        
        # Add the start token n times to the beginning of the sentence and the end token once to the end
        sentence = [start_token] * n + sentence + [end_token]
        
        # Convert the list of words into a tuple so that it can be used as a key in the dictionary
        sentence = tuple(sentence)
        
        # Use 'i' to indicate the start of the n-gram from index 0 to the last index
        # where the end of the n-gram is within the sentence.
        # If n is 1, then we only need to iterate over each word in the sentence.
        m = len(sentence) if n==1 else len(sentence)-1
        for i in range(m):

            # Get the n-gram from the sentence
            n_gram = sentence[i:i+n]

            # Check if the n-gram is in the dictionary
            if n_gram in n_grams.keys():
                
                # If the n-gram already exists in the dictionary, increment its count
                n_grams[n_gram] += 1
            else:
                # If the n-gram is not yet in the dictionary, add it and set its count to 1
                n_grams[n_gram] = 1
    
    return n_grams


In [133]:
# testing the code
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(N_Grams_Count(sentences, 1))
print("Bi-gram:")
print(N_Grams_Count(sentences, 2))

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


### Function for estimating probability of each sentence 

In [141]:
def Probablity_estimation(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    # Convert previous n-gram to a tuple to use as a dictionary key
    previous_n_gram = tuple(previous_n_gram)

    # Get the count of the previous n-gram from the n-gram counts dictionary, or set it to 0 if not found
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)

    # Calculate the denominator using the count of the previous n-gram and applying k-smoothing
    denominator = previous_n_gram_count + k * vocabulary_size

    # Create an n plus 1 gram by concatenating the previous n-gram and the current word
    n_plus1_gram = previous_n_gram + (word,)

    # Get the count of the n plus 1 gram from the n plus 1 gram counts dictionary, or set it to 0 if not found
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)

    # Calculate the numerator using the count of the n plus 1 gram and applying smoothing
    numerator = n_plus1_gram_count + k

    # Calculate the probability as the numerator divided by the denominator
    probability = numerator / denominator

    return probability


In [203]:
sentences = [['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'],
             ['the', 'lazy', 'dog', 'is', 'owned', 'by', 'john']]

unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = N_Grams_Count(sentences, 1)
bigram_counts = N_Grams_Count(sentences, 2)

# Estimate the probability of the word 'dog' given the previous n-gram 'the lazy'
tmp_prob = Probablity_estimation("dog", "the lazy", unigram_counts, bigram_counts, len(unique_words), k=1)

print(f"The estimated probability of word 'dog' given the previous n-gram 'the lazy' is: {tmp_prob:.4f}")


The estimated probability of word 'dog' given the previous n-gram 'the lazy' is: 0.0833


### Function for estimating probavilities of all words 

In [146]:
def All_Word_Probalities_Estimation(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):

    # Convert list to tuple to use it as a dictionary key
    previous_n_gram = tuple(previous_n_gram)

    # Add <e> and <unk> to the vocabulary
    vocabulary += ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)

    probabilities = {}
    for word in vocabulary:
        # Estimate the probability of the word given the previous n-gram
        probability = Probablity_estimation(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0)
        probabilities[word] = probability

    return probabilities


In [227]:
sentences = [['i', 'love', 'chocolate', 'cake'],
             ['she', 'likes', 'vanilla', 'ice', 'cream'],
             ['he', 'hates', 'spicy', 'food']]
unique_words = list(set(sentences[0] + sentences[1] + sentences[2]))
unigram_counts = N_Grams_Count(sentences, 1)
bigram_counts = N_Grams_Count(sentences, 2)
All_Word_Probalities_Estimation("likes", unigram_counts, bigram_counts, unique_words, k=1)


{'cream': 0.06666666666666667,
 'cake': 0.06666666666666667,
 'food': 0.06666666666666667,
 'hates': 0.06666666666666667,
 'love': 0.06666666666666667,
 'she': 0.06666666666666667,
 'spicy': 0.06666666666666667,
 'he': 0.06666666666666667,
 'i': 0.06666666666666667,
 'likes': 0.06666666666666667,
 'ice': 0.06666666666666667,
 'vanilla': 0.06666666666666667,
 'chocolate': 0.06666666666666667,
 '<e>': 0.06666666666666667,
 '<unk>': 0.06666666666666667}

# Sentence Generation Part 

### Function for suggeting a word 

In [154]:
def Word_suggestion(Previous_words, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    # Determine the n-gram order by getting the length of the first key in the n-gram counts dictionary
    n = len(list(n_gram_counts.keys())[0]) 
    # Get the last n tokens in the Previous_words list to form the previous_n_gram
    previous_n_gram = Previous_words[-n:]
    # Estimate the probability of each word that follows the previous_n_gram using n-gram counts and add-k smoothing
    probabilities = All_Word_Probalities_Estimation(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k)
    # Initialize variables to store the suggestion and its probability
    suggestion = None
    max_prob = 0
    # Iterate through the probability dictionary to find the word with the highest probability
    for word, prob in probabilities.items():
        # If start_with is not None and the word does not start with start_with, skip to the next word
        if start_with != None:
            if not word.startswith(start_with):
                continue
        # If the probability of the current word is higher than the previous max probability, update the suggestion and max_prob variables
        if prob > max_prob:
            suggestion = word
            max_prob = prob
    # Return the suggested word and its probability
    return suggestion, max_prob


### Function for getting multiple suggestions

In [158]:
def Multiple_Suggestions(Previous_words, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    # Determine the number of models in the n-gram counts list
    model_counts = len(n_gram_counts_list)
    # Initialize a list to store the suggestions from each model
    suggestions = []
    # Iterate through the models and get suggestions from each one
    for i in range(model_counts-1):
        # Get the n-gram counts and n+1-gram counts for the current model
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        # Get a word suggestion from the current model using the previous tokens, n-gram and n+1-gram counts, vocabulary, k value, and start_with constraint
        suggestion = Word_suggestion(Previous_words, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k, start_with=start_with)
        # Add the suggestion to the list of suggestions
        suggestions.append(suggestion)
    # Return the list of suggestions from each model
    return suggestions

In [159]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = N_Grams_Count(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


### Testing word generation 

In [194]:
Previous_words = ["Author", "name" ]
Suggested_Words = Multiple_Suggestions(Previous_words, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {Previous_words}, the suggestions are:")
display(Suggested_Words)

The previous words are ['Author', 'name'], the suggestions are:


[('disambiguation', 0.055014146494812954),
 ('in', 0.0003621876131836291),
 ('in', 0.0003619254433586681),
 ('in', 0.0003616636528028933)]

In [174]:
Previous_words = ["ﬁnding", "similar", "names" , "the",  "geodesic"]
Suggested_words = Multiple_Suggestions(Previous_words, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {Previous_words}, the suggestions are:")
display(Suggested_words)

The previous words are ['ﬁnding', 'similar', 'names', 'the', 'geodesic'], the suggestions are:


[('distance', 0.0025678650036683784),
 ('distance', 0.001834189288334556),
 ('distance', 0.0007342143906020558),
 ('distance', 0.0007336757153338225)]

In [175]:
Previous_words = ["ﬁnding", "similar", "names" , "the"]
Suggested_words = Multiple_Suggestions(Previous_words, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {Previous_words}, the suggestions are:")
display(Suggested_words)

The previous words are ['ﬁnding', 'similar', 'names', 'the'], the suggestions are:


[('<e>', 0.022871376811594204),
 ('geodesic', 0.0007323324789454412),
 ('geodesic', 0.0007320644216691069),
 ('geodesic', 0.000731528895391368)]

In [176]:
Previous_words = ["ﬁnding", "similar"]
Suggested_words = Multiple_Suggestions(Previous_words, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {Previous_words}, the suggestions are:")
display(Suggested_words)

The previous words are ['ﬁnding', 'similar'], the suggestions are:


[(')', 0.003588087549336204),
 ('names', 0.0007304601899196494),
 ('in', 0.0003650967506389193),
 ('in', 0.00036483035388544326)]

# Testing the model by Calculating the Perplexity 

In [152]:
def Perplexity_Calculation(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    # n-gram order
    n = len(list(n_gram_counts.keys())[0]) 
    # Pad the sentence with start and end tokens 
    sentence = ["<s>"] * n + sentence + ["<e>"]
    # Convert sentence to a tuple
    sentence = tuple(sentence)
    # Length of the sentence
    N = len(sentence)
    # Initialize product of probabilities to 1.0
    product_prob = 1.0
    # Iterate over each word in the sentence
    for t in range(n, N):
        # Extract the n-gram preceding the current word
        n_gram = sentence[t-n:t]
        # Current word
        word = sentence[t]
        # Estimate the probability of the current word given the n-gram
        probability = Probablity_estimation(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k)
        # Multiply the product of probabilities by the inverse of the probability of the current word
        product_prob *= 1 / probability
    # Calculate perplexity as the Nth root of the product of probabilities
    perplexity = product_prob**(1/float(N))
    return perplexity


In [202]:
unique_words = list(set(documents[0] + documents[1]+ documents[2] + documents[3] + documents[4] + documents[5]))

bigram_counts = N_Grams_Count(tokenized_data, 2)
trigram_counts = N_Grams_Count(tokenized_data, 3)

# For Sample 1

perplexity_train1 = Perplexity_Calculation(train_data_processed[0],
                                         bigram_counts, trigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for first train sample: {perplexity_train1:.4f}")

perplexity_test1 = Perplexity_Calculation(test_data_processed[0],
                                       bigram_counts, trigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test1:.4f}")

# For S 2

perplexity_train2 = Perplexity_Calculation(train_data_processed[1],
                                         bigram_counts, trigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for Second train sample: {perplexity_train2:.4f}")

perplexity_test2 = Perplexity_Calculation(test_data_processed[1],
                                       bigram_counts, trigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test2:.4f}")

Perplexity for first train sample: 21.9161
Perplexity for test sample: 62.2042
Perplexity for Second train sample: 22.2904
Perplexity for test sample: 54.1756
