Imports

In [None]:
from wiki_dump_reader import Cleaner , iterate
from tqdm import tqdm


In [None]:
import time

def calculate_time(func):
    """
    A decorator function to calculate the execution time of another function.

    Args:
        func (function): The function whose execution time is to be measured.

    Returns:
        function: A wrapped function that calculates and prints the execution time of the original function.

    """
    def wrapper(*args, **kwargs):
        """
        Calculate the execution time of the decorated function.

        Args:
            *args: Positional arguments to be passed to the decorated function.
            **kwargs: Keyword arguments to be passed to the decorated function.

        Returns:
            Any: The result of the decorated function.
        
        """
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Time taken by {func.__name__}: {end_time - start_time} seconds")
        return result
    return wrapper



In [None]:
def memory_location(func):
    """
    A decorator function to print the memory location of a wrapped function.

    Args:
        func (function): The function whose memory location is to be printed.

    Returns:
        function: A wrapped function that prints the memory location of the original function and then calls it.

    """
    def wrapper(*args, **kwargs):
        """
        Print the memory location of the decorated function and then call it.

        Args:
            *args: Positional arguments to be passed to the decorated function.
            **kwargs: Keyword arguments to be passed to the decorated function.

        Returns:
            Any: The result of the decorated function.
        
        """
        print(f"Memory location of {func.__name__}: {id(func)}")
        return func(*args, **kwargs)
    return wrapper


Corpus Create

In [None]:
@calculate_time
@memory_location
def create_corpus():
    """
    Create a corpus from a Wikipedia dump file in Hindi.

    This function iterates over pages in a file named 'hiwiki-latest-pages-articles.xml',
    cleans the text, and writes it along with the title to a new file named 'Hindi_Corpus.txt'.
    It also updates a progress bar using tqdm to show the processing progress.
    
    Args:
        None
    
    Returns:
        None

    Raises:
        None
    
    """
    corpus_file = 'Hindi_Corpus.txt'
    corpus_limit = 232729
    page_count = 0
    cleaner = Cleaner()  # Assuming Cleaner class is imported or defined elsewhere.
    
    # Open the output file for writing
    with open(corpus_file, 'w', encoding='utf-8') as output:
        pg_bar = tqdm(total=corpus_limit)  # Initialize the progress bar
        # Iterate over pages in 'hiwiki-latest-pages-articles.xml'
        for title, text in iterate('hiwiki-latest-pages-articles.xml'):  # Assuming iterate function is defined elsewhere.
            # Clean the text
            text = cleaner.clean_text(text)
            cleaned_text, _ = cleaner.build_links(text)
            # Write title and cleaned text to the corpus file
            output.write(title + '\n' + cleaned_text + '\n')
            page_count += 1
            if page_count % 1000 == 0:
                pg_bar.update(1000)  # Update progress bar every 1000 pages
        pg_bar.close()  # Close the progress bar
        output.close()  # Close the output file
    print(f"\nPage count = {page_count}")  # Print total page count after processing



Preprocessing

In [None]:
def create_stop_words():
    """
    Create a list of stop words from a file named 'stopwords.txt'.

    This function reads the stop words from the file, strips newline characters,
    and splits the words by spaces to create a list of stop words.

    Args:
        None
    
    Returns:
        list: A list of stop words.
    
    Raises:
        None
    
    """
    with open('stopwords.txt', 'r', encoding='utf-8') as stop:
        y = stop.readlines()
    x = []
    for element in y:
        element = element.strip('\n')
        x.extend(element.split(' '))
    return x


stop_words=create_stop_words()

def remove_stop_words(string):
    """
    Remove stop words from a given string.

    This function takes a string as input, splits it into words, and removes
    any words that are found in a predefined list of stop words. It then joins
    the remaining words back into a single string and returns it.

    Args:
        string (str): The input string from which stop words are to be removed.

    Returns:
        str: The input string with stop words removed.
    
    Raises:
        None
    
    """
    l = string.split()  # Split the input string into a list of words
    return_list = []
    for x in l:
        if x not in stop_words:  # Check if the word is not in the stop_words list
            return_list.append(x)  # If not, add it to the return_list
    return ' '.join(return_list)  # Join the words in return_list back into a string and return it


import re

def remove_foreign(x):
    """
    Remove foreign characters from a given string.

    This function takes a string as input and removes any characters that are not
    part of the Devanagari script, which is commonly used for writing languages like Hindi, Sanskrit, etc.
    
    Args:
        x (str): The input string from which foreign characters are to be removed.

    Returns:
        str: The input string with foreign characters removed.
    
    Raises:
        None
    
    """
    string = x.split(' ')  # Split the input string into a list of words
    y = [(re.compile(r'[\u0901-\u0939\u093C-\u094D\u0950-\u0954\u0958-\u0963\u097B-\u097F]')).findall(s) for s in string]
    z = [''.join(s) for s in y]  # Join the characters found in each word back into a string
    w = ' '.join(z)  # Join the resulting strings back into a single string
    return w  # Return the string with foreign characters removed


from tqdm import tqdm
@calculate_time
@memory_location
def pre(source, destination):
    """
    Preprocess a source text file and save the result to a destination file.

    This function reads text from the source file, removes foreign characters
    and stop words, and writes the preprocessed text to the destination file.
    It also updates a progress bar to show the processing progress.

    Args:
        source (str): The path to the source text file.
        destination (str): The path to save the preprocessed text file.

    Returns:
        None

    Raises:
        None

    """
    line_count = 0
    with open(source, 'r', encoding='utf-8') as input:
        with open(destination, 'w', encoding='utf-8') as output:
            bar = tqdm(total=5000000)  # Assuming 5000000 is the total number of lines
            for line in input:
                string = line.replace('\n', '')
                string = remove_foreign(string)
                string = remove_stop_words(string)
                output.write(string)
                line_count += 1
                if line_count % 10000 == 0:
                    bar.update(10000)  # Update progress bar every 10000 lines
            bar.close()



In [None]:
pre('Hindi_Corpus.txt', 'PreProcessed_Corpus.txt')

Working on reduced Corpus

In [None]:
def minimize_pre(so,de):
    i=10
    with open(so, 'r',encoding="utf-8") as sor:
        with open(de, 'w',encoding="utf-8") as des:
                for line in sor:
                    des.write(line[:(len(line))//i])

minimize_pre("PreProcessed_Corpus - Copy.txt", "PreProcessed_Corpus.txt")

In [None]:
from collections import Counter

const_vocab=100

def vocab():
    """
    Reads the text from 'trialfile.txt' and returns it as a list of words.

    Returns:
        list: A list of words extracted from the file 'trialfile.txt'.
    
    """
    with open('PreProcessed_Corpus.txt', 'r', encoding='utf-8') as r:
        text = r.read().split()
    word_counts = Counter(text)
    text = [word for word in tqdm(text) if word_counts[word] > const_vocab]  
    return text

def gen_distinct_vocab():
    """
    Reads the text from 'trialfile.txt', extracts distinct words, and returns them as a list.

    This function computes the vocabulary size and stores it in a global variable 'vocabulary_size'.

    Returns:
        list: A list of distinct words extracted from the file 'trialfile.txt'.
    
    """
    with open('PreProcessed_Corpus.txt', 'r', encoding='utf-8') as r:
        text = r.read().split()
        counter = Counter(text)
        reduced_vocab = [item for item, count in counter.items() if count > const_vocab]
        text_set = set(reduced_vocab)
        global vocabulary_size
        vocabulary_size = len(text_set)
    return list(text_set)


In [None]:
import numpy as np

global words_list
words_list=vocab()

Printing total number of tokens and vocabulary


In [None]:
distinct_vocab=gen_distinct_vocab()

In [None]:
vocabulary_size = len(distinct_vocab)
print((vocabulary_size))

In [None]:
from collections import Counter, defaultdict
from nltk import ngrams, word_tokenize

def gen_grams():            
    global tokens
    global grams
    tokens = word_tokenize(' '.join(words_list))
    grams = ngrams(tokens, 6)
    return None 

In [None]:
gen_grams()

In [None]:
print("Number of tokens is: ", len(tokens))
print("Size of the vocabulary: ", vocabulary_size)
print("First 1500 words of the Vocabulary: ", words_list[:1500])


In [None]:
from tqdm import tqdm
import numpy as np 

matrix = np.zeros((len(distinct_vocab),len(distinct_vocab)),dtype=np.int64)

@calculate_time
@memory_location
def create_matrix():

    for x in tqdm(grams):
    
        
        if x[0] in distinct_vocab:
            if x[1] in distinct_vocab:
                matrix[distinct_vocab.index(x[0])][distinct_vocab.index(x[1])] += 5
            if x[2] in distinct_vocab:
                matrix[distinct_vocab.index(x[0])][distinct_vocab.index(x[2])] += 4
            if x[3] in distinct_vocab:
                matrix[distinct_vocab.index(x[0])][distinct_vocab.index(x[3])] += 3
            if x[4] in distinct_vocab:
                matrix[distinct_vocab.index(x[0])][distinct_vocab.index(x[4])] += 2
            if x[5] in distinct_vocab:
                matrix[distinct_vocab.index(x[0])][distinct_vocab.index(x[5])] += 1
        if x[5] in distinct_vocab:
            if x[1] in distinct_vocab:
                matrix[distinct_vocab.index(x[5])][distinct_vocab.index(x[1])] += 2
            if x[2] in distinct_vocab:
                matrix[distinct_vocab.index(x[5])][distinct_vocab.index(x[2])] += 3
            if x[3] in distinct_vocab:
                matrix[distinct_vocab.index(x[5])][distinct_vocab.index(x[3])] += 4
            if x[4] in distinct_vocab:
                matrix[distinct_vocab.index(x[5])][distinct_vocab.index(x[4])] += 5
            if x[0] in distinct_vocab:
                matrix[distinct_vocab.index(x[5])][distinct_vocab.index(x[0])] += 1

        
create_matrix()


In [None]:
import math
from tqdm import tqdm

@calculate_time
@memory_location
def calculate_probability(matrix):
    
    # Calculate row sums and column sums
    row_sums = np.sum(matrix, axis=1)
    col_sums = np.sum(matrix, axis=0)

    # Total co-occurrences
    N = np.sum(matrix)

    if(N==0):
        return 0,0

    # Calculate pi and pj
    pI = row_sums / N
    pJ = col_sums / N

    global pi,pj

    pi,pj=pI,pJ
 
 
@calculate_time
@memory_location
def gen_ppmi_matrix(matrix):
    co_occurrence_matrix=matrix
 
    ppmi = np.zeros((len(distinct_vocab), len(distinct_vocab)))

    row_sums = np.sum(co_occurrence_matrix, axis=1)
    for i in tqdm(range(len(distinct_vocab))):
        for j in range(len(distinct_vocab)):

            if (row_sums[i] == 0 or co_occurrence_matrix[i][j] == 0):
                ppmi[i][j] = 0
                continue
            pij = ((co_occurrence_matrix[i][j]))/row_sums[i]
            ppmi[i][j] = max(0, math.log2(pij/(pi[i]*pj[j])))

    return ppmi


In [None]:
calculate_probability(matrix)
ppmi_matrix=gen_ppmi_matrix(matrix)


Finding 10 most common nouns

In [None]:
from collections import Counter

my_list = vocab()
counter = Counter(my_list)
common_elements = counter.most_common(10)
most_common_elements=[a for (a,_) in common_elements]

Implementing and printing the top ten nearest neighbour dictionary. (Without Multiprocessing)

In [None]:
import math

def insert_into_sorted_list(sorted_list, element):
    index = 0
    while index < len(sorted_list) and sorted_list[index][1] > element[1]:
        index += 1
    sorted_list.insert(index, element)
    return sorted_list[:-1]

def cosine(a,b):
        mag_a = math.sqrt(sum(component ** 2 for component in a))
        mag_b = math.sqrt(sum(component ** 2 for component in b))
        dot_product = sum(ai * bi for ai, bi in zip(a, b))
        if mag_a == 0 or mag_b ==0:
            return 0
        else:
            return dot_product/(mag_a*mag_b)

def find_nearest_neighbor_of_noun(index):
    l=[(distinct_vocab[0],(cosine(ppmi_matrix[index], ppmi_matrix[0]))) for _ in range(11)]
    for i in tqdm(range(1,len(distinct_vocab))):
        if i!=index:
            if cosine(ppmi_matrix[index],ppmi_matrix[i])>l[10][1]:
                l=insert_into_sorted_list(l,(distinct_vocab[i],(cosine(ppmi_matrix[index], ppmi_matrix[i]))))
    return l

nearest_neighbour_dict={}

@calculate_time
@memory_location
def nearest_seq():
    for x in tqdm(most_common_elements):
        nearest_neighbour_dict[x]=find_nearest_neighbor_of_noun(distinct_vocab.index(x))

nearest_seq()

print(nearest_neighbour_dict)


The above output cell shows the time taken by the sequential process.

Assignment 6 

Computing Co-occurrence matrix with Multiprocessing


In [None]:
import multiprocessing
from tqdm import tqdm

n = vocabulary_size
p_mat = multiprocessing.Array('d', n*n)

def compile_grams(x,p_mat):
    leng=len(distinct_vocab)
    

    if x[0] in distinct_vocab:
        if x[1] in distinct_vocab:
            p_mat[distinct_vocab.index(x[0])*leng+distinct_vocab.index(x[1])] += 5
        if x[2] in distinct_vocab:
            p_mat[distinct_vocab.index(x[0])*leng+distinct_vocab.index(x[2])] += 4
        if x[3] in distinct_vocab:
            p_mat[distinct_vocab.index(x[0])*leng+distinct_vocab.index(x[3])] += 3
        if x[4] in distinct_vocab:
            p_mat[distinct_vocab.index(x[0])*leng+distinct_vocab.index(x[4])] += 2
        if x[5] in distinct_vocab:
            p_mat[distinct_vocab.index(x[0])*leng+distinct_vocab.index(x[5])] += 1
    if x[5] in distinct_vocab:
        if x[1] in distinct_vocab:
            p_mat[distinct_vocab.index(x[5])*leng+distinct_vocab.index(x[1])] += 2
        if x[2] in distinct_vocab:
            p_mat[distinct_vocab.index(x[5])*leng+distinct_vocab.index(x[2])] += 3
        if x[3] in distinct_vocab:
            p_mat[distinct_vocab.index(x[5])*leng+distinct_vocab.index(x[3])] += 4
        if x[4] in distinct_vocab:
            p_mat[distinct_vocab.index(x[5])*leng+distinct_vocab.index(x[4])] += 5
        if x[0] in distinct_vocab:
            p_mat[distinct_vocab.index(x[5])*leng+distinct_vocab.index(x[0])] += 1

@calculate_time
@memory_location
def create_matrix_p(p_mat):
    grams = ngrams(tokens,6)

    jobs=[]

    for x in tqdm(grams):
        p=multiprocessing.Process(target=compile_grams, args=(x,p_mat,))
        p.start()
        jobs.append(p)

    for job in jobs:
        job.join()

create_matrix_p(p_mat)

p_matrix = np.zeros((n,n),dtype=np.int64)

for i in range(n):
    for j in range(n):
        p_matrix[i][j]=p_mat[i*n+j]



The above output cell shows the time taken by multiprocessing to create the co-occurrence matrix

Using Multiprocessing to compute the nearest neighbours

In [None]:
import math
import multiprocessing

neigh = multiprocessing.Manager().dict()

def p_insert_into_sorted_list(sorted_list, element):
    index = 0
    while index < len(sorted_list) and sorted_list[index][1] > element[1]:
        index += 1
    sorted_list.insert(index, element)
    return sorted_list[:-1]

def p_cosine(a,b):
        mag_a = math.sqrt(sum(component ** 2 for component in a))
        mag_b = math.sqrt(sum(component ** 2 for component in b))
        dot_product = sum(ai * bi for ai, bi in zip(a, b))
        if mag_a == 0 or mag_b ==0:
            return 0
        else:
            return dot_product/(mag_a*mag_b)

def p_find_nearest_neighbor_of_noun(index,neigh):
    l=[(distinct_vocab[0],(p_cosine(ppmi_matrix[index], ppmi_matrix[0]))) for _ in range(11)]
    for i in tqdm(range(1,n)):
        if i!=index:
            if p_cosine(ppmi_matrix[index],ppmi_matrix[i])>l[10][1]:
                l=p_insert_into_sorted_list(l,(distinct_vocab[i],(p_cosine(ppmi_matrix[index], ppmi_matrix[i]))))

    neigh[distinct_vocab[index]]=l

@calculate_time
@memory_location
def parallel_nearest():
    jobs=[]
    for x in (most_common_elements):
        p=multiprocessing.Process(target=p_find_nearest_neighbor_of_noun, args=(distinct_vocab.index(x),neigh,))
        p.start()
        jobs.append(p)

    for job in jobs:
        job.join()
        
parallel_nearest()

print(neigh)

Time taken to create the nearest neighbours dictionary using multiprocessing. 