In [1]:
import multiprocessing
import math
from wiki_dump_reader import Cleaner , iterate
from tqdm import tqdm
import numpy as np
import multiprocessing
import nltk
from collections import Counter, defaultdict
from nltk import ngrams, word_tokenize
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/anurag/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import time

def calculate_time(func):
    """
    A decorator function to calculate the execution time of another function.

    Args:
        func (function): The function whose execution time is to be measured.

    Returns:
        function: A wrapped function that calculates and prints the execution time of the original function.

    """
    def wrapper(*args, **kwargs):
        """
        Calculate the execution time of the decorated function.

        Args:
            *args: Positional arguments to be passed to the decorated function.
            **kwargs: Keyword arguments to be passed to the decorated function.

        Returns:
            Any: The result of the decorated function.
        
        """
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Time taken by {func.__name__}: {end_time - start_time} seconds")
        return result
    return wrapper



In [3]:
def memory_location(func):
    """
    A decorator function to print the memory location of a wrapped function.

    Args:
        func (function): The function whose memory location is to be printed.

    Returns:
        function: A wrapped function that prints the memory location of the original function and then calls it.

    """
    def wrapper(*args, **kwargs):
        """
        Print the memory location of the decorated function and then call it.

        Args:
            *args: Positional arguments to be passed to the decorated function.
            **kwargs: Keyword arguments to be passed to the decorated function.

        Returns:
            Any: The result of the decorated function.
        
        """
        print(f"Memory location of {func.__name__}: {id(func)}")
        return func(*args, **kwargs)
    return wrapper


In [4]:
@calculate_time
@memory_location
def create_corpus():
    """
    Create a corpus from a Wikipedia dump file in Hindi.

    This function iterates over pages in a file named 'hiwiki-latest-pages-articles.xml',
    cleans the text, and writes it along with the title to a new file named 'Hindi_Corpus.txt'.
    It also updates a progress bar using tqdm to show the processing progress.
    
    Args:
        None
    
    Returns:
        None

    Raises:
        None
    
    """
    corpus_file = 'Hindi_Corpus.txt'
    corpus_limit = 232729
    page_count = 0
    cleaner = Cleaner()  # Assuming Cleaner class is imported or defined elsewhere.
    
    # Open the output file for writing
    with open(corpus_file, 'w', encoding='utf-8') as output:
        pg_bar = tqdm(total=corpus_limit)  # Initialize the progress bar
        # Iterate over pages in 'hiwiki-latest-pages-articles.xml'
        for title, text in iterate('hiwiki-latest-pages-articles.xml'):  # Assuming iterate function is defined elsewhere.
            # Clean the text
            text = cleaner.clean_text(text)
            cleaned_text, _ = cleaner.build_links(text)
            # Write title and cleaned text to the corpus file
            output.write(title + '\n' + cleaned_text + '\n')
            page_count += 1
            if page_count % 1000 == 0:
                pg_bar.update(1000)  # Update progress bar every 1000 pages
        pg_bar.close()  # Close the progress bar
        output.close()  # Close the output file
    print(f"\nPage count = {page_count}")  # Print total page count after processing



In [5]:

def remove_stop_words(string):
	
	with open('stopwords.txt', 'r', encoding='utf-8') as stop:
		y = stop.readlines()
	stop_words = []
	for element in y:
		element = element.strip('\n')
		stop_words.extend(element.split(' '))
		
	l = string.split()  
	return_list = []
	for x in l:
		if x not in stop_words:  
			return_list.append(x) 
	return ' '.join(return_list) 


import re

def remove_foreign(x):
	string = x.split(' ')  
	s1 = [(re.compile(r'[\u0901-\u0939\u093C-\u094D\u0950-\u0954\u0958-\u0963\u097B-\u097F]')).findall(s) for s in string]
	s2 = [''.join(s) for s in s1] 
	s3 = ' '.join(s2) 
	return s3  


@calculate_time
@memory_location
def pre(source, destination):
	line_count = 0
	with open(source, 'r', encoding='utf-8') as input:
		with open(destination, 'w', encoding='utf-8') as output:
			bar = tqdm(total=5000000) 
			for line in input:
				string = line.replace('\n', '')
				string = remove_foreign(string)
				string = remove_stop_words(string)
				output.write(string)
				line_count += 1
				if line_count % 10000 == 0:
					bar.update(10000)  
			bar.close()



In [None]:
pre('Hindi_Corpus.txt', 'PreProcessed_Corpus.txt')

In [12]:
from collections import Counter

const_vocab=0

def gen_vocab():
    """
    Reads the text from 'trialfile.txt' and returns it as a list of words.

    Returns:
        list: A list of words extracted from the file 'trialfile.txt'.
    
    """
    with open('PreProcessed_Corpus.txt', 'r', encoding='utf-8') as r:
        text = r.read().split()
    word_counts = Counter(text)
    text = [word for word in tqdm(text) if word_counts[word] > const_vocab]  
    return text

def gen_distinct_vocab():
    """
    Reads the text from 'trialfile.txt', extracts distinct words, and returns them as a list.

    This function computes the vocabulary size and stores it in a global variable 'vocabulary_size'.

    Returns:
        list: A list of distinct words extracted from the file 'trialfile.txt'.
    
    """
    with open('PreProcessed_Corpus.txt', 'r', encoding='utf-8') as r:
        text = r.read().split()
        counter = Counter(text)
        reduced_vocab = [item for item, count in counter.items() if count > const_vocab]
        text_set = set(reduced_vocab)
        global vocabulary_size
        vocabulary_size = len(text_set)
    return list(text_set)


In [13]:

global words_list
words_list=gen_vocab()
distinct_vocab=gen_distinct_vocab()
vocabulary_size = len(distinct_vocab)
print((vocabulary_size))

100%|██████████| 3944/3944 [00:00<00:00, 3979392.59it/s]

1774





In [14]:

def gen_grams():            
    global tokens
    global grams
    tokens = word_tokenize(' '.join(words_list))
    grams = ngrams(tokens, 6)
    return None 

gen_grams()

In [15]:
print("Number of tokens is: ", len(tokens))
print("Size of the vocabulary: ", vocabulary_size)
print("First 1500 words of the Vocabulary: ", words_list[:1500])


Number of tokens is:  3944
Size of the vocabulary:  1774
First 1500 words of the Vocabulary:  ['मुख्य', 'पृष्ठमुखपृष्ठमुखपृष्ठहोंगे', 'कामयाबहोंगे', 'कामयाब', 'गिरिजा', 'कुमार', 'माथुर', 'हिंदी', 'भावानुवाद', 'प्रतिरोध', 'गीत', 'गीत', 'बीसवीं', 'सदी', 'नागरिक', 'अधिकार', 'आंदोलन', 'प्रधान', 'स्वर', 'गीत', 'आमतौर', 'आई', 'विल', 'ओवरकम', 'सम', 'डे', 'काव्यावतरित', 'माना', 'चार्ल्स', 'अल्बर्ट', 'टिंडले', 'गाया', 'पहली', 'प्रकाशितसन्दर्भनागरिक', 'अधिकार', 'आंदोलनदेशभक्ति', 'गीतआधारदैनिक', 'पूजादैनिक', 'पूजा', 'विधि', 'नित्य', 'पूजा', 'सनातन', 'हिन्दू', 'धर्म', 'उपासना', 'पद्धतियों', 'दैनिक', 'कर्म', 'विभिन्न', 'देवताओं', 'प्रसन्न', 'मन्त्र', 'बताये', 'पूजा', 'तीन', 'प्रकार', 'मंत्र', 'बताये', 'नाम', 'मंत्र', 'पौराणिक', 'मंत्र', 'वैदिक', 'मंत्रनाम', 'मंत्र', 'देवता', 'नाम', 'प्रणव', 'पीछे', 'नमः', 'लगापौराणिक', 'मंत्र', 'मंत्र', 'पुराणों', 'वर्णितवैदिक', 'मंत्र', 'मंत्र', 'वेदों', 'वर्णितपूजा', 'मुख्यतः', 'छः', 'प्रकारपञ्चोपचार', 'पांच', 'उपचार', 'पूर्वक', 'वालीदशोपचार', 'दश', 'उपचार', 'पूर

In [16]:

cocrmat = np.zeros((len(distinct_vocab),len(distinct_vocab)),dtype=np.int64)

@calculate_time
@memory_location
def create_matrix():
	
	ramp=[0,1,2,3,4,5]

	for gram in tqdm(grams):

		x=list(gram)

		if x[0] in distinct_vocab:
			for i in range(1,len(x)):
				if x[i] in distinct_vocab:
					cocrmat[distinct_vocab.index(x[0])][distinct_vocab.index(x[i])] += 5-ramp[i]
					
		x.reverse()
	
		if x[0] in distinct_vocab:
			for i in range(1,len(x)):
				if x[i] in distinct_vocab:
					cocrmat[distinct_vocab.index(x[0])][distinct_vocab.index(x[i])] += 5-ramp[i]
		
create_matrix()


Memory location of create_matrix: 129525608273280


3939it [00:00, 6536.01it/s]

Time taken by wrapper: 0.604438304901123 seconds





In [17]:

@calculate_time
@memory_location
def calculate_probability(matrix):
    
    # Calculate row sums and column sums
    row_sums = np.sum(matrix, axis=1)
    col_sums = np.sum(matrix, axis=0)

    # Total co-occurrences
    N = np.sum(matrix)

    if(N==0):
        return 0,0

    # Calculate pi and pj
    pI = row_sums / N
    pJ = col_sums / N

    global pi,pj

    pi,pj=pI,pJ
 
 
@calculate_time
@memory_location
def gen_ppmi_matrix(matrix):
    co_occurrence_matrix=matrix
 
    ppmi = np.zeros((len(distinct_vocab), len(distinct_vocab)))

    row_sums = np.sum(co_occurrence_matrix, axis=1)
    for i in tqdm(range(len(distinct_vocab))):
        for j in range(len(distinct_vocab)):

            if (row_sums[i] == 0 or co_occurrence_matrix[i][j] == 0):
                ppmi[i][j] = 0
                continue
            pij = ((co_occurrence_matrix[i][j]))/row_sums[i]
            ppmi[i][j] = max(0, math.log2(pij/(pi[i]*pj[j])))

    return ppmi


In [18]:
calculate_probability(cocrmat)
ppmi_matrix=gen_ppmi_matrix(cocrmat)


Memory location of calculate_probability: 129525608274080
Time taken by wrapper: 0.005644083023071289 seconds
Memory location of gen_ppmi_matrix: 129525608274400


100%|██████████| 1774/1774 [00:00<00:00, 1790.18it/s]

Time taken by wrapper: 0.9937396049499512 seconds





In [19]:
from collections import Counter

my_list = gen_vocab()
counter = Counter(my_list)
common_elements = counter.most_common(10)
most_common_elements=[a for (a,_) in common_elements]

100%|██████████| 3944/3944 [00:00<00:00, 7888571.76it/s]


In [20]:
import math

def insert_into_sorted_list(sorted_list, element):
	index = 0
	while index < len(sorted_list) and sorted_list[index][1] > element[1]:
		index += 1
	sorted_list.insert(index, element)
	return sorted_list[:-1]

def cosine(a,b):
		mag_a = math.sqrt(sum(component ** 2 for component in a))
		mag_b = math.sqrt(sum(component ** 2 for component in b))
		dot_product = sum(ai * bi for ai, bi in zip(a, b))
		if mag_a == 0 or mag_b ==0:
			return 0
		else:
			return dot_product/(mag_a*mag_b)

def find_nearest_neighbor_of_noun(index):
	l=[(distinct_vocab[0],(cosine(ppmi_matrix[index], ppmi_matrix[0]))) for _ in range(11)]
	for i in tqdm(range(1,len(distinct_vocab))):
		if i!=index and cosine(ppmi_matrix[index],ppmi_matrix[i])>l[10][1]:
				l=insert_into_sorted_list(l,(distinct_vocab[i],(cosine(ppmi_matrix[index], ppmi_matrix[i]))))
	return l

nearest_neighbour_dict={}

@calculate_time
@memory_location
def nearest_seq():
	for x in tqdm(most_common_elements):
		nearest_neighbour_dict[x]=find_nearest_neighbor_of_noun(distinct_vocab.index(x))

nearest_seq()

for i in nearest_neighbour_dict:
	print("Words similar to ",i," :")
	for j in nearest_neighbour_dict[i]:
		print ("  ",j[0]," : ",j[1])


Memory location of nearest_seq: 129525608275520


100%|██████████| 1773/1773 [00:00<00:00, 1900.09it/s]
100%|██████████| 1773/1773 [00:00<00:00, 1925.78it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1765.20it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1747.27it/s]
100%|██████████| 1773/1773 [00:00<00:00, 1837.46it/s]
100%|██████████| 1773/1773 [00:00<00:00, 1789.24it/s]
100%|██████████| 1773/1773 [00:00<00:00, 1865.98it/s]
100%|██████████| 1773/1773 [00:00<00:00, 1882.01it/s]
100%|██████████| 1773/1773 [00:00<00:00, 1894.28it/s]
100%|██████████| 1773/1773 [00:00<00:00, 1930.27it/s]
100%|██████████| 10/10 [00:09<00:00,  1.04it/s]

Time taken by wrapper: 9.65730905532837 seconds
Words similar to  हिन्दी  :
   भाषा  :  0.3472396399601132
   भारत  :  0.33943060869648284
   रूप  :  0.305484092600656
   उर्दू  :  0.300078744109324
   भारतीय  :  0.2878374008275373
   प्रयोग  :  0.2618181347445307
   भाषाओं  :  0.2580177115873142
   राजभाषा  :  0.25079511388536113
   लोगों  :  0.2445474797528446
   संख्या  :  0.24381054880907813
   आधुनिक  :  0.2380191261055757
Words similar to  ॐ  :
   नमः  :  0.8331027179229085
   सिद्धि  :  0.6230561905579797
   विनायकाय  :  0.6204863570166662
   समर्पयामि  :  0.5894912204663979
   श्री  :  0.4178855280895735
   पूजयामि  :  0.38100113427242527
   महा  :  0.3682594160198308
   विघ्न  :  0.34146986321010675
   पुष्पं  :  0.3383239111668996
   दन्ताय  :  0.33063627048575234
   राजाय  :  0.31982842044205584
Words similar to  भाषा  :
   रूप  :  0.3920582355010139
   हिन्दी  :  0.3472396399601132
   अरबी  :  0.26047903134533157
   उर्दू  :  0.25425497320377005
   भारत  :  0.25274238623553




In [21]:

n = vocabulary_size
p_mat = multiprocessing.Array('d', n*n)

def compile_grams(gram,p_mat):

	ramp=[0,1,2,3,4,5]

	leng=len(distinct_vocab)

	x=list(gram)

	if x[0] in distinct_vocab:
		for i in range(1,len(x)):
			if x[i] in distinct_vocab:
				p_mat[distinct_vocab.index(x[0])*leng+distinct_vocab.index(x[i])] += 5-ramp[i]
					
	x.reverse()
	
	if x[0] in distinct_vocab:
		for i in range(1,len(x)):
			if x[i] in distinct_vocab:
				p_mat[distinct_vocab.index(x[0])*leng+distinct_vocab.index(x[i])] += 5-ramp[i]
	

@calculate_time
@memory_location
def create_matrix_p(p_mat):
	grams = ngrams(tokens,6)

	jobs=[]

	for gram in tqdm(grams):
		p=multiprocessing.Process(target=compile_grams, args=(gram,p_mat,))
		p.start()
		jobs.append(p)

	for job in jobs:
		job.join()

create_matrix_p(p_mat)

p_matrix = np.zeros((n,n),dtype=np.int64)

for i in range(n):
	for j in range(n):
		p_matrix[i][j]=p_mat[i*n+j]



Memory location of create_matrix_p: 129525600781312


3939it [00:12, 326.10it/s]


Time taken by wrapper: 12.100571155548096 seconds


In [22]:
import math
import multiprocessing

neigh = multiprocessing.Manager().dict()

def p_insert_into_sorted_list(sorted_list, element):
    index = 0
    while index < len(sorted_list) and sorted_list[index][1] > element[1]:
        index += 1
    sorted_list.insert(index, element)
    return sorted_list[:-1]

def p_cosine(a,b):
        mag_a = math.sqrt(sum(component ** 2 for component in a))
        mag_b = math.sqrt(sum(component ** 2 for component in b))
        dot_product = sum(ai * bi for ai, bi in zip(a, b))
        if mag_a == 0 or mag_b ==0:
            return 0
        else:
            return dot_product/(mag_a*mag_b)

def p_find_nearest_neighbor_of_noun(index,neigh):
    l=[(distinct_vocab[0],(p_cosine(ppmi_matrix[index], ppmi_matrix[0]))) for _ in range(11)]
    for i in tqdm(range(1,n)):
        if i!=index:
            if p_cosine(ppmi_matrix[index],ppmi_matrix[i])>l[10][1]:
                l=p_insert_into_sorted_list(l,(distinct_vocab[i],(p_cosine(ppmi_matrix[index], ppmi_matrix[i]))))

    neigh[distinct_vocab[index]]=l

@calculate_time
@memory_location
def parallel_nearest():
    jobs=[]
    for x in (most_common_elements):
        p=multiprocessing.Process(target=p_find_nearest_neighbor_of_noun, args=(distinct_vocab.index(x),neigh,))
        p.start()
        jobs.append(p)

    for job in jobs:
        job.join()
        
parallel_nearest()

print(neigh)

Memory location of parallel_nearest: 129525592722784


100%|██████████| 1773/1773 [00:01<00:00, 1233.36it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1173.61it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1123.49it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1067.81it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1057.06it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1051.30it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1020.88it/s]
100%|██████████| 1773/1773 [00:01<00:00, 992.03it/s] 
100%|██████████| 1773/1773 [00:01<00:00, 962.06it/s] 
100%|██████████| 1773/1773 [00:01<00:00, 945.98it/s] 


Time taken by wrapper: 1.9032878875732422 seconds
{'हिन्दी': [('भाषा', 0.3472396399601132), ('भारत', 0.33943060869648284), ('रूप', 0.305484092600656), ('उर्दू', 0.300078744109324), ('भारतीय', 0.2878374008275373), ('प्रयोग', 0.2618181347445307), ('भाषाओं', 0.2580177115873142), ('राजभाषा', 0.25079511388536113), ('लोगों', 0.2445474797528446), ('संख्या', 0.24381054880907813), ('आधुनिक', 0.2380191261055757)], 'भाषा': [('रूप', 0.3920582355010139), ('हिन्दी', 0.3472396399601132), ('अरबी', 0.26047903134533157), ('उर्दू', 0.25425497320377005), ('भारत', 0.25274238623553785), ('विश्व', 0.24686043157160134), ('प्रयोग', 0.24655441427577637), ('वाली', 0.24629187347461845), ('शब्द', 0.24609328764723643), ('फ़ारसी', 0.24508262713913223), ('हिन्दुस्तानी', 0.239469607498854)], 'शब्द': [('संस्कृत', 0.32248638257810386), ('अर्थ', 0.30888007859276173), ('बना', 0.2796397865042576), ('फ़ारसीअरबी', 0.26363738838590045), ('भाषा', 0.24609328764723643), ('अरबी', 0.24494908147634054), ('देशज', 0.24399477157058094