Imports

In [1]:
import multiprocessing
from wiki_dump_reader import Cleaner , iterate
from tqdm import tqdm
import numpy as np
import multiprocessing
import nltk
from collections import Counter, defaultdict
from nltk import ngrams, word_tokenize
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/anurag/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import time

def runtime(func):
	def wrapper(*args, **kwargs):
		start_time = time.time()
		result = func(*args, **kwargs)
		end_time = time.time()
		print(f"Time taken by {func.__name__}: {end_time - start_time} seconds")
		return result
	return wrapper



In [3]:
def memory_location(func):
	def wrapper(*args, **kwargs):
		print(f"Location of {func.__name__}: {id(func)}")
		return func(*args, **kwargs)
	return wrapper


Corpus Create

In [4]:
@runtime
@memory_location
def create_corpus():
	corpus_file = 'Hindi_Corpus.txt'
	corpus_limit = 232729
	page_count = 0
	cleaner = Cleaner()  # Assuming Cleaner class is imported or defined elsewhere.
	
	# Open the output file for writing
	with open(corpus_file, 'w', encoding='utf-8') as output:
		bar = tqdm(total=corpus_limit)  # Initialize the progress bar
		for title, text in iterate('hiwiki-latest-pages-articles.xml'):  
			text = cleaner.clean_text(text)
			cleaned_text, _ = cleaner.build_links(text)
			output.write(title + '\n' + cleaned_text + '\n')
			page_count += 1
			if page_count % 1000 == 0:
				bar.update(1000)  # Update progress bar every 1000 pages
		bar.close() 
		output.close()  # Close the output file
	print(f"\nPage count = {page_count}")  # Print total page count after processing



Preprocessing

In [5]:

def remove_stop_words(string):
	
	with open('stopwords.txt', 'r', encoding='utf-8') as stop:
		y = stop.readlines()
	stop_words = []
	for element in y:
		element = element.strip('\n')
		stop_words.extend(element.split(' '))
		
	l = string.split()  
	return_list = []
	for x in l:
		if x not in stop_words:  
			return_list.append(x) 
	return ' '.join(return_list) 


import re

def remove_foreign(x):
	string = x.split(' ')  
	s1 = [(re.compile(r'[\u0901-\u0939\u093C-\u094D\u0950-\u0954\u0958-\u0963\u097B-\u097F]')).findall(s) for s in string]
	s2 = [''.join(s) for s in s1] 
	s3 = ' '.join(s2) 
	return s3  


@runtime
@memory_location
def pre(source, destination):
	line_count = 0
	with open(source, 'r', encoding='utf-8') as input:
		with open(destination, 'w', encoding='utf-8') as output:
			bar = tqdm(total=5000000) 
			for line in input:
				string = line.replace('\n', '')
				string = remove_foreign(string)
				string = remove_stop_words(string)
				output.write(string)
				line_count += 1
				if line_count % 10000 == 0:
					bar.update(10000)  
			bar.close()



In [None]:
pre('Hindi_Corpus.txt', 'ProcessedCorpus.txt')

Working on reduced Corpus

In [6]:
def minimize_pre(so,de):
	i=10000
	with open(so, 'r',encoding="utf-8") as sor:
		with open(de, 'w',encoding="utf-8") as des:
				for line in sor:
					des.write(line[:(len(line))//i])

minimize_pre("ProcessedCorpus (copy).txt", "ProcessedCorpus.txt")

In [7]:
from collections import Counter

min_vocab=0

def list_of_words():
	with open('ProcessedCorpus.txt', 'r', encoding='utf-8') as r:
		text = r.read().split()
	word_counts = Counter(text)
	text = [word for word in tqdm(text) if word_counts[word] > min_vocab]  
	return text

def gen_distinct_vocab():
	with open('ProcessedCorpus.txt', 'r', encoding='utf-8') as r:
		text = r.read().split()
		counter = Counter(text)
		reduced_vocab = [item for item, count in counter.items() if count > min_vocab]
		text_set = set(reduced_vocab)
		global vocabulary_size
		vocabulary_size = len(text_set)
	return list(text_set)


In [8]:

global words_list
words_list=list_of_words()

100%|██████████| 3944/3944 [00:00<00:00, 3387046.47it/s]


Printing total number of tokens and vocabulary


In [9]:
vocab=gen_distinct_vocab()
vocabulary_size = len(vocab)
tokens = word_tokenize(' '.join(words_list))
grams = ngrams(tokens, 6)

In [10]:
print("Number of tokens is: ", len(tokens))
print("Size of the vocabulary: ", vocabulary_size)


Number of tokens is:  3944
Size of the vocabulary:  1774


In [13]:

cocrmat = np.zeros((len(vocab),len(vocab)),dtype=np.int64)

@runtime
@memory_location
def create_matrix():
	
	ramp=[0,1,2,3,4,5]

	for gram in tqdm(grams):

		x=list(gram)

		if x[0] in vocab:
			for i in range(1,len(x)):
				if x[i] in vocab:
					cocrmat[vocab.index(x[0])][vocab.index(x[i])] += 5-ramp[i]
					
		x.reverse()
	
		if x[0] in vocab:
			for i in range(1,len(x)):
				if x[i] in vocab:
					cocrmat[vocab.index(x[0])][vocab.index(x[i])] += 5-ramp[i]
		
create_matrix()


Location of create_matrix: 129247443228352


3939it [00:00, 5422.43it/s]

Time taken by wrapper: 0.7289161682128906 seconds





In [14]:
import math
from tqdm import tqdm

@runtime
@memory_location
def calculate_probability(matrix):
	
	row_sums = np.sum(matrix, axis=1)
	col_sums = np.sum(matrix, axis=0)

	N = np.sum(matrix)

	if(N==0):
		return 0,0

	global pi,pj

	pi = row_sums / N
	pj = col_sums / N

 
 
@runtime
@memory_location
def gen_ppmi_matrix(matrix):
	co_occurrence_matrix=matrix
 
	ppmi = np.zeros((len(vocab), len(vocab)))

	row_sums = np.sum(co_occurrence_matrix, axis=1)
	for i in tqdm(range(len(vocab))):
		for j in range(len(vocab)):

			if (row_sums[i] == 0 or co_occurrence_matrix[i][j] == 0):
				ppmi[i][j] = 0
				continue
			pij = ((co_occurrence_matrix[i][j]))/row_sums[i]
			ppmi[i][j] = max(0, math.log2(pij/(pi[i]*pj[j])))

	return ppmi


In [15]:
calculate_probability(cocrmat)
ppmi_matrix=gen_ppmi_matrix(cocrmat)


Location of calculate_probability: 129247443227872
Time taken by wrapper: 0.009210586547851562 seconds
Location of gen_ppmi_matrix: 129247584307296


100%|██████████| 1774/1774 [00:01<00:00, 1659.90it/s]

Time taken by wrapper: 1.0741322040557861 seconds





Finding 10 most common nouns

In [22]:
from collections import Counter

counter = Counter(list_of_words())
common_elements = counter.most_common(10)
most_common_elements=[a for (a,_) in common_elements]
print(most_common_elements)

100%|██████████| 3944/3944 [00:00<00:00, 3489207.97it/s]

['हिन्दी', 'ॐ', 'भाषा', 'नमः', 'लिपि', 'देवनागरी', 'रूप', 'शब्द', 'संस्कृत', 'विनायकाय']





Implementing and printing the top ten nearest neighbour dictionary. (Without Multiprocessing)

In [26]:
import math

def insert_into_sorted_list(sorted_list, element):
	index = 0
	while index < len(sorted_list) and sorted_list[index][1] > element[1]:
		index += 1
	sorted_list.insert(index, element)
	return sorted_list[:-1]

def cosine(a,b):
		mag_a = math.sqrt(sum(component ** 2 for component in a))
		mag_b = math.sqrt(sum(component ** 2 for component in b))
		dot_product = sum(ai * bi for ai, bi in zip(a, b))
		if mag_a == 0 or mag_b ==0:
			return 0
		else:
			return dot_product/(mag_a*mag_b)

def find_nearest_neighbor_of_noun(index):
	l=[(vocab[0],(cosine(ppmi_matrix[index], ppmi_matrix[0]))) for _ in range(11)]
	for i in tqdm(range(1,len(vocab))):
		if i!=index and cosine(ppmi_matrix[index],ppmi_matrix[i])>l[10][1]:
				l=insert_into_sorted_list(l,(vocab[i],(cosine(ppmi_matrix[index], ppmi_matrix[i]))))
	return l

nearest_neighbour_dict={}

@runtime
@memory_location
def nearest_seq():
	for x in tqdm(most_common_elements):
		nearest_neighbour_dict[x]=find_nearest_neighbor_of_noun(vocab.index(x))

nearest_seq()

for i in nearest_neighbour_dict:
	print("Words similar to ",i," :")
	for j in nearest_neighbour_dict[i]:
		print ("  ",j[0]," : ",j[1])


Location of nearest_seq: 129247384780000


100%|██████████| 1773/1773 [00:01<00:00, 1505.06it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1670.54it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1647.37it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1588.97it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1651.83it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1664.42it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1658.14it/s]
100%|██████████| 1773/1773 [00:01<00:00, 999.76it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1162.80it/s]
100%|██████████| 1773/1773 [00:01<00:00, 1307.76it/s]
100%|██████████| 10/10 [00:12<00:00,  1.24s/it]

Time taken by wrapper: 12.395121335983276 seconds
Words similar to  हिन्दी  :
   भाषा  :  0.34723963996011337
   भारत  :  0.339430608696483
   रूप  :  0.30548409260065607
   उर्दू  :  0.300078744109324
   भारतीय  :  0.28783740082753745
   प्रयोग  :  0.26181813474453075
   भाषाओं  :  0.2580177115873143
   राजभाषा  :  0.2507951138853613
   लोगों  :  0.24454747975284472
   संख्या  :  0.24381054880907818
   आधुनिक  :  0.23801912610557582
Words similar to  ॐ  :
   नमः  :  0.8331027179229086
   सिद्धि  :  0.6230561905579799
   विनायकाय  :  0.6204863570166664
   समर्पयामि  :  0.5894912204663977
   श्री  :  0.4178855280895735
   पूजयामि  :  0.3810011342724255
   महा  :  0.36825941601983075
   विघ्न  :  0.3414698632101067
   पुष्पं  :  0.3383239111668998
   दन्ताय  :  0.3306362704857525
   राजाय  :  0.31982842044205595
Words similar to  भाषा  :
   रूप  :  0.3920582355010141
   हिन्दी  :  0.34723963996011337
   अरबी  :  0.2604790313453316
   उर्दू  :  0.25425497320377005
   भारत  :  0.2527423862




The above output cell shows the time taken by the sequential process.

Assignment 6 

Computing Co-occurrence matrix with Multiprocessing


In [18]:

n = vocabulary_size
p_mat = multiprocessing.Array('d', n*n)

def compile_grams(gram,p_mat):

	ramp=[0,1,2,3,4,5]

	leng=len(vocab)

	x=list(gram)

	if x[0] in vocab:
		for i in range(1,len(x)):
			if x[i] in vocab:
				p_mat[vocab.index(x[0])*leng+vocab.index(x[i])] += 5-ramp[i]
					
	x.reverse()
	
	if x[0] in vocab:
		for i in range(1,len(x)):
			if x[i] in vocab:
				p_mat[vocab.index(x[0])*leng+vocab.index(x[i])] += 5-ramp[i]
	

@runtime
@memory_location
def create_matrix_p(p_mat):
	grams = ngrams(tokens,6)

	jobs=[]

	for gram in tqdm(grams):
		p=multiprocessing.Process(target=compile_grams, args=(gram,p_mat,))
		p.start()
		jobs.append(p)

	for job in jobs:
		job.join()

create_matrix_p(p_mat)

p_matrix = np.zeros((n,n),dtype=np.int64)

for i in range(n):
	for j in range(n):
		p_matrix[i][j]=p_mat[i*n+j]



Location of create_matrix_p: 129247384554528


0it [00:00, ?it/s]

3939it [00:17, 226.87it/s]


Time taken by wrapper: 17.387744188308716 seconds


The above output cell shows the time taken by multiprocessing to create the co-occurrence matrix

Using Multiprocessing to compute the nearest neighbours

In [29]:


neigh = multiprocessing.Manager().dict()

def p_insert_into_sorted_list(sorted_list, element):
	index = 0
	while index < len(sorted_list) and sorted_list[index][1] > element[1]:
		index += 1
	sorted_list.insert(index, element)
	return sorted_list[:-1]

def p_cosine(a,b):
		mag_a = math.sqrt(sum(component ** 2 for component in a))
		mag_b = math.sqrt(sum(component ** 2 for component in b))
		dot_product = sum(ai * bi for ai, bi in zip(a, b))
		if mag_a == 0 or mag_b ==0:
			return 0
		else:
			return dot_product/(mag_a*mag_b)

def p_find_nearest_neighbor_of_noun(index,neigh):
	l=[(vocab[0],(p_cosine(ppmi_matrix[index], ppmi_matrix[0]))) for _ in range(11)]
	for i in tqdm(range(1,n)):
		if i!=index:
			if p_cosine(ppmi_matrix[index],ppmi_matrix[i])>l[10][1]:
				l=p_insert_into_sorted_list(l,(vocab[i],(p_cosine(ppmi_matrix[index], ppmi_matrix[i]))))

	neigh[vocab[index]]=l

@runtime
@memory_location
def parallel_nearest():
	jobs=[]
	for x in (most_common_elements):
		p=multiprocessing.Process(target=p_find_nearest_neighbor_of_noun, args=(vocab.index(x),neigh,))
		p.start()
		jobs.append(p)

	for job in jobs:
		job.join()
		
parallel_nearest()

neighbour={}

for elem in most_common_elements:
	neighbour[elem]=neigh[elem]

for i in neighbour:
	print("Words similar to ",i," :")
	for j in neighbour[i]:
		print ("  ",j[0]," : ",j[1])

Location of parallel_nearest: 129247583799552


100%|██████████| 1773/1773 [00:05<00:00, 349.73it/s]
100%|██████████| 1773/1773 [00:06<00:00, 290.30it/s]
100%|██████████| 1773/1773 [00:05<00:00, 296.10it/s]
100%|██████████| 1773/1773 [00:06<00:00, 292.32it/s]
100%|██████████| 1773/1773 [00:06<00:00, 287.42it/s]
100%|██████████| 1773/1773 [00:06<00:00, 283.14it/s]
100%|██████████| 1773/1773 [00:06<00:00, 263.23it/s]
100%|██████████| 1773/1773 [00:07<00:00, 242.77it/s]
100%|██████████| 1773/1773 [00:07<00:00, 239.49it/s]
100%|██████████| 1773/1773 [00:07<00:00, 244.52it/s]


Time taken by wrapper: 7.504637718200684 seconds
Words similar to  हिन्दी  :
   भाषा  :  0.34723963996011337
   भारत  :  0.339430608696483
   रूप  :  0.30548409260065607
   उर्दू  :  0.300078744109324
   भारतीय  :  0.28783740082753745
   प्रयोग  :  0.26181813474453075
   भाषाओं  :  0.2580177115873143
   राजभाषा  :  0.2507951138853613
   लोगों  :  0.24454747975284472
   संख्या  :  0.24381054880907818
   आधुनिक  :  0.23801912610557582
Words similar to  ॐ  :
   नमः  :  0.8331027179229086
   सिद्धि  :  0.6230561905579799
   विनायकाय  :  0.6204863570166664
   समर्पयामि  :  0.5894912204663977
   श्री  :  0.4178855280895735
   पूजयामि  :  0.3810011342724255
   महा  :  0.36825941601983075
   विघ्न  :  0.3414698632101067
   पुष्पं  :  0.3383239111668998
   दन्ताय  :  0.3306362704857525
   राजाय  :  0.31982842044205595
Words similar to  भाषा  :
   रूप  :  0.3920582355010141
   हिन्दी  :  0.34723963996011337
   अरबी  :  0.2604790313453316
   उर्दू  :  0.25425497320377005
   भारत  :  0.25274238623