Importing Local File Paths

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

corpus_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/HindiCorpus.txt"
preprocessed_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/ProcessedCorpus.txt"
xml_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/hiwiki-latest-pages-articles.xml"
stop_words_file="/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/stopwords.txt"

corpus_limit=232729
preprocess_limit=4232433
gramms_limit=38735257

Importing necessary libraries

In [None]:
from collections import Counter, defaultdict
!pip install wiki_dump_reader
from wiki_dump_reader import Cleaner, iterate

import string
import psutil
import os

import multiprocessing
from multiprocessing import Pool
import time

from nltk import ngrams, word_tokenize
import nltk
nltk.download('punkt')

import numpy as np
from tqdm import tqdm
import math

Decorator Function - Display Time needed to run

In [2]:

def record_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        func(*args, **kwargs)
        end_time = time.perf_counter()
        execution_time = end_time - start_time
        print("\n\n>>> Elapsed time = ",execution_time," seconds")
        
    return wrapper



Decorator Function - Display Memory used to run

In [3]:

def process_memory():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss

def record_memory(func):
    def wrapper(*args, **kwargs):

        mem_before = process_memory()
        func(*args, **kwargs)
        mem_after = process_memory()
        print("\n\n>>> Memory consumed by ", func.__name__, "() = ",mem_after - mem_before)

    return wrapper

Function to create the corpus from the Wiki Dump Reader

In [4]:
@record_time
@record_memory
def write_corpus():
	corpus=corpus_file
	page_count = 0
	cleaner = Cleaner()
	with open(corpus, 'w', encoding='utf -8') as output:
		pg_bar=tqdm(total=corpus_limit)
		#progress bar
		for title, text in iterate(xml_file):
			text = cleaner.clean_text(text)
			cleaned_text, _ = cleaner.build_links(text)
			output.write(title + '\n' + cleaned_text + '\n ')
			#write to the corpus file
			page_count += 1
			if page_count % 1000 == 0:
				pg_bar.update(1000)
				#update progress bar every 1000 lines
		pg_bar.close()
		output.close()

	print(f"\nPage count = {page_count}")




In [5]:
write_corpus()

100%|█████████▉| 232000/232729 [04:39<00:00, 830.07it/s] 


Page count = 232728


>>> Memory consumed by  write_corpus () =  12820480


>>> Elapsed time =  279.5014267789993  seconds





Functions needed for Pre Processing the corpus

In [6]:
stop_words = []

start = int('0x0900', 16)
end = int('0x097F', 16)

num_start = int('0x0966', 16)
num_end = int('0x096F', 16)

hindi_letters = [chr(code) for code in range(start, end + 1)]
hindi_letters.remove('॥')
hindi_letters.remove('।')
hindi_letters.remove('ॽ')

# hindi_letters contains only hindi letters

hindi_numbers = [chr(code) for code in range(num_start, num_end + 1)]

# hindi_numbers contains hindi numbers and hindi punctuations


def create_stop_words(file_path):
	try:
		with open(file_path, 'r', encoding="utf-8") as file:
			for line in file:
				line=line.strip()
				words=line.split(" ")
				# generates the stop_words file as list of words
				for word in words:
					stop_words.append(word)
	except:
		print()


def remove_numbers(text):
	tokens = list(text)
	filtered_tokens = []

	for token in tokens:
		if token not in hindi_numbers:
			# removes hindi numbers
			filtered_tokens.append(token)

	result = ''.join(filtered_tokens)
	return result


def remove_punctuation(text):
	translator = str.maketrans('', '', string.punctuation)
	return text.translate(translator)


def remove_whitespace(text):
	return " ".join(text.split())


create_stop_words(stop_words_file)

Function for Pre Processing the corpus for further analysis

In [7]:
def preprocess_line(text):
	text = remove_whitespace(remove_punctuation(remove_numbers(text)))

	words = text.split(" ")

	# Filter out foreign words
	filtered_tokens = []
	for word in words:
		if word in stop_words:
			continue

		breaker = False

		# if the the word contains a foreign charachter that does not count as a hindi word
		for letter in word:
			if letter not in hindi_letters:
				breaker = True
				break

		if breaker:
			continue

		filtered_tokens.append(word)

	# Reassemble the text
	processed_text = ' '.join(filtered_tokens)

	return processed_text


@record_time
@record_memory
def preprocess_corpus():

	source_file=corpus_file
	destination_file=preprocessed_file

	line_count = 0
	with open(source_file, 'r', encoding="utf-8") as source:
		with open(destination_file, 'w', encoding="utf-8") as destination:
			pg_bar=tqdm(total=preprocess_limit)
			for line in source:
				# read each line from corpus and preprocess it to another file
				destination.write(preprocess_line(line))
				line_count += 1
				if line_count % 10000 == 0:
					pg_bar.update(10000)
					#update every 10000 lines processed
			pg_bar.close()

	print(f'\nLines processed = {line_count}')



In [8]:
preprocess_corpus()

100%|█████████▉| 4230000/4232433 [04:21<00:00, 16175.46it/s]


Lines processed = 4232432


>>> Memory consumed by  preprocess_corpus () =  0


>>> Elapsed time =  261.52774737100117  seconds





Functions for generating number of tokens and vocabulary

In [9]:

def gen_token_count(file):
	token_len = 0
	with open(file, 'r', encoding="utf-8") as f:
		for line in f:
			# remove '\n' statements if any
			line=line.replace('\n','')
			space_count = 0
			# just count the number of spaces in each line to get number of tokens
			for char in line:
				if char == ' ':
					space_count += 1

			token_len += space_count+1

	return token_len


def gen_token(file):
	tokens = []

	with open(file, 'r', encoding="utf-8") as f:
		# tokens generation by splitting the lines
		for line in f:
			tokens += (line.split(' '))

	return tokens


def gen_vocabulary(file):
	word_list = []
	with open(file, 'r', encoding="utf-8") as f:
		for line in f:
			line=line.replace('\n','')
			word_list += (line.split(' '))

	# generate set from tokens
	word_counts = Counter(word_list)
	return word_counts

def initialize_vocabulary():
	global vocab_list
	vocab_list = list(vocab)
	# reduce vocabulary to words which have occured more than 100 times to reduced size of co-occurance matrix
	global vocab_pos
	vocab_pos = {vocab_list[i]: i for i in range(len(vocab_list))}
	global vocab_idx
	vocab_idx = vocab_pos.copy()
	vocab_idx.update({i: w for i, w in enumerate(vocab_list)})
	# global variables to determine position of token and vice versa when called

Displaying Token Count and Vocabulary Count

In [10]:
corpus_token_count = gen_token_count(corpus_file)
print("Number of Corpus Tokens :", corpus_token_count)

preprocess_token_count = gen_token_count(preprocessed_file)
print("Number of Pre Processed Corpus Tokens :", preprocess_token_count)

vocab_counts = gen_vocabulary(preprocessed_file)
print("Size of Vocabulary :", len(vocab_counts))

Number of Corpus Tokens : 86699267
Number of Pre Processed Corpus Tokens : 38735251
Size of Vocabulary : 2108503


Reducing the vocabulary to words which occur more than 100 times since the total vocabulary count cannot be allocated as a 2D matrix

In [11]:
vocab = {x for x, count in vocab_counts.items() if count >= 100}
print("Size of reduced Vocabulary :", len(vocab))
print("Vocabulary : ",vocab)
initialize_vocabulary()

Size of reduced Vocabulary : 18706
Vocabulary :  {'दिखाने', 'निर्वाचन', 'भूखंड', 'सोचना', 'तत्काल', 'प्रान्तीय', 'गुणगान', 'मलाया', 'ग्रेटर', 'आक्सीकरण', 'नेविस', 'खोलना', 'वेबर', 'रिकॉर्ड', 'स्केल्डिक', 'अनिल', 'पंचोली', 'शुमार', 'सम्पन्न', 'कोणों', 'सूक्ष्मता', 'लघुकथा', 'यूनिकोड', 'सरोवर', 'परिभाषाओं', 'बबूल', 'देखें', 'विद्यालयकेन्द्रीय', 'सपने', 'ट्रांसमिशन', 'वेल्लूर', 'जारी', 'मेघा', 'पुनरुत्पादन', 'दृश्यमान', 'सर्वांगीण', 'सोनम', 'अठारहवीं', 'दरबारी', 'नारीवादी', 'अधिकारी', 'छुपे', 'गाँवबाहरी', 'कौगनिटिव', 'धोखे', 'समझाने', 'उपनगर', 'गोंडवाना', 'पैग़म्बर', 'विसंगतियों', 'अधिवर्षघटनाएँजनवरीमार्चअप्रैलजूनजुलाईसितंबरअक्टूबरदिसम्बरअज्ञात', 'प्रभावशीलता', 'अनुपस्थिति', 'ब्लेयर', 'तुग़लक़', 'स्पीलबर्ग', 'गणना', 'जनित', 'दृष्टिगोचर', 'वाह', 'सदस्यजन्मे', 'साइरस', 'सर्किल', 'चचेरा', 'दांत', 'हिला', 'वाकर', 'भेज', 'हेलेना', 'आचार्यों', 'दीपिका', 'अफ़गानिस्तान', 'शिक्षक', 'संक्रमणों', 'परिणामस्वरुप', 'रोकने', 'इक्कीस', 'संकटग्रस्त', 'बर्डाक', 'पहलू', 'सीपी', 'अनुशासन', 'तमाशा', 'मर्सिडीज

Function to create the Co-Occurance matrix - Sequential Processing

In [12]:
@record_time
@record_memory
def co_occurances():
	file = preprocessed_file
	window = 5

	# Using a ramp of window 5
	ramp = [0] + [*range(window, 0, -1)]

	#initialize xo-occurance matrix
	occurances = np.zeros((len(vocab), len(vocab)), dtype=np.int64)

	gram_count=0
	line=''

	with open(file, 'r', encoding="utf-8") as corpus:
		for corp in corpus:
			line=corp

	pg_bar=tqdm(total=gramms_limit)

	all_grams = ngrams(word_tokenize(line), window+1,pad_right=True, pad_left=True)
	#generate n-grams

	for grams in all_grams:
	#fill up the matrix from the n-grams

		if grams[0] in vocab:
			for idx, gram in enumerate(grams):
				if gram in vocab:
					occurances[vocab_idx[grams[0]]][vocab_idx[gram]] += ramp[idx]

		# doing the same as above with the gram reversed
		grams_rev = grams[::-1]

		if grams_rev[0] in vocab:
			for idx, gram in enumerate(grams_rev):
				if gram in vocab:
					occurances[vocab_idx[grams_rev[0]]][vocab_idx[gram]] += ramp[idx]

		gram_count+=1

		if gram_count % 100000 == 0:
			pg_bar.update(100000)
			#display progress every 100000 grams

	pg_bar.close()

	print(f'\nGrams processed = {gram_count}')

	global cocrmat
	cocrmat= occurances


In [13]:
co_occurances()

100%|█████████▉| 38700000/38735257 [02:41<00:00, 240331.56it/s]


Grams processed = 38735256


>>> Memory consumed by  co_occurances () =  2781347840


>>> Elapsed time =  162.22181566400104  seconds





Function to create the Co-Occurance matrix - Parallel Processing

In [14]:
def parse_grams(gramss):
	window=5
	ramp = [0] + [*range(window, 0, -1)]
	occurances=[[None for _ in range(len(vocab))] for _ in range(len(vocab))]

	for grams in gramss:
	#fill matrix for designated n-grams
			if grams[0] in vocab:
				for idx, gram in enumerate(grams):
					if gram in vocab:
						if occurances[vocab_idx[grams[0]]][vocab_idx[gram]] != None:
							occurances[vocab_idx[grams[0]]][vocab_idx[gram]] += ramp[idx]
						else:
							occurances[vocab_idx[grams[0]]][vocab_idx[gram]] = ramp[idx]


			# doing the same as above with the gram reversed
			grams_rev = grams[::-1]

			if grams_rev[0] in vocab:
				for idx, gram in enumerate(grams_rev):
					if gram in vocab:
						if occurances[vocab_idx[grams_rev[0]]][vocab_idx[gram]] !=None:
							occurances[vocab_idx[grams_rev[0]]][vocab_idx[gram]] += ramp[idx]
						else:
							occurances[vocab_idx[grams_rev[0]]][vocab_idx[gram]]=ramp[idx]
	
	return occurances



@record_time
@record_memory
def parallel_co_occurances():
	file=preprocessed_file
	window=5

	#initialize matrix to store the final matrix
	global cocrmat_parallel
	cocrmat_parallel=[[0 for _ in range(len(vocab))] for _ in range(len(vocab))]

	gram_count=0
	line=''

	with open(file, 'r', encoding="utf-8") as corpus:
		for corp in corpus:
			line=corp

	all_grams = ngrams(word_tokenize(line), window+1,pad_right=True, pad_left=True)
	#num_processes = multiprocessing.cpu_count()
	num_processes=4
	# number of processes is reduced due to limitation of RAM
	process_count=(gramms_limit-1)//num_processes
	gram_parallel=[[] for _ in range(num_processes+1)]
	#grams are equally divided among the processes

	for grams in all_grams:
		gram_count+=1
		gram_parallel[gram_count//process_count].append(grams)

	all_occurances=[]
	with Pool(num_processes) as p:
		all_occurances=p.map(parse_grams,gram_parallel)
	#each gram is pooled together


	for occur in tqdm(all_occurances):
		for i in range(len(vocab)):
			for j in range(len(vocab)):
				if occur[i][j]!=None:
					cocrmat_parallel[i][j]+=occur[i][j]
	#each matrix from each multiprocessed core is added together to create the comple Co-Occurance matrix

	print(f'\nGrams processed = {gram_count}\n')


In [15]:
parallel_co_occurances()

100%|██████████| 5/5 [00:44<00:00,  8.82s/it]



Grams processed = 38735256



>>> Memory consumed by  parallel_co_occurances () =  87261184


>>> Elapsed time =  209.8120386590017  seconds


Difference between Co-Occurance Matrix created by Sequential Processing and Parallel Processing

In [32]:
def compute_difference(cocrmat,cocrmat_parallel):
	avg=0
	maxs=0

	for i in tqdm(range(len(vocab))):
		for j in range(len(vocab)):
			diff=abs(cocrmat[i][j]-cocrmat_parallel[i][j])
			if (diff>maxs):
				maxs=diff
			avg+=diff
			
	print("\nAverage Difference : ",avg / (len(vocab)*len(vocab)))
	print("Maximum Difference : ",maxs)

compute_difference(cocrmat,cocrmat_parallel)

100%|██████████| 18706/18706 [01:03<00:00, 294.49it/s]


Average Difference :  0.0
Maximum Difference :  0





Function to create Pi and Pj values

In [17]:

@record_time
@record_memory
def calculate_probability():
	co_occurrence_matrix=cocrmat
	# Calculate row sums and column sums
	row_sums = np.sum(co_occurrence_matrix, axis=1)
	col_sums = np.sum(co_occurrence_matrix, axis=0)

	# Total co-occurrences
	N = np.sum(co_occurrence_matrix)

	if(N==0):
		return 0,0

	# Calculate pi and pj

	global pi,pj

	pi,pj=row_sums / N,col_sums / N



In [18]:
calculate_probability()



>>> Memory consumed by  calculate_probability () =  1572864


>>> Elapsed time =  0.238933705000818  seconds


Function to generate the Positive Pointwise Mutual Information (PPMI) matrix from the Co-Occurance matrix

In [19]:

@record_time
@record_memory
def gen_ppmi_matrix():
	co_occurrence_matrix=cocrmat
	ppmi = np.zeros((len(vocab), len(vocab)))

	row_sums = np.sum(co_occurrence_matrix, axis=1)
	for i in tqdm(range(len(vocab))):
		for j in range(len(vocab)):
			#for null values in the matrix, the way to bypass division by 0 error
			if (row_sums[i] == 0 or co_occurrence_matrix[i][j] == 0):
				ppmi[i][j] = 0
				continue
			#calculate pij values
			pij = (co_occurrence_matrix[i][j])/row_sums[i]
			ppmi[i][j] = max(0, math.log2(pij/(pi[i]*pj[j])))

	global ppmi_matrix
	ppmi_matrix=ppmi


In [20]:
gen_ppmi_matrix()

100%|██████████| 18706/18706 [01:38<00:00, 189.31it/s]



>>> Memory consumed by  gen_ppmi_matrix () =  2799566848


>>> Elapsed time =  98.87386335499832  seconds





Generating the Top 15 words and choosing the Top 10 nouns from them

In [25]:
def top_10_words():
	top_15 = vocab_counts.most_common(15)
	top_15 = [i for i, _ in top_15]
	print("Top 15 popular words : ", top_15)

	global nouns
	nouns = [top_15[0], top_15[1], top_15[2], top_15[4], top_15[5],
			top_15[6], top_15[7], top_15[8], top_15[9], top_15[11]]
	print("Top 10 popular nouns : ", nouns)

top_10_words()

Top 15 popular words :  ['रूप', 'हिप्पोकैम्पस', 'उपहार', 'नहीं', 'राज्य', 'ओडिन', 'बच्चों', 'भारत', 'ओलंपिक', 'स्मृति', 'नाम', 'ईसाई', 'विशेष', 'प्रकार', 'क्रिसमस']
Top 10 popular nouns :  ['रूप', 'हिप्पोकैम्पस', 'उपहार', 'राज्य', 'ओडिन', 'बच्चों', 'भारत', 'ओलंपिक', 'स्मृति', 'ईसाई']


Functions for calculating similar words

In [26]:
def calc_norm(vec):
	#calculate norm of a vector
	sum_of_squares = sum(x**2 for x in vec)
	return math.sqrt(sum_of_squares)


def cosine_similarity(vec1, vec2):
	#find the cosine distance between two vectors
	v1 = list(vec1)
	v2 = list(vec2)

	dot_product = (0.0)

	for i in range(len(v1)):
		dot_product += (v1[i]*v2[i])
	# print(dot_product)
	norm_vec1 = calc_norm(v1)
	norm_vec2 = calc_norm(v2)
	return dot_product / (norm_vec1 * norm_vec2)



Function to display Similar words - Sequentaial Processing

In [27]:
def find_similar_words(target_word):
	if target_word not in vocab:
		print("Target word not found in the vocabulary.")
		return

	target_index = vocab_pos[target_word]
	target_vector = ppmi_matrix[target_index]

	similarities = []

	#since the vocabulary is not large enough, it is more efficient to calculate all values
	#as a list than storing only the top 10
	for i, vector in enumerate(ppmi_matrix):
		sim = cosine_similarity(target_vector, vector)
		similarities.append((vocab_idx[i], sim))

	similarities = sorted(similarities, key=lambda x: x[1])

	similarities.reverse()

	#return the top 10 words other than the word itself
	return similarities[1:11]

@record_time
@record_memory
def similar_sequential():

	#run for each noun
	for noun in nouns:
		target_word = noun
		similar_words = find_similar_words(target_word)

		print(f"Words similar to '{target_word}':")
		for word, similarity in similar_words:
			print(f"	{word}: {similarity}")

In [28]:
similar_sequential()

Words similar to 'रूप':
	नहीं: 0.948707221861126
	उन्होंने: 0.9300621013389426
	सबसे: 0.9295931282106785
	नाम: 0.9277516621623078
	कारण: 0.9259441237081197
	शामिल: 0.9186821937048162
	काम: 0.9028323314003919
	प्राप्त: 0.9000391196345234
	मुख्य: 0.8995005017423204
	प्रमुख: 0.8914432497554496
Words similar to 'हिप्पोकैम्पस':
	विभेदस्थानिक: 0.7207213286524591
	भूमिकाहिप्पोकैम्पस: 0.7162927728740851
	दिशानिर्देशन: 0.6929263481764719
	भूमिकाा: 0.6427220666189856
	भूमिकाे: 0.6323230290634048
	भूमिकां: 0.6242357862046933
	भूमिकाी: 0.5855576984290629
	स्मृति: 0.3232409766622333
	मेडियल: 0.3204077314809079
	भूमिकारूप: 0.31689994782205255
Words similar to 'उपहार':
	उत्पत्तिउपहार: 0.5085022245806062
	सिंटरक्लास: 0.3055436794275973
	उत्पत्तिर: 0.3023470627281464
	मिलतेआधुनिक: 0.28301821826856083
	दिन: 0.2691051745029472
	देने: 0.268129770620607
	क्रिसमस: 0.2673538038063329
	घर: 0.26472965886513705
	बच्चों: 0.2641047079927661
	लोगों: 0.26390195090873325
Words similar to 'राज्य':
	भारत: 0.8729305925

Function to display Similar words - Parallel Processing

In [29]:

def compute_cosine(data):
	return cosine_similarity(data[0], ppmi_matrix[data[1]])

def p_find_similar_words(target_word):
	if target_word not in vocab:
		print("Target word not found in the vocabulary.")
		return

	target_index = vocab_pos[target_word]
	target_vector = ppmi_matrix[target_index]

	cpu_count=multiprocessing.cpu_count()

	similarities=[]
	#mapper stores the needed variables as a tuple
	mapper=[(target_vector,i) for i in range(len(vocab))]


	with Pool(cpu_count) as p:
		similarities=p.map(compute_cosine,mapper)
	#pool all the processes over the cores to get list of similarities

	similars=[]

	for i in range(len(vocab)):
		similars.append((vocab_idx[i],similarities[i]))
	#map each similarity to its designated word to be displayed

	similars = sorted(similars, key=lambda x: x[1])

	similars.reverse()

	#return top 10 words other than itself
	return similars[1:11]

@record_time
@record_memory
def similar_parallel():
	#run for all nouns
	for noun in nouns:
		target_word = noun
		similar_words = p_find_similar_words(target_word)

		print(f"Words similar to '{target_word}':")
		for word, similarity in similar_words:
			print(f"	{word}: {similarity}")




In [30]:
similar_parallel()

Words similar to 'रूप':
	नहीं: 0.948707221861126
	उन्होंने: 0.9300621013389426
	सबसे: 0.9295931282106785
	नाम: 0.9277516621623078
	कारण: 0.9259441237081197
	शामिल: 0.9186821937048162
	काम: 0.9028323314003919
	प्राप्त: 0.9000391196345234
	मुख्य: 0.8995005017423204
	प्रमुख: 0.8914432497554496
Words similar to 'हिप्पोकैम्पस':
	विभेदस्थानिक: 0.7207213286524591
	भूमिकाहिप्पोकैम्पस: 0.7162927728740851
	दिशानिर्देशन: 0.6929263481764719
	भूमिकाा: 0.6427220666189856
	भूमिकाे: 0.6323230290634048
	भूमिकां: 0.6242357862046933
	भूमिकाी: 0.5855576984290629
	स्मृति: 0.3232409766622333
	मेडियल: 0.3204077314809079
	भूमिकारूप: 0.31689994782205255
Words similar to 'उपहार':
	उत्पत्तिउपहार: 0.5085022245806062
	सिंटरक्लास: 0.3055436794275973
	उत्पत्तिर: 0.3023470627281464
	मिलतेआधुनिक: 0.28301821826856083
	दिन: 0.2691051745029472
	देने: 0.268129770620607
	क्रिसमस: 0.2673538038063329
	घर: 0.26472965886513705
	बच्चों: 0.2641047079927661
	लोगों: 0.26390195090873325
Words similar to 'राज्य':
	भारत: 0.8729305925

The Parallel Processing of Co-Occurance matrix seems to run slower than the Sequential Processing by an extra 48 seconds, owing to the fact that adding the matrices from the output of the multiprocessing units takes a lot of time and I could not use more than 5 processes of Pool as I had limited RAM.

The Parallel Processing of Cosine distance was very effective in comparison to the Sequential Processing. It was 5.8 times faster than sequential processing.

Conclusion -- When large variables are used Parallel Processing doesn't prove effective if there isn't RAM to spare but otherwise it decreases runtime significantly.