Importing necessary files and Initialization from Assignment 4



In [2]:
# %% [markdown]
# Importing Local File Paths

# %%
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

corpus_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/HindiCorpus.txt"
preprocessed_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/ProcessedCorpus.txt"
xml_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/hiwiki-latest-pages-articles.xml"
stop_words_file="/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/stopwords.txt"

corpus_limit=232729
preprocess_limit=4232433
gramms_limit=38735257

# %% [markdown]
# Importing necessary libraries

# %%
import multiprocessing
import time


from collections import Counter, defaultdict
!pip install wiki_dump_reader
from wiki_dump_reader import Cleaner, iterate

import string

from nltk import ngrams, word_tokenize
import nltk
nltk.download('punkt')

import numpy as np
from tqdm import tqdm
import math


Mounted at /content/drive/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Common Functions

In [11]:
# %% [markdown]
# Functions for generating number of tokens and vocabulary

# %%
def gen_vocabulary(file):
	word_list = []
	with open(file, 'r', encoding="utf-8") as f:
		for line in f:
			word_list += (line.split(' '))

	word_counts = Counter(word_list)
	return word_counts

# %% [markdown]
# Displaying Token Count and Vocabulary Count

# %%

vocab_counts = gen_vocabulary(preprocessed_file)

# %% [markdown]
# Reducing the vocabulary to words which occur more than 100 times since the total vocabulary count cannot be allocated as a 2D matrix

# %%

vocab = {x for x, count in vocab_counts.items() if count >= 100}

# %% [markdown]
# Necessary variables for making the Co-Occurance matrix

# %%
vocab_list = list(vocab)
vocab_pos = {vocab_list[i]: i for i in range(len(vocab_list))}
vocab_idx = vocab_pos.copy()
vocab_idx.update({i: w for i, w in enumerate(vocab_list)})



Sequential Processing - Co Occurance Matrix

In [12]:
def co_occurances(file):
	window = 5

	# Using a ramp of window 4
	ramp = [0] + [*range(window, 0, -1)]

	occurances = np.zeros((len(vocab), len(vocab)), dtype=np.int64)

	gram_count=0
	line=''

	with open(file, 'r', encoding="utf-8") as corpus:
		for corp in corpus:
			line=corp

	pg_bar=tqdm(total=gramms_limit)

	all_grams = ngrams(word_tokenize(line), window+1,pad_right=True, pad_left=True)

	for grams in all_grams:

		if grams[0] in vocab:
			for idx, gram in enumerate(grams):
				if gram in vocab:
					occurances[vocab_idx[grams[0]]][vocab_idx[gram]] += ramp[idx]

		# Doing the same as above with the gram reversed
		grams_rev = grams[::-1]

		if grams_rev[0] in vocab:
			for idx, gram in enumerate(grams_rev):
				if gram in vocab:
					occurances[vocab_idx[grams_rev[0]]][vocab_idx[gram]] += ramp[idx]

		gram_count+=1

		if gram_count % 100000 == 0:
			pg_bar.update(100000)

	pg_bar.close()

	print(f'\nGrams processed = {gram_count}')

	return occurances


start_time = time.perf_counter()

cocrmat = co_occurances(preprocessed_file)

end_time = time.perf_counter()

elapsed_time = end_time - start_time

print("\n\nElapsed time:", elapsed_time, "seconds")



100%|█████████▉| 38700000/38735257 [03:04<00:00, 209991.78it/s]


Grams processed = 38735256


Elapsed time: 185.52026572000068 seconds





Parallel Processing - Co Occurance Matrix

In [13]:
def parse_grams(ramp,gramss,occurances):

	for grams in gramss:
			if grams[0] in vocab:
				for idx, gram in enumerate(grams):
					if gram in vocab:
						occurances[vocab_idx[grams[0]]*len(vocab)+vocab_idx[gram]] += ramp[idx]

			# Doing the same as above with the gram reversed
			grams_rev = grams[::-1]

			if grams_rev[0] in vocab:
				for idx, gram in enumerate(grams_rev):
					if gram in vocab:
						occurances[vocab_idx[grams_rev[0]]*len(vocab)+vocab_idx[gram]] += ramp[idx]



def parallel_co_occurances(file):
	window = 5

	# Using a ramp of window 4
	ramp = [0] + [*range(window, 0, -1)]

	occurances=multiprocessing.Array('i', (len(vocab)* len(vocab)))

	gram_count=0
	line=''

	with open(file, 'r', encoding="utf-8") as corpus:
		for corp in corpus:
			line=corp



	all_grams = ngrams(word_tokenize(line), window+1,pad_right=True, pad_left=True)
	num_processes = multiprocessing.cpu_count()
	process_count=(gramms_limit-1)//num_processes
	gram_parallel=[[] for _ in range(num_processes+1)]

	for grams in all_grams:
		gram_count+=1
		gram_parallel[gram_count//process_count].append(grams)

	gram_count=0

	p=[]

	for i in range(len(gram_parallel)):
		p.append(multiprocessing.Process(target=parse_grams, args=(ramp,gram_parallel[i],occurances,)))

	for i in tqdm(range(len(p))):
		p[i].start()

	for i in tqdm(range(len(p))):
		p[i].join()



	print(f'\nGrams processed = {gram_count}')

	mat=np.zeros((len(vocab), len(vocab)), dtype=np.int64)

	for i in tqdm(range(len(vocab)*len(vocab))):
		mat[i%len(vocab)][i//len(vocab)]=occurances[i]

	return mat


start_time = time.perf_counter()

cocrmat_p = parallel_co_occurances(preprocessed_file)

end_time = time.perf_counter()

elapsed_time = end_time - start_time

print("\n\nElapsed time:", elapsed_time, "seconds")


100%|██████████| 17/17 [00:01<00:00, 13.03it/s]
100%|██████████| 17/17 [17:37<00:00, 62.18s/it]   



Grams processed = 0


100%|██████████| 349914436/349914436 [03:15<00:00, 1792867.45it/s]




Elapsed time: 1321.775573731 seconds


Checking Relation between Co-Occurance matrix generated from Sequential and Parallel operation.

In [14]:
avg=0
max_diff=0

for i in tqdm(range(len(vocab))):
    for j in range(len(vocab)):
        diff=abs(cocrmat[i][j]-cocrmat_p[i][j])
        if (diff>max_diff):
            max_diff=diff
        avg+=diff

avg /= (len(vocab)*len(vocab))

print ("Average Difference - ",avg)
print("Maximum Difference - ",max_diff)

100%|██████████| 18706/18706 [02:04<00:00, 150.17it/s]

Average Difference -  0.00019986314597206272
Maximum Difference -  6400





Common Functions

In [15]:
# %% [markdown]
# Calculating overall Pi and Pj values of the Co-Occurance matrix

# %%

def calculate_probability(co_occurrence_matrix):
	# Calculate row sums and column sums
	row_sums = np.sum(co_occurrence_matrix, axis=1)
	col_sums = np.sum(co_occurrence_matrix, axis=0)

	# Total co-occurrences
	N = np.sum(co_occurrence_matrix)

	if (N==0):
		return 0,0

	# Calculate pi and pj
	pi = row_sums / N
	pj = col_sums / N

	return pi, pj


# Calculate pi and pj
pi, pj = calculate_probability(cocrmat)

# %% [markdown]
# Generating the Positive Pointwise Mutual Information (PPMI) matrix from the Co-Occurance matrix

# %%

def gen_ppmi_matrix(co_occurrence_matrix):
	ppmi_matrix = np.zeros((len(vocab), len(vocab)))

	row_sums = np.sum(co_occurrence_matrix, axis=1)
	for i in tqdm(range(len(vocab))):
		for j in range(len(vocab)):

			if (row_sums[i] == 0 or co_occurrence_matrix[i][j] == 0):
				ppmi_matrix[i][j] = 0
				continue
			pij = ((co_occurrence_matrix[i][j]))/row_sums[i]
			ppmi_matrix[i][j] = max(0, math.log2(pij/(pi[i]*pj[j])))

	return ppmi_matrix


ppmi_matrix = gen_ppmi_matrix(cocrmat)

# %% [markdown]
# Functions for calculating similar words

# %%
def calc_norm(vec):
	sum_of_squares = sum(x**2 for x in vec)
	return math.sqrt(sum_of_squares)


def cosine_similarity(vec1, vec2):
	v1 = list(vec1)
	v2 = list(vec2)

	dot_product = (0.0)

	for i in range(len(v1)):
		temp = dot_product
		dot_product += (v1[i]*v2[i])
	# print(dot_product)
	norm_vec1 = calc_norm(v1)
	norm_vec2 = calc_norm(v2)
	return dot_product / (norm_vec1 * norm_vec2)


def reshape_top(words, extra):
	low = 0
	high = len(words) - 1

	while low <= high:
		mid = (low + high) // 2
		if words[mid][1] < extra[1]:
			low = mid + 1
		elif words[mid][1] > extra[1]:
			high = mid - 1
		else:
			break

	words.insert(low, extra)
	return words[1:]


def find_similar_words(target_word, ppmi_matrix, top_n=10):
	top_n += 1
	if target_word not in vocab:
		print("Target word not found in the vocabulary.")
		return

	target_index = vocab_pos[target_word]
	target_vector = ppmi_matrix[target_index]

	similarities = [(vocab_idx[0], 0.0) for _ in range(top_n)]

	for i, vector in enumerate(ppmi_matrix):
		sim = cosine_similarity(target_vector, vector)
		similarities = reshape_top(similarities, (vocab_idx[i], sim))

	similarities.reverse()

	return similarities[1:]

100%|██████████| 18706/18706 [01:55<00:00, 162.53it/s]


Generating the Top 15 words and choosing the Top 10 nouns from them

In [18]:
top_15 = vocab_counts.most_common(15)
top_15 = [i for i, _ in top_15]
print("Top 15 popular words : ", top_15)

nouns = [top_15[0], top_15[1], top_15[2], top_15[4], top_15[5],
		top_15[6], top_15[7], top_15[8], top_15[9], top_15[11]]
print("Top 10 popular nouns : ", nouns)

Top 15 popular words :  ['रूप', 'हिप्पोकैम्पस', 'उपहार', 'नहीं', 'राज्य', 'ओडिन', 'बच्चों', 'भारत', 'ओलंपिक', 'स्मृति', 'नाम', 'ईसाई', 'विशेष', 'प्रकार', 'क्रिसमस']
Top 10 popular nouns :  ['रूप', 'हिप्पोकैम्पस', 'उपहार', 'राज्य', 'ओडिन', 'बच्चों', 'भारत', 'ओलंपिक', 'स्मृति', 'ईसाई']


Sequential Processing - Displaying similar words for the nouns

In [17]:
for noun in nouns:
	target_word = noun
	similar_words = find_similar_words(target_word, ppmi_matrix)

	print(f"Words similar to '{target_word}':")
	for word, similarity in similar_words:
		print(f"	{word}: {similarity}")

Words similar to 'रूप':
	नहीं: 0.9487072218611307
	उन्होंने: 0.930062101338945
	सबसे: 0.9295931282106806
	नाम: 0.9277516621623244
	कारण: 0.9259441237081271
	शामिल: 0.9186821937048143
	काम: 0.9028323314003853
	प्राप्त: 0.900039119634523
	मुख्य: 0.8995005017423224
	प्रमुख: 0.8914432497554596
Words similar to 'हिप्पोकैम्पस':
	विभेदस्थानिक: 0.7207213286524589
	भूमिकाहिप्पोकैम्पस: 0.7162927728740857
	दिशानिर्देशन: 0.6929263481764726
	भूमिकाा: 0.6427220666189857
	भूमिकाे: 0.6323230290634051
	भूमिकां: 0.6242357862046934
	भूमिकाी: 0.5855576984290632
	स्मृति: 0.3232409766622346
	मेडियल: 0.32040773148090806
	भूमिकारूप: 0.3168999478220528
Words similar to 'उपहार':
	उत्पत्तिउपहार: 0.5085022245806068
	सिंटरक्लास: 0.3055436794275975
	उत्पत्तिर: 0.3023470627281466
	मिलतेआधुनिक: 0.283018218268561
	दिन: 0.26910517450294663
	देने: 0.26812977062060644
	क्रिसमस: 0.2673538038063325
	घर: 0.2647296588651374
	बच्चों: 0.26410470799276714
	लोगों: 0.26390195090873525
Words similar to 'राज्य':
	भारत: 0.8729305925

KeyboardInterrupt: 

Parallel Processing - Displaying similar words for the nouns

In [19]:

def similar_noun(noun,num,return_dict):
	similars=""

	target_word = noun
	similar_words = find_similar_words(target_word, vocab, ppmi_matrix)

	similars+=(f"Words similar to '{target_word}':\n")
	for word, similarity in similar_words:
		similars+=(f"	{word}: {similarity}\n")

	return_dict[num]=similars

return_dict=multiprocessing.Manager().dict()
jobs=[]

for i in range(10):
	p=multiprocessing.Process(target=similar_noun, args=(nouns[i],i,return_dict,))
	jobs.append(p)
	p.start()

for job in jobs:
	job.join()

for i in range(10):
	print(return_dict[i])



Words similar to 'रूप':
	नहीं: 0.9487072218611307
	उन्होंने: 0.930062101338945
	सबसे: 0.9295931282106806
	नाम: 0.9277516621623244
	कारण: 0.9259441237081271
	शामिल: 0.9186821937048143
	काम: 0.9028323314003853
	प्राप्त: 0.900039119634523
	मुख्य: 0.8995005017423224
	प्रमुख: 0.8914432497554596

Words similar to 'हिप्पोकैम्पस':
	विभेदस्थानिक: 0.7207213286524589
	भूमिकाहिप्पोकैम्पस: 0.7162927728740857
	दिशानिर्देशन: 0.6929263481764726
	भूमिकाा: 0.6427220666189857
	भूमिकाे: 0.6323230290634051
	भूमिकां: 0.6242357862046934
	भूमिकाी: 0.5855576984290632
	स्मृति: 0.3232409766622346
	मेडियल: 0.32040773148090806
	भूमिकारूप: 0.3168999478220528

Words similar to 'उपहार':
	उत्पत्तिउपहार: 0.5085022245806068
	सिंटरक्लास: 0.3055436794275975
	उत्पत्तिर: 0.3023470627281466
	मिलतेआधुनिक: 0.283018218268561
	दिन: 0.26910517450294663
	देने: 0.26812977062060644
	क्रिसमस: 0.2673538038063325
	घर: 0.2647296588651374
	बच्चों: 0.26410470799276714
	लोगों: 0.26390195090873525

Words similar to 'राज्य':
	भारत: 0.8729305