Importing necessary files and Initialization

In [1]:

corpus_file = "HindiCorpus.txt"
preprocessed_file = "ProcessedCorpus.txt"
xml_file = "hiwiki-latest-pages-articles.xml"
stop_words_file="stopwords.txt"

corpus_limit=232729
preprocess_limit=4232433
gramms_limit=38735257 

# %% [markdown]
# Importing necessary libraries

# %%
import multiprocessing 
import time

from collections import Counter, defaultdict
from wiki_dump_reader import Cleaner, iterate

import string
import ctypes

from nltk import ngrams, word_tokenize
import nltk
nltk.download('punkt')

import numpy as np
from tqdm import tqdm
import math


[nltk_data] Downloading package punkt to /home/anurag/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Minimize Pre

In [1]:
def minimize_pre(so,de):
	i=10000
	with open(so, 'r',encoding="utf-8") as sor:
		with open(de, 'w',encoding="utf-8") as des:
				for line in sor:
					des.write(line[:(len(line)//i)])
	
minimize_pre('ProcessedCorpus (copy).txt','ProcessedCorpus.txt')

Common Functions

In [2]:
# %% [markdown]
# Functions for generating number of tokens and vocabulary

# %%
def gen_vocabulary(file):
	word_list = []
	with open(file, 'r', encoding="utf-8") as f:
		for line in f:
			line=line.replace('\n','')

			word_list += (line.split(' '))

	word_counts = Counter(word_list)
	return word_counts

# %% [markdown]
# Displaying Token Count and Vocabulary Count

# %%

vocab_counts = gen_vocabulary(preprocessed_file)

# %% [markdown]
# Reducing the vocabulary to words which occur more than 100 times since the total vocabulary count cannot be allocated as a 2D matrix

# %%

vocab = {x for x, count in vocab_counts.items() if count >= 100}


# %% [markdown]
# Necessary variables for making the Co-Occurance matrix

# %%
vocab_list = list(vocab)
vocab_pos = {vocab_list[i]: i for i in range(len(vocab_list))}
vocab_idx = vocab_pos.copy()
vocab_idx.update({i: w for i, w in enumerate(vocab_list)})



Sequential Processing - Co Occurance Matrix

In [3]:

def co_occurances(file):
	window = 5

	# Using a ramp of window 4
	ramp = [0] + [*range(window, 0, -1)]

	occurances = np.zeros((len(vocab), len(vocab)), dtype=np.int64)

	gram_count=0
	line=''
 
	with open(file, 'r', encoding="utf-8") as corpus:
		for corp in corpus:
			line=corp

	pg_bar=tqdm(total=gramms_limit)	

	all_grams = ngrams(word_tokenize(line), window+1,pad_right=True, pad_left=True)

	for grams in all_grams:
		if grams[0] in vocab:
			for idx, gram in enumerate(grams):
				if gram in vocab:
					occurances[vocab_idx[grams[0]]][vocab_idx[gram]] += ramp[idx]

		# Doing the same as above with the gram reversed
		grams_rev = grams[::-1]

		if grams_rev[0] in vocab:
			for idx, gram in enumerate(grams_rev):
				if gram in vocab:
					occurances[vocab_idx[grams_rev[0]]][vocab_idx[gram]] += ramp[idx]

		gram_count+=1

		if gram_count % 100000 == 0:
			pg_bar.update(100000)

	pg_bar.close()

	print(f'\nGrams processed = {gram_count}')
	
	global cocrmat
	cocrmat=occurances

co_occurances(preprocessed_file)


 10%|█         | 3900000/38735257 [00:16<02:30, 232120.67it/s]


Grams processed = 3975363





Parallel Processing - Co Occurance Matrix

In [43]:
def parse_grams(ramp,gramss,occurances):

	for grams in gramss:
			if grams[0] in vocab:
				for idx, gram in enumerate(grams):
					if gram in vocab:
						occurances[vocab_idx[grams[0]]*len(vocab)+vocab_idx[gram]] += ramp[idx]

			# Doing the same as above with the gram reversed
			grams_rev = grams[::-1]

			if grams_rev[0] in vocab:
				for idx, gram in enumerate(grams_rev):
					if gram in vocab:
						occurances[vocab_idx[grams_rev[0]]*len(vocab)+vocab_idx[gram]] += ramp[idx]



def parallel_co_occurances(file):
	window = 5

	# Using a ramp of window 4
	ramp = [0] + [*range(window, 0, -1)]

	occurances=multiprocessing.Array('i', (len(vocab)* len(vocab)))

	gram_count=0
	line=''

	with open(file, 'r', encoding="utf-8") as corpus:
		for corp in corpus:
			line=corp

	

	all_grams = ngrams(word_tokenize(line), window+1,pad_right=True, pad_left=True)
	num_processes = multiprocessing.cpu_count()
	num_processes=15
	process_count=(gramms_limit-1)//num_processes
	gram_parallel=[[] for _ in range(num_processes+1)]

	for grams in all_grams:
		gram_count+=1
		gram_parallel[gram_count//process_count].append(grams)

	gram_count=0

	p=[]

	for i in range(len(gram_parallel)):
		p.append(multiprocessing.Process(target=parse_grams, args=(ramp,gram_parallel[i],occurances,)))

	for i in tqdm(range(len(p))):
		p[i].start()

	for i in tqdm(range(len(p))):
		p[i].join()



	print(f'\nGrams processed = {gram_count}')

	mat=np.zeros((len(vocab), len(vocab)), dtype=np.int64)
	
	for i in tqdm(range(len(vocab)*len(vocab))):
		mat[i%len(vocab)][i//len(vocab)]=occurances[i]

	return mat

cocrmat_p = parallel_co_occurances(preprocessed_file)


100%|██████████| 16/16 [00:00<00:00, 22.63it/s]
100%|██████████| 16/16 [00:26<00:00,  1.63s/it]



Grams processed = 0


100%|██████████| 14615329/14615329 [00:08<00:00, 1787394.05it/s]


In [34]:
avg=0
maxs=0

for i in tqdm(range(len(vocab))):
    for j in range(len(vocab)):
        diff=abs(cocrmat[i][j]-cocrmat_p[i][j])
        if (diff>maxs):
            maxs=diff
        avg+=diff
        
print(avg / (len(vocab)*len(vocab)))
print(maxs)

100%|██████████| 3823/3823 [00:06<00:00, 601.29it/s]

0.0
0





Common Functions

In [18]:
# %% [markdown]
# Calculating overall Pi and Pj values of the Co-Occurance matrix

# %%

def calculate_probability(co_occurrence_matrix):
	# Calculate row sums and column sums
	row_sums = np.sum(co_occurrence_matrix, axis=1)
	col_sums = np.sum(co_occurrence_matrix, axis=0)

	# Total co-occurrences
	N = np.sum(co_occurrence_matrix)

	if (N==0):
		return 0,0

	# Calculate pi and pj
	pi = row_sums / N
	pj = col_sums / N

	return pi, pj


# Calculate pi and pj
pi, pj = calculate_probability(cocrmat)

# %% [markdown]
# Generating the Positive Pointwise Mutual Information (PPMI) matrix from the Co-Occurance matrix

# %%

def gen_ppmi_matrix(co_occurrence_matrix):
	ppmi_matrix = np.zeros((len(vocab), len(vocab)))

	row_sums = np.sum(co_occurrence_matrix, axis=1)
	for i in tqdm(range(len(vocab))):
		for j in range(len(vocab)):

			if (row_sums[i] == 0 or co_occurrence_matrix[i][j] == 0):
				ppmi_matrix[i][j] = 0
				continue
			pij = ((co_occurrence_matrix[i][j]))/row_sums[i]
			ppmi_matrix[i][j] = max(0, math.log2(pij/(pi[i]*pj[j])))

	return ppmi_matrix


ppmi_matrix = gen_ppmi_matrix(cocrmat)

# %% [markdown]
# Functions for calculating similar words

# %%
def calc_norm(vec):
	sum_of_squares = sum(x**2 for x in vec)
	return math.sqrt(sum_of_squares)


def cosine_similarity(vec1, vec2):
	v1 = list(vec1)
	v2 = list(vec2)

	dot_product = (0.0)

	for i in range(len(v1)):
		temp = dot_product
		dot_product += (v1[i]*v2[i])
	# print(dot_product)
	norm_vec1 = calc_norm(v1)
	norm_vec2 = calc_norm(v2)
	return dot_product / (norm_vec1 * norm_vec2)


def reshape_top(words, extra):
	low = 0
	high = len(words) - 1

	while low <= high:
		mid = (low + high) // 2
		if words[mid][1] < extra[1]:
			low = mid + 1
		elif words[mid][1] > extra[1]:
			high = mid - 1
		else:
			break

	words.insert(low, extra)
	return words[1:]


def find_similar_words(target_word, vocab, ppmi_matrix, top_n=10):
	top_n += 1
	if target_word not in vocab:
		print("Target word not found in the vocabulary.")
		return

	target_index = vocab_pos[target_word]
	target_vector = ppmi_matrix[target_index]

	similarities = [(vocab_idx[0], 0.0) for _ in range(top_n)]

	for i, vector in enumerate(ppmi_matrix):
		sim = cosine_similarity(target_vector, vector)
		similarities = reshape_top(similarities, (vocab_idx[i], sim))

	similarities.reverse()

	return similarities[1:]

  0%|          | 0/3823 [00:00<?, ?it/s]

100%|██████████| 3823/3823 [00:05<00:00, 663.41it/s]


Generating the Top 15 words and choosing the Top 10 nouns from them

In [20]:
top_15 = vocab_counts.most_common(15)
top_15 = [i for i, _ in top_15]
print("Top 15 popular words : ", top_15)

nouns = [top_15[0], top_15[1], top_15[2], top_15[4], top_15[5],
		top_15[6], top_15[7], top_15[8], top_15[9], top_15[11]]
print("Top 10 popular nouns : ", nouns)

Top 15 popular words :  ['राज्य', 'घी', 'प्रदेश', 'मद्रास', 'दक्षिण', 'जिले', 'रूप', 'नाम', 'नहीं', 'आन्ध्र', 'जिलों', 'केनरा', 'निर्माण', 'बीजिंग', 'मक्खन']
Top 10 popular nouns :  ['राज्य', 'घी', 'प्रदेश', 'दक्षिण', 'जिले', 'रूप', 'नाम', 'नहीं', 'आन्ध्र', 'केनरा']


Sequential Processing - Displaying similar words for the nouns

In [None]:
for noun in nouns:
	target_word = noun
	similar_words = find_similar_words(target_word, vocab, ppmi_matrix)

	print(f"Words similar to '{target_word}':")
	for word, similarity in similar_words:
		print(f"	{word}: {similarity}")
	print()

Parallel Processing - Displaying similar words for the nouns

In [6]:

def similar_noun(noun,num,return_dict):
	similars=""

	target_word = noun
	similar_words = find_similar_words(target_word, vocab, ppmi_matrix)

	similars+=(f"Words similar to '{target_word}':\n")
	for word, similarity in similar_words:
		similars+=(f"	{word}: {similarity}\n")

	return_dict[num]=similars

return_dict=multiprocessing.Manager().dict()
jobs=[]

for i in range(10):
	p=multiprocessing.Process(target=similar_noun, args=(nouns[i],i,return_dict,))
	jobs.append(p)
	p.start()

for job in jobs:
	job.join()

for i in range(10):
	print(return_dict[i])



Words similar to 'राज्य':
	भारत: 0.7943134831751983
	क्षेत्र: 0.7867147826430491
	देश: 0.7634178143134698
	रूप: 0.7551294184104643
	प्रमुख: 0.7490510195514426
	भारतीय: 0.7483550881219223
	सबसे: 0.7474029377969469
	शहर: 0.7464457790116953
	नाम: 0.7443590051920376
	पूर्व: 0.7420841912880451

Words similar to 'प्रदेश':
	उत्तर: 0.7604822167311955
	भारत: 0.7553183325819316
	क्षेत्र: 0.7489451229986199
	राज्य: 0.7368517065547928
	जिले: 0.7255353249416687
	भारतीय: 0.7170110662178009
	प्रमुख: 0.7150667354134218
	नाम: 0.7108643681774648
	स्थित: 0.7104384713702615
	पूर्व: 0.705646309832619

Words similar to 'घी':
	सघंटक: 0.682440569736589
	प्रतिशतघी: 0.6724056759941595
	उत्पन्नघी: 0.6167697005194525
	मक्खन: 0.5377301258917578
	अम्ल: 0.40557211515069347
	भार: 0.3662217244214159
	दूध: 0.36215728168234446
	दही: 0.35393713542482963
	तेल: 0.29355357318529185
	गुड़: 0.29292152199018046

Words similar to 'नहीं':
	रूप: 0.9646278795039884
	कारण: 0.9492937586060992
	उन्होंने: 0.929825396645016
	प्रकार: 0.

In [29]:


mat = np.zeros((9, 9), dtype=np.int64)
occurances=multiprocessing.Manager().dict()

for i in range(len(mat)):
	occurances[i]=dict()
	for j in range(len(mat)):
		l=occurances[i]
		l[j]=3
		occurances[i]=l

print(occurances)

{0: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}, 1: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}, 2: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}, 3: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}, 4: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}, 5: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}, 6: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}, 7: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}, 8: {0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3}}


In [49]:
s=np.zeros((2, 2), dtype=np.int64)
occurances=multiprocessing.Array('i',[[3,4], [2,2]])
print(occurances[1])

TypeError: 'list' object cannot be interpreted as an integer