Importing Local File Paths

In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

corpus_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/HindiCorpus.txt"
preprocessed_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/ProcessedCorpus.txt"
xml_file = "/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/hiwiki-latest-pages-articles.xml"
stop_words_file="/content/drive/MyDrive/Colab Notebooks/AnuragBMC202309_Assignment04/stopwords.txt"

corpus_limit=232729
preprocess_limit=4232433
gramms_limit=38735257

Mounted at /content/drive/


Importing necessary libraries

In [2]:
from collections import Counter, defaultdict
!pip install wiki_dump_reader
from wiki_dump_reader import Cleaner, iterate

import string

from nltk import ngrams, word_tokenize
import nltk
nltk.download('punkt')

import numpy as np
from tqdm import tqdm
import math

Collecting wiki_dump_reader
  Downloading wiki-dump-reader-0.0.4.tar.gz (3.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wiki_dump_reader
  Building wheel for wiki_dump_reader (setup.py) ... [?25l[?25hdone
  Created wheel for wiki_dump_reader: filename=wiki_dump_reader-0.0.4-py3-none-any.whl size=3983 sha256=a323e8f6f133e7be20c15cce2d6dee11c1fdd28678be51470e7c408e31309c87
  Stored in directory: /root/.cache/pip/wheels/78/81/3d/463b7f906f65d3e9e43db8446ebc5fb719bf1777a40b411cd2
Successfully built wiki_dump_reader
Installing collected packages: wiki_dump_reader
Successfully installed wiki_dump_reader-0.0.4


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Creating the corpus from the Wiki Dump Reader

In [None]:
def write_corpus(corpus):

	page_count = 0
	cleaner = Cleaner()
	with open(corpus, 'w', encoding='utf -8') as output:
		pg_bar=tqdm(total=corpus_limit)
		for title, text in iterate(xml_file):
			text = cleaner.clean_text(text)
			cleaned_text, _ = cleaner.build_links(text)
			output.write(title + '\n' + cleaned_text + '\n ')
			page_count += 1
			if page_count % 1000 == 0:
				pg_bar.update(1000)
		pg_bar.close()
		output.close()
	print(f"\nPage count = {page_count}")


write_corpus(corpus_file)

100%|█████████▉| 232000/232729 [20:39<00:03, 187.14it/s]


Page count = 232728





Creating functions needed for Pre Processing the corpus

In [3]:
stop_words = []

start = int('0x0900', 16)
end = int('0x097F', 16)

num_start = int('0x0966', 16)
num_end = int('0x096F', 16)

hindi_letters = [chr(code) for code in range(start, end + 1)]
hindi_letters.remove('॥')
hindi_letters.remove('।')
hindi_letters.remove('ॽ')

hindi_numbers = [chr(code) for code in range(num_start, num_end + 1)]


def create_stop_words(file_path):
	try:
		with open(file_path, 'r', encoding="utf-8") as file:
			for line in file:
				line=line.strip()
				words=line.split(" ")
				for word in words:
					stop_words.append(word)
	except:
		print()


def remove_numbers(text):
	tokens = list(text)
	filtered_tokens = []

	for token in tokens:
		if token not in hindi_numbers:
			filtered_tokens.append(token)

	result = ''.join(filtered_tokens)
	return result


def remove_punctuation(text):
	translator = str.maketrans('', '', string.punctuation)
	return text.translate(translator)


def remove_whitespace(text):
	return " ".join(text.split())


create_stop_words(stop_words_file)

Pre Processing the corpus for further analysis

In [4]:
def preprocess_line(text):
	text = remove_whitespace(remove_punctuation(remove_numbers(text)))

	words = text.split(" ")

	# Filter out foreign words
	filtered_tokens = []
	for word in words:
		if word in stop_words:
			continue

		breaker = False
		for letter in word:
			if letter not in hindi_letters:
				breaker = True
				break

		if breaker:
			continue

		filtered_tokens.append(word)

	# Reassemble the text
	processed_text = ' '.join(filtered_tokens)

	return processed_text


def preprocess_corpus(source_file, destination_file):
	line_count = 0
	with open(source_file, 'r', encoding="utf-8") as source:
		with open(destination_file, 'w', encoding="utf-8") as destination:
			pg_bar=tqdm(total=preprocess_limit)
			for line in source:
				destination.write(preprocess_line(line))
				line_count += 1
				if line_count % 10000 == 0:
					pg_bar.update(10000)
			pg_bar.close()

	print(f'\nLines processed = {line_count}')

preprocess_corpus(corpus_file, preprocessed_file)

100%|█████████▉| 4230000/4232433 [16:21<00:00, 4310.10it/s]


Lines processed = 4232432





Functions for generating number of tokens and vocabulary

In [3]:

def gen_token_count(file):
	token_len = 0
	with open(file, 'r', encoding="utf-8") as f:
		for line in f:
			line=line.replace('\n','')
			space_count = 0
			for char in line:
				if char == ' ':
					space_count += 1

			token_len += space_count+1

	return token_len


def gen_token(file):
	tokens = []

	with open(file, 'r', encoding="utf-8") as f:
		for line in f:
			tokens += (line.split(' '))

	return tokens


def gen_vocabulary(file):
	word_list = []
	with open(file, 'r', encoding="utf-8") as f:
		for line in f:
			line=line.replace('\n','')
			word_list += (line.split(' '))

	word_counts = Counter(word_list)
	return word_counts

Displaying Token Count and Vocabulary Count

In [4]:
corpus_token_count = gen_token_count(corpus_file)
print("Number of Corpus Tokens :", corpus_token_count)

preprocess_token_count = gen_token_count(preprocessed_file)
print("Number of Pre Processed Corpus Tokens :", preprocess_token_count)

vocab_counts = gen_vocabulary(preprocessed_file)
print("Size of Vocabulary :", len(vocab_counts))

Number of Corpus Tokens : 86699267
Number of Pre Processed Corpus Tokens : 38735251
Size of Vocabulary : 2108503


Reducing the vocabulary to words which occur more than 100 times since the total vocabulary count cannot be allocated as a 2D matrix

In [5]:

vocab = {x for x, count in vocab_counts.items() if count >= 100}
print("Size of reduced Vocabulary :", len(vocab))


Size of reduced Vocabulary : 18706


Necessary variables for making the Co-Occurance matrix

In [6]:
vocab_list = list(vocab)
vocab_pos = {vocab_list[i]: i for i in range(len(vocab_list))}
vocab_idx = vocab_pos.copy()
vocab_idx.update({i: w for i, w in enumerate(vocab_list)})

Creating the Co-Occurance matrix

In [9]:
def co_occurances(file):
	window = 5

	# Using a ramp of window 4
	ramp = [0] + [*range(window, 0, -1)]

	occurances = np.zeros((len(vocab), len(vocab)), dtype=np.int64)

	gram_count=0
	line=''

	with open(file, 'r', encoding="utf-8") as corpus:
		for corp in corpus:
			line=corp

	pg_bar=tqdm(total=gramms_limit)

	all_grams = ngrams(word_tokenize(line), window+1,pad_right=True, pad_left=True)

	for grams in all_grams:

		if grams[0] in vocab:
			for idx, gram in enumerate(grams):
				if gram in vocab:
					occurances[vocab_idx[grams[0]]][vocab_idx[gram]] += ramp[idx]

		# Doing the same as above with the gram reversed
		grams_rev = grams[::-1]

		if grams_rev[0] in vocab:
			for idx, gram in enumerate(grams_rev):
				if gram in vocab:
					occurances[vocab_idx[grams_rev[0]]][vocab_idx[gram]] += ramp[idx]

		gram_count+=1

		if gram_count % 100000 == 0:
			pg_bar.update(100000)

	pg_bar.close()

	print(f'\nGrams processed = {gram_count}')

	return occurances

cocrmat = co_occurances(preprocessed_file)


100%|█████████▉| 38700000/38735257 [10:40<00:00, 60459.40it/s] 


Grams processed = 38735256





Calculating overall Pi and Pj values of the Co-Occurance matrix

In [12]:

def calculate_probability(co_occurrence_matrix):
	# Calculate row sums and column sums
	row_sums = np.sum(co_occurrence_matrix, axis=1)
	col_sums = np.sum(co_occurrence_matrix, axis=0)

	# Total co-occurrences
	N = np.sum(co_occurrence_matrix)

	if(N==0):
		return 0,0

	# Calculate pi and pj
	pi = row_sums / N
	pj = col_sums / N

	return pi, pj


# Calculate pi and pj
pi, pj = calculate_probability(cocrmat)

Generating the Positive Pointwise Mutual Information (PPMI) matrix from the Co-Occurance matrix

In [13]:

def gen_ppmi_matrix(co_occurrence_matrix):
	ppmi_matrix = np.zeros((len(vocab), len(vocab)))

	row_sums = np.sum(co_occurrence_matrix, axis=1)
	for i in tqdm(range(len(vocab))):
		for j in range(len(vocab)):

			if (row_sums[i] == 0 or co_occurrence_matrix[i][j] == 0):
				ppmi_matrix[i][j] = 0
				continue
			pij = ((co_occurrence_matrix[i][j]))/row_sums[i]
			ppmi_matrix[i][j] = max(0, math.log2(pij/(pi[i]*pj[j])))

	return ppmi_matrix


ppmi_matrix = gen_ppmi_matrix(cocrmat)

100%|██████████| 18706/18706 [06:30<00:00, 47.84it/s]


Functions for calculating similar words

In [14]:
def calc_norm(vec):
	sum_of_squares = sum(x**2 for x in vec)
	return math.sqrt(sum_of_squares)


def cosine_similarity(vec1, vec2):
	v1 = list(vec1)
	v2 = list(vec2)

	dot_product = (0.0)

	for i in range(len(v1)):
		temp = dot_product
		dot_product += (v1[i]*v2[i])
	# print(dot_product)
	norm_vec1 = calc_norm(v1)
	norm_vec2 = calc_norm(v2)
	return dot_product / (norm_vec1 * norm_vec2)


def reshape_top(words, extra):
	low = 0
	high = len(words) - 1

	while low <= high:
		mid = (low + high) // 2
		if words[mid][1] < extra[1]:
			low = mid + 1
		elif words[mid][1] > extra[1]:
			high = mid - 1
		else:
			break

	words.insert(low, extra)
	return words[1:]


def find_similar_words(target_word, vocab, ppmi_matrix, top_n=10):
	top_n += 1
	if target_word not in vocab:
		print("Target word not found in the vocabulary.")
		return

	target_index = vocab_pos[target_word]
	target_vector = ppmi_matrix[target_index]

	similarities = [(vocab_idx[0], 0.0) for _ in range(top_n)]

	for i, vector in enumerate(ppmi_matrix):
		sim = cosine_similarity(target_vector, vector)
		similarities = reshape_top(similarities, (vocab_idx[i], sim))

	similarities.reverse()

	return similarities[1:]

Generating the Top 15 words and choosing the Top 10 nouns from them

In [15]:
top_15 = vocab_counts.most_common(15)
top_15 = [i for i, _ in top_15]
print("Top 15 popular words : ", top_15)

nouns = [top_15[0], top_15[1], top_15[2], top_15[4], top_15[5],
		top_15[6], top_15[7], top_15[8], top_15[9], top_15[11]]
print("Top 10 popular nouns : ", nouns)

Top 15 popular words :  ['रूप', 'हिप्पोकैम्पस', 'उपहार', 'नहीं', 'राज्य', 'ओडिन', 'बच्चों', 'भारत', 'ओलंपिक', 'स्मृति', 'नाम', 'ईसाई', 'विशेष', 'प्रकार', 'क्रिसमस']
Top 10 popular nouns :  ['रूप', 'हिप्पोकैम्पस', 'उपहार', 'राज्य', 'ओडिन', 'बच्चों', 'भारत', 'ओलंपिक', 'स्मृति', 'ईसाई']


Displaying similar words for the nouns

In [16]:
for noun in nouns:
	target_word = noun
	similar_words = find_similar_words(target_word, vocab, ppmi_matrix)

	print(f"Words similar to '{target_word}':")
	for word, similarity in similar_words:
		print(f"	{word}: {similarity}")

Words similar to 'रूप':
	नहीं: 0.9487072218611369
	उन्होंने: 0.9300621013389434
	सबसे: 0.9295931282106752
	नाम: 0.9277516621623187
	कारण: 0.9259441237081238
	शामिल: 0.9186821937048171
	काम: 0.9028323314003894
	प्राप्त: 0.9000391196345271
	मुख्य: 0.8995005017423251
	प्रमुख: 0.8914432497554523
Words similar to 'हिप्पोकैम्पस':
	विभेदस्थानिक: 0.720721328652459
	भूमिकाहिप्पोकैम्पस: 0.716292772874085
	दिशानिर्देशन: 0.6929263481764727
	भूमिकाा: 0.6427220666189853
	भूमिकाे: 0.6323230290634059
	भूमिकां: 0.6242357862046938
	भूमिकाी: 0.5855576984290631
	स्मृति: 0.3232409766622341
	मेडियल: 0.32040773148090795
	भूमिकारूप: 0.3168999478220527
Words similar to 'उपहार':
	उत्पत्तिउपहार: 0.5085022245806071
	सिंटरक्लास: 0.3055436794275977
	उत्पत्तिर: 0.3023470627281467
	मिलतेआधुनिक: 0.28301821826856133
	दिन: 0.26910517450294785
	देने: 0.2681297706206066
	क्रिसमस: 0.26735380380633333
	घर: 0.26472965886513794
	बच्चों: 0.2641047079927667
	लोगों: 0.26390195090873453
Words similar to 'राज्य':
	भारत: 0.87293059