In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import defaultdict
from math import log, sqrt
import os
import re

nltk.download('punkt')
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

dir_path = r'ResearchPapers'
stopwords = set()

with open("Stopword-List.txt", "r") as f:
    stopwords = {word.strip() for word in f}

documents = {}
index = set()

for filename in os.listdir(dir_path):
    if os.path.isfile(os.path.join(dir_path, filename)):
        with open(os.path.join(dir_path, filename), "r") as f:
            text = re.sub(r'[^\w\s]|[\d]', '', f.read())
            words = word_tokenize(text)
            temp_dict = defaultdict(int)
            for word in words:
                word = stemmer.stem(lemmatizer.lemmatize(word.casefold()))
                if word not in stopwords and len(word) <= 11:
                    temp_dict[word] += 1
                    index.add(word)
            documents[int(filename.replace(".txt", ""))] = dict(temp_dict)

index = sorted(index)
vsm = {}
df = defaultdict(int)

for filename, doc in documents.items():
    temp_vector = [doc.get(word, 0) for word in index]
    vsm[filename] = temp_vector
    for word in doc:
        df[word] += 1

idf = [log(len(vsm) / df[word], 10) for word in index]

for doc_id, vector in vsm.items():
    mag = sqrt(sum(val ** 2 for val in vector))
    vsm[doc_id] = [(1 + log(freq, 10)) * idf[idx] / mag if freq else 0 for idx, freq in enumerate(vector)]

ModuleNotFoundError: No module named 'nltk'

In [None]:
import numpy as np

def process_query(query, index, vsm):
    query_vector = [query.count(word) for word in index]
    mag = np.sqrt(np.sum(np.square(query_vector)))
    query_vector = [val / mag for val in query_vector]

    doc_rank = {doc_id: np.dot(vsm[doc_id], query_vector) for doc_id in vsm}
    sorted_doc_rank = sorted(doc_rank.items(), key=lambda x: x[1], reverse=True)
    
    alpha = 0.0005
    result = [doc_id for doc_id, score in sorted_doc_rank if score > alpha]
    return result

query = str(input("Enter the Query: "))
query = [stemmer.stem(lemmatizer.lemmatize(word.casefold())) for word in word_tokenize(query)]

result = process_query(query, index, vsm)
print(result)