In [12]:
import glob
import nltk
import os
import string
from nltk.stem import PorterStemmer
import re
import math 
Dictionary = {} #Create a global dictionary
DocVectors = {} #Create a global dictionary for Document Vectors


def FileRead(): 
    Folder = 'ResearchPapers'
    Pattern = '*.txt' 
    FList = glob.glob(os.path.join(Folder, Pattern)) #Finding all Files in the given Folder 
    for Path in FList: 
        with open(Path, 'r') as file: 
            FileContents = file.read() #Reading File text
            FileContents = FileContents.lower()
            File_name = Path.strip("ResearchPapers\\.txt")
            FileContents = PunctuationRemove(FileContents)# Removing Punctuations
            FileContents = FileContents.split() # Tokenizing string
            Stemmer = PorterStemmer()
            FileStem = []
            #Applying Stemming to all the tokens
            for words in list(FileContents):
                FileStem.append(Stemmer.stem(words))
            File_name = int(File_name)
            Dictionary = DictionaryBuilder(FileStem,File_name)
            Dictionary = sorted(Dictionary.items()) # Sorting the Dictionary by tokens
            Dictionary = dict(Dictionary)
    # Initializing all Document Vectors with 0 for every word
    for i in range(1,31):
         DocVectors[i] = [0] * len(Dictionary)
    return Dictionary


def PunctuationRemove(File):
    File = File.replace('-', ' ') # Replacing hyphens with spaces
    File = File.translate(str.maketrans("", "", string.punctuation))
    return(File)


def DictionaryBuilder(File,File_Name):
    Stop = open(r'Stopword-List.txt', 'r')
    StopContents = Stop.read()
    StopContents = StopContents.split()
    for words in File: # Building Dictionary
        if(words not in StopContents):
            if(words not in Dictionary): # First time a word is added to Dictionary
                Dictionary[words] = {}
                Dictionary[words][File_Name] = 1 # Setting Term Frequency for the document to 1
            else:
                if(File_Name not in Dictionary[words]):
                    Dictionary[words][File_Name] = 1 # Setting Term Frequency for the document to 1
                else:
                    Dictionary[words][File_Name] += 1 # Incrementing Term Frequency
    return Dictionary   

def BuildDocumentVectors():
    for Index, Key in enumerate(Dictionary): # Traversing through words in Dictionary
        for DocKeys in DocVectors.keys(): # Traversing through all Documents
            if(DocKeys in Dictionary[Key]):
                DocFreq = len(Dictionary[Key]) 
                InvertedDocFreq = round(math.log(len(DocVectors) / DocFreq, 10),2) # Calculating Inverted Document Frequency
                TfIdf = InvertedDocFreq * Dictionary[Key][DocKeys]
                DocVectors[DocKeys][Index] = TfIdf


def QueryProcessor(Query):
    Query = Query.split()
    Query = QueryStemmer(Query)
    QueryVector = [0] * len(Dictionary) # Initializing Query Vector
    QueryDict = {}
    for words in Query: # Building Dictionary for Query
        if(words not in QueryDict): # First time a word is added to Dictionary
            QueryDict[words] = 1
        else:
            QueryDict[words] += 1
    for Index, Key in enumerate(Dictionary): # Traversing Dictionary
            if(Key in QueryDict):
                DocFreq = len(Dictionary[Key])
                InvertedDocFreq = math.log(len(DocVectors) / DocFreq, 10)
                TfIdf = InvertedDocFreq * QueryDict[Key]
                QueryVector[Index] = TfIdf
    return QueryVector


def QueryStemmer(Query):
    StemQuery = []
    Stop = open(r'Stopword-List.txt', 'r')
    StopContents = Stop.read()
    StopContents = StopContents.split()
    Stemmer = PorterStemmer()
    Query = [Val for Val in Query if Val not in StopContents]
    for words in Query:
        StemQuery.append(Stemmer.stem(words))
    return StemQuery  

# Calculating Eucilidean Length for a Vector
def EucDist(Vector):
    Sum = 0
    for i in Vector:
        Sum += i ** 2
    return(math.sqrt(Sum))

def Solver(Query):
    ResultList = []
    QueryEucDist = EucDist(Query) # Calculating Euclidean Length for the Query
    if QueryEucDist == 0: # Return empty list if the query vector is all zeros
        return ResultList
    for Doc in DocVectors.keys():
        Cosine = 0
        DotProduct = 0
        DocEucDist = EucDist(DocVectors[Doc]) # Calculating Euclidean Length for a given Document
        if DocEucDist == 0: # Skip calculation if the document vector is all zeros
            continue
        for i in range(0,len(Dictionary)):
            if Query[i] == 0 or DocVectors[Doc][i] == 0: # Skip calculation if one of the TF-IDFs is zero
                continue
            else:
                DotProduct += Query[i] * DocVectors[Doc][i]
        Cosine = DotProduct / (QueryEucDist * DocEucDist)
        if Cosine > 0.05: # Threshold
            ResultList.append((Doc,Cosine))
    ResultList = sorted(ResultList, key=lambda x:-x[1]) # Sort results according to Cosine value
    return ResultList

Dictionary = FileRead()
BuildDocumentVectors()
Query = ''
while(1):
    Query = input("Enter Query(Type -1 to exit): ")
    if(Query == '-1'):
        break
    Query = QueryProcessor(str(Query))
    print(Solver(Query))

Enter Query(Type -1 to exit):  transformer


[(21, 0.38735031509185675), (18, 0.2048209890122228)]


Enter Query(Type -1 to exit):  -1


In [9]:
################################################ ONLY REFACTORED
import glob
import nltk
import os
import string
from nltk.stem import PorterStemmer
import re
import math 

Dictionary = {}  # Contains term frequencies for each word in each document
DocVectors = {}  # Contains TF-IDF vectors for each document

def FileRead(): 
    Folder = 'ResearchPapers'
    Pattern = '*.txt' 
    FList = glob.glob(os.path.join(Folder, Pattern)) #Finding all Files in the given Folder 
    for Path in FList: 
        with open(Path, 'r') as file: 
            FileContents = file.read() #Reading File text
            FileContents = FileContents.lower()
            File_name = Path.strip("ResearchPapers\\.txt")
            FileContents = PunctuationRemove(FileContents)# Removing Punctuations
            FileContents = FileContents.split() # Tokenizing string
            Stemmer = PorterStemmer()
            FileStem = []
            #Applying Stemming to all the tokens
            for words in list(FileContents):
                FileStem.append(Stemmer.stem(words))
            File_name = int(File_name)
            Dictionary = DictionaryBuilder(FileStem,File_name)
            Dictionary = sorted(Dictionary.items()) # Sorting the Dictionary by tokens
            Dictionary = dict(Dictionary)
    # Initializing all Document Vectors with 0 for every word
    for i in range(1,31):
         DocVectors[i] = [0] * len(Dictionary)
    return Dictionary


def PunctuationRemove(File):
    # Function to remove punctuation marks from text
    File = File.replace('-', ' ')  # Replacing hyphens with spaces
    File = File.translate(str.maketrans("", "", string.punctuation))
    return File


def DictionaryBuilder(File, File_Name):
    # Function to build the dictionary with term frequencies
    
    # Open stopword list
    Stop = open('Stopword-List.txt', 'r')
    StopContents = Stop.read()
    StopContents = StopContents.split()
    
    # Building dictionary
    for word in File: 
        if word not in StopContents:
            if word not in Dictionary:  # First time a word is added to dictionary
                Dictionary[word] = {}
                Dictionary[word][File_Name] = 1  # Setting term frequency for the document to 1
            else:
                if File_Name not in Dictionary[word]:
                    Dictionary[word][File_Name] = 1  # Setting term frequency for the document to 1
                else:
                    Dictionary[word][File_Name] += 1  # Incrementing term frequency
    
    return Dictionary


def BuildDocumentVectors():
    # Function to build TF-IDF vectors for each document
    
    for Index, Key in enumerate(Dictionary): 
        for DocKeys in DocVectors.keys(): 
            if DocKeys in Dictionary[Key]:
                DocFreq = len(Dictionary[Key]) 
                InvertedDocFreq = round(math.log(len(DocVectors) / DocFreq, 10), 2)  # Calculating IDF
                TfIdf = InvertedDocFreq * Dictionary[Key][DocKeys]  # Calculating TF-IDF
                DocVectors[DocKeys][Index] = TfIdf


def QueryProcessor(Query):
    # Function to process query text and convert it into a vector
    
    Query = Query.split()
    Query = QueryStemmer(Query)  # Stemming query words
    QueryVector = [0] * len(Dictionary)  # Initializing query vector
    QueryDict = {}
    
    # Building dictionary for query
    for word in Query: 
        if word not in QueryDict:
            QueryDict[word] = 1
        else:
            QueryDict[word] += 1
    
    # Calculating TF-IDF for query vector
    for Index, Key in enumerate(Dictionary): 
        if Key in QueryDict:
            DocFreq = len(Dictionary[Key])
            InvertedDocFreq = math.log(len(DocVectors) / DocFreq, 10)
            TfIdf = InvertedDocFreq * QueryDict[Key]
            QueryVector[Index] = TfIdf
    
    return QueryVector


def QueryStemmer(Query):
    # Function to stem words in query
    
    StemQuery = []
    Stop = open('Stopword-List.txt', 'r')
    StopContents = Stop.read()
    StopContents = StopContents.split()
    Stemmer = PorterStemmer()
    Query = [Val for Val in Query if Val not in StopContents]
    
    # Stemming query words
    for word in Query:
        StemQuery.append(Stemmer.stem(word))
    
    return StemQuery  


def EucDist(Vector):
    # Function to calculate Euclidean distance for a vector
    
    Sum = 0
    for i in Vector:
        Sum += i ** 2
    return math.sqrt(Sum)


def Solver(Query):
    # Function to solve the query and retrieve relevant documents
    
    ResultList = []
    QueryEucDist = EucDist(Query)  # Calculating Euclidean length for the query
    
    if QueryEucDist == 0: 
        return ResultList  # Return empty list if the query vector is all zeros
    
    for Doc in DocVectors.keys():
        Cosine = 0
        DotProduct = 0
        DocEucDist = EucDist(DocVectors[Doc])  # Calculating Euclidean length for a given document
        
        if DocEucDist == 0: 
            continue  # Skip calculation if the document vector is all zeros
        
        for i in range(0, len(Dictionary)):
            if Query[i] == 0 or DocVectors[Doc][i] == 0: 
                continue  # Skip calculation if one of the TF-IDFs is zero
            else:
                DotProduct += Query[i] * DocVectors[Doc][i]
        
        Cosine = DotProduct / (QueryEucDist * DocEucDist)
        
        if Cosine > 0.05:  # Threshold
            ResultList.append((Doc, Cosine))
    
    # Sort results according to cosine value
    ResultList = sorted(ResultList, key=lambda x:-x[1])  
    return ResultList

# Reading files, building dictionaries, and constructing document vectors
Dictionary = FileRead()
BuildDocumentVectors()

# Query processing and solving loop
while True:
    Query = input("Enter Query (Type '-1' to exit): ")
    if Query == '-1':
        break
    Query = QueryProcessor(str(Query))
    print(Solver(Query))


Enter Query (Type '-1' to exit):  MACHINE LEARNING


[(16, 0.130290461076697), (2, 0.09906766265502358), (24, 0.09594246904790407), (7, 0.09499223244572715), (1, 0.09090314851096121), (3, 0.08737820239762055), (17, 0.06266354326509087), (8, 0.061917936649195635)]


Enter Query (Type '-1' to exit):  TRANSFORMER


[(21, 0.38735031509185675), (18, 0.2048209890122228)]


Enter Query (Type '-1' to exit):  -1
