In [2]:
# implementation of vector space model for document retrieval

from xml.dom.minidom import Document
import pandas
# module to read the contents of the file from a csv file

from contextlib import redirect_stdout
# module to redirect the output to a text file

import math
# module to perform mathematical functions

terms = []
# list to store the terms present in the documents

keys = []
# list to store the names of the documents

vec_Dic = {}
# dictionary to store the name of the document and the weight as list

dicti = {}
# dictionary to store the name of the document and the terms present in it as a
# vector

dummy_List = []
# list for performing some operations and clearing them

term_Freq = {}
# dictionary to store the term and the number of times of its occurrence in the
# documents

idf = {}
# dictionary to store the term and the inverse document frequency

weight = {}
# dictionary to store the term and the weight which is the product of term
# frequency and inverse document frequency


def filter(documents, rows, cols):

    for i in range(rows):
        for j in range(cols):
            # traversal through the data frame

            if(j == 0):
                # first column has the name of the document in the csv file
                keys.append(documents.loc[i].iat[j])
            else:
                dummy_List.append(documents.loc[i].iat[j])
                # dummy list to update the terms in the dictionary

                if documents.loc[i].iat[j] not in terms:
                    # add the terms to the list if it is not present else continue
                    terms.append(documents.loc[i].iat[j])

        copy = dummy_List.copy()
        # copying the the dummy list to a different list

        dicti.update({documents.loc[i].iat[0]: copy})
        # adding the key value pair to a dictionary

        dummy_List.clear()
        # clearing the dummy list


def compute_Weight(doc_Count, cols):
    for i in terms:
        # initially adding all the elements into the dictionary and initialising
        # the values as zero
        if i not in term_Freq:
            term_Freq.update({i: 0})

    for key, value in dicti.items():
        # to get the number of occurrence of each terms
        for k in value:
            if k in term_Freq:
                term_Freq[k] += 1
                # value incremented by one if the term is found in the documents

    idf = term_Freq.copy()
    for i in term_Freq:
        term_Freq[i] = term_Freq[i]/cols
        # term frequency is number of occurrence divided by total number of
        # documents

    for i in idf:
        if idf[i] != doc_Count:
            idf[i] = math.log2(cols / idf[i])
            # inverse document frequency log of total number of documents divided
            # by number of occurrence of the terms
        else:
            idf[i] = 0
            # this is to avoid the zero division error

    for i in idf:
        weight.update({i: idf[i]*term_Freq[i]})
        # weight is the product of term frequency and the inverse document
        # frequency

    for i in dicti:
        for j in dicti[i]:
            dummy_List.append(weight[j])

        copy = dummy_List.copy()
        vec_Dic.update({i: copy})
        dummy_List.clear()
        # above operations performed to get the dictionary of weighted vector
        # for each of the documents


def get_Weight_For_Query(query):
    '''function to get the weight for each terms present in the query, here we
    consider the term frequency as the weight of the terms'''

    query_Freq = {}
    for i in terms:
        if i not in query_Freq:
            query_Freq.update({i: 0})

    for val in query:
        # to get the number of occurrence of each terms
        if val in query_Freq:
            query_Freq[val] += 1
            # value incremented by one if the term is found in the documents

    for i in query_Freq:
        query_Freq[i] = query_Freq[i] / len(query)

    return query_Freq
def similarity_Computation(query_Weight):
    numerator = 0
    denomi1 = 0
    denomi2 = 0
    similarity = {}
    for document in dicti:
        for terms in dicti[document]:
            # cosine similarity is calculated

            numerator += weight[terms] * query_Weight[terms]
            denomi1 += weight[terms] * weight[terms]
            denomi2 += query_Weight[terms] * query_Weight[terms]
            # the summation values of the weight is calculated and later they are
            # divided

        if denomi1 != 0 and denomi2 != 0:
            # to avoid the zero division error

            simi = numerator / (math.sqrt(denomi1) * math.sqrt(denomi2))
            similarity.update({document: simi})
            #dictionary is updated

            numerator = 0
            denomi2 = 0
            denomi1 = 0
            # reinitialisation of the variables to zero

    return (similarity)
def prediction(similarity, doc_count):
    '''Function to predict the document which is relevant to the query '''
    if len(similarity)!=0:
        ans = max(similarity, key=similarity.get)
        print(ans, "is the most relevant document")
        print("ranking of the documents")
    else:
        print("Document not found")
    for i in range(doc_count):
        if len(similarity)!=0:
            ans = max(similarity, key=lambda x: similarity[x])
            print(ans, "rank is", i+1)
        # to print the document name and its rank
            similarity.pop(ans)
def main():
    documents = pandas.read_excel(r'data.xlsx')
    rows = len(documents)
    cols = len(documents.columns)
    filter(documents, rows, cols)
    compute_Weight(rows, cols)
    print("Enter the query")
    query = input()
    query = query.split(' ')
    query_Weight = get_Weight_For_Query(query)
    similarity = similarity_Computation(query_Weight)
    prediction(similarity, rows)
main()


Enter the query
document12 is the most relevant document
ranking of the documents
document12 rank is 1
document183 rank is 2
document72 rank is 3
document10 rank is 4
document64 rank is 5
document178 rank is 6
document48 rank is 7
document156 rank is 8
