## Vector Space Model Ranking

1. Read necessary files

In [1]:
from collections import Counter
import math
import json
import numpy as np
import os
import re


with open("queries",'r') as file:
    query_dic = json.load(file)

with open("documents","r") as file:
    document_dic = json.load(file)
    
with open("Inverted_index","r") as file:
    Inverted_index = json.load(file)

with open("vocabulary","r") as file:
    vocabulary = file.readline()
    vocabulary = vocabulary.split(" ")

2. Formulating the query vector with simple zero and one scheme

In [2]:
# Formulating the query vector with simple zero and one scheme

def get_vector(tokens):
    
    Q = np.zeros(len(vocabulary))
    
    for token in tokens:
        try:
            ind = list(vocabulary).index(token)
            Q[ind] = 1
        except:
            pass
        
    return Q


Or you can formulate the query vector with tf_idf (need to change this in the main loop)

In [3]:
# Formulating the query vector with tf_idf 

def get_query_vector(tokens):
    
    all_words = []
    
    Q = np.zeros(len(vocabulary))
    
    for d in documents_dic.items():
        for w in d[1]:
            all_words.append(w)
    
    for token in tokens:
        
        try:
            ind = list(vocabulary).index(token)
            tf = all_words.count(token)/len(all_words)
            df = len(Inverted_index[token])
            idf = math.log(len(documents_dic.keys())/df)
            tf_idf = tf * idf
            
            Q[ind] = tf_idf
        except:
            pass
        
    return Q


3. Find all the related Documents with given query 

In [4]:
def find_documents(tokens):

    Intersection = []
    
    for i in tokens:
        
        # If token was in the index
        
        if i in Inverted_index.keys():
            
            posting_list = Inverted_index[i]
            Intersection.append(posting_list)
                
    
    #intersection between relavent documents

    lst = set(Intersection[0])

    for i in range(len(Intersection)):

        lst = set(Intersection[i])&lst

    Intersection = list(lst)
    
    return Intersection
        

4. formulating the document vectors

In [5]:
def document_vector(documents,tokens):
    
    # Since we just need to consider terms that occur in the query,
    # insdead of computing all tf-idf vlaues in a document, we just compute them for query items
    
    # document dictionary for documents and their vectors
    dic = {}
    
    for doc in documents:
        d = document_dic[doc]
        count = Counter(d)
        query_vec = np.zeros(len(vocabulary))
        
        for token in tokens:
            
            if token in d:
            
                tf = count[token]/len(d)
                df = len(Inverted_index[token])
                idf = math.log(len(document_dic.keys())/df)
                tf_idf = tf * idf

                ind = list(vocabulary).index(token)
                query_vec[ind] = tf_idf

        dic[doc] = query_vec
    
    return dic
    

5. Get document rank for a particular query based on previous functions

In [6]:
def cosine_sim(query):

    result = {}
    
    query_vec = get_vector(query)
    documents = find_documents(query)
    document_vec = document_vector(documents,query)

    for i in document_vec.items():
        
        # compute consine similary between a query and its related documents

        score = np.dot(query_vec,i[1])/(np.linalg.norm(query_vec)*np.linalg.norm(i[1]))

        result[i[0]] = score
        
    return result


In [7]:
# Rank all the documents 

def rank_documents(scores):

    #scores = cosine_sim(query)
    rank = sorted(scores.items(), key=lambda x:x[1],reverse=True)
    
    return rank
    

6. Output results file

In [8]:
with open("output/result_vsm.txt","w") as file:
    for q in query_dic.items():
        query_id = q[0]
        query = q[1]
        # Get rank scores
        scores  = cosine_sim(query)
        rank = rank_documents(scores)
        c = 0
        for i in rank:
            # Write query id
            file.write(query_id)
            file.write(" ")
            # write iter
            file.write("1")
            file.write(" ")
            # Write document id
            file.write(i[0])
            file.write(" ")
            # write rank (irrelevant information)
            file.write("0")
            file.write(" ")
            # write similarity
            file.write(str(i[1]))
            file.write(" ")
            file.write("run1")
            file.write("\n")
            # only show the top 100 results
            c += 1
            if c > 100:
                break
