In [46]:
#implement an Information Retrieval (IR) system based on the vector space model, for a collection of documents
#For weighting, you can use the tf-idf weighting scheme (wij = tfij∙idfi)
#For each query, your system will produce a ranked list of documents, starting with the most similar to the query and ending with the least similar. For the query terms you can use a modified tf-idf weighting scheme wiq = (0.5 + 0.5 tfiq)∙idfi
#For the ranking, you can use the cosine similarity measure
import time
import Porter_Stemming as ps
import pandas as pd
from bs4 import BeautifulSoup
import os
import string
import csv
from sklearn.metrics.pairwise import cosine_similarity
import math

from collections import defaultdict

In [67]:
#Step 1 Preprocessing
#Input: Documents that are read one by one from the collection
#Output: Tokens to be added to the index (vocabulary)
#Get Start time 
start_time = time.time()

coll_files = [f for f in os.listdir(r"./coll/") if os.path.isfile(os.path.join(r"./coll/", f))]
stop_words = open(r"./stopwords.txt", "r").read().split()

def collect_info(coll_files, stop_words):
    files = {} 
    list_of_words = [] 
    vocabulary = set([]) #We use a set because we don't want to have duplicates in the vocabulary
    total_length = len(coll_files)
    count = 0
    for file in coll_files: #For each file in the collection
        count += 1
        with open(r"./coll/" + file, "r") as f: #Open the file
            soup = BeautifulSoup(f, 'lxml') 
            
            for doc in soup.find_all('doc'): #For each document in the file
                docno = doc.find_all('docno')[0].text.strip() #Get the docno

                temptxt = str(doc.find('text')).replace('<text>', ' ').replace('</text>', ' ').replace('\n', ' ').replace('\t', ' ') #Get the text, replace the tags and new lines with spaces
                temptxt = temptxt.lower()
                temptxt = temptxt.translate(str.maketrans(string.punctuation, " " * len(string.punctuation))) #Remove punctuation
                temptxt = temptxt.translate(str.maketrans(string.digits, " " * len(string.digits)))
                
                list_of_words = temptxt.split()
                
                porter = ps.PorterStemmer()
                list_of_words = [porter.stem(word, 0, len(word)-1) for word in list_of_words]
                
                #temptxt = list(set(list_of_words) - set(stop_words))
                temptxt = [word for word in list_of_words if word not in stop_words]
                
                vocabulary.update(set(temptxt))
                files.update({docno: temptxt}) #We cannot check the length of the list of words because we don't know how many words are in the stop words list
        print ("Progress: " + str(count) + "/" + str(total_length))
    return files, vocabulary

In [60]:


def create_inverted_index(files, vocabulary, output_csv : bool = False):
    inverted_index = {i:{'documentFrequency':0} for i in vocabulary}
    count = 0
    csv_columns = ['word', 'documentFrequency']
    for file in files.items(): #key is the docno, value is the list of words
        count += 1
        key, value = file
        for word in value: #if the word is in the document, then we add that document to the inverted index TODO: we add the docno each time the word is found, fix this
            if key not in inverted_index[word]: #if the docno is not in the list of docnos for that word, then we add it
                inverted_index[word].update({key:1})
                inverted_index[word].update({'documentFrequency':(inverted_index[word]['documentFrequency'] + 1)})
            else: #if the docno is in the list of docnos for that word, then we increment the frequency
                inverted_index[word].update({key:(inverted_index[word][key] + 1)})
            
            
        if count % 1000 == 0:
            print("creating prog: ", count)
        csv_columns.append(key)

    if output_csv:
        #Send the inverted index to csv using a pandas dataframe
        count = 0
        csv_file = r"./inverted_index.csv"
        try:
            with open(csv_file, 'w') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
                writer.writeheader()
                for data in inverted_index:
                    count += 1
                    row = {'word': data, 'documentFrequency': inverted_index[data]['documentFrequency']}
                    row.update(inverted_index[data])
                    writer.writerow(row)
                    if count % 1000 == 0:
                        print("writing prog: ", count)
        except IOError:
            print("I/O error")
    
    

    return inverted_index

In [61]:
def collect_queries():
    queries = {}
    list_of_words = [] 
    
    
    with open(r"./topics1-50.txt", "r") as f:
        soup = BeautifulSoup(f, 'lxml') 
        
        for top in soup.find_all('top'):
            num = top.find_all('num')[0].text.strip()[0:2].strip() #this is a stupid way to do it, don't follow my example 

            temptxt = str(top.find('title')).replace('<title>', ' ').replace('</title>', ' ').replace('\n', ' ') #replace the title tag with either title or top to test different query sections
            temptxt = temptxt.lower()
            temptxt = temptxt.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
            temptxt = temptxt.translate(str.maketrans(string.digits, " " * len(string.digits)))
            
            list_of_words = temptxt.split()
            
            porter = ps.PorterStemmer()
            list_of_words = [porter.stem(word, 0, len(word)-1) for word in list_of_words]
            
            #temptxt = list(set(list_of_words) - set(stop_words))
            temptxt = [word for word in list_of_words if word not in stop_words]
            
            queries.update({num: temptxt}) #We cannot check the length of the list of words because we don't know how many words are in the stop words list

    return queries

In [5]:
#read the inverted index csv into a pandas dataframe
def read_inverted_index():
    return pd.read_csv(r"./inverted_index.csv")
    

In [106]:
def create_tfidf_index(inverted_index, total_documents, files):
    tfidf_index = inverted_index.copy()
    for word in tfidf_index:
        document_frequency = tfidf_index[word]['documentFrequency']
        for docno in tfidf_index[word]:
            if docno != 'documentFrequency':
                tf = (tfidf_index[word][docno]/len(files[docno]))
                tfidf_index[word][docno] = tf * math.log((total_documents/document_frequency), 2)
    return tfidf_index
        
def query_tfidf_index(queries, inverted_index, total_documents): 
    results = {str(i):{} for i in range(1,(len(queries.items())+1))}
    #We need to get the total number of 
    for word in inverted_index:
        for key,query in queries.items():
            if word in query:
                tf = (query.count(word)/len(query))
                tfidf = tf * math.log((total_documents/inverted_index[word]['documentFrequency']), 2)
                results[key].update({word:tfidf})
        
    return results

In [79]:
def calculate_doc_lengths(files,tfidf_index):
    doc_lengths = {}
    for docno in files:
        length = 0
        for word in files[docno]:
            length += tfidf_index[word][docno]**2
            # doc_lengths.update({docno:sqrt(tfidf_index[word][docno]**2)})
        doc_lengths.update({docno:math.sqrt(length)})
    return doc_lengths

In [63]:
def compute_cosine_similarity(query, document):
    #get the query vector
    query_vector = []
    for word in query:
        if word in document:
            query_vector.append(1)
        else:
            query_vector.append(0)
    
    #get the document vector
    document_vector = []
    for word in document:
        if word in query:
            document_vector.append(1)
        else:
            document_vector.append(0)
    
    #compute the cosine similarity
    return cosine_similarity([query_vector], [document_vector])[0][0]


In [68]:
files, vocabulary = collect_info(coll_files, stop_words)

print("vocabulary length: " , len(vocabulary))
print("files length: " , len(files))

Progress: 1/322
Progress: 2/322
Progress: 3/322
Progress: 4/322
Progress: 5/322
Progress: 6/322
Progress: 7/322
Progress: 8/322
Progress: 9/322
Progress: 10/322
Progress: 11/322
Progress: 12/322
Progress: 13/322
Progress: 14/322
Progress: 15/322
Progress: 16/322
Progress: 17/322
Progress: 18/322
Progress: 19/322
Progress: 20/322
Progress: 21/322
Progress: 22/322
Progress: 23/322
Progress: 24/322
Progress: 25/322
Progress: 26/322
Progress: 27/322
Progress: 28/322
Progress: 29/322
Progress: 30/322
Progress: 31/322
Progress: 32/322
Progress: 33/322
Progress: 34/322
Progress: 35/322
Progress: 36/322
Progress: 37/322
Progress: 38/322
Progress: 39/322
Progress: 40/322
Progress: 41/322
Progress: 42/322
Progress: 43/322
Progress: 44/322
Progress: 45/322
Progress: 46/322
Progress: 47/322
Progress: 48/322
Progress: 49/322
Progress: 50/322
Progress: 51/322
Progress: 52/322
Progress: 53/322
Progress: 54/322
Progress: 55/322
Progress: 56/322
Progress: 57/322
Progress: 58/322
Progress: 59/322
Progre

In [69]:
number_of_documents = len(files)


In [70]:
inverted_index = create_inverted_index(files, vocabulary)


creating prog:  1000
creating prog:  2000
creating prog:  3000
creating prog:  4000
creating prog:  5000
creating prog:  6000
creating prog:  7000
creating prog:  8000
creating prog:  9000
creating prog:  10000
creating prog:  11000
creating prog:  12000
creating prog:  13000
creating prog:  14000
creating prog:  15000
creating prog:  16000
creating prog:  17000
creating prog:  18000
creating prog:  19000
creating prog:  20000
creating prog:  21000
creating prog:  22000
creating prog:  23000
creating prog:  24000
creating prog:  25000
creating prog:  26000
creating prog:  27000
creating prog:  28000
creating prog:  29000
creating prog:  30000
creating prog:  31000
creating prog:  32000
creating prog:  33000
creating prog:  34000
creating prog:  35000
creating prog:  36000
creating prog:  37000
creating prog:  38000
creating prog:  39000
creating prog:  40000
creating prog:  41000
creating prog:  42000
creating prog:  43000
creating prog:  44000
creating prog:  45000
creating prog:  460

In [72]:
queries = collect_queries()

In [99]:
queries

{'1': ['cope',
  'overcrowd',
  'prison',
  'desc',
  'document',
  'provid',
  'inform',
  'jail',
  'prison',
  'overcrowd',
  'inmat',
  'ar',
  'forc',
  'cope',
  'condit',
  'reveal',
  'plan',
  'reliev',
  'overcrowd',
  'condit',
  'narr',
  'relev',
  'document',
  'describ',
  'scene',
  'overcrowd',
  'becom',
  'common',
  'jail',
  'prison',
  'countri',
  'document',
  'identifi',
  'inmat',
  'ar',
  'forc',
  'cope',
  'overcrowd',
  'condit',
  'correct',
  'system',
  'plan',
  'allevi',
  'crowd',
  'condit',
  'narr',
  'desc'],
 '2': ['accus',
  'cheat',
  'contractor',
  'defens',
  'project',
  'desc',
  'document',
  'refer',
  'alleg',
  'illeg',
  'commit',
  'ani',
  'entiti',
  'seek',
  'contract',
  'behalf',
  'militari',
  'forc',
  'narr',
  'relev',
  'document',
  'mention',
  'alleg',
  'improprieti',
  'improprieti',
  'individu',
  'entiti',
  'compani',
  'corpor',
  'domest',
  'foreign',
  'attempt',
  'provid',
  'servic',
  'product',
  'rela

In [77]:
tfidf_index = create_tfidf_index(inverted_index, number_of_documents, files)

In [80]:
doc_lengths = calculate_doc_lengths(files, tfidf_index)

In [107]:
queries_tfidf = query_tfidf_index(queries, inverted_index, number_of_documents)

In [108]:
queries_tfidf

{'1': {'forc': 0.0987808755959802,
  'scene': 0.10699862285510595,
  'crowd': 0.09596385960914511,
  'describ': 0.08585397049698502,
  'allevi': 0.18627535439077275,
  'system': 0.06788853748407671,
  'narr': 0.47654724577158697,
  'relev': 0.17195216963049847,
  'identifi': 0.07994946453847752,
  'reliev': 0.1474206619630755,
  'provid': 0.06477952828257463,
  'common': 0.09126588486940368,
  'countri': 0.04693273744287369,
  'condit': 0.25305873715730015,
  'ar': 0.035776960348572126,
  'inform': 0.07035192949318182,
  'reveal': 0.11712688092251006,
  'cope': 0.472770629825735,
  'overcrowd': 0.9126557430640159,
  'becom': 0.0705386918895575,
  'jail': 0.20078157623264534,
  'correct': 0.11498817892089759,
  'plan': 0.09648257825697089,
  'inmat': 0.2842124597077882,
  'prison': 0.2418969550408742,
  'document': 0.29830561401648453},
 '2': {'term': 0.052812348552008705,
  'forc': 0.039344925025517535,
  'attempt': 0.06008230729629636,
  'seek': 0.05804499315710246,
  'bid': 0.0737047

In [None]:
# print(compute_cosine_similarity(queries['1'], list(files[2].values())[0]))
scores = []
for t in range(1, 51):
    for i in range(len(files)-1):
        scores.append(compute_cosine_similarity(queries['3'], list(files[i].values())[0]))
    

print(max(scores))

In [43]:
print(compute_cosine_similarity("I love horror movies", "Lights out is a horror movie"))

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 20 while Y.shape[1] == 28