In [63]:
#implement an Information Retrieval (IR) system based on the vector space model, for a collection of documents
#For weighting, you can use the tf-idf weighting scheme (wij = tfij∙idfi)
#For each query, your system will produce a ranked list of documents, starting with the most similar to the query and ending with the least similar. For the query terms you can use a modified tf-idf weighting scheme wiq = (0.5 + 0.5 tfiq)∙idfi
#For the ranking, you can use the cosine similarity measure
import time
import Porter_Stemming as ps
import pandas as pd
from bs4 import BeautifulSoup
import os
import string
import csv
from sklearn.metrics.pairwise import cosine_similarity
import sys

from collections import defaultdict

In [107]:
#Step 1 Preprocessing
#Input: Documents that are read one by one from the collection
#Output: Tokens to be added to the index (vocabulary)
#Get Start time 
start_time = time.time()

coll_files = [f for f in os.listdir(r"./coll/") if os.path.isfile(os.path.join(r"./coll/", f))]
stop_words = open(r"./stopwords.txt", "r").read().split()

def collect_info(coll_files, stop_words):
    files = [] 
    list_of_words = [] 
    vocabulary = set([]) #We use a set because we don't want to have duplicates in the vocabulary
    total_length = len(coll_files)
    count = 0
    for file in coll_files:
        count += 1
        with open(r"./coll/" + file, "r") as f:
            soup = BeautifulSoup(f, 'lxml') 
            
            for doc in soup.find_all('doc'):
                docno = doc.find_all('docno')[0].text.strip()

                temptxt = str(doc.find('text')).replace('<text>', ' ').replace('</text>', ' ').replace('\n', ' ')
                temptxt = temptxt.lower()
                temptxt = temptxt.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
                temptxt = temptxt.translate(str.maketrans(string.digits, " " * len(string.digits)))
                
                list_of_words = temptxt.split()
                
                porter = ps.PorterStemmer()
                list_of_words = [porter.stem(word, 0, len(word)-1) for word in list_of_words]
                
                #temptxt = list(set(list_of_words) - set(stop_words))
                temptxt = [word for word in temptxt if word not in stop_words]
                
                vocabulary.update(set(temptxt))
                files.append({docno: temptxt}) #We cannot check the length of the list of words because we don't know how many words are in the stop words list
        print ("Progress: " + str(count) + "/" + str(total_length))
    return files, vocabulary

In [102]:


def create_inverted_index(files, vocabulary, output_csv : bool = False):
    inverted_index = {i:{'documentFrequency':0} for i in vocabulary}
    count = 0
    csv_columns = ['word', 'documentFrequency']
    for file in files: #key is the docno, value is the list of words
        count += 1
        key, value = list(file.items())[0]
        for word in value: #if the word is in the document, then we add that document to the inverted index TODO: we add the docno each time the word is found, fix this
            if key not in inverted_index[word]: #if the docno is not in the list of docnos for that word, then we add it
                inverted_index[word].update({key:1})
                inverted_index[word].update({'documentFrequency':(inverted_index[word]['documentFrequency'] + 1)})
            else: #if the docno is in the list of docnos for that word, then we increment the frequency
                inverted_index[word].update({key:(inverted_index[word][key] + 1)})
                inverted_index[word].update({'documentFrequency':(inverted_index[word]['documentFrequency'] + 1)})
            
            
        if count % 1000 == 0:
            print("creating prog: ", count)
        csv_columns.append(key)

    if output_csv:
        #Send the inverted index to csv using a pandas dataframe
        count = 0
        csv_file = r"./inverted_index.csv"
        try:
            with open(csv_file, 'w') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
                writer.writeheader()
                for data in inverted_index:
                    count += 1
                    row = {'word': data, 'documentFrequency': inverted_index[data]['documentFrequency']}
                    row.update(inverted_index[data])
                    writer.writerow(row)
                    if count % 1000 == 0:
                        print("writing prog: ", count)
        except IOError:
            print("I/O error")
    
    

    return inverted_index

In [11]:
def collect_queries():
    queries = {}
    list_of_words = [] 
    
    
    with open(r"./topics1-50.txt", "r") as f:
        soup = BeautifulSoup(f, 'lxml') 
        
        for top in soup.find_all('top'):
            num = top.find_all('num')[0].text.strip()[0:2].strip() #this is a stupid way to do it, don't follow my example 

            temptxt = str(top.find('title')).replace('<title>', ' ').replace('</title>', ' ').replace('\n', ' ') #replace the title tag with either title or top to test different query sections
            temptxt = temptxt.lower()
            temptxt = temptxt.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
            temptxt = temptxt.translate(str.maketrans(string.digits, " " * len(string.digits)))
            
            list_of_words = temptxt.split()
            
            porter = ps.PorterStemmer()
            list_of_words = [porter.stem(word, 0, len(word)-1) for word in list_of_words]
            
            temptxt = list(set(list_of_words) - set(stop_words))
            
            queries.update({num: temptxt}) #We cannot check the length of the list of words because we don't know how many words are in the stop words list

    return queries

In [12]:
#read the inverted index csv into a pandas dataframe
def read_inverted_index():
    return pd.read_csv(r"./inverted_index.csv")
    

In [13]:
def compute_cosine_similarity(query, document):
    


In [108]:
files, vocabulary = collect_info(coll_files, stop_words)

print("vocabulary length: " , len(vocabulary))
print("files length: " , len(files))

Progress: 1/322
Progress: 2/322
Progress: 3/322
Progress: 4/322
Progress: 5/322
Progress: 6/322
Progress: 7/322
Progress: 8/322
Progress: 9/322
Progress: 10/322
Progress: 11/322
Progress: 12/322
Progress: 13/322
Progress: 14/322
Progress: 15/322
Progress: 16/322
Progress: 17/322
Progress: 18/322
Progress: 19/322
Progress: 20/322
Progress: 21/322
Progress: 22/322
Progress: 23/322
Progress: 24/322
Progress: 25/322
Progress: 26/322
Progress: 27/322
Progress: 28/322
Progress: 29/322
Progress: 30/322
Progress: 31/322
Progress: 32/322
Progress: 33/322
Progress: 34/322
Progress: 35/322
Progress: 36/322


KeyboardInterrupt: 

In [103]:
inverted_index = create_inverted_index(files, vocabulary)


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


KeyboardInterrupt: 

In [99]:
inverted_index

{'cuddi': {'documentFrequency': 3,
  'AP880516-0139': 1,
  'AP880524-0073': 1,
  'AP880829-0097': 1},
 'libertin': {'documentFrequency': 3,
  'AP880426-0259': 1,
  'AP880816-0244': 1,
  'AP880924-0001': 1},
 'adlai': {'documentFrequency': 24,
  'AP880317-0035': 1,
  'AP880317-0067': 1,
  'AP880321-0020': 1,
  'AP880606-0131': 1,
  'AP880707-0175': 1,
  'AP880718-0026': 1,
  'AP880721-0055': 1,
  'AP880721-0238': 1,
  'AP880808-0168': 1,
  'AP880816-0215': 1,
  'AP880830-0217': 1,
  'AP880927-0241': 1,
  'AP881003-0273': 1,
  'AP881007-0074': 1,
  'AP881009-0020': 1,
  'AP881010-0203': 1,
  'AP881017-0215': 1,
  'AP881022-0087': 1,
  'AP881028-0188': 1,
  'AP881102-0096': 1,
  'AP881102-0240': 1,
  'AP881111-0012': 1,
  'AP881214-0135': 1,
  'AP881225-0049': 1},
 'tomiko': {'documentFrequency': 1, 'AP880314-0080': 1},
 'kielti': {'documentFrequency': 1, 'AP880228-0089': 1},
 'blasso': {'documentFrequency': 1, 'AP880705-0150': 1},
 'churchman': {'documentFrequency': 16,
  'AP880225-0090'

In [7]:
inverted_index = read_inverted_index()
print(inverted_index)

             word                                          documents
0          wittak                 ['AP880614-0002', 'AP880615-0290']
1         nominin                                  ['AP881214-0008']
2            krew  ['AP880213-0167', 'AP880214-0058', 'AP880215-0...
3         pullman  ['AP880519-0296', 'AP880522-0075', 'AP880531-0...
4         partier                 ['AP880608-0195', 'AP880815-0015']
...           ...                                                ...
116849     endara  ['AP880504-0004', 'AP880528-0137', 'AP880810-0...
116850   lizichev  ['AP880525-0084', 'AP880525-0260', 'AP880526-0...
116851    outmigr                                  ['AP880425-0024']
116852     tadtad                                  ['AP880309-0041']
116853  axpproxim                                  ['AP880616-0181']

[116854 rows x 2 columns]


In [None]:
queries = collect_queries()

In [None]:
files[1]

In [None]:
# print(compute_cosine_similarity(queries['1'], list(files[2].values())[0]))
scores = []
for t in range(1, 51):
    for i in range(len(files)-1):
        scores.append(compute_cosine_similarity(queries['3'], list(files[i].values())[0]))
    

print(max(scores))

In [None]:
print(compute_cosine_similarity("I love horror movies", "Lights out is a horror movie"))