In [1]:
import re
import os
import csv
import nltk
import pickle
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from collections import Counter

N_doc=26543

In [2]:
tokenizer = RegexpTokenizer(r'[a-z]+') #Change this line by removing 0-9 if we don't want numbers in the plot tokens.
stop_words = set(stopwords.words("english"))
stemmer= PorterStemmer()

In [5]:
#We create the term_id dictionary

tokens_set=set()
for i in range(1,26544):
    with open('F://FILE_ADM_3//articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
        temp = csv.DictReader(file, delimiter = '\t')
        for row in temp:
            #tokenizing the plot, making everything lowercase and removing unwanted characters
            Plot_words=tokenizer.tokenize(row["Plot"].lower())
            tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
            temp=set(tokens_without_sw)
            
            tokens_set=tokens_set.union(temp)

In [16]:
#A few steps to save it in a dictionary ordered alphabetically

tokens_list=list(tokens_set)
tokens_list.sort()
tokens_tuple=[(tokens_list[i],i) for i in range(len(tokens_list))]
tokens_dictionary=dict((x, y) for x, y in tokens_tuple)

In [20]:
len(tokens_dictionary)

55037

#### Storing a file as a pkl file
```with open('vocabulary.pkl', 'wb') as handle:
    pickle.dump(tokens_dictionary, handle)```

In [8]:
#loading a pkl file
with open('vocabulary.pkl', 'rb') as handle:
    vocabulary = pickle.load(handle)

In [11]:
#Creating the first inverted index

#We're using the vocabulary without the numbers.


In [6]:
#Initializing the inverted index
inverted_index={}
for i in range(len(vocabulary)):
    inverted_index[i]=[]

for j in range(1,26544):
        with open('F://FILE_ADM_3//articles/article_' + str(j) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                #tokenizing the plot, making everything lowercase and removing unwanted characters
                Plot_words=tokenizer.tokenize(row["Plot"].lower())
                tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
                
                for word in tokens_without_sw:
                    inverted_index[vocabulary[word]].append(j)

for i in range(len(inverted_index)):
    inverted_index[i]=set(inverted_index[i])
    

#### Storing a file as a pkl file
```with open('inverted_index_1.pkl', 'wb') as handle:
    pickle.dump(inverted_index, handle)```

In [10]:
#loading a pkl file
with open('inverted_index_1.pkl', 'rb') as handle:
    inverted_index = pickle.load(handle)

In [125]:
#Need to consider edge cases here when doing this in the main file
query = input()
query = tokenizer.tokenize(query.lower())
query_stems = [stemmer.stem(word) for word in query if word not in stop_words]

query_stem_test=query_stems
query_stems=[]

#Checking if input stems exists in the vocabulary

for word in query_stem_test:
    try:
        vocabulary[word]
        query_stems.append(word)
    except KeyError:
        print("Stem",word,"not found. It will be ignored.")
temp=set()

#

if len(query_stems)>0:
    temp=inverted_index[vocabulary[query_stems[0]]]
    for stem in query_stems:
        temp=temp.intersection(inverted_index[vocabulary[stem]])

to_print=list(sorted(temp))

for i in to_print:
    with open('F://FILE_ADM_3//articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                print("BookTitle:",row["bookTitle"])
                print("Plot:")
                print(row["Plot"])
                print("Url:",row["Url"])
                print()

dfghjk,
Stem dfghjk not found. It will be ignored.


In [60]:
#Initializing the inverted index with repetitions
inverted_index2={}
for i in range(len(vocabulary)):
    inverted_index2[i]=[]

for j in range(1,26544):
        with open('F://FILE_ADM_3//articles/article_' + str(j) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                #tokenizing the plot, making everything lowercase and removing unwanted characters
                Plot_words=tokenizer.tokenize(row["Plot"].lower())
                tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
                
                for word in tokens_without_sw:
                    inverted_index2[vocabulary[word]].append(j)


In [74]:
#We create the term_id dictionary

vocabulary2={}
for i in range(1,26544):
    with open('F://FILE_ADM_3//articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
        temp = csv.DictReader(file, delimiter = '\t')
        for row in temp:
            #tokenizing the plot, making everything lowercase and removing unwanted characters
            Plot_words=tokenizer.tokenize(row["Plot"].lower())
            tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
            temp=set(tokens_without_sw)
            for word in temp:
                if vocabulary[word] in vocabulary2:
                    vocabulary2[vocabulary[word]]+=1
                else:
                    vocabulary2[vocabulary[word]]=1

In [5]:
length_documents={}
for j in range(1,26544):
        with open('F://FILE_ADM_3//articles/article_' + str(j) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                #tokenizing the plot, making everything lowercase and removing unwanted characters
                Plot_words=tokenizer.tokenize(row["Plot"].lower())
                tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
                
                length_documents[j]=len(tokens_without_sw)

In [62]:
tfIdf_index={}
for key,value in inverted_index2.items():
    tfIdf_index[key]=[]
    n_ij=dict(Counter(value))
    
    for element in n_ij:
        
        tf=n_ij[element]/length_documents[element]
        
        Idf=np.log(N_doc/vocabulary2[key])
        
        tfIdf_index[key].append((element,tf*Idf))

# New structure here

In [21]:
#We create the new index for the second search engine

BookTokens={}
for i in range(1,N_doc+1):
    with open('F://FILE_ADM_3//articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
        temp = csv.DictReader(file, delimiter = '\t')
        for row in temp:
            #tokenizing the plot, making everything lowercase and removing unwanted characters
            Plot_words=tokenizer.tokenize(row["Plot"].lower())
            tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
            #temp=set(tokens_without_sw)
            BookTokens[i]=[]
            n_ij=dict(Counter(tokens_without_sw))
            for word in tokens_without_sw:
                tf=n_ij[word]/length_documents[i]
                Idf=np.log(N_doc/vocabulary2[vocabulary[word]])
                BookTokens[i].append((vocabulary[word],tf*Idf))
                BookTokens[i]=list(set(BookTokens[i]))
                BookTokens[i].sort()

In [12]:
#loading a pkl file
with open('vocabulary.pkl', 'rb') as handle:
    vocabulary = pickle.load(handle)

In [8]:
#loading a pkl file
with open('tfIdf_index.pkl', 'rb') as handle:
    tfIdf_index = pickle.load(handle)