In [20]:
import re
import os
import csv
import nltk
import pickle
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from collections import Counter

N_doc=26543

In [21]:
tokenizer = RegexpTokenizer(r'[a-z]+') #Change this line by removing 0-9 if we don't want numbers in the plot tokens.
stop_words = set(stopwords.words("english"))
stemmer= PorterStemmer()

In [None]:
def get_text_document(document_number)
    with open('articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
        temp = csv.DictReader(file, delimiter = '\t')
        for row in temp:
            #tokenizing the plot, making everything lowercase and removing unwanted characters
            Plot_words=tokenizer.tokenize(row["Plot"].lower())
            tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
    return tokens_without_sw

## Vocabulary dictionary

In [5]:
# Create 'vocabulary' dictionary
# - keys: all words in the all documents
# - values: index of each word from 0 to 55037
tokens_set=set()
for i in range(1,N_doc+1):
    with open('F://FILE_ADM_3//articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
        temp = csv.DictReader(file, delimiter = '\t')
        for row in temp:
            #tokenizing the plot, making everything lowercase and removing unwanted characters
            Plot_words=tokenizer.tokenize(row["Plot"].lower())
            tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
            temp=set(tokens_without_sw)
            
            tokens_set=tokens_set.union(temp)

#A few steps to save it in a dictionary ordered alphabetically
tokens_list=list(tokens_set)
tokens_list.sort()
tokens_tuple=[(tokens_list[i],i) for i in range(len(tokens_list))]
tokens_dictionary=dict((x, y) for x, y in tokens_tuple)

In [20]:
len(tokens_dictionary)

55037

#### Storing a file as a pkl file
```with open('vocabulary.pkl', 'wb') as handle:
    pickle.dump(tokens_dictionary, handle)```

In [3]:
#loading a pkl file
with open('vocabulary.pkl', 'rb') as handle:
    vocabulary = pickle.load(handle)

In [50]:
vocabulary

{'aa': 0,
 'aaaanah': 1,
 'aachim': 2,
 'aahana': 3,
 'aahz': 4,
 'aalish': 5,
 'aaliya': 6,
 'aaliyah': 7,
 'aaman': 8,
 'aamir': 9,
 'aang': 10,
 'aap': 11,
 'aar': 12,
 'aarav': 13,
 'aardema': 14,
 'aaren': 15,
 'aaron': 16,
 'aarsen': 17,
 'aarti': 18,
 'aasmaani': 19,
 'aastra': 20,
 'aavarana': 21,
 'aayan': 22,
 'ab': 23,
 'aba': 24,
 'ababa': 25,
 'aback': 26,
 'abadabad': 27,
 'abaddon': 28,
 'abadi': 29,
 'abagnal': 30,
 'abah': 31,
 'abalam': 32,
 'abalon': 33,
 'abandon': 34,
 'abandonato': 35,
 'abarat': 36,
 'abarrach': 37,
 'abas': 38,
 'abascar': 39,
 'abash': 40,
 'abat': 41,
 'abattoir': 42,
 'abb': 43,
 'abba': 44,
 'abbado': 45,
 'abbadon': 46,
 'abbatoir': 47,
 'abbear': 48,
 'abberlain': 49,
 'abberwick': 50,
 'abbess': 51,
 'abbey': 52,
 'abbeybeast': 53,
 'abbeydal': 54,
 'abbi': 55,
 'abbot': 56,
 'abbott': 57,
 'abbottempo': 58,
 'abbraccio': 59,
 'abbrevi': 60,
 'abbzug': 61,
 'abc': 62,
 'abd': 63,
 'abdal': 64,
 'abdel': 65,
 'abdic': 66,
 'abdiel': 67,
 '

## Inverted Index dictionary

In [6]:
# Create the first inverted index
#We're using the vocabulary without the numbers.

#Initializing the inverted index

# - keys: index of each word from 0 to 55037
# - values: each book containing the unique word (index number)
inverted_index={}
for i in range(len(vocabulary)):
    inverted_index[i]=[]

for j in range(1,26544):
        with open('F://FILE_ADM_3//articles/article_' + str(j) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                #tokenizing the plot, making everything lowercase and removing unwanted characters
                Plot_words=tokenizer.tokenize(row["Plot"].lower())
                tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
                
                for word in tokens_without_sw:
                    inverted_index[vocabulary[word]].append(j)

for i in range(len(inverted_index)):
    inverted_index[i]=set(inverted_index[i])
    

#### Storing a the inverted_index file as a pkl file
```with open('inverted_index_1.pkl', 'wb') as handle:
    pickle.dump(inverted_index, handle)```

In [44]:
#loading the inverted index as a pkl file
with open('inverted_index_1.pkl', 'rb') as handle:
    inverted_index = pickle.load(handle)

In [54]:
inverted_index

{0: {1630, 16376, 17707, 22524},
 1: {23931},
 2: {14426, 14653},
 3: {23941},
 4: {5377, 11723},
 5: {25354},
 6: {8873},
 7: {21656},
 8: {17341},
 9: {9896},
 10: {9915, 12713, 13440, 14249, 18285, 20730, 21893, 22446, 23521},
 11: {8264},
 12: {22215},
 13: {24346},
 14: {15613},
 15: {18737},
 16: {185,
  384,
  650,
  1452,
  2060,
  3008,
  3242,
  3550,
  3670,
  4174,
  4595,
  4669,
  4670,
  7547,
  8345,
  8625,
  9926,
  9932,
  10468,
  10497,
  11048,
  11279,
  11609,
  12137,
  12264,
  12366,
  13503,
  13816,
  13847,
  13903,
  14281,
  14908,
  15135,
  15223,
  15263,
  15309,
  15500,
  15581,
  16545,
  16554,
  17406,
  18056,
  18234,
  18361,
  18405,
  18709,
  18991,
  19011,
  19048,
  19070,
  19309,
  20292,
  20589,
  20894,
  21171,
  21299,
  22314,
  22851,
  23118,
  23590,
  24444,
  24565,
  24885,
  25058,
  26057,
  26122,
  26264},
 17: {23712},
 18: {6394},
 19: {25845},
 20: {20440},
 21: {11672},
 22: {10712},
 23: {2375,
  2901,
  3442,
  5

## Inverted Index 2  dictionary with repeated word

In [7]:
#Initializing the inverted index with repetitions

# - keys: index of each word from 0 to 55037
# - values: each book containing the word with repetitions (index number)

inverted_index2={}
for i in range(len(vocabulary)):
    inverted_index2[i]=[]

for j in range(1,26544):
        with open('F://FILE_ADM_3//articles/article_' + str(j) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                #tokenizing the plot, making everything lowercase and removing unwanted characters
                Plot_words=tokenizer.tokenize(row["Plot"].lower())
                tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
                
                for word in tokens_without_sw:
                    inverted_index2[vocabulary[word]].append(j)


#### Storing a the inverted_index file as a pkl file
```with open('inverted_index_2.pkl', 'wb') as handle:
    pickle.dump(inverted_index2, handle)```

#### loading the inverted_2 index as a pkl file
```with open('inverted_index_2.pkl', 'rb') as handle:
    inverted_index_2 = pickle.load(handle)```

## Vocabulary 2 dictionary

In [47]:
# We create the term_id dictionary
# - keys: index of each word from 0 to 55037
# - values: number of times a word appear in all books
vocabulary2={}
for i in range(1,26544):
    with open('F://FILE_ADM_3//articles/article_' + str(j) +'.tsv', 'r', encoding="utf-8") as file:
    #with open('articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
        temp = csv.DictReader(file, delimiter = '\t')
        for row in temp:
            #tokenizing the plot, making everything lowercase and removing unwanted characters
            Plot_words=tokenizer.tokenize(row["Plot"].lower())
            tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
            temp=set(tokens_without_sw)
            for word in temp:
                if vocabulary[word] in vocabulary2:
                    vocabulary2[vocabulary[word]]+=1
                else:
                    vocabulary2[vocabulary[word]]=1

#### Storing a the vocsbulary2 file as a pkl file
```with open('vocabulary2.pkl', 'wb') as handle:
    pickle.dump(vocabulary2, handle)```

#### Storing a the inverted_index file as a pkl file
```with open('inverted_index_2.pkl', 'wb') as handle:
    pickle.dump(inverted_index2, handle)```

In [49]:
vocabulary2

{10472: 3076,
 27854: 10019,
 47120: 2187,
 47074: 988,
 33232: 1258,
 25396: 5,
 10105: 87,
 44382: 1427,
 43139: 190,
 53708: 1527,
 32440: 2347,
 17437: 2958,
 35378: 4,
 7557: 19,
 8840: 1115,
 34192: 513,
 44422: 393,
 46144: 835,
 11785: 1744,
 50208: 447,
 26263: 1646,
 18302: 1026,
 22401: 3199,
 43780: 175,
 52978: 155,
 28541: 7515,
 27997: 937,
 30618: 1962,
 10956: 255,
 9350: 1168,
 39987: 205,
 54533: 357,
 1455: 1190,
 54366: 7426,
 29372: 5027,
 47602: 4822,
 15746: 5,
 655: 2056,
 35856: 1,
 34811: 4258,
 13072: 81,
 39684: 1350,
 36101: 112,
 34931: 11082,
 53436: 779,
 6079: 2234,
 1244: 1223,
 41450: 502,
 32327: 377,
 14544: 227,
 16823: 1902,
 1917: 74,
 37280: 2575,
 27839: 1272,
 47115: 540,
 50188: 217,
 43099: 526,
 11835: 2952,
 28163: 6343,
 33217: 1013,
 42931: 2244,
 17576: 338,
 18994: 3232,
 25519: 2311,
 45993: 2075,
 22450: 229,
 53524: 1428,
 4367: 3990,
 20852: 170,
 15753: 2593,
 42892: 1301,
 13683: 2220,
 53739: 245,
 3232: 3704,
 32831: 3933,
 15

## Length documents

In [64]:
# Create the 'length_documents' dict
# - keys: books in number (from 1 to 26543)
# - values: len of each book
length_documents={}
for j in range(1,26544):
        with open('F://FILE_ADM_3//articles/article_' + str(j) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                #tokenizing the plot, making everything lowercase and removing unwanted characters
                Plot_words=tokenizer.tokenize(row["Plot"].lower())
                tokens_without_sw = [stemmer.stem(word) for word in Plot_words if word not in stop_words]
                
                length_documents[j]=len(tokens_without_sw)

In [56]:
length_documents

{1: 86,
 2: 73,
 3: 76,
 4: 57,
 5: 27,
 6: 88,
 7: 65,
 8: 99,
 9: 94,
 10: 55,
 11: 21,
 12: 89,
 13: 63,
 14: 123,
 15: 85,
 16: 74,
 17: 46,
 18: 42,
 19: 46,
 20: 53,
 21: 72,
 22: 117,
 23: 70,
 24: 50,
 25: 79,
 26: 108,
 27: 51,
 28: 62,
 29: 98,
 30: 140,
 31: 166,
 32: 120,
 33: 60,
 34: 75,
 35: 94,
 36: 70,
 37: 125,
 38: 92,
 39: 46,
 40: 25,
 41: 138,
 42: 87,
 43: 81,
 44: 242,
 45: 86,
 46: 169,
 47: 38,
 48: 25,
 49: 97,
 50: 53,
 51: 25,
 52: 73,
 53: 145,
 54: 34,
 55: 50,
 56: 104,
 57: 59,
 58: 32,
 59: 52,
 60: 57,
 61: 68,
 62: 54,
 63: 45,
 64: 61,
 65: 152,
 66: 34,
 67: 74,
 68: 40,
 69: 48,
 70: 102,
 71: 50,
 72: 50,
 73: 70,
 74: 74,
 75: 70,
 76: 113,
 77: 78,
 78: 70,
 79: 46,
 80: 60,
 81: 98,
 82: 72,
 83: 47,
 84: 84,
 85: 77,
 86: 146,
 87: 100,
 88: 40,
 89: 45,
 90: 51,
 91: 75,
 92: 118,
 93: 82,
 94: 108,
 95: 25,
 96: 84,
 97: 15,
 98: 18,
 99: 181,
 100: 50,
 101: 80,
 102: 80,
 103: 62,
 104: 70,
 105: 72,
 106: 85,
 107: 150,
 108: 65,
 109: 2

## Create TfIdf Index

In [10]:
# Create the 'tfIdf_index' dict
# - keys: words mapped in numbers
# - values: {book in which the word appears, score of the word with respect to the book}
tfIdf_index={}
for key,value in inverted_index2.items():
    tfIdf_index[key]=[]
    n_ij=dict(Counter(value))
    print(n_ij)

    for element in n_ij:
        # issue with length_documents, we count also the duplicated words
        tf=n_ij[element]/length_documents[element]
        # issue with vocabulary2, we count also the duplicated words
        Idf=np.log(N_doc/vocabulary2[key])
        
        tfIdf_index[key].append((element,tf*Idf))


{16376: 1, 17707: 1, 22524: 1, 1630: 1}
16376


In [63]:
# MY TRIAL TO CHECK THE tf AND Idf
tfIdf_index_temp={}
key = 'circu'
value = {2886}
tfIdf_index_temp[key]=[]
n_ij=dict(Counter(value))
print(n_ij)

for element in n_ij:
    tf=n_ij[element]/length_documents[element]

    Idf=np.log(N_doc/vocabulary2[element])

    tfIdf_index_temp[key].append((element,tf*Idf))
print(tf)
print(Idf)
print(tfIdf_index_temp)

{2886: 1}
0.011494252873563218
9.087909049755293
{'circu': [(2886, 0.10445872470983095)]}


In [66]:
#Need to consider edge cases here when doing this in the main file
query = input()
query = tokenizer.tokenize(query.lower())
query_stems = [stemmer.stem(word) for word in query if word not in stop_words]

query_stem_test=query_stems
query_stems=[]

#Checking if input stems exists in the vocabulary

for word in query_stem_test:
    try:
        vocabulary[word]
        query_stems.append(word)
    except KeyError:
        print("Word",word,"not found. It will be ignored.")
temp=set()

#

if len(query_stems)>0:
    temp=inverted_index[vocabulary[query_stems[0]]]
    for stem in query_stems:
        temp=temp.intersection(inverted_index[vocabulary[stem]])

to_print=list(sorted(temp))

for i in to_print:
    with open('F://FILE_ADM_3//articles/article_' + str(i) +'.tsv', 'r', encoding="utf-8") as file:
            temp = csv.DictReader(file, delimiter = '\t')
            for row in temp:
                print("BookTitle:",row["bookTitle"])
                print("Plot:")
                print(row["Plot"])
                print("Url:",row["Url"])
                print()

yhjugfvc juygvbh
Word yhjugfvc not found. It will be ignored.
Word juygvbh not found. It will be ignored.
