In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import os
import re
from datetime import datetime
import nltk
from nltk.stem import PorterStemmer
import csv
import json
import pandas as pd
import numpy as np
import heapq

## 2. Search Engine

In [2]:
def text_mining(string):
    # gather all the stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenization
    tokens = nltk.word_tokenize(string.lower())
    # remove punctuations and numbers and then word stemming
    res_tok = [PorterStemmer().stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return res_tok

In [3]:
def create_vocab():
    vocabulary = dict()
    for i in tqdm(range(1, 384)):
        path = f'pages_tsv/pages_tsv/page_{i}/'
        for file in os.listdir(path):
            tsv_file = open(path+file, 'r', encoding='utf-8')
            anime = csv.DictReader(tsv_file, delimiter='\t')
            descr = anime.__next__()['animeDescription']

            for word in text_mining(descr):
                if word not in vocabulary.keys():
                    vocabulary[word] = len(vocabulary)

    file_voc = open("vocabulary.json", "w", encoding='utf-8')
    json.dump(vocabulary, file_voc, ensure_ascii=False)
    file_voc.close()

In [4]:
def invertedIndex():

    inverted_index = dict()

    voc_json = open('vocabulary.json', 'r', encoding='utf-8')
    vocabulary = json.load(voc_json)

    # creating an empty inverted_index dictionary
    for word in vocabulary:
        inverted_index[vocabulary[word]] = []

    for i in tqdm(range(1, 384)):
        path = f'pages_tsv/pages_tsv/page_{i}/'

        for file in os.listdir(path):
            tsv_file = open(path+file, 'r', encoding='utf-8')
            document_id = 'document_' + (''.join(re.findall(r'\d+', file)))
            anime = csv.DictReader(tsv_file, delimiter='\t')
            descr = anime.__next__()['animeDescription']

            for word in text_mining(descr):
                if document_id not in inverted_index[vocabulary[word]]:
                    inverted_index[vocabulary[word]].append(document_id)

    file_inv_ind = open("inverted_index.json", "w", encoding='utf-8')
    json.dump(inverted_index, file_inv_ind, ensure_ascii=False)
    file_inv_ind.close()

### 2.1. Conjunctive query

In [5]:
create_vocab()

100%|████████████████████████████████████████████████████████████████████████████████| 383/383 [00:49<00:00,  7.69it/s]


#### 2.1.1) Create your index!

In [6]:
invertedIndex()

100%|████████████████████████████████████████████████████████████████████████████████| 383/383 [00:43<00:00,  8.84it/s]


#### 2.1.2) Execute the query

In [7]:
# opening the vocabulary
voc_json = open('vocabulary.json', 'r', encoding='utf-8')
vocabulary = json.load(voc_json)
voc_json.close()

# opening the inverted_index
inv_ind_json = open('inverted_index.json', 'r', encoding='utf-8')
inverted_index = json.load(inv_ind_json)
inv_ind_json.close()

# opening the anime_links
list_url_txt = open('anime_links.txt', 'r', encoding='utf-8')
list_url = list_url_txt.read().splitlines()
list_url_txt.close()

In [11]:
query = input()
assert len(query) > 0, "The query is empty!!"
# stemming the query
query_stemmed = text_mining(query)

saiyan race


In [12]:
# creating index query dictionary
query_dict = dict()
for word in query_stemmed:
    if word in vocabulary.keys():
        query_dict[vocabulary[word]] = inverted_index[str(vocabulary[word])]

In [13]:
# saving the inverted_index of the query
query_index = list(query_dict.keys())


# searching for the documents requested from the query
doc_list = set(query_dict[query_index[0]])
for query_word in query_index[1:]:
    doc_list.intersection_update(query_dict[query_word])

print("List of documents found with the query =", doc_list)

# creating a pandas dataframe for the final result
doc_df = pd.DataFrame(columns=["animeTitle", "animeDescription", "Url"])
for doc in doc_list:
    i = int(''.join(re.findall(r'\d+', doc)))
    doc_page = (i-1)//50 + 1
    path = f'pages_tsv/pages_tsv/page_{doc_page}/anime_{i}.tsv'
    tsv_file = open(path, 'r', encoding='utf-8')
    anime_tsv = csv.DictReader(tsv_file, delimiter='\t')
    anime = anime_tsv.__next__()
    doc_df.loc[doc, ["animeTitle", "animeDescription", "Url"]] = [anime["animeTitle"], anime["animeDescription"], list_url[i-1]]

List of documents found with the query = {'document_365', 'document_1035', 'document_401', 'document_1469'}


In [14]:
doc_d = dict(selector="th",
             props=[('text-align', 'center')])
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

doc_df.style.set_properties(**{'text-align':'center'}).set_table_styles([doc_d]).format({'Url': make_clickable})

Unnamed: 0,animeTitle,animeDescription,Url
document_365,Dragon Ball Z,"Five years after winning the World Martial Arts tournament, Gokuu is now living a peaceful life with his wife and son. This changes, however, with the arrival of a mysterious enemy named Raditz who presents himself as Gokuu's long-lost brother. He reveals that Gokuu is a warrior from the once powerful but now virtually extinct Saiyan race, whose homeworld was completely annihilated. When he was sent to Earth as a baby, Gokuu's sole purpose was to conquer and destroy the planet; but after suffering amnesia from a head injury, his violent and savage nature changed, and instead was raised as a kind and well-mannered boy, now fighting to protect others. With his failed attempt at forcibly recruiting Gokuu as an ally, Raditz warns Gokuu's friends of a new threat that's rapidly approaching Earth—one that could plunge Earth into an intergalactic conflict and cause the heavens themselves to shake. A war will be fought over the seven mystical dragon balls, and only the strongest will survive in Dragon Ball Z. [Written by MAL Rewrite]",https://myanimelist.net/anime/813/Dragon_Ball_Z
document_1035,Dragon Ball Kai,"Five years after the events of Dragon Ball, martial arts expert Gokuu is now a grown man married to his wife Chi-Chi, with a four-year old son named Gohan. While attending a reunion on Turtle Island with his old friends Master Roshi, Krillin, Bulma and others, the festivities are interrupted when a humanoid alien named Raditz not only reveals the truth behind Gokuu's past, but kidnaps Gohan as well. With Raditz displaying power beyond anything Gokuu has seen before, he is forced to team up with his old nemesis, Piccolo, in order to rescue his son. But when Gokuu and Piccolo reveal the secret of the seven mystical wish-granting Dragon Balls to Raditz, he informs the duo that there is more of his race, the Saiyans, and they won’t pass up an opportunity to seize the power of the Dragon Balls for themselves. These events begin the saga of Dragon Ball Kai, a story that finds Gokuu and his friends and family constantly defending the galaxy from increasingly more powerful threats. Bizarre, comical, heartwarming and threatening characters come together in a series of battles that push the powers and abilities of Gokuu and his friends beyond anything they have ever experienced.",https://myanimelist.net/anime/6033/Dragon_Ball_Kai
document_401,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of the infamous Saiyan warrior race, King Vegeta noticed a baby named Broly whose latent power exceeded that of his own son. Believing that Broly's power would one day surpass that of his child, Vegeta, the king sends Broly to the desolate planet Vampa. Broly's father Paragus follows after him, intent on rescuing his son. However, his ship gets damaged, causing the two to spend years trapped on the barren world, unaware of the salvation that would one day come from an unlikely ally. Years later on Earth, Gokuu Son and Prince Vegeta—believed to be the last survivors of the Saiyan race—are busy training on a remote island. But their sparring is interrupted when the appearance of their old enemy Frieza drives them to search for the last of the wish-granting Dragon Balls on a frozen continent. Once there, Frieza shows off his new allies: Paragus and the now extremely powerful Broly. A legendary battle that shakes the foundation of the world ensues as Gokuu and Vegeta face off against Broly, a warrior without equal whose rage is just waiting to be unleashed. [Written by MAL Rewrite]",https://myanimelist.net/anime/36946/Dragon_Ball_Super__Broly
document_1469,Dragon Ball Z Special 1: Tatta Hitori no Saishuu Kessen,"Bardock, Son Goku's father, is a low-ranking Saiyan soldier who was given the power to see into the future by the last remaining alien on a planet he just destroyed. He witnesses the destruction of his race and must now do his best to stop Frieza's impending massacre. (Source: ANN)",https://myanimelist.net/anime/986/Dragon_Ball_Z_Special_1__Tatta_Hitori_no_Saishuu_Kessen


## 2.2) Conjunctive query & Ranking score

### 2.2.1) Inverted index

We create two dictionaries:
- inverted_term such that for each word we have the list of documents in which it is contained in, and the relative tfIdf score.
- inverte_doc such that for each document we have the sum of the squares of the tfidf, we will use this dictonary in the execute query.

We compute the tf-idf as $tf*idf$:
- $tf=\frac{n_i}{|d|}$, n is the number of occurences of the i-th word in the document and |d| is the number of words in the document
- $idf=log_{10}\left(\frac{N}{n_d}\right)$, N is the total number of documents and $n_d$ is the number of documents contaning the word

In [15]:
def inverted():
    
    inverted_term = dict()
    inverted_doc = dict()
    
    #opening the vocabulary
    voc_json = open('vocabulary.json', 'r', encoding='utf-8')
    vocabulary = json.load(voc_json)
    
    # opening the inverted_index
    inv_ind_json = open('inverted_index.json', 'r', encoding='utf-8')
    inverted_index = json.load(inv_ind_json)
    inv_ind_json.close()
    

    # creating an empty inverted_index dictionary
    for word in vocabulary:
        inverted_term[vocabulary[word]] = []

    for i in tqdm(range(1, 384)):
        path = f'pages_tsv/pages_tsv/page_{i}/'

        for file in os.listdir(path):
            tsv_file = open(path+file, 'r', encoding='utf-8')
            document_id = 'document_' + (''.join(re.findall(r'\d+', file)))
            anime = csv.DictReader(tsv_file, delimiter='\t')
            descr = anime.__next__()['animeDescription']
            descr = text_mining(descr)

            for word in descr:
                tf = descr.count(word) / len(descr)
                idf = np.log10(19128/len(inverted_index[str(vocabulary[word])]))
                tfidf = tf * idf
                if (document_id, tfidf) not in inverted_term[vocabulary[word]]:
                    inverted_term[vocabulary[word]].append((document_id, tfidf))
                if document_id not in inverted_doc:
                    inverted_doc[document_id] = np.square(tfidf)
                else:
                    inverted_doc[document_id] = inverted_doc[document_id] + np.square(tfidf)
                        

    file_inv_term = open("inverted_term.json", "w", encoding='utf-8')
    json.dump(inverted_term, file_inv_term, ensure_ascii=False)
    file_inv_term.close()
    
    file_inv_doc = open("inverted_doc.json", "w", encoding='utf-8')
    json.dump(inverted_doc, file_inv_doc, ensure_ascii=False)
    file_inv_doc.close()

In [16]:
# creates the inverted_term and inverted_doc dictionaries and stores them in a json file
inverted()

100%|████████████████████████████████████████████████████████████████████████████████| 383/383 [01:03<00:00,  6.07it/s]


### 2.2.2) Execute the query

Given a query we get the set of documents containing all the words in the query and sort them according to their similairty to the query

- First we consider only the documents that contain all the words in the query.

- We create a dictonary called numerator such that for each document we have the sum of tf-idf of the words in the query in reference to the document.

- We compute the cosine similarity for each document as $ cos(\theta) = \frac{(\vec{q} \cdot \vec{d})}{(|{\vec{q}}| \cdot |{\vec{d}}|)}$ where:
<p> $(\vec{q} \cdot \vec{d})$ is the intersection of the document and the query vectors, so we use the numerator dictonary argument for each document as intersection;<p>
<p> $|{\vec{q}}| \cdot |{\vec{d}}|$  are the norms of the document and query vectors . We compute $|{\vec{q}}|$ as the square root of the length of the query (because the query vector has only components equal to 1 corresponding to the words of the query). We compute $|{\vec{d}}|$ as the square root of the sum of the squares of tfidf of all words in the document (so we use the inverted_doc dictonary to compute this norm)<p>

- Then we create the result dictonary to store for each document the corresponding cosine similarity to the query.

In [17]:
# opening the vocabulary
voc_json = open('vocabulary.json', 'r', encoding='utf-8')
vocabulary = json.load(voc_json)
voc_json.close()

# opening the inverted_index
inv_ind_json = open('inverted_index.json', 'r', encoding='utf-8')
inverted_index = json.load(inv_ind_json)
inv_ind_json.close()

#opening the inverted_term
inv_term_json = open('inverted_term.json', 'r', encoding='utf-8')
inverted_term = json.load(inv_term_json)
inv_term_json.close()

#opening the inverted_doc
inv_doc_json = open('inverted_doc.json', 'r', encoding='utf-8')
inverted_doc = json.load(inv_doc_json)
inv_doc_json.close()


# opening the anime_links
list_url_txt = open('anime_links.txt', 'r', encoding='utf-8')
list_url = list_url_txt.read().splitlines()
list_url_txt.close()

In [18]:
print("insert the query: ")
query = input()
print("insert k: ")
k = int(input())
assert len(query) > 0, "The query is empty!!"
# stemming the query
query_stemmed_2 = text_mining(query)

insert the query: 
saiyan race
insert k: 
3


In [19]:
# creating index query dictionary
query_dict_2 = dict()
for word in query_stemmed_2:
    if word in vocabulary.keys():
        query_dict_2[vocabulary[word]] = inverted_index[str(vocabulary[word])]

In [20]:
# saving the inverted_index of the query
query_index_2 = list(query_dict_2.keys())

# searching for the documents requested from the query
doc_list_2 = set(query_dict_2[query_index_2[0]])

for query_word in query_index_2[1:]:
    doc_list_2.intersection_update(query_dict_2[query_word])

# create the heap list in order to take the first k documents     
heap = list()
heapq.heapify(heap)
result, numerator = dict(), dict()

#creating the numerator dictonary
for word in query_stemmed_2:
    for elem in inverted_term[str(vocabulary[word])]:
        if elem[0] in doc_list_2:
            if elem[0] not in numerator:
                numerator[elem[0]] = elem[1]
            else:
                numerator[elem[0]] = numerator[elem[0]] + elem[1]

for document in doc_list_2:
    cos_sim = numerator[document]/(np.sqrt(inverted_doc[document]) * np.sqrt(len(query_stemmed_2)))
    result[document] = cos_sim
    heapq.heappush(heap, cos_sim)


We create the final_doc dictonary to store only the first k documents with their corresponding cosine similarity

In [21]:
# we are taking the first k similar documents to the query using heapq
heap_k = heapq.nlargest(k, heap)
final_doc = dict()
for i in range(len(heap_k)):
    pos = list(result.values()).index(heap_k[i])
    final_doc[list(result)[pos]] = result[list(result)[pos]]

In [22]:
# creating a pandas dataframe for the final result
doc_df = pd.DataFrame(columns=["animeTitle", "animeDescription", "Url", "Similarity"])
for doc in final_doc:
    i = int(''.join(re.findall(r'\d+', doc)))
    doc_page = (i-1)//50 + 1
    path = f'pages_tsv/pages_tsv/page_{doc_page}/anime_{i}.tsv'
    tsv_file = open(path, 'r', encoding='utf-8')
    anime_tsv = csv.DictReader(tsv_file, delimiter='\t')
    anime = anime_tsv.__next__()
    doc_df.loc[doc, ["animeTitle", "animeDescription", "Url", "Similarity"]] = [anime["animeTitle"], anime["animeDescription"], list_url[i-1], result[doc]]

In [23]:
doc_d = dict(selector="th",
             props=[('text-align', 'center')])
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

doc_df.style.set_properties(**{'text-align':'center'}).set_table_styles([doc_d]).format({'Url': make_clickable})

Unnamed: 0,animeTitle,animeDescription,Url,Similarity
document_1469,Dragon Ball Z Special 1: Tatta Hitori no Saishuu Kessen,"Bardock, Son Goku's father, is a low-ranking Saiyan soldier who was given the power to see into the future by the last remaining alien on a planet he just destroyed. He witnesses the destruction of his race and must now do his best to stop Frieza's impending massacre. (Source: ANN)",https://myanimelist.net/anime/986/Dragon_Ball_Z_Special_1__Tatta_Hitori_no_Saishuu_Kessen,0.320894
document_401,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of the infamous Saiyan warrior race, King Vegeta noticed a baby named Broly whose latent power exceeded that of his own son. Believing that Broly's power would one day surpass that of his child, Vegeta, the king sends Broly to the desolate planet Vampa. Broly's father Paragus follows after him, intent on rescuing his son. However, his ship gets damaged, causing the two to spend years trapped on the barren world, unaware of the salvation that would one day come from an unlikely ally. Years later on Earth, Gokuu Son and Prince Vegeta—believed to be the last survivors of the Saiyan race—are busy training on a remote island. But their sparring is interrupted when the appearance of their old enemy Frieza drives them to search for the last of the wish-granting Dragon Balls on a frozen continent. Once there, Frieza shows off his new allies: Paragus and the now extremely powerful Broly. A legendary battle that shakes the foundation of the world ensues as Gokuu and Vegeta face off against Broly, a warrior without equal whose rage is just waiting to be unleashed. [Written by MAL Rewrite]",https://myanimelist.net/anime/36946/Dragon_Ball_Super__Broly,0.085567
document_365,Dragon Ball Z,"Five years after winning the World Martial Arts tournament, Gokuu is now living a peaceful life with his wife and son. This changes, however, with the arrival of a mysterious enemy named Raditz who presents himself as Gokuu's long-lost brother. He reveals that Gokuu is a warrior from the once powerful but now virtually extinct Saiyan race, whose homeworld was completely annihilated. When he was sent to Earth as a baby, Gokuu's sole purpose was to conquer and destroy the planet; but after suffering amnesia from a head injury, his violent and savage nature changed, and instead was raised as a kind and well-mannered boy, now fighting to protect others. With his failed attempt at forcibly recruiting Gokuu as an ally, Raditz warns Gokuu's friends of a new threat that's rapidly approaching Earth—one that could plunge Earth into an intergalactic conflict and cause the heavens themselves to shake. A war will be fought over the seven mystical dragon balls, and only the strongest will survive in Dragon Ball Z. [Written by MAL Rewrite]",https://myanimelist.net/anime/813/Dragon_Ball_Z,0.067677
