# 2.Search Engine

In [1]:
import csv
import nltk
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
from numpy.linalg import norm
import heapq
import json

We define the text_mining function to compute: tokenization, stemming and removal of stopwords, numbers and punctuations of the input text. We used the nltk library for these operations.

In [2]:
def text_mining(synopsis):
    tokens = nltk.word_tokenize(synopsis)                           #tokenization
    tokens = [word for word in tokens if word.isalpha()]            #remove punctuations and numbers
    stop_words = set(nltk.corpus.stopwords.words('english'))        #remove stopwords
    tokens = [word for word in tokens if not word.lower() in stop_words]       
    tokens = [PorterStemmer().stem(word) for word in tokens]        #stemming
    tokens = [str(hash(word)) for word in tokens]       #consider the hash code of the words will be useful for the next queries
    return tokens

In [3]:
def vocab(text):
    for word in set(text):                                          #create the vocabuary according to the synopsis 
        vocabulary.append(word)

In [4]:
description1, description2 = {}, {}
vocabulary = []
len_anime = 19128
for j in range(0,383):
    for i in range((50*j)+1, (50*(j+1))+1):
        if i == 7242 or i ==15009:
            continue
        if i == 19131:
            break
        #doc = "".join("documento "+str(i))
        tsv_file = open("pages_tsv/pages_tsv/page_" + str(j+1) + "/anime_" + str(i)+ ".tsv", 'r', encoding="utf-8")
        anime = csv.DictReader(tsv_file, delimiter='\t')
        anime = anime.__next__()
        title = anime['animeTitle']
        synopsis = anime['animeDescription']
        description2[i] = [synopsis, title] 
        synopsis = text_mining(synopsis)
        vocab(synopsis)
        description1[i] = synopsis 


### 2.1. Conjunctive query

In [5]:
vocabulary = list(set(vocabulary))                    #create the vocabulatory with non repetitions of words
file = open("vocabulary.txt", "w", encoding = "utf-8")
for word in vocabulary:
    file.write(str(hash(word)) +"\n")                 #create the file .txt that maps each word to a term_id using tha hash
file.close()                                          #function of python

### 2.1.1) Create your index!

In [6]:
inverted_index = {}
for term_id in vocabulary:
    for i in description1:
        if term_id in description1[i]:
            if term_id not in inverted_index:
                inverted_index[term_id] =  [i]
            else:
                inverted_index[term_id] = inverted_index[term_id] + [i]

In [7]:
with open("inverted_index.json", "w", encoding="utf-8") as file: 
    json.dump(dict(zip(inverted_index.keys(), map(list, inverted_index.values()))), file, indent = 4)
file.close()

### 2.1.2) Execute the query

In [8]:
with open("inverted_index.json", "r", encoding="utf-8") as file:
    inverted_index = json.load(file) 
file.close()

In [9]:
def search_word(query):
    d = []
    query = set(text_mining(query))
    for word in query:
        if word in vocabulary:
            d.append(inverted_index[word])
        else:
            return set()
    if len(d)==1:
        return d[0]
    else:
        for i in range(len(d)-1):
            intersection = set(d[i]).intersection(set(d[i+1]))
        return  intersection    

In [10]:
url = []
with open("anime_links.txt", "r", encoding = "utf-8") as file:
    for line in file:
        url.append(line)
file.close()

In [11]:
query = "saiyan race"
result = pd.DataFrame(columns=['animeTitle', 'animeDescription', 'Url'])
set_result = search_word(query)
for i in set_result:
    result.loc[i, "animeDescription"] = description2[i][0]
    result.loc[i, "animeTitle"] = description2[i][1]
    result.loc[i, "Url"] = url[i-1]
result

Unnamed: 0,animeTitle,animeDescription,Url
401,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of ...",https://myanimelist.net/anime/36946/Dragon_Bal...
1035,Dragon Ball Kai,"Five years after the events of Dragon Ball, ma...",https://myanimelist.net/anime/6033/Dragon_Ball...
365,Dragon Ball Z,Five years after winning the World Martial Art...,https://myanimelist.net/anime/813/Dragon_Ball_Z\n
1469,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"Bardock, Son Goku's father, is a low-ranking S...",https://myanimelist.net/anime/986/Dragon_Ball_...


### 2.2) Conjunctive query & Ranking score

### 2.2.1) Inverted index

In [12]:
inverted_term = {}
inverted_doc = {}
for term_id in vocabulary:
    for i in description1:
        if term_id in description1[i]:
            tf = (description1[i].count(term_id)) / len(description1[i])
            idf = np.log10(len_anime/(len(inverted_index[term_id])))
            tfidf = tf * idf
            if term_id not in inverted_term:
                inverted_term[term_id] =  [[i, tfidf]]
            else:
                inverted_term[term_id] = inverted_term[term_id] + [[i, tfidf]]
            if i not in inverted_doc:
                inverted_doc[i] = tfidf
            else:
                inverted_doc[i] = inverted_doc[i] + tfidf  

In [13]:
with open("inverted_term.json", "w", encoding="utf-8") as file: 
    json.dump(inverted_term, file, ensure_ascii=False)
file.close()

In [14]:
with open("inverted_doc.json", "w", encoding="utf-8") as file: 
    json.dump(inverted_doc, file, ensure_ascii=False)
file.close()

### 2.2.2) Execute the query

In [15]:
with open("inverted_term.json", "r", encoding="utf-8") as file:
    inverted_term = json.load(file) 
file.close()

In [16]:
with open("inverted_doc.json", "r", encoding="utf-8") as file:
    inverted_doc = json.load(file) 
file.close()

In [20]:
def search_similarity(query):
    documents = search_word(query)
    query = set(text_mining(query))
    heap = []
    heapq.heapify(heap)
    result = {}

    numeratore = {}
    for word in query:
        for i in inverted_term[word]:
            if i[0] in documents:
                if i[0] not in numeratore:
                    numeratore[i[0]] = i[1]
                else:
                    numeratore[i[0]] = numeratore[i[0]]+ i[1]
    count = 0
    for elem in numeratore:
        count += numeratore[elem]
        
    for document in documents:
        cos_sim = numeratore[document]/(np.sqrt(inverted_doc[str(document)]) * np.sqrt(count))
        result[document] = cos_sim
        heapq.heappush(heap, cos_sim)
    return result, heap
              
        
    

In [21]:
def top_k_anime(query, k):
    res, heap = search_similarity(query)
    heap_k = heapq.nlargest(k, heap)
    result = []
    for i in range(len(heap_k)):
        pos = list(res.values()).index(heap_k[i])
        result.append(list(res)[pos])
    return result, res     

In [22]:
query = "saiyan race"
k = 3
data, dictionary = top_k_anime(query, k)
print(dictionary)
result2 = pd.DataFrame(columns=['animeTitle', 'animeDescription', 'Url', 'Similarity'])
for i in data:
    result2.loc[i, "animeDescription"] = description2[i][0]
    result2.loc[i, "animeTitle"] = description2[i][1]
    result2.loc[i, "Url"] = url[i-1]
    result2.loc[i, "Similarity"] = dictionary[i]
result2

{401: 0.08140734173914635, 1035: 0.05119910412232713, 365: 0.060830150659738194, 1469: 0.22620464338614163}


Unnamed: 0,animeTitle,animeDescription,Url,Similarity
1469,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"Bardock, Son Goku's father, is a low-ranking S...",https://myanimelist.net/anime/986/Dragon_Ball_...,0.226205
401,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of ...",https://myanimelist.net/anime/36946/Dragon_Bal...,0.0814073
365,Dragon Ball Z,Five years after winning the World Martial Art...,https://myanimelist.net/anime/813/Dragon_Ball_Z\n,0.0608302
