# 2.Search Engine

In [1]:
import csv
import nltk
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
from numpy.linalg import norm
import heapq

We define the text_mining function to compute: tokenization, stemming and removal of stopwords, numbers and punctuations of the input text. We used the nltk library for these operations.

In [2]:
def text_mining(synopsis):
    tokens = nltk.word_tokenize(synopsis)                           #tokenization
    tokens = [word for word in tokens if word.isalpha()]            #remove punctuations and numbers
    stop_words = set(nltk.corpus.stopwords.words('english'))        #remove stopwords
    tokens = [word for word in tokens if not word.lower() in stop_words]       
    tokens = [PorterStemmer().stem(word) for word in tokens]        #stemming
    tokens = [str(hash(word)) for word in tokens]       #consider the hash code of the words will be useful for the next queries
    return tokens

In [3]:
def vocab(text):
    for word in set(text):                                          #create the vocabuary according to the synopsis 
        vocabulary.append(word)

In [4]:
description1, description2 = {}, {}
vocabulary = []
len_anime = 19128
for j in range(0,383):
    for i in range((50*j)+1, (50*(j+1))+1):
        if i == 7242 or i ==15009:
            continue
        if i == 19131:
            break
        doc = "".join("documento "+str(i))
        tsv_file = open("pages_tsv/pages_tsv/page_" + str(j+1) + "/anime_" + str(i)+ ".tsv", 'r', encoding="utf-8")
        anime = csv.DictReader(tsv_file, delimiter='\t')
        anime = anime.__next__()
        title = anime['animeTitle']
        synopsis = anime['animeDescription']
        description2[doc] = [synopsis, title] 
        synopsis = text_mining(synopsis)
        vocab(synopsis)
        description1[doc] = synopsis 


### 2.1. Conjunctive query

In [5]:
vocabulary = list(set(vocabulary))                    #create the vocabulatory with non repetitions of words
file = open("vocabulary.txt", "w", encoding = "utf-8")
for word in vocabulary:
    file.write(str(hash(word)) +"\n")                 #create the file .txt that maps each word to a term_id using tha hash
file.close()                                          #function of python

### 2.1.1) Create your index!

In [6]:
inverted_index = {}
for term_id in vocabulary:
    for i in description1:
        if term_id in description1[i]:
            if term_id not in inverted_index:
                inverted_index[term_id] =  [i]
            else:
                inverted_index[term_id] = inverted_index[term_id] + [i]

### 2.1.2) Execute the query

In [7]:
def search_word(query):
    d = []
    query = set(text_mining(query))
    for word in query:
        if word in vocabulary:
            d.append(inverted_index[word])
        else:
            return set()
    if len(d)==1:
        return d[0]
    else:
        for i in range(len(d)-1):
            intersection = set(d[i]).intersection(set(d[i+1]))
        return  intersection    

In [8]:
url = []
with open("anime_links.txt", "r", encoding = "utf-8") as file:
    for line in file:
        url.append(line)
file.close()

In [9]:
query = "saiyan race"
result = pd.DataFrame(columns=['animeTitle', 'animeDescription', 'Url'])
set_result = search_word(query)
for i in set_result:
    result.loc[i, "animeDescription"] = description2[i][0]
    result.loc[i, "animeTitle"] = description2[i][1]
    num = i.split(" ")
    result.loc[i, "Url"] = url[int(num[1])-1]
result

Unnamed: 0,animeTitle,animeDescription,Url
documento 401,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of ...",https://myanimelist.net/anime/36946/Dragon_Bal...
documento 365,Dragon Ball Z,Five years after winning the World Martial Art...,https://myanimelist.net/anime/813/Dragon_Ball_Z\n
documento 1469,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"Bardock, Son Goku's father, is a low-ranking S...",https://myanimelist.net/anime/986/Dragon_Ball_...
documento 1035,Dragon Ball Kai,"Five years after the events of Dragon Ball, ma...",https://myanimelist.net/anime/6033/Dragon_Ball...


### 2.2) Conjunctive query & Ranking score

### 2.2.1) Inverted index

In [10]:
inverted = {}
for term_id in vocabulary:
    for i in description1:
        if term_id in description1[i]:
            tf = (description1[i].count(term_id)) / len(description1[i])
            idf = np.log10(len_anime/(len(inverted_index[term_id])))
            tfidf = tf * idf
            if term_id not in inverted:
                inverted[term_id] =  [i, tfidf]
            else:
                inverted[term_id] = inverted[term_id] + [i, tfidf]

### 2.2.2) Execute the query

In [11]:
def create_query_array(query):
    query = text_mining(query)
    query_array = np.zeros(len(vocabulary))
    for i in range(len(vocabulary)):
        if vocabulary[i] in query:
            query_array[i] = 1
    return query_array

In [12]:
#cell to create the anime array
anime_array = np.zeros([len_anime, len(vocabulary)])
for i in range(len(vocabulary)):
    for word in inverted:
        tfidf = inverted[word][1]
        doc = (inverted[word][0]).split(" ")
        num_doc = int(doc[1])-1
        anime_array[num_doc][i] = tfidf

In [13]:
def search_similarity(query):
    result = {}
    documents = search_word(query)
    query_array = create_query_array(query)
    heap = []
    heapq.heapify(heap)
            
    for document in documents:
        docs = document.split(" ")
        num_document = int(docs[1])-1
        anime = anime_array[num_document]
        cos_sim = np.dot(query_array, anime)/(norm(anime)*norm(query_array))
        result[document] = cos_sim
        heapq.heappush(heap, cos_sim)
    return result, heap

In [18]:
def top_k_anime(query, k):
    res, heap = search_similarity(query)
    heap_k = heapq.nlargest(k, heap)
    result = []
    for i in range(len(heap_k)):
        pos = list(res.values()).index(heap_k[i])
        result.append(list(res)[pos])
    return result, res     

In [20]:
query = "saiyan race"
k = 3
data, dictionary = top_k_anime(query, k)
print(dictionary)
result2 = pd.DataFrame(columns=['animeTitle', 'animeDescription', 'Url', 'Similarity'])
for i in data:
    result2.loc[i, "animeDescription"] = description2[i][0]
    result2.loc[i, "animeTitle"] = description2[i][1]
    num = i.split(" ")
    result2.loc[i, "Url"] = url[int(num[1])-1]
    result2.loc[i, "Similarity"] = dictionary[i]
result2

{'documento 401': 0.0076008823616397795, 'documento 365': 0.007600882361639766, 'documento 1469': 0.0076008823616397735, 'documento 1035': 0.007600882361639773}


Unnamed: 0,animeTitle,animeDescription,Url,Similarity
documento 401,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of ...",https://myanimelist.net/anime/36946/Dragon_Bal...,0.00760088
documento 1469,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"Bardock, Son Goku's father, is a low-ranking S...",https://myanimelist.net/anime/986/Dragon_Ball_...,0.00760088
documento 1035,Dragon Ball Kai,"Five years after the events of Dragon Ball, ma...",https://myanimelist.net/anime/6033/Dragon_Ball...,0.00760088
