In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import os
import re
from datetime import datetime
import nltk
from nltk.stem import PorterStemmer
import csv
import json
import pandas as pd
import numpy as np
import heapq
from dateutil import parser

We define the new score as:
\begin{equation}
score={\sum_{j∈columns}(score_j)}
\end{equation}
where $columns={animeTitle, animeType, ..., animeStaff}$

We compute the scores in this way:
- the title score: $score_t={\sum_{i∈query}\frac{2*n_i}{len(title)+len(query)}}$, where $n_i$ is the number of occurences of $word_i$ of the query in the title. We are considering how many times the query word appears in the title, normalizing by the sum of query length and the title lenght. We decided to multiply the numerator by two in order to try to give greater importance of similarity to the queries with a word that appeared in the title. 
- the type score: $score_p={\sum_{i∈query}\frac{n_i}{2}}$, where $n_i$ is different from zero, if in the query is specified the anime type, in that case we consider as score: $n_i$ divided by two to not give much importance to the score type.
- score_episode, score_members, score_animeScore, score_users, score_rank, score_popularity are computed such that their sum is equal to one; we give much importance if the query match with score_animeScore and with score_rank, score_episode. The others are less significant in the similarity in our opinion, so we did not give them much weight.
- the description score: $score_d={\sum_{i∈query}\frac{2*d_i}{len(description)+len(query)}}$, where $d_i$ is the occurences of the $word_i$ of the query in the description. We compute this score with the same rules of the title_score.
- the releaseDate_score and endDate_score: we consider these scores equal to 0.5 if in the query appears an year that coincides with the releasedate or enddate year. We compute these scores taking into account only the year because we think the user is more likely to do a search by remembering only the releasedate or enddate year rather than the exact date with day and month.
- the score_staff, score_characters, score_voice: we compute these scores adding 0.5 every time a name in the query appears in the list of strings. We decided to add 0.5 because in this way, if the users only enter the name or the surname of the character/staff/voice the relative score will be 0.5, instead if the users enters both the name and the surname the relative score is equal to 1 (as the similarity is more precise).

In [2]:
def text_mining_score(string):
    # gather all the stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenization
    tokens = nltk.word_tokenize(string.lower())
    # remove punctuations and numbers and then word stemming
    res_tok = [PorterStemmer().stem(word) for word in tokens if  word not in stop_words]
    return res_tok

In [3]:
print("insert the query: ")
query = input()
print("insert k: ")
k = int(input())
assert len(query) > 0, "The query is empty!!"

insert the query: 
gintama enchousen Youichi 2018 64
insert k: 
5


In [4]:
def new_score(query):
    
    #formatting the query
    query = text_mining_score(query)
    
    score_dict = dict()
    heap = list()
    heapq.heapify(heap)

    for i in tqdm(range(1, 384)):
        path = f'pages_tsv/pages_tsv/page_{i}/'

        for file in os.listdir(path):
            tsv_file = open(path+file, 'r', encoding='utf-8')
            document_id = 'document_' + (''.join(re.findall(r'\d+', file)))
            anime = csv.DictReader(tsv_file, delimiter='\t')
            anime = anime.__next__()
            
            #considering all the columns of the anime link
            title = text_mining_score(anime['animeTitle'])
            typ = text_mining_score(anime['animeType'])
            num_episode = anime['animeNumEpisode']
            releaseDate = anime['releaseDate']
            endDate = anime['endDate']
            animeNumMembers = anime['animeNumMembers']
            animeScore = anime['animeScore']
            animeUsers = anime['animeUsers']
            animeRank = anime['animeRank']
            animePopularity = anime['animePopularity']
            descr = text_mining_score(anime['animeDescription'])
            animeRelated = anime['animeRelated']
            animeCharacters = anime['animeCharacters']
            animeVoices = anime['animeVoices']
            animeStaff = anime['animeStaff']
            
            #initializing to 0 every score column
            score_title, score_type, score_description, score_staff, score_characters, score_voice, score_releaseDate, score_endDate = 0, 0, 0, 0, 0, 0, 0, 0
            score_episode, score_members, score_animeScore, score_users, score_rank, score_popularity = 0, 0, 0, 0, 0, 0
            
            # define the score column for each document
            for word in query:
                if word.isnumeric():
                    if word == num_episode:
                        score_episode += 0.2
                    if word == animeNumMembers:
                        score_members += 0.05
                    if word == animeUsers:
                        score_users += 0.05
                    if word == animeRank:
                        score_rank += 0.2
                    if word == animePopularity:
                        score_popularity += 0.1
                    if releaseDate!="" and word == str((parser.parse(releaseDate)).year):
                        score_releaseDate += 0.5
                    if endDate!="" and word == str((parser.parse(endDate)).year) :
                        score_endDate += 0.5
                else:
                    score_title += (title.count(word)*2)/(len(title) + len(query))
                    score_type += typ.count(word)/2
                    score_description += (descr.count(word)*2)/(len(descr) + len(query))
                    if word == animeScore:
                        score_animeScore += 0.4
                    for character in animeCharacters.split(","):
                        character = character.replace("[","").replace("]", "").replace(" ' ", "").replace("'", "")
                        character = text_mining_score(character)
                        if character == word.split():
                            score_characters += 0.5
                    for voice in animeVoices.split(","):
                        voice = voice.replace("[","").replace("]", "").replace(" ' ", "").replace("'", "")
                        voice = text_mining_score(voice)
                        if voice == word.split():
                            score_voice += 0.5
                    for member in animeStaff.split(","):
                        member = member.replace("[","").replace("]", "").replace(" ' ", "").replace("'", "")
                        member = text_mining_score(member)
                        if member == word.split():
                            score_staff += 0.5


            score = score_title + score_type + score_episode + score_members + score_animeScore + score_users + score_rank + score_popularity + score_description + score_staff + score_characters + score_voice
            
            #adding to the score_dict the corresponding score for every document
            score_dict[document_id] = score
            heapq.heappush(heap, score)
    
    #saving in a json file the score_dict and the heap list
    file_score_dict = open("score_dict.json", "w", encoding='utf-8')
    json.dump(score_dict, file_score_dict, ensure_ascii=False)
    file_score_dict.close()
    
    file = open("heap.json", "w", encoding='utf-8')
    json.dump(heap, file, ensure_ascii=False)
    file.close()

In [5]:
new_score(query)

100%|████████████████████████████████████████████████████████████████████████████████| 383/383 [09:53<00:00,  1.55s/it]


In [6]:
def top_k_documents(k):
    
    # opening the score_dict
    score_dict_json = open('score_dict.json', 'r', encoding='utf-8')
    score_dict = json.load(score_dict_json)
    score_dict_json.close()
    
    # opening the heap list
    heap_json = open('heap.json', 'r', encoding='utf-8')
    heap = json.load(heap_json)
    heap_json.close()
    
    # we are taking the first k similar documents to the query using heapq
    heap_k = heapq.nlargest(k, heap)
    final_doc = dict()
    for i in range(len(heap_k)):
        pos = list(score_dict.values()).index(heap_k[i])
        final_doc[list(score_dict)[pos]] = score_dict[list(score_dict)[pos]]
    return final_doc, score_dict

In [7]:
# opening the anime_links
list_url_txt = open('anime_links.txt', 'r', encoding='utf-8')
list_url = list_url_txt.read().splitlines()
list_url_txt.close()

In [8]:
# creating a pandas dataframe for the final result
doc_df = pd.DataFrame(columns=["animeTitle", "animeDescription", "Url", "Similarity"])
final_doc, score_dict = top_k_documents(k)
for doc in final_doc:
    i = int(''.join(re.findall(r'\d+', doc)))
    doc_page = (i-1)//50 + 1
    path = f'pages_tsv/pages_tsv/page_{doc_page}/anime_{i}.tsv'
    tsv_file = open(path, 'r', encoding='utf-8')
    anime_tsv = csv.DictReader(tsv_file, delimiter='\t')
    anime = anime_tsv.__next__()
    doc_df.loc[doc, ["animeTitle", "animeDescription", "Url", "Similarity"]] = [anime["animeTitle"], anime["animeDescription"], list_url[i-1], score_dict[doc]]

In [9]:
doc_dict = dict(selector="th",
             props=[('text-align', 'center')])
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

doc_df.style.set_properties(**{'text-align':'center'}).set_table_styles([doc_dict]).format({'Url': make_clickable})

Unnamed: 0,animeTitle,animeDescription,Url,Similarity
document_9,Gintama': Enchousen,"While Gintoki Sakata was away, the Yorozuya found themselves a new leader: Kintoki, Gintoki's golden-haired doppelganger. In order to regain his former position, Gintoki will need the help of those around him, a troubling feat when no one can remember him! Between Kintoki and Gintoki, who will claim the throne as the main character? In addition, Yorozuya make a trip back down to red-light district of Yoshiwara to aid an elderly courtesan in her search for her long-lost lover. Although the district is no longer in chains beneath the earth's surface, the trio soon learn of the tragic backstories of Yoshiwara's inhabitants that still haunt them. With flashback after flashback, this quest has Yorozuya witnessing everlasting love and protecting it as best they can with their hearts and souls. Gintama': Enchousen includes moments of action-packed intensity along with their usual lighthearted, slapstick humor for Gintoki and his friends. [Written by MAL Rewrite]",https://myanimelist.net/anime/15417/Gintama__Enchousen,0.979227
document_10,Gintama: The Final,New Gintama movie.,https://myanimelist.net/anime/39486/Gintama__The_Final,0.972222
document_22,Gintama.: Shirogane no Tamashii-hen - Kouhan-sen,Second Season of the final arc of Gintama.,https://myanimelist.net/anime/37491/Gintama__Shirogane_no_Tamashii-hen_-_Kouhan-sen,0.848485
document_15,Gintama,"The Amanto, aliens from outer space, have invaded Earth and taken over feudal Japan. As a result, a prohibition on swords has been established, and the samurai of Japan are treated with disregard as a consequence. However one man, Gintoki Sakata, still possesses the heart of the samurai, although from his love of sweets and work as a yorozuya, one might not expect it. Accompanying him in his jack-of-all-trades line of work are Shinpachi Shimura, a boy with glasses and a strong heart, Kagura with her umbrella and seemingly bottomless stomach, as well as Sadaharu, their oversized pet dog. Of course, these odd jobs are not always simple, as they frequently have run-ins with the police, ragtag rebels, and assassins, oftentimes leading to humorous but unfortunate consequences. Who said life as an errand boy was easy? [Written by MAL Rewrite]",https://myanimelist.net/anime/918/Gintama,0.833333
document_6,Gintama',"After a one-year hiatus, Shinpachi Shimura returns to Edo, only to stumble upon a shocking surprise: Gintoki and Kagura, his fellow Yorozuya members, have become completely different characters! Fleeing from the Yorozuya headquarters in confusion, Shinpachi finds that all the denizens of Edo have undergone impossibly extreme changes, in both appearance and personality. Most unbelievably, his sister Otae has married the Shinsengumi chief and shameless stalker Isao Kondou and is pregnant with their first child. Bewildered, Shinpachi agrees to join the Shinsengumi at Otae and Kondou's request and finds even more startling transformations afoot both in and out of the ranks of the the organization. However, discovering that Vice Chief Toushirou Hijikata has remained unchanged, Shinpachi and his unlikely Shinsengumi ally set out to return the city of Edo to how they remember it. With even more dirty jokes, tongue-in-cheek parodies, and shameless references, Gintama' follows the Yorozuya team through more of their misadventures in the vibrant, alien-filled world of Edo. [Written by MAL Rewrite]",https://myanimelist.net/anime/9969/Gintama,0.801462
