In [14]:
import numpy as np # To perform linear algebra
import pandas as pd # For data processing

In [15]:
anime_data = pd.read_csv("anime-dataset-2023.csv")

In [16]:
anime_data.head(3)

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,Premiered,Status,Producers,Licensors,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26,"Apr 3, 1998 to Apr 24, 1999",spring 1998,Finished Airing,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41,43,78525,914193,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1,1-Sep-01,UNKNOWN,Finished Airing,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189,602,1448,206248,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26,"Apr 1, 1998 to Sep 30, 1998",spring 1998,Finished Airing,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328,246,15035,356739,727252,https://cdn.myanimelist.net/images/anime/7/203...


In [17]:
anime_data.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [18]:
import re
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    return text

anime_data['Name'] = anime_data['Name'].apply(text_cleaning)
anime_data['English name'] = anime_data['English name'].apply(text_cleaning)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

#getting tfidf
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
anime_data['Genres'] = anime_data['Genres'].fillna('')

genres_str = anime_data['Genres'].str.split(',').astype(str) 
studios_str = anime_data['Studios'].str.split(',').astype(str) 
name_str = anime_data['Name'].str.split(',').astype(str)
total = name_str + studios_str + genres_str
print(total)
tfv_matrix = tfv.fit_transform(total)

0        ['Cowboy Bebop']['Sunrise']['Action', ' Award ...
1        ['Cowboy Bebop: Tengoku no Tobira']['Bones']['...
2        ['Trigun']['Madhouse']['Action', ' Adventure',...
3        ['Witch Hunter Robin']['Sunrise']['Action', ' ...
4        ['Bouken Ou Beet']['Toei Animation']['Adventur...
                               ...                        
24900    ['Wu Nao Monu']['UNKNOWN']['Comedy', ' Fantasy...
24901    ['Bu Xing Si: Yuan Qi']['UNKNOWN']['Action', '...
24902    ['Di Yi Xulie']['UNKNOWN']['Action', ' Adventu...
24903    ['Bokura no Saishuu Sensou']['UNKNOWN']['UNKNO...
24904             ['Shijuuku Nichi']['UNKNOWN']['UNKNOWN']
Length: 24905, dtype: object


In [7]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

#getting the indices of anime title
indices = pd.Series(anime_data.index, index=anime_data['Name']).drop_duplicates()

In [8]:
def recommend(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    anime_indices = [i[0] for i in sig_scores]
    
    for i in anime_indices[0:10]:
        print(anime_data.iloc[i].Name)


In [9]:
recommend('Death Note') # Need to apply popularity factors to this model\
print('---')

Death Note: Rewrite
Death Billiards
Death Parade
Mouryou no Hako
Mousou Dairinin
Mouryou no Hako: Chuuzenji Atsuko no Jikenbo - Hako no Yurei no Koto
Gyakkyou Burai Kaiji: Hakairoku-hen
Tesla Note
Rainbow: Nisha Rokubou no Shichinin
Aoi Bungaku Series
---


In [10]:
recommend('Tokyo Ghoul') # Need to apply popularity factors to this model\
print('---')

Tokyo Ghoul √A
Tokyo Ghoul: "Pinto"
Tokyo Ghoul:re
Tokyo Ghoul: "Jack"
Tokyo Ghoul:re 2nd Season
Neo Ranga
Kyoufu Shinbun
3x3 Eyes
Ayakashi
Youma
---


In [11]:
recommend('Shingeki no Kyojin') # Need to apply popularity factors to this model\
print('---')

Shingeki no Kyojin Season 2
Shingeki no Kyojin: Kuinaki Sentaku
Shingeki no Kyojin OVA
Shingeki no Kyojin: Chronicle
Shingeki no Kyojin: Lost Girls
Shingeki no Kyojin Movie 2: Jiyuu no Tsubasa
Shingeki no Kyojin Movie 1: Guren no Yumiya
Shingeki no Kyojin: Ano Hi Kara
Jinyu
Mida
---


In [12]:
recommend('Kimi no Na wa.') # Need to apply popularity factors to this model\
print('---')

Hadakanbou
Taisei Kensetsu CMs
Kotonoha no Niwa
Tenki no Ko
Hoshi no Koe
Makasete Iruka!
Kumo no Mukou, Yakusoku no Basho
Hanare Toride no Yonna
Sarusuberi: Miss Hokusai
Shisha no Sho
---


In [13]:
import pickle
pickle.dump(anime_data, open('anime_list.pkl','wb'))
pickle.dump(tfv_matrix, open('matrix.pkl','wb'))