## Import Library

In [22]:
import pandas as pd;
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Load Data

In [23]:
#Loading Dataset
animes = pd.read_csv("data/animes.csv", encoding="ISO-8859-1")
animes.head() #display in table

# print(animes)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyssâa gaping chasm stretching down int...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


## Clean + Processing Data

In [24]:
#for checking is there any null value
print("Total null value in dataset:")
print(animes.isnull().sum())
print("\n--------------------------------\n")

print("Total null value in dataset (specific column):")
print(animes[['uid','title','synopsis']].isnull().sum())


Total null value in dataset:
uid              0
title            0
synopsis       975
genre            0
aired            0
episodes       706
members          0
popularity       0
ranked        3212
score          579
img_url        180
link             0
dtype: int64

--------------------------------

Total null value in dataset (specific column):
uid           0
title         0
synopsis    975
dtype: int64


In [25]:
#cleaning null value
animes['synopsis'] = animes['synopsis'].fillna("")

#for checking is there still have any null value
print("Total null value in dataset (specific column):")
print(animes[['uid','title','synopsis']].isnull().sum())


Total null value in dataset (specific column):
uid         0
title       0
synopsis    0
dtype: int64


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Remove all english stop words such as 'the', 'a'
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(animes['synopsis'])


## Calculate Similarity

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Turn similarity matrix into DataFrame with titles as labels
animes_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim_df = pd.DataFrame(animes_sim, 
                             index=animes['title'], 
                             columns=animes['title'])

print(cosine_sim_df)
# print(cosine_sim)


title                                        Haikyuu!! Second Season  \
title                                                                  
Haikyuu!! Second Season                                     1.000000   
Shigatsu wa Kimi no Uso                                     0.051963   
Made in Abyss                                               0.033850   
Fullmetal Alchemist: Brotherhood                            0.025713   
Kizumonogatari III: Reiketsu-hen                            0.016301   
...                                                              ...   
Flip Flappers                                               0.039112   
Fushigi Yuugi                                               0.053759   
Gakkou no Kaidan                                            0.037640   
InuYasha Movie 2: Kagami no Naka no Mugenjo                 0.008497   
Mobile Suit Gundam: Char's Counterattack                    0.036072   

title                                        Shigatsu wa Kimi n

## Function

In [None]:
indices = pd.Series(animes.index, index=animes['title']).drop_duplicates()

def recommend_anime(title, n=10):
    if title not in indices:
        return ["Anime not found!"]
    
    idx = indices[title]
    # Get the similarity scores for this specific anime
    sim_scores = animes_sim[idx]
    # Create list of (index, score) pairs
    sim_scores = list(enumerate(sim_scores))
    # Sort by the similarity score (x[1])
    # sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=False)
    sim_scores = sim_scores[1:n+1]  # skip itself
    anime_indices = [i[0] for i in sim_scores]
    return animes['title'].iloc[anime_indices].tolist()
# Example
print(recommend_anime("Cowboy Bebop", n=10))


['Shigatsu wa Kimi no Uso']
