## Import Library

In [8]:
import pandas as pd;
import numpy as np;

## Load Data

In [9]:
#Loading Dataset
animes = pd.read_csv("data/animes.csv", encoding="ISO-8859-1")
animes.head() #display in table

# print(animes)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyssâa gaping chasm stretching down int...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


## Clean + Processing Data

In [10]:
#for checking is there any null value
print("Total null value in dataset:")
print(animes.isnull().sum())
print("\n--------------------------------\n")

print("Total null value in dataset (specific column):")
print(animes[['uid','title','synopsis']].isnull().sum())


Total null value in dataset:
uid              0
title            0
synopsis       975
genre            0
aired            0
episodes       706
members          0
popularity       0
ranked        3212
score          579
img_url        180
link             0
dtype: int64

--------------------------------

Total null value in dataset (specific column):
uid           0
title         0
synopsis    975
dtype: int64


In [11]:
#cleaning null value
animes['synopsis'] = animes['synopsis'].fillna("")

#for checking is there still have any null value
print("Total null value in dataset (specific column):")
print(animes[['uid','title','synopsis']].isnull().sum())


Total null value in dataset (specific column):
uid         0
title       0
synopsis    0
dtype: int64


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Remove all english stop words such as 'the', 'a'
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(animes['synopsis'])


## Calculate Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Turn similarity matrix into DataFrame with titles as labels
animes_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
animes_sim_df = pd.DataFrame(animes_sim, 
                             index=animes['title'], 
                             columns=animes['title'])

print(animes_sim_df)


title                                        Haikyuu!! Second Season  \
title                                                                  
Haikyuu!! Second Season                                     1.000000   
Shigatsu wa Kimi no Uso                                     0.027058   
Made in Abyss                                               0.019472   
Fullmetal Alchemist: Brotherhood                            0.017405   
Kizumonogatari III: Reiketsu-hen                            0.012047   
...                                                              ...   
Flip Flappers                                               0.020680   
Fushigi Yuugi                                               0.034918   
Gakkou no Kaidan                                            0.023487   
InuYasha Movie 2: Kagami no Naka no Mugenjo                 0.005957   
Mobile Suit Gundam: Char's Counterattack                    0.025065   

title                                        Shigatsu wa Kimi n

## Function

In [None]:
indices = pd.Series(animes.index, index=animes['title'])

def recommend_anime(title, num_recommendations=10):
    
    if title not in indices.index:
        return ["Anime not found!"]

    idx = indices[title]

    if isinstance(idx, pd.Series):  
        idx = idx.values  # multiple row indices

   # Take the average similarity across all duplicates
    sim_vector = np.mean(animes_sim[idx], axis=0)

    sim_scores = list(enumerate(sim_vector))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]

    anime_indices = [i[0] for i in sim_scores]

    return animes['title'].iloc[anime_indices].tolist()
# Example
print(recommend_anime("Toradora!"))


['Toradora!', 'Toradora!: Bentou no Gokui', 'Puzzle & Dragon', 'Kuroko no Basket', 'Kuroko no Basket', 'Kyouhaku: Owaranai Ashita', 'Kyouhaku: Owaranai Ashita', 'Ring ni Kakero 1: Nichibei Kessen-hen', 'Toko-chan Chokkin', 'Gokujou!! Mecha Mote Iinchou']


## Final

In [2]:
import pandas as pd;
import numpy as np;
from sklearn.feature_extraction.text import TfidfVectorizer

animes = pd.read_csv("data/animes.csv", encoding="ISO-8859-1")
animes.head() #display in table

#for checking is there any null value
print("Total null value in dataset:")
print(animes.isnull().sum())
print("\n--------------------------------\n")

print("Total null value in dataset (specific column):")
print(animes[['uid','title','synopsis']].isnull().sum())

#cleaning null value
animes['synopsis'] = animes['synopsis'].fillna("")

#for checking is there still have any null value
print("Total null value in dataset (specific column):")
print(animes[['uid','title','synopsis']].isnull().sum())

#Remove all english stop words such as 'the', 'a'
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(animes['synopsis'])

from sklearn.metrics.pairwise import cosine_similarity

# Turn similarity matrix into DataFrame with titles as labels
animes_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
animes_sim_df = pd.DataFrame(animes_sim, 
                             index=animes['title'], 
                             columns=animes['title'])


indices = pd.Series(animes.index, index=animes['title'])

def recommend_anime(title, num_recommendations=10):
    
    if title not in indices.index:
        return ["Anime not found!"]

    idx = indices[title]

    if isinstance(idx, pd.Series):  
        idx = idx.values  # multiple row indices

   # Take the average similarity across all duplicates
    sim_vector = np.mean(animes_sim[idx], axis=0)

    sim_scores = list(enumerate(sim_vector))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]

    anime_indices = [i[0] for i in sim_scores]

    return animes['title'].iloc[anime_indices].tolist()
# Example
print(recommend_anime("Toradora!"))


Total null value in dataset:
uid              0
title            0
synopsis       975
genre            0
aired            0
episodes       706
members          0
popularity       0
ranked        3212
score          579
img_url        180
link             0
dtype: int64

--------------------------------

Total null value in dataset (specific column):
uid           0
title         0
synopsis    975
dtype: int64
Total null value in dataset (specific column):
uid         0
title       0
synopsis    0
dtype: int64
['Toradora!', 'Toradora!: Bentou no Gokui', 'Puzzle & Dragon', 'Kuroko no Basket', 'Kuroko no Basket', 'Kyouhaku: Owaranai Ashita', 'Kyouhaku: Owaranai Ashita', 'Ring ni Kakero 1: Nichibei Kessen-hen', 'Toko-chan Chokkin', 'Gokujou!! Mecha Mote Iinchou']
