## Import Library

In [None]:
import pandas as pd;

## Load Data

In [2]:
#Loading Dataset
animes = pd.read_csv("data/animes.csv", encoding="ISO-8859-1")
animes.head() #display in table

# print(animes)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyssâa gaping chasm stretching down int...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


## Clean Data

In [3]:
#for checking is there any null value
print("Total null value in dataset:")
print(animes.isnull().sum())



Total null value in dataset:
uid              0
title            0
synopsis       975
genre            0
aired            0
episodes       706
members          0
popularity       0
ranked        3212
score          579
img_url        180
link             0
dtype: int64


In [4]:
#cleaning null value
animes['synopsis'] = animes['synopsis'].fillna("")
animes['genre'] = animes['genre'].fillna("")

#for checking is there still have any null value
print("\nTotal null value in dataset (specific column):")
print(animes[['uid','title','synopsis', 'genre']].isnull().sum())



Total null value in dataset (specific column):
uid         0
title       0
synopsis    0
genre       0
dtype: int64


In [5]:
# Keep only useful columns
animes = animes[['uid', 'title', 'synopsis', 'genre']]

## NLP

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine synopsis + genres into one content field
animes['content'] = animes['synopsis'] + " " + animes['genre'].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else "")

#Remove all english stop words such as 'the', 'a'
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(animes['content'])
print(tfidf_matrix)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 652591 stored elements and shape (19311, 45351)>
  Coords	Values
  (0, 13440)	0.07228844586734062
  (0, 29494)	0.11764360126691784
  (0, 18865)	0.10734408418119634
  (0, 16894)	0.15251335576012381
  (0, 20540)	0.3650416111251174
  (0, 34787)	0.07725417566997717
  (0, 43066)	0.314356900282006
  (0, 39615)	0.12421519731067973
  (0, 3267)	0.07844754972417589
  (0, 32392)	0.1370969556351493
  (0, 11204)	0.08630043042403712
  (0, 1473)	0.09602710760815286
  (0, 7767)	0.0877511005122935
  (0, 37582)	0.08357591111733502
  (0, 40855)	0.08316913976857038
  (0, 18789)	0.07271015927436972
  (0, 32200)	0.08655327742577085
  (0, 19081)	0.10184270303541475
  (0, 23244)	0.0606850874977704
  (0, 37719)	0.090171297645984
  (0, 33430)	0.0773463866625316
  (0, 27162)	0.1370969556351493
  (0, 1408)	0.09137729587339492
  (0, 22527)	0.07741273491350009
  (0, 40963)	0.0724276135759678
  :	:
  (19310, 6229)	0.07006209789680688
  (19310, 42078)	0.08

## Function

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_by_query(query, top_n=20):
    query_vec = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_vec, tfidf_matrix).ravel()
    top_indices = sim_scores.argsort()[-top_n:][::-1]
    return animes['title'].iloc[top_indices].tolist()

# Example
print(recommend_by_query("comedy and school"))


['Yuyushiki: Komarasetari, Komarasaretari', 'Tales of HR', '3-Nen D-Gumi Glass no Kamen: Tobidase! Watashitachi no Victory Road', 'Kaden Manzai John TV Show!', 'Kawauso Labo', 'Mandamgangho', 'Chamebou Kuukijuu no Maki', 'Hentatsu', 'Catchy-kun no Nice Catch!', '100% Pascal-sensei', 'Chokin no Susume', 'Ore wa Chokkaku', 'Daifuku-kun@Kin Tele 2nd Season', 'Choukadou Girl â\x85\x99: Nona Video Nikki', 'Gakkyuu Ou Yamazaki Specials', 'Wake Up, Girl Zoo! Taiwan de Go!', 'Rusuden Hour: Sodan Brothers', 'Hentatsu', 'Puru Pom', 'Akagaki Genzou: Tokuri no Wakare']


## Final

In [26]:
import pandas as pd;
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Loading Dataset
animes = pd.read_csv("data/animes.csv", encoding="ISO-8859-1")
animes.head() #display in table

#for checking is there any null value
print("Total null value in dataset:")
print(animes.isnull().sum())

#cleaning null value
animes['synopsis'] = animes['synopsis'].fillna("")
animes['genre'] = animes['genre'].fillna("")

#for checking is there still have any null value
print("\nTotal null value in dataset (specific column):")
print(animes[['uid','title','synopsis', 'genre']].isnull().sum())

# Keep only useful columns
animes = animes[['uid', 'title', 'synopsis', 'genre']]

# Combine synopsis + genres into one content field
animes['content'] = animes['synopsis'] + " " + animes['genre'].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else "")

#Remove all english stop words such as 'the', 'a'
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(animes['content'])
print(tfidf_matrix)

def recommend_by_query(query, top_n=20):
    query_vec = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_vec, tfidf_matrix).ravel()
    top_indices = sim_scores.argsort()[-top_n:][::-1]
    return animes['title'].iloc[top_indices].tolist()

# Example
print(recommend_by_query("comedy and school"))


Total null value in dataset:
uid              0
title            0
synopsis       975
genre            0
aired            0
episodes       706
members          0
popularity       0
ranked        3212
score          579
img_url        180
link             0
dtype: int64

Total null value in dataset (specific column):
uid         0
title       0
synopsis    0
genre       0
dtype: int64
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 652591 stored elements and shape (19311, 45351)>
  Coords	Values
  (0, 13440)	0.07228844586734062
  (0, 29494)	0.11764360126691784
  (0, 18865)	0.10734408418119634
  (0, 16894)	0.15251335576012381
  (0, 20540)	0.3650416111251174
  (0, 34787)	0.07725417566997717
  (0, 43066)	0.314356900282006
  (0, 39615)	0.12421519731067973
  (0, 3267)	0.07844754972417589
  (0, 32392)	0.1370969556351493
  (0, 11204)	0.08630043042403712
  (0, 1473)	0.09602710760815286
  (0, 7767)	0.0877511005122935
  (0, 37582)	0.08357591111733502
  (0, 40855)	0.08316913976857038