## Import Library

In [None]:
import pandas as pd;

## Load Data

In [None]:
#Loading Dataset
animes = pd.read_csv("data/animes.csv", encoding="ISO-8859-1")
animes.head() #display in table

# print(animes)

## Clean Data

In [None]:
#for checking is there any null value
print("Total null value in dataset:")
print(animes.isnull().sum())



In [None]:
#cleaning null value
animes['synopsis'] = animes['synopsis'].fillna("")
animes['genre'] = animes['genre'].fillna("")

#for checking is there still have any null value
print("\nTotal null value in dataset (specific column):")
print(animes[['uid','title','synopsis', 'genre']].isnull().sum())


In [None]:
# Keep only useful columns
animes = animes[['uid', 'title', 'synopsis', 'genre']]

## NLP

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine synopsis + genres into one content field
animes['content'] = animes['synopsis'] + " " + animes['genre'].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else "")

#Remove all english stop words such as 'the', 'a'
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(animes['content'])
print(tfidf_matrix)


## Function

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_by_query(query, top_n=20):
    query_vec = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_vec, tfidf_matrix).ravel()
    top_indices = sim_scores.argsort()[-top_n:][::-1]
    return animes['title'].iloc[top_indices].tolist()

# Example
print(recommend_by_query("comedy and school"))
print(recommend_by_query("Is there any sports high school anime recommended?"))


## Setup
- Data Cleaning process

In [None]:
import pandas as pd;
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Loading Dataset
animes = pd.read_csv("data/animes.csv", encoding="ISO-8859-1")
animes.head() #display in table

#for checking is there any null value
print("Total null value in dataset:")
print(animes.isnull().sum())

#cleaning null value
animes['synopsis'] = animes['synopsis'].fillna("")
animes['genre'] = animes['genre'].fillna("")

#for checking is there still have any null value
print("\nTotal null value in dataset (specific column):")
print(animes[['uid','title','synopsis', 'genre']].isnull().sum())

# Keep only useful columns
animes = animes[['uid', 'title', 'synopsis', 'genre']]

# Combine synopsis + genres into one content field
animes['content'] = animes['synopsis'] + " " + animes['genre'].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else "")


def recommend_by_query(query, top_n=20):
    query_vec = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_vec, tfidf_matrix).ravel()
    top_indices = sim_scores.argsort()[-top_n:][::-1]
    return animes['title'].iloc[top_indices].tolist()

# Example
print(recommend_by_query("comedy and school"))


## NLP Processing

In [None]:
#Remove all english stop words such as 'the', 'a'
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(animes['content'])
print(tfidf_matrix)

## Function
- Content-Based Filtering

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_by_query(query, top_n=20):
    query_vec = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_vec, tfidf_matrix).ravel()
    top_indices = sim_scores.argsort()[-top_n:][::-1]
    return animes['title'].iloc[top_indices].tolist()

## Sample Test
- Test Run Content-Based Filtering

In [None]:
# Example
print(recommend_by_query("comedy and school")) 
print(recommend_by_query("Is there any sports high school anime recommended?"))