In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
df = df.sort_values(by='anime_id').reset_index(drop=True)

In [4]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,8.40,137636
2,6,Trigun,"Action, Comedy, Sci-Fi",TV,26,8.32,283069
3,7,Witch Hunter Robin,"Action, Drama, Magic, Mystery, Police, Superna...",TV,26,7.36,64905
4,8,Beet the Vandel Buster,"Adventure, Fantasy, Shounen, Supernatural",TV,52,7.06,9848
...,...,...,...,...,...,...,...
12289,34514,Pokemon Generations,"Action, Adventure, Fantasy, Game, Kids",ONA,18,7.21,295
12290,34519,Mobile Suit Gakuen: G-Reco Koushien,Comedy,Special,9,5.67,94
12291,34522,"Wake Up, Girls! Shin Shou","Drama, Music",TV,Unknown,,381
12292,34525,Centaur no Nayami,"Comedy, Fantasy, Slice of Life, Supernatural",TV,Unknown,,108


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
df.name.unique()

array(['Cowboy Bebop', 'Cowboy Bebop: Tengoku no Tobira', 'Trigun', ...,
       'Wake Up, Girls! Shin Shou', 'Centaur no Nayami',
       'Gou-chan. Moko to Chinjuu no Mori no Nakama-tachi'],
      shape=(12292,), dtype=object)

In [7]:
len(df.name.unique())

12292

In [8]:
df.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [9]:
# Handling missing values in Genre and Type column
import warnings
warnings.filterwarnings('ignore')
df['genre'].fillna('Unknown', inplace=True)
df['type'].fillna('Unknown', inplace=True)

In [10]:
df['rating'].fillna(df['rating'].mean(), inplace=True)

In [11]:
df.isna().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [12]:
# Converting episodes to numeric, replacing 'Unknown' with NaN and then filling with median
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'].fillna(df['episodes'].median(), inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  float64
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 672.5+ KB


In [14]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26.0,8.82,486824
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1.0,8.4,137636
2,6,Trigun,"Action, Comedy, Sci-Fi",TV,26.0,8.32,283069
3,7,Witch Hunter Robin,"Action, Drama, Magic, Mystery, Police, Superna...",TV,26.0,7.36,64905
4,8,Beet the Vandel Buster,"Adventure, Fantasy, Shounen, Supernatural",TV,52.0,7.06,9848


#### Feature Extraction

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# Converting genres into a numerical representation using CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(", "))
genre_matrix = vectorizer.fit_transform(df['genre'])

In [17]:
# One-hot encoding on 'type' column
encoder = OneHotEncoder(sparse_output=False)
type_matrix = encoder.fit_transform(df[['type']])

In [18]:
# Combining the genre and type features into a single matrix
feature_matrix = np.hstack((genre_matrix.toarray(), type_matrix))

In [19]:
feature_matrix.shape

(12294, 51)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

# Computing cosine similarity between all anime based on extracted features
cosine_sim = cosine_similarity(feature_matrix)

# Function to get recommendations based on cosine similarity
def recommend_anime(title, df, cosine_sim_matrix, top_n=10):
    # Getting the index of the anime that matches the title
    idx = df[df['name'] == title].index

    if len(idx) == 0:
        return f"Anime '{title}' not found in the dataset."
    
    idx = idx[0]

    # Getting similarity scores for all anime with respect to the given anime
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sorting anime based on similarity scores (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Getting the indices of the most similar anime
    anime_indices = [i[0] for i in sim_scores]

    # Returning the top recommended anime titles
    return df.iloc[anime_indices][['anime_id', 'name', 'genre', 'type', 'rating', 'episodes', 'members']]

In [21]:
# Testing recommendation function with an example
recommend_anime("Steins;Gate", df, cosine_sim)

Unnamed: 0,anime_id,name,genre,type,rating,episodes,members
1649,1812,Hanoka,Sci-Fi,TV,4.27,12.0,1686
3709,4547,RoboDz,Sci-Fi,TV,5.0,26.0,125
6181,10348,Fireball Charming,Sci-Fi,TV,6.94,13.0,5640
6514,11213,Hoshi no Ko Poron,Sci-Fi,TV,6.76,260.0,117
6838,12767,Yuusei Kamen,Sci-Fi,TV,6.44,39.0,103
12204,34208,Escha Chron,Sci-Fi,TV,6.473902,2.0,797
215,239,Gankutsuou,"Drama, Mystery, Sci-Fi, Supernatural, Thriller",TV,8.27,24.0,103828
119,141,Jinki:Extend,"Mecha, Sci-Fi",TV,6.29,12.0,7496
359,383,Galaxy Angel,"Comedy, Sci-Fi",TV,7.14,24.0,17601
511,545,Mousou Kagaku Series: Wandaba Style,"Comedy, Sci-Fi",TV,6.18,12.0,2077


In [22]:
recommend_anime("Cowboy Bebop", df, cosine_sim)

Unnamed: 0,anime_id,name,genre,type,rating,episodes,members
1118,1226,Seihou Tenshi Angel Links,"Action, Adventure, Comedy, Drama, Romance, Sci...",TV,6.06,13.0,4817
376,400,Seihou Bukyou Outlaw Star,"Action, Adventure, Comedy, Sci-Fi, Space",TV,7.98,24.0,78600
1352,1490,Ginga Tetsudou Monogatari,"Action, Adventure, Drama, Sci-Fi, Space",TV,7.29,26.0,5947
2016,2203,Waga Seishun no Arcadia: Mugen Kidou SSX,"Action, Adventure, Drama, Sci-Fi, Space",TV,7.56,22.0,2587
2502,2717,Ginga Tetsudou Monogatari: Eien e no Bunkiten,"Action, Adventure, Drama, Sci-Fi, Space",TV,7.09,24.0,2072
186,209,R.O.D the TV,"Action, Adventure, Comedy, Drama, Sci-Fi, Supe...",TV,7.64,26.0,47053
905,1000,Uchuu Kaizoku Captain Harlock,"Action, Adventure, Drama, Sci-Fi, Seinen, Space",TV,7.87,42.0,14869
927,1022,Generator Gawl,"Action, Adventure, Comedy, Drama, Sci-Fi, Shounen",TV,7.13,12.0,5457
1079,1184,Lost Universe,"Adventure, Comedy, Drama, Sci-Fi, Shounen, Space",TV,7.1,26.0,9293
1181,1293,Urusei Yatsura,"Action, Adventure, Comedy, Drama, Romance, Sci-Fi",TV,7.78,195.0,23020


In [23]:
df[df['anime_id'] == 1]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26.0,8.82,486824


In [24]:
df[df.name == "Steins;Gate"]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
5689,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24.0,9.17,673572


In [25]:
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [26]:
# Evaluating the Recommendation system
from sklearn.metrics import precision_score, recall_score, f1_score

In [27]:
total_tp = 0
total_fp = 0
total_fn = 0

for idx, row in test_df.iterrows():
    recommendations = recommend_anime(row['name'], df, cosine_sim)
    recommended_ids = set(recommendations['anime_id'])

    # animes with similar genres
    genres = row['genre'].split(', ')

    relevant_anime = set(df[
            (df['genre'].str.contains('|'.join(genres))) &  
            (df['anime_id'] != row['anime_id'])
        ]['anime_id'])

    # Calculating True Positive, False Positive and False Negative

    tp = len(recommended_ids.intersection(relevant_anime))
    fp = len(recommended_ids - relevant_anime)
    fn = len(relevant_anime - recommended_ids)

    total_tp += tp
    total_fp += fp
    total_fn += fn

precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"Precision: {precision}")
print(f"Precision: {recall}")
print(f"Precision: {f1}")

Precision: 0.9622610817405449
Precision: 0.0022711405056786193
Precision: 0.004531585507199748


### Interview Questions:

##### 1. Can you explain the difference between user-based and item-based collaborative filtering?

- User-based collaborative filtering is where users who liked the same things in the past will like the same things in the future.

- Example: If User-A and User-B have rated several animes similarly, and User B gave high ratings to “Naruto” which User A hasn’t watched, then Naruto is recommended to User A.

- Item-based collaborative filtering is where items liked by similar users tend to be similar.

- Example: If many users who watched "Attack on Titan" also watched "Tokyo Ghoul", then "Tokyo Ghoul" would be recommended to someone who watched "Attack on Titan".

##### 2. What is collaborative filtering, and how does it work?

- Collaborative Filtering (CF) is a recommendation technique that makes automatic predictions (filtering) about the interests of a user by collecting preferences from many users (collaborating).

- Collaborative Filtering has two main types:

- User-based Collaborative Filtering
- Item-based Collaborative Filtering