In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

## Data Preprocessing

### 1a.Load the dataset

In [2]:
df=pd.read_csv('anime.csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
df.shape

(12294, 7)

In [4]:
#Display the structure of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
#Show statistical summary of numerical columns
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [6]:
df.duplicated().any()

False

### 2a.Handle Missing Value

In [7]:
import warnings as warn 
warn.filterwarnings('ignore')

In [8]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [9]:
median = df['rating'].median()
median

6.57

In [10]:
mode1 = df['genre'].mode()
mode1


0    Hentai
Name: genre, dtype: object

In [11]:
mode2 = df['type'].mode()
mode2

0    TV
Name: type, dtype: object

In [12]:
df.fillna({'rating':median , 'genre':mode1[0] , 'type':mode2[0]} , inplace=True)

In [13]:
df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

### 3a.Explore the data

In [14]:
#Check the data types of each column
df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [15]:
df['type'].unique()

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA'], dtype=object)

In [16]:
df['genre'].unique()

array(['Drama, Romance, School, Supernatural',
       'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen',
       'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen',
       ..., 'Hentai, Sports', 'Drama, Romance, School, Yuri',
       'Hentai, Slice of Life'], dtype=object)

In [17]:
df.nunique()

anime_id    12294
name        12292
genre        3264
type            6
episodes      187
rating        598
members      6706
dtype: int64

In [18]:
df['type'].value_counts()

type
TV         3812
OVA        3311
Movie      2348
Special    1676
ONA         659
Music       488
Name: count, dtype: int64

In [19]:
df['genre'].value_counts()

genre
Hentai                                                  885
Comedy                                                  523
Music                                                   301
Kids                                                    199
Comedy, Slice of Life                                   179
                                                       ... 
Adventure, Drama, Fantasy, Game, Sci-Fi                   1
Adventure, Demons, Fantasy, Historical                    1
Action, Comedy, Drama, Mecha, Music, Sci-Fi, Shounen      1
Action, Comedy, Fantasy, Mecha, Sci-Fi, Shounen           1
Hentai, Slice of Life                                     1
Name: count, Length: 3264, dtype: int64

In [21]:
df['rating'].value_counts()

rating
6.57    283
6.00    141
7.00     99
6.50     90
6.25     84
       ... 
3.47      1
3.71      1
3.87      1
3.91      1
3.14      1
Name: count, Length: 598, dtype: int64

In [22]:
df.describe(include='all')

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
count,12294.0,12294,12294,12294,12294.0,12294.0,12294.0
unique,,12292,3264,6,187.0,,
top,,Shi Wan Ge Leng Xiaohua,Hentai,TV,1.0,,
freq,,2,885,3812,5677.0,,
mean,14058.221653,,,,,6.4757,18071.34
std,11455.294701,,,,,1.017179,54820.68
min,1.0,,,,,1.67,5.0
25%,3484.25,,,,,5.9,225.0
50%,10260.5,,,,,6.57,1550.0
75%,24794.5,,,,,7.17,9437.0


# 2.Feature Extraction

In [23]:
# Selected features
selected_columns = ['genre', 'type', 'rating', 'members', 'episodes']


In [24]:
# One-hot encode 'type'
OHT = pd.get_dummies(df, columns=['type'], drop_first=True)

In [25]:
# One-hot encode 'type'
OHG = pd.get_dummies(df, columns=['genre'], drop_first=True)

In [26]:
data = pd.concat([df ,OHG ,OHT] , axis=1)
data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,anime_id.1,name.1,type.1,...,name.2,genre.1,episodes.1,rating.1,members.1,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,32281,Kimi no Na wa.,Movie,...,Kimi no Na wa.,"Drama, Romance, School, Supernatural",1,9.37,200630,False,False,False,False,False
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,5114,Fullmetal Alchemist: Brotherhood,TV,...,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",64,9.26,793665,False,False,False,False,True
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,28977,Gintama°,TV,...,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",51,9.25,114262,False,False,False,False,True
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,9253,Steins;Gate,TV,...,Steins;Gate,"Sci-Fi, Thriller",24,9.17,673572,False,False,False,False,True
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,9969,Gintama&#039;,TV,...,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",51,9.16,151266,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211,9316,Toushindai My Lover: Minami tai Mecha-Minami,OVA,...,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,1,4.15,211,False,False,True,False,False
12290,5543,Under World,Hentai,OVA,1,4.28,183,5543,Under World,OVA,...,Under World,Hentai,1,4.28,183,False,False,True,False,False
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219,5621,Violence Gekiga David no Hoshi,OVA,...,Violence Gekiga David no Hoshi,Hentai,4,4.88,219,False,False,True,False,False
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,OVA,...,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,1,4.98,175,False,False,True,False,False


In [27]:
#drop the original categorical column
data.drop(['genre' , 'type'] , axis =1 , inplace = True)

In [30]:
data.isnull().any()

anime_id        False
name            False
episodes        False
rating          False
members         False
                ...  
type_Music      False
type_ONA        False
type_OVA        False
type_Special    False
type_TV         False
Length: 3283, dtype: bool

In [31]:
user_matrix = df.pivot_table(index='anime_id',columns='name',values='rating')
user_matrix

name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,&quot;Eiyuu&quot; Kaitai,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,...,s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,Üks Uks,ēlDLIVE,◯
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34514,,,,,,,,,,,...,,,,,,,,,,
34519,,,,,,,,,,,...,,,,,,,,,,
34522,,,,,,,,,,,...,,,,,,,,,,
34525,,,,,,,,,,,...,,,,,,,,,,


In [32]:
# replace null with o , this is called sparse matrix.

# 3. Recommendation System Using Cosine Similarity

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
user_similarity_matrix = cosine_similarity(user_matrix.fillna(0))
user_similarity_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [35]:
## fill diagonal values with 0

In [36]:
np.fill_diagonal(user_similarity_matrix,0)

In [37]:
user_similarity_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
user_similarity_df = pd.DataFrame(user_similarity_matrix,index=df.anime_id.unique(),columns=df.anime_id.unique())

In [40]:
user_similarity_df

Unnamed: 0,32281,5114,28977,9253,9969,32935,11061,820,15335,15417,...,26031,34399,10368,9352,5541,9316,5543,5621,6133,26081
32281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
#Most Similar Users
sim_users=user_similarity_df.idxmax(axis=1)[0:50]
sim_users

32281    32281
5114     32281
28977    32281
9253     32281
9969     32281
32935    32281
11061    32281
820      32281
15335    32281
15417    32281
4181     32281
28851    32281
918      32281
2904     32281
28891    32281
199      32281
23273    32281
24701    32281
12355    32281
1575     32281
263      32281
44       32281
1        32281
30276    32281
164      32281
7311     32281
17074    32281
21939    32281
457      32281
2001     32281
245      32281
32983    32281
5258     32281
28957    32281
11665    32281
431      32281
11741    32281
31757    32281
19       32281
12365    32281
1535     32281
32366    32281
30654    32281
20583    32281
19647    32281
4282     32281
10379    32281
22135    32281
21329    32281
31043    32281
dtype: int64

In [42]:
sim_users = sim_users.index
sim_users

Index([32281,  5114, 28977,  9253,  9969, 32935, 11061,   820, 15335, 15417,
        4181, 28851,   918,  2904, 28891,   199, 23273, 24701, 12355,  1575,
         263,    44,     1, 30276,   164,  7311, 17074, 21939,   457,  2001,
         245, 32983,  5258, 28957, 11665,   431, 11741, 31757,    19, 12365,
        1535, 32366, 30654, 20583, 19647,  4282, 10379, 22135, 21329, 31043],
      dtype='int64')

In [45]:
df[(df['anime_id']==1535) | (df['anime_id']==19)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
38,19,Monster,"Drama, Horror, Mystery, Police, Psychological,...",TV,74,8.72,247562
40,1535,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917


In [46]:
user1 = df[df['anime_id']==19]
user1

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
38,19,Monster,"Drama, Horror, Mystery, Police, Psychological,...",TV,74,8.72,247562


In [47]:
user2 = df[df['anime_id']==1535]
user2

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
40,1535,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917


In [48]:
user1.name

38    Monster
Name: name, dtype: object

In [49]:
user2.name

40    Death Note
Name: name, dtype: object

In [50]:
anime_watched_by_user1 = list(set(user1.name))
anime_watched_by_user2 = list(set(user2.name))

for anime_name in anime_watched_by_user1:
  if anime_name not in anime_watched_by_user2:
    print("Recommendation : ", anime_name)

Recommendation :  Monster


## Interview Questions

### 1. Can you explain the difference between user-based and item-based collaborative filtering?


1. User-based collaborative filtering works by identifying users who share similar tastes or preferences with the target user. 

  -> For example, if user A and user B have liked many of the same items, user A’s preferences can be used to recommend items to user B that user A liked but user B hasn't discovered yet. The assumption is that if two users agree on some items, they will likely agree on others as well.

2. Item-based collaborative filtering, on the other hand, recommends items that are similar to the items the user has already interacted with. 

  -> It works by finding items that are frequently co-rated or liked by similar users. If a user has liked item X, the system will recommend other items that are often liked by users who liked item X

### 2. What is collaborative filtering, and how does it work?

Collaborative filtering is a technique used in recommendation systems to predict a user’s preferences based on the preferences of others.

-> It operates on the principle that users who have agreed in the past will agree in the future. It relies on user-item interactions like ratings, clicks, or purchases to identify patterns.

There are two main types of collaborative filtering:

-> User-based: Recommends items based on similar users' preferences.

-> Item-based: Recommends items that are similar to the ones the user has interacted with.

Collaborative filtering works by leveraging these patterns (either user similarities or item similarities) to make personalized recommendations.

