## Importing Depenencies

In [50]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [51]:
#Loading Data
anime = pd.read_csv('anime.csv')
credits = pd.read_csv('rating.csv')

In [52]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [53]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [54]:
credits.head()
#Not usefull for our project - will not go further with this data

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [55]:
### We will create recommendation system based on [genre type and rating]
df = anime[['anime_id','name','genre','type','rating']]

In [56]:
df.head()

Unnamed: 0,anime_id,name,genre,type,rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.37
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.25
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,9.17
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.16


##### We will also not consider rating here as - completely different anime could have same ratings also - which will falsify the recommendations

In [57]:
df.drop('rating',axis=1,inplace=True)

In [58]:
df.head()

Unnamed: 0,anime_id,name,genre,type
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   anime_id  12294 non-null  int64 
 1   name      12294 non-null  object
 2   genre     12232 non-null  object
 3   type      12269 non-null  object
dtypes: int64(1), object(3)
memory usage: 384.3+ KB


### Preprocessing

In [60]:
df.isna().sum()

anime_id     0
name         0
genre       62
type        25
dtype: int64

In [61]:
## It is ideal to fill the NaN with proper values based on Domain Knowledge But here we are just
## Dropping NAN
df = df.dropna().reset_index()

In [62]:
df.isna().sum()

index       0
anime_id    0
name        0
genre       0
type        0
dtype: int64

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12210 entries, 0 to 12209
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   index     12210 non-null  int64 
 1   anime_id  12210 non-null  int64 
 2   name      12210 non-null  object
 3   genre     12210 non-null  object
 4   type      12210 non-null  object
dtypes: int64(2), object(3)
memory usage: 477.1+ KB


In [64]:
df.head()

Unnamed: 0,index,anime_id,name,genre,type
0,0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie
1,1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV
2,2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV
3,3,9253,Steins;Gate,"Sci-Fi, Thriller",TV
4,4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV


In [65]:
df.tail()

Unnamed: 0,index,anime_id,name,genre,type
12205,12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA
12206,12290,5543,Under World,Hentai,OVA
12207,12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA
12208,12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA
12209,12293,26081,Yasuji no Pornorama: Yacchimae!!,Hentai,Movie


#### Creating a column - Tags by Combining [genre , type]

In [66]:
df['tags'] = df['genre'] + ',' + df['type']

In [67]:
df.head()

Unnamed: 0,index,anime_id,name,genre,type,tags
0,0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,"Drama, Romance, School, Supernatural,Movie"
1,1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,"Action, Comedy, Historical, Parody, Samurai, S..."
3,3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,"Sci-Fi, Thriller,TV"
4,4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,"Action, Comedy, Historical, Parody, Samurai, S..."


In [68]:
#Getting Required data only
new_df = df[['anime_id','name','tags']]

In [69]:
new_df.head()

Unnamed: 0,anime_id,name,tags
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural,Movie"
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."
3,9253,Steins;Gate,"Sci-Fi, Thriller,TV"
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S..."


In [70]:
new_df.tail()

Unnamed: 0,anime_id,name,tags
12205,9316,Toushindai My Lover: Minami tai Mecha-Minami,"Hentai,OVA"
12206,5543,Under World,"Hentai,OVA"
12207,5621,Violence Gekiga David no Hoshi,"Hentai,OVA"
12208,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,"Hentai,OVA"
12209,26081,Yasuji no Pornorama: Yacchimae!!,"Hentai,Movie"


In [71]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12210 entries, 0 to 12209
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   anime_id  12210 non-null  int64 
 1   name      12210 non-null  object
 2   tags      12210 non-null  object
dtypes: int64(1), object(2)
memory usage: 286.3+ KB


In [72]:
new_df['tags'][1]

'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen,TV'

In [73]:
#Removing Commas form tags
new_df['tags'] = new_df['tags'].apply(lambda x : x.replace(","," "))

In [74]:
new_df.head()

Unnamed: 0,anime_id,name,tags
0,32281,Kimi no Na wa.,Drama Romance School Supernatural Movie
1,5114,Fullmetal Alchemist: Brotherhood,Action Adventure Drama Fantasy Magic Mili...
2,28977,Gintama°,Action Comedy Historical Parody Samurai S...
3,9253,Steins;Gate,Sci-Fi Thriller TV
4,9969,Gintama&#039;,Action Comedy Historical Parody Samurai S...


In [75]:
#Converting Tags into lowercase - recommended practice
new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())

In [76]:
new_df.head()

Unnamed: 0,anime_id,name,tags
0,32281,Kimi no Na wa.,drama romance school supernatural movie
1,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic mili...
2,28977,Gintama°,action comedy historical parody samurai s...
3,9253,Steins;Gate,sci-fi thriller tv
4,9969,Gintama&#039;,action comedy historical parody samurai s...


### The data is ready to process Further

#### Steps
- We dont see any need for stemming but still doing it as a good practice 
- Use Text Vectorization

In [77]:
## Stemming
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [78]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [79]:
## Applying Steming on Tags
new_df['tags'] = new_df['tags'].apply(stem)

In [80]:
new_df.head()

Unnamed: 0,anime_id,name,tags
0,32281,Kimi no Na wa.,drama romanc school supernatur movi
1,5114,Fullmetal Alchemist: Brotherhood,action adventur drama fantasi magic militari s...
2,28977,Gintama°,action comedi histor parodi samurai sci-fi sho...
3,9253,Steins;Gate,sci-fi thriller tv
4,9969,Gintama&#039;,action comedi histor parodi samurai sci-fi sho...


## CountVectorizer

In [81]:
from sklearn.feature_extraction.text import CountVectorizer

In [82]:
vect = CountVectorizer(max_features=100,stop_words='english')
vectors = vect.fit_transform(new_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [83]:
vectors.shape

(12210, 51)

In [84]:
### Lets see those most common 51 words (alphabetically arranged)
vect.get_feature_names()

['action',
 'adventur',
 'ai',
 'art',
 'car',
 'comedi',
 'dementia',
 'demon',
 'drama',
 'ecchi',
 'fantasi',
 'fi',
 'game',
 'harem',
 'hentai',
 'histor',
 'horror',
 'josei',
 'kid',
 'life',
 'magic',
 'martial',
 'mecha',
 'militari',
 'movi',
 'music',
 'mysteri',
 'ona',
 'ova',
 'parodi',
 'polic',
 'power',
 'psycholog',
 'romanc',
 'samurai',
 'school',
 'sci',
 'seinen',
 'shoujo',
 'shounen',
 'slice',
 'space',
 'special',
 'sport',
 'super',
 'supernatur',
 'thriller',
 'tv',
 'vampir',
 'yaoi',
 'yuri']

## Finding Similar Anime using Cosine Similarity

In [85]:
from sklearn.metrics.pairwise import cosine_similarity

In [86]:
cosine_similarity(vectors)

array([[1.        , 0.15811388, 0.        , ..., 0.        , 0.        ,
        0.31622777],
       [0.15811388, 1.        , 0.35355339, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.35355339, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.5       ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.5       ],
       [0.31622777, 0.        , 0.        , ..., 0.5       , 0.5       ,
        1.        ]])

In [87]:
cosine_similarity(vectors).shape

(12210, 12210)

In [88]:
similarity = cosine_similarity(vectors)

## Creating a Recommend Function

1. Given a anime name - Find its index
2. Use that index to pass to similarity matrix - so to access array in simmilarity

e.g <br>
- given - 'Kimi no Na wa.'
- found index - 1
- use similarity[1]
- sort values in similarity[1]
- fetch top 5
- again using index - fetch their names from new_df

In [89]:
new_df[new_df['name'] == 'Kimi no Na wa.'].index[0]

0

In [90]:
#Problem
#We relie on the position of value in the similarity matrix
#So if we sort the values in the similarity matrix - we'll lose the movie index

In [91]:
#Lets have a look on the problem
#IF we sort - we are losing the movie index
sorted(similarity[0],reverse=True)

[0.9999999999999999,
 0.9128709291752769,
 0.8944271909999159,
 0.8944271909999159,
 0.8944271909999159,
 0.7999999999999999,
 0.7999999999999999,
 0.7999999999999999,
 0.7999999999999999,
 0.7999999999999999,
 0.7999999999999999,
 0.7745966692414835,
 0.7745966692414835,
 0.7745966692414835,
 0.7745966692414835,
 0.7745966692414835,
 0.7302967433402215,
 0.7302967433402215,
 0.6761234037828131,
 0.6761234037828131,
 0.6761234037828131,
 0.6761234037828131,
 0.6761234037828131,
 0.6761234037828131,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932499369,
 0.6708203932

In [92]:
#So we will use enumerate - to preserve the indexes
sorted(list(enumerate(similarity[0])),reverse=True)

[(12209, 0.3162277660168379),
 (12208, 0.0),
 (12207, 0.0),
 (12206, 0.0),
 (12205, 0.0),
 (12204, 0.0),
 (12203, 0.0),
 (12202, 0.0),
 (12201, 0.22360679774997896),
 (12200, 0.0),
 (12199, 0.0),
 (12198, 0.0),
 (12197, 0.0),
 (12196, 0.0),
 (12195, 0.0),
 (12194, 0.0),
 (12193, 0.0),
 (12192, 0.0),
 (12191, 0.0),
 (12190, 0.0),
 (12189, 0.0),
 (12188, 0.0),
 (12187, 0.0),
 (12186, 0.0),
 (12185, 0.0),
 (12184, 0.0),
 (12183, 0.0),
 (12182, 0.0),
 (12181, 0.0),
 (12180, 0.0),
 (12179, 0.0),
 (12178, 0.0),
 (12177, 0.0),
 (12176, 0.0),
 (12175, 0.25819888974716115),
 (12174, 0.0),
 (12173, 0.0),
 (12172, 0.25819888974716115),
 (12171, 0.25819888974716115),
 (12170, 0.25819888974716115),
 (12169, 0.0),
 (12168, 0.25819888974716115),
 (12167, 0.0),
 (12166, 0.0),
 (12165, 0.0),
 (12164, 0.0),
 (12163, 0.0),
 (12162, 0.25819888974716115),
 (12161, 0.0),
 (12160, 0.0),
 (12159, 0.0),
 (12158, 0.0),
 (12157, 0.0),
 (12156, 0.0),
 (12155, 0.0),
 (12154, 0.0),
 (12153, 0.0),
 (12152, 0.0),
 (1

In [93]:
#But Sorting is done based on Index which we dont want - so using key
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[-1])

[(0, 0.9999999999999999),
 (1111, 0.9128709291752769),
 (208, 0.8944271909999159),
 (1494, 0.8944271909999159),
 (1959, 0.8944271909999159),
 (60, 0.7999999999999999),
 (1199, 0.7999999999999999),
 (2103, 0.7999999999999999),
 (5803, 0.7999999999999999),
 (6391, 0.7999999999999999),
 (11005, 0.7999999999999999),
 (894, 0.7745966692414835),
 (1697, 0.7745966692414835),
 (5695, 0.7745966692414835),
 (6116, 0.7745966692414835),
 (10083, 0.7745966692414835),
 (3542, 0.7302967433402215),
 (5794, 0.7302967433402215),
 (45, 0.6761234037828131),
 (878, 0.6761234037828131),
 (986, 0.6761234037828131),
 (1389, 0.6761234037828131),
 (1604, 0.6761234037828131),
 (2060, 0.6761234037828131),
 (11, 0.6708203932499369),
 (15, 0.6708203932499369),
 (265, 0.6708203932499369),
 (504, 0.6708203932499369),
 (1201, 0.6708203932499369),
 (1435, 0.6708203932499369),
 (1436, 0.6708203932499369),
 (1595, 0.6708203932499369),
 (1631, 0.6708203932499369),
 (1907, 0.6708203932499369),
 (2280, 0.6708203932499369),


In [94]:
#Get five Most Similar Anime
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[-1])[1:6]

[(1111, 0.9128709291752769),
 (208, 0.8944271909999159),
 (1494, 0.8944271909999159),
 (1959, 0.8944271909999159),
 (60, 0.7999999999999999)]

In [95]:
#Get their names
lst = sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[-1])[1:6]
for i in lst:
    print(new_df.iloc[i[0]]['name'])

Aura: Maryuuin Kouga Saigo no Tatakai
Kokoro ga Sakebitagatterunda.
Harmonie
Air Movie
Hotarubi no Mori e


In [96]:
## Creating a Recommend Function
def recommend(anime):
    anime_index = new_df[new_df['name']==anime].index[0]
    cos_simi = similarity[anime_index]
    anime_list = sorted(list(enumerate(cos_simi)),reverse=True,key=lambda x:x[-1])[1:6]
    
    for i in anime_list:
        print(new_df.iloc[i[0]]['name'])
    

In [97]:
new_df['name']

0                                           Kimi no Na wa.
1                         Fullmetal Alchemist: Brotherhood
2                                                 Gintama°
3                                              Steins;Gate
4                                            Gintama&#039;
                               ...                        
12205         Toushindai My Lover: Minami tai Mecha-Minami
12206                                          Under World
12207                       Violence Gekiga David no Hoshi
12208    Violence Gekiga Shin David no Hoshi: Inma Dens...
12209                     Yasuji no Pornorama: Yacchimae!!
Name: name, Length: 12210, dtype: object

In [98]:
recommend('Yasuji no Pornorama: Yacchimae!!')

Kanashimi no Belladonna
Senya Ichiya Monogatari
Cleopatra
Blue Seagull
Hi Gekiga Ukiyoe Senya Ichiya


## Next Task - Convert this to a website