In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("E:/A.I/Coursework/New folder/anime.csv")
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
#Exploring the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
# count the missing values in each column
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [5]:
#For this content based system we  will be using { Name, Genre, Type} column.

In [6]:
# preprocessing
#NaN=(None Value)Not a Number are special values in DataFrame and numpy arrays that represent the missing of value in a cell)
# filling NaN  with unknown
df.loc[(df['type'].isnull()), 'type'] = 'Unknown'## select value by row label and column label
df.loc[(df['genre'].isnull()), 'genre'] = 'Unknown'
#loc is used for indexing or selecting based on name .i.e. by row name and column name

In [7]:
df.isnull().sum()

anime_id      0
name          0
genre         0
type          0
episodes      0
rating      230
members       0
dtype: int64

In [8]:
#Removing unwanted columns
df.drop(['rating', 'members', 'episodes'], axis=1, inplace=True)
#Axis 1 will work on all the COLUMNS in each ROW
#inplace=true performs operation on data and nothing is returned.
df.head()

Unnamed: 0,anime_id,name,genre,type
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV


In [9]:
# Converting to lower case
df['genre'] = df['genre'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [10]:
df['bag_of_words'] = ""
df.loc[:,'bag_of_words'] = df.loc[:, 'genre'] +', '+ df.loc[:, 'type']
#selecting all the values form genre and type and keeping them togther by sperating them with ','.

In [11]:
# Data after preprocessing
df.head()

Unnamed: 0,anime_id,name,genre,type,bag_of_words
0,32281,Kimi no Na wa.,"drama, romance, school, supernatural",Movie,"drama, romance, school, supernatural, Movie"
1,5114,Fullmetal Alchemist: Brotherhood,"action, adventure, drama, fantasy, magic, mili...",TV,"action, adventure, drama, fantasy, magic, mili..."
2,28977,Gintama°,"action, comedy, historical, parody, samurai, s...",TV,"action, comedy, historical, parody, samurai, s..."
3,9253,Steins;Gate,"sci-fi, thriller",TV,"sci-fi, thriller, TV"
4,9969,Gintama&#039;,"action, comedy, historical, parody, samurai, s...",TV,"action, comedy, historical, parody, samurai, s..."


In [12]:
#Convert a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()#word count of the words in the document
tfidf = vectorizer.fit_transform(df["bag_of_words"])#Converts into sparse matriz and the output of this comes as a sparse_matrix.

In [13]:
# Using Cosine to obtain similaritier between two animes
cosine_sim = cosine_similarity(tfidf, tfidf)

In [14]:
#recommender function
def recommendation(name, cosine_sim):
    recommended = np.array([])
    index_ = df.loc[(df['name'] == name)].index
    score = cosine_sim[index_]
    ind = np.argpartition(score[0], -10)[-11:]
    ind = ind[np.argsort(score[0][ind])]
    return ind

In [15]:
# Predicting movies similar to "Under the Dog"
indices = recommendation('Under the Dog', cosine_sim)
df[['anime_id','name', 'genre', 'type']].loc[indices]

Unnamed: 0,anime_id,name,genre,type
7649,3064,Kizuoibito,"action, adventure, seinen, thriller",OVA
493,5630,Higashi no Eden,"action, comedy, drama, mystery, romance, sci-f...",TV
6889,12723,Loups=Garous Pilot,"mystery, sci-fi, thriller",Special
5525,7598,Loups=Garous,"mystery, sci-fi, thriller",Movie
5477,13863,Arve Rezzle: Kikaijikake no Yousei-tachi,"action, drama, mystery, sci-fi, thriller",Movie
2518,6610,Ibara no Ou,"action, mystery, sci-fi, thriller",Movie
196,32188,Steins;Gate: Kyoukaimenjou no Missing Link - D...,"sci-fi, thriller",Special
126,10863,Steins;Gate: Oukoubakko no Poriomania,"sci-fi, thriller",Special
59,11577,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,"sci-fi, thriller",Movie
3,9253,Steins;Gate,"sci-fi, thriller",TV


In [16]:
# Predicting movies similar to "Kizuoibito"
indices = recommendation('Kizuoibito', cosine_sim)
df[['anime_id','name', 'genre', 'type']].loc[indices]

Unnamed: 0,anime_id,name,genre,type
4286,6736,Mouryou no Hako Special,"mystery, seinen, supernatural, thriller",Special
2199,4879,Mouryou no Hako,"mystery, seinen, supernatural, thriller",TV
53,6114,Rainbow: Nisha Rokubou no Shichinin,"drama, historical, seinen, thriller",TV
5126,27387,Under the Dog,"action, sci-fi, thriller",OVA
6689,1928,Early Reins,"action, adventure, seinen",OVA
7139,20329,Koroshiya-san: The Hired Gun,"action, comedy, police, seinen, thriller",TV
5297,2698,Sanctuary,"action, drama, police, seinen, thriller",OVA
6009,838,Narutaru: Mukuro Naru Hoshi Tama Taru Ko,"drama, seinen, thriller",TV
353,5682,Phantom: Requiem for the Phantom,"action, drama, seinen, thriller",TV
1254,4039,Golgo 13 (TV),"action, adventure, drama, seinen, thriller",TV
