In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


df = pd.read_csv("./dataset/anime_with_synopsis.csv")


In [3]:
df.head()
     

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [4]:
#returns the number of missing values in the dataset
df.isnull().sum()

MAL_ID       0
Name         0
Score        0
Genres       0
sypnopsis    8
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
#that describe which rows in the DataFrame are "duplicated" and no
df.duplicated().sum()

0

In [7]:
df["Score"] = df["Score"].map(lambda x:np.nan if x=="Unknown" else x)

In [8]:
df["Score"].fillna(df["Score"].median(),inplace = True)

In [9]:
df["Score"] = df["Score"].astype(float)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16206 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MAL_ID     16206 non-null  int64  
 1   Name       16206 non-null  object 
 2   Score      16206 non-null  float64
 3   Genres     16206 non-null  object 
 4   sypnopsis  16206 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 759.7+ KB


In [11]:
df['Genres'] = df['Genres'].apply(lambda x:x.split())
df['sypnopsis'] = df['sypnopsis'].apply(lambda x:x.split())

In [13]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"[Action,, Adventure,, Comedy,, Drama,, Sci-Fi,...","[In, the, year, 2071,, humanity, has, colonize..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"[Action,, Drama,, Mystery,, Sci-Fi,, Space]","[other, day,, another, bounty—such, is, the, l..."
2,6,Trigun,8.24,"[Action,, Sci-Fi,, Adventure,, Comedy,, Drama,...","[Vash, the, Stampede, is, the, man, with, a, $..."
3,7,Witch Hunter Robin,7.27,"[Action,, Mystery,, Police,, Supernatural,, Dr...","[ches, are, individuals, with, special, powers..."
4,8,Bouken Ou Beet,6.98,"[Adventure,, Fantasy,, Shounen,, Supernatural]","[It, is, the, dark, century, and, the, people,..."


In [14]:
# remove space between two words
df['Genres'] = df['Genres'].apply(lambda x:[i.replace(" ","") for i in x])
df['sypnopsis'] = df['sypnopsis'].apply(lambda x:[i.replace(" ","") for i in x])

In [15]:
df['features'] = df['Genres'] + df['sypnopsis'] 
     


In [16]:
new_df = df[['Name', 'features']]

In [17]:

# convert list to string
new_df['features'] = new_df['features'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['features'] = new_df['features'].apply(lambda x:" ".join(x))


In [18]:
new_df

Unnamed: 0,Name,features
0,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Spac..."
1,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space other da..."
2,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shou..."
3,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ..."
4,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural It i..."
...,...,...
16209,Daomu Biji Zhi Qinling Shen Shu,"Adventure, Mystery, Supernatural No synopsis i..."
16210,Mieruko-chan,"Comedy, Horror, Supernatural ko is a typical h..."
16211,Higurashi no Naku Koro ni Sotsu,"Mystery, Dementia, Horror, Psychological, Supe..."
16212,Yama no Susume: Next Summit,"Adventure, Slice of Life, Comedy New Yama no S..."


In [19]:

#Stemming is the process of producing morphological variants of a root/base word.
"""
root word "like" include:

-> "likes"
-> "liked"
-> "likely"
-> "liking"

"""

import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [20]:

def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [21]:
new_df['features'] = new_df['features'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['features'] = new_df['features'].apply(stem)


In [22]:
# convert to lowercase
new_df['features'] = new_df['features'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['features'] = new_df['features'].apply(lambda x:x.lower())


In [23]:
new_df.head()

Unnamed: 0,Name,features
0,Cowboy Bebop,"action, adventure, comedy, drama, sci-fi, spac..."
1,Cowboy Bebop: Tengoku no Tobira,"action, drama, mystery, sci-fi, space other da..."
2,Trigun,"action, sci-fi, adventure, comedy, drama, shou..."
3,Witch Hunter Robin,"action, mystery, police, supernatural, drama, ..."
4,Bouken Ou Beet,"adventure, fantasy, shounen, supernatur it is ..."


In [24]:
"""
Countvectorizer is a method to convert text to numerical data

The CountVectorizer will select the words/features/terms which occur the most frequently.
It takes absolute values so if you set the ‘max_features = 3’, it will select the 3 most 
common words in the data.

If ‘english’, a built-in stop word list for English is used. 
"""

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [25]:
vectors = cv.fit_transform(new_df['features']).toarray()

In [26]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 3, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
     

#we use cosine similarity between these vectors to find their similarity.
similarity = cosine_similarity(vectors)

In [28]:
cosine_similarity(vectors).shape

(16206, 16206)

In [29]:
# creates a tupple and stores every similarity index
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:10]

[(3149, 0.3493261963115963),
 (5545, 0.3351775152257364),
 (1145, 0.31619510292053465),
 (5949, 0.29793556908954344),
 (15573, 0.29793556908954344),
 (365, 0.2931856917889426),
 (3669, 0.2922959000805237),
 (4028, 0.28783310518446176),
 (2077, 0.2844400619942872)]

In [39]:
def recommend(anime):
    movie_index = new_df[new_df['Name'] == anime].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:15]

    # create a list to store recommended anime names and their details
    recommended_anime = []
    for i in movies_list:
        anime_name = new_df.iloc[i[0]].Name
        anime_score = df[df['Name'] == anime_name]['Score'].values[0] # get the score from the original dataframe
        anime_genres = df[df['Name'] == anime_name]['Genres'].values[0] # get the genres from the original dataframe
        anime_synopsis = df[df['Name'] == anime_name]['sypnopsis'].values[0] # get the synopsis from the original dataframe
        recommended_anime.append((anime_name, anime_score, anime_genres, anime_synopsis))
        
    # sort the list of recommended anime by their score in descending order
    recommended_anime = sorted(recommended_anime, key=lambda x:x[1], reverse=True)
    
    # return the top 10 anime names along with their details
    for i in range(10):
        print(f"Name: {recommended_anime[i][0]}")
        print(f"Score: {recommended_anime[i][1]}")
        print(f"Genres: {' '.join(recommended_anime[i][2])}") # convert list of genres to a string
        print(f"Synopsis: {' '.join(recommended_anime[i][3])}") # convert list of synopsis to a string
        print("\n")


In [40]:
recommend('Shingeki no Kyojin')

Name: Shingeki no Kyojin Season 3
Score: 8.59
Genres: Action, Military, Mystery, Super Power, Drama, Fantasy, Shounen
Synopsis: Still threatened by the "Titans" that rob them of their freedom, mankind remains caged inside the two remaining walls. Efforts to eradicate these monsters continue; however, threats arise not only from the Titans beyond the walls, but from the humans within them as well. After being rescued from the Colossal and Armored Titans, Eren Yaeger devotes himself to improving his Titan form. Krista Lenz struggles to accept the loss of her friend, Captain Levi chooses Eren and his friends to form his new personal squad, and Commander Erwin Smith recovers from his injuries. All seems well for the soldiers, until the government suddenly demands custody of Eren and Krista. The Survey Corps' recent successes have drawn attention, and a familiar face from Levi's past is sent to collect the wanted soldiers. Sought after by the government, Levi and his new squad must evade th

Ginga Senpuu Braiger
Planetes Picture Drama
Odin: Koushi Hansen Starlight
Happening Star ☆
Konglong Baobei Zhi Longshen Yongshi 3
Seihou Bukyou Outlaw Star
Uchuu Kuubo Blue Noah
Byeolnala Samchongsa
Space Adventure Cobra
Rokushin Gattai GodMars (1982)
Sol Bianca: Taiyou no Fune
Ginga Tetsudou Monogatari: Eien e no Bunkiten
Konglong Baobei Zhi Longshen Yongshi 2
Uchuu no Kishi Tekkaman
