# **Anime Recommendation System**

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp
import operator
import time
import math
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('dark_background')
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go

from wordcloud import WordCloud, STOPWORDS

from scipy.spatial.distance import cosine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

#Metrics (Computation)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.metrics import silhouette_score

from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
from IPython.display import display

import warnings
warnings.filterwarnings("ignore")

<hr>

**<h3> Anime Dataset </h3>**

In [None]:
#Read and display dataset using Pandas
anime_dataset = pd.read_csv("../input/anime-recommendations-database/anime.csv",encoding ="latin-1")
display(anime_dataset.head())
print("Number of diffrent Anime Programs: ",anime_dataset.shape[0])
print("Number of features in the dataset: ",anime_dataset.shape[1],anime_dataset.columns.values)

**<h4>Data Preprocessing</h4>**

In [None]:
print("Data type and Null Values: ")
display(anime_dataset.info())

print("\nStatistics about the dataset: ")
display(anime_dataset["rating"].describe())

In [None]:
#Drop all null values from the dataset
anime_dataset = anime_dataset.dropna()

#Change unknown "episode" values to 0.
anime_dataset.loc[anime_dataset["episodes"] == "Unknown", "episodes"] = 0
anime_dataset["episodes"] = anime_dataset["episodes"].astype(int)

<hr>

**<h3> Rating Dataset </h3>**

In [None]:
rating_dataset = pd.read_csv("../input/anime-recommendations-database/rating.csv")
display(rating_dataset.head())
print("Number of entries in the dataset: ",rating_dataset.shape[0])
print("Number of features in the dataset: ",rating_dataset.shape[1])
print("The features are: ",rating_dataset.columns.values)

<hr>

**<h3>Final Dataset</h3>**

We will now merge the two datasets "anime_dataset" and "rating_dataset" and create a new dataset "final_dataset" which will be used to train our DM/ML models.

In [None]:
final_dataset = pd.merge(anime_dataset,rating_dataset[["user_id","anime_id","rating"]], on = ["anime_id"],how = "right")
final_dataset.rename(columns = {'rating_x':'AverageRating','rating_y':'rating'}, inplace = True)
#Drop rows with null Average Rating
final_dataset.dropna(inplace=True)

print("Final Dataset: ")
display(final_dataset.head(10))

<hr>

**<h3> Explonatory Data Analysis (EDA) </h3>**

**1) Top 10 and bottom 10 Anime programs based on average rating**

In [None]:
#Average Rating of each Anime
grouped_by_anime_id = final_dataset.groupby(["anime_id","name","genre"])[["AverageRating"]].mean().reset_index()
sorted_grouped_by_anime_id = grouped_by_anime_id.sort_values(by = ["AverageRating"],ascending=False).reset_index()

#Plot top 10 based on average rating of each show.
display(sorted_grouped_by_anime_id.head(10))
plt.figure(figsize=(20,8))
plt.subplot(1, 2, 1)
A = sns.barplot(sorted_grouped_by_anime_id.name[:10], sorted_grouped_by_anime_id["AverageRating"][:10], palette ="Blues_r")
A.spines['bottom'].set_linewidth(1.5)
for w in ['right', 'top', 'left']:
    A.spines[w].set_visible(False)
plt.title("Top 10 Anime Shows based on average rating")
plt.ylim(9.1,9.55)
plt.xticks(rotation=90)
plt.grid()

k=0
for p in A.patches:
    height = p.get_height()
    plt.text(p.get_x()+0.1,height+0.01, sorted_grouped_by_anime_id["AverageRating"][k],fontname = 'monospace', fontsize = 12, color = 'Blue') 
    k+=1


#Plot worst 10 anime shows based on average ratings
display(sorted_grouped_by_anime_id.tail(10))
plt.subplot(1, 2, 2)
B = sns.barplot(sorted_grouped_by_anime_id.name[11152:11162],sorted_grouped_by_anime_id["AverageRating"][11152:11162],palette="Blues_r")
B.spines['bottom'].set_linewidth(1.5)
for w in ['right', 'top', 'left']:
    B.spines[w].set_visible(False)
    
k=11152
for p in B.patches:
    height = p.get_height()
    plt.text(p.get_x()+0.1,height+0.01, sorted_grouped_by_anime_id["AverageRating"][k],fontname = 'monospace', fontsize = 12, color = 'Blue') 
    k+=1
plt.title("Worst 10 Anime Shows based on average rating")
plt.ylim(1.5,2.75)
plt.xticks(rotation=90)
plt.grid()
plt.show()

**2) Distribution of Average Ratings**

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(grouped_by_anime_id["AverageRating"],palette="Blues_r",orient="v")
plt.grid()
plt.show()

<hr>

**3) Word Cloud for Genre column** <br>
This will visualize the most common genres in the top 100 animes watched by users

In [None]:
genres = sorted_grouped_by_anime_id["genre"]

comment_words = ''
stopwords = set(STOPWORDS)
 
for genre in genres:
 
    genre = str(genre)
    # split the different genres
    tokens = genre.split()
     
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "
    
wordcloud = WordCloud(width = 2000, height = 1000, background_color ='black', stopwords = stopwords, min_font_size = 10).generate(comment_words)
    
#Plot the wordcloud
plt.figure(figsize = (13, 10), facecolor = None)
plt.imshow(wordcloud)
plt.title("Top Watched Genre")
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

<hr>

**4) Group the Anime based on "Type"**

In [None]:
anime_type = final_dataset["type"]
print("The different type of Anime Shows are: ",anime_type.unique())

labels = anime_type.value_counts().index
values = anime_type.value_counts().values

#Visualize the different types
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=10,textfont_color = "white",
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))

fig.update_layout(
    title={
        'text': "Types of Anime Shows",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        paper_bgcolor="black")

fig.show()

In [None]:
#Split dataset based on different types of anime
mean_avgrat_type = [round(final_dataset[final_dataset["type"] == "TV"]["AverageRating"].mean(),3),
                    round(final_dataset[final_dataset["type"] == "Movie"]["AverageRating"].mean(),3),
                    round(final_dataset[final_dataset["type"] == "Special"]["AverageRating"].mean(),3),
                    round(final_dataset[final_dataset["type"] == "OVA"]["AverageRating"].mean(),3),
                    round(final_dataset[final_dataset["type"] == "ONA"]["AverageRating"].mean(),3),
                    round(final_dataset[final_dataset["type"] == "Music"]["AverageRating"].mean(),3)]

labels = ["TV","Movie","Special","OVA","ONA","Music"]
print("Average Rating based on type: ")
for i in range(6):
    print(labels[i],":",mean_avgrat_type[i])
    
#Plot the average rating of every type.
plt.figure(figsize = (16,8))
plt.plot(["TV","Movie","Special","OVA","ONA","Music"],mean_avgrat_type,label = "Average Rating", c = "Blue",linewidth = 2)
plt.scatter(labels,mean_avgrat_type, c = "Blue",linewidth = 3)
plt.title("Average Ratings based on type of Anime")
plt.grid()
plt.legend()
plt.show()

<h3> Let's look at the top 100 Anime shows based on average rating </h3>

In [None]:
top_100_anime_shows = final_dataset.groupby(["anime_id","name","genre","episodes"])[["AverageRating"]].mean().reset_index()
sorted_top_100_anime_shows = top_100_anime_shows.sort_values(by = ["AverageRating"],ascending=False).reset_index()[:100]
display(sorted_top_100_anime_shows)

<hr>

**Generate Word Cloud to visualize popular genres**

In [None]:
genres = sorted_top_100_anime_shows["genre"]

comment_words = ''
stopwords = set(STOPWORDS)
 
for genre in genres:
 
    genre = str(genre)
    # split the different genres
    tokens = genre.split()
     
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "
    
wordcloud = WordCloud(width = 2000, height = 1000, background_color ='black', stopwords = stopwords, min_font_size = 10).generate(comment_words)
    
#Plot the wordcloud
plt.figure(figsize = (13, 10), facecolor = None)
plt.imshow(wordcloud)
plt.title("Top 100 Anime")
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

<hr>

**Number of episodes in top 100 animes**

In [None]:
#Plot Distribution plots and box plots
plt.figure(figsize=(20,8))
sns.distplot(sorted_top_100_anime_shows["episodes"])
plt.title("Distribution of Number of Episodes")
plt.grid()
plt.show()

<hr>

**<h3>K-Means Clustering</h3>**

In [None]:
clustering_data = final_dataset[:100000]
user_anime_crosstab = pd.crosstab(clustering_data['user_id'],clustering_data['name'])
user_anime_crosstab.head()

In [None]:
#Apply Principle Component Analysis to reduce dimension of the data
pca = PCA(n_components=3)
pca.fit(user_anime_crosstab)
pca_samples = pca.transform(user_anime_crosstab)

ps = pd.DataFrame(pca_samples)
display(ps.head())

tocluster = pd.DataFrame(ps[[0,1,2]])

In [None]:
plt.rcParams['figure.figsize'] = (14, 6)

fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(tocluster[0], tocluster[2], tocluster[1])

plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()

In [None]:
#Elbow method - Choose the optimum value of K
Error =[]

for i in range(1,8):
    kmeans = KMeans(n_clusters=i).fit(tocluster)
    kmeans.fit(tocluster)
    Error.append(kmeans.inertia_)
    
plt.plot(range(1,8),Error)
plt.title("Elbow method")
plt.xlabel("Number of clusters")
plt.ylabel("Error")
plt.grid()
plt.show()

In [None]:
#Build cluster model and fit data to it
clustering_model = KMeans(n_clusters=3,random_state=30)
clustering_model.fit(tocluster)

centers = clustering_model.cluster_centers_
c_preds = clustering_model.predict(tocluster)

print("Coordinates for the 3 clusters are: \n",centers)

In [None]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(tocluster[0], tocluster[2], tocluster[1], c = c_preds)
plt.title('Data points in 3D PCA axis', fontsize=20)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))

for ci,c in enumerate(centers):
    plt.plot(c[1], c[0], 'x', markersize=8, color='red', alpha=1)
    plt.scatter(tocluster[1],tocluster[0],c = c_preds,s=50,marker="o")

plt.xlabel('x_values')
plt.ylabel('y_values')

plt.title('Data points with cluster centers', fontsize=20)
plt.grid()
plt.show()

In [None]:
user_anime_crosstab['cluster'] = c_preds

c0 = user_anime_crosstab[user_anime_crosstab['cluster']==0].drop('cluster',axis=1).mean()
c1 = user_anime_crosstab[user_anime_crosstab['cluster']==1].drop('cluster',axis=1).mean()
c2 = user_anime_crosstab[user_anime_crosstab['cluster']==2].drop('cluster',axis=1).mean()

In [None]:
c1.sort_values(ascending=False)[0:15]

<hr>

**<h3> Collaborative Filtering</h3>**

1) **User-User CF**

In [None]:
CF_dataset = final_dataset[final_dataset["rating"] != -1][["name","anime_id","user_id","rating","AverageRating"]]
CF_dataset = CF_dataset[CF_dataset.user_id <= 10000]
display(CF_dataset.head())
CF_dataset.shape

In [None]:
grouped_userid = CF_dataset.groupby(by="user_id",as_index=False)["rating"].mean()
#For each user find the standard deviation of ratings
Mean = CF_dataset.groupby(by="user_id",as_index=False)['rating'].mean()
Std = CF_dataset.groupby(by="user_id",as_index=False)["rating"].std()
CF_dataset = pd.merge(CF_dataset,Std,on="user_id")
CF_dataset = CF_dataset[CF_dataset["rating_y"] != 0 ]
CF_dataset.rename(columns = {'rating_x':'rating','rating_y':'StdRating'}, inplace = True)

CF_dataset["z_score"] = (CF_dataset["rating"] - CF_dataset["AverageRating"]) / CF_dataset["StdRating"]
CF_dataset.head()

In [None]:
#Create pivot table 
user_rating_z = pd.pivot_table(CF_dataset, values = "z_score", index = "user_id", columns="anime_id").fillna(0)
user_rating_z.head()

**Similarity Metric - Cosine Similarity**

In [None]:
#Calculate User-User similarity using Cosine function
cosine_similarity_user = cosine_similarity(user_rating_z)
np.fill_diagonal(cosine_similarity_user,0)

#Create a pandas dataframe to store the user_similarity values
user_similarity=pd.DataFrame(cosine_similarity_user,index=user_rating_z.index)
user_similarity.columns=user_rating_z.index
user_similarity.head()

In [None]:
#Calculate item-item cosine similarity
cosine_similarity_item = cosine_similarity(user_rating_z.T)
item_similarity=pd.DataFrame(cosine_similarity_item,index=user_rating_z.T.index)
item_similarity.columns=user_rating_z.T.index
item_similarity.head()

**<h4>Now we define a few functions that will help us build our Recommendation System with the similarity data.</h4>**

1. **Top Anime**

In [None]:
# This function will return the top 10 shows with the highest cosine similarity value
def top_animes(anime_name):
    count = 1
    print('Top 10 shows similar to {} include:\n'.format(anime_name))
    for item in item_similarity.sort_values(by = anime_name, ascending = False).index[1:11]:
        print('No. {}: #{} - {}'.format(count,item,CF_dataset[CF_dataset["anime_id"] == item]["name"].unique()[0]))
        count +=1  

2. **Top Users**

In [None]:
# This function will return the top 5 users with the highest similarity value 
def top_users(user):
    
    if user not in user_similarity.columns:
        return('No data available on user {}'.format(user))
    
    print('TOp 10 users similar to {} include:\n'.format(user))
    sim_values = user_similarity.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_similarity.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim)) 

3. **Similar User Recommendations**

In [None]:
# This function constructs a list of lists containing the highest rated shows per similar user and returns the name of the show along with the frequency it appears in the list
def similar_user_recs(user):
    
    if user not in user_similarity.columns:
        return('No data available on user {}'.format(user))
    
    sim_users = user_similarity.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = user_similarity.loc[:, i].max()
        best.append(user_similarity[user_similarity.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]  

4. **Predict Rating**

In [None]:
def find_n_neighbours(df,n):
    order=np.argsort(df.values,axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:n].index,index=['Top{}'.format(i)for i in range(1,n+1)]),axis=1)
    return df

In [None]:
#top N neighbours for each user
N=3
top_N_user_similarity = find_n_neighbours(user_similarity,N)
top_N_user_similarity.head()

In [None]:
def user_item_rating_prediction(user,item):
    TopN_UserList = top_N_user_similarity[top_N_user_similarity.index==user].values.squeeze().tolist()
   
    ItemRating = user_rating_z.loc[:,item]
    ItemRating = ItemRating[ItemRating.index.isin(TopN_UserList)]
    TopN_Prime_ItemRating = ItemRating[ItemRating.notnull()]
    TopN_Prime_UserList= TopN_Prime_ItemRating.index.values.squeeze().tolist()

    sim=user_similarity.loc[user,TopN_Prime_UserList]

    weighted_AVG=np.sum(TopN_Prime_ItemRating*sim)/np.sum(sim)
    avg_user = Mean.loc[Mean['user_id']== user, 'rating'].values[0]
    std_user = Std.loc[Mean['user_id']==user,'rating'].values[0]
    rating_prediction=(weighted_AVG+avg_user)*std_user
    print("The predicted rating for user",user, "for the anime", CF_dataset[CF_dataset["anime_id"] == item]["name"].unique()[0], "is: ",round(rating_prediction,2))

In [None]:
def top_anime_for_user(user):
    TopN_UserList = top_N_user_similarity[top_N_user_similarity.index==user].values.squeeze().tolist()
    
    recommendations = []
    for usr in TopN_UserList:
        top_N_shows = CF_dataset[CF_dataset["user_id"] == usr].sort_values(by="rating",ascending=False)["name"][:3]
        recommendations.append(top_N_shows)
        
    return recommendations

def unique(list1):
    x = np.array(list1)
    print(np.unique(x))

In [None]:
#Create a dropdown menu for the item
def f(x):
    display(x)
    return x

C1 = item_similarity.columns
P1 = interactive(f, x=widgets.Dropdown(options=C1, value=1 ,description='Anime ID:',disabled=False))
print("Select an Item: ")
display(P1)

In [None]:
item1=P1.result
top_animes(item1)

In [None]:
#Create a dropdown menu for the user
def f(x):
    display(x)
    return x

C2 = user_similarity.columns
P2 = interactive(f, x=widgets.Dropdown(options=C2, value=3 ,description='User ID:',disabled=False))
print("Select a user: ")
display(P2)

In [None]:
user1 =  P2.result
top_users(user1)

In [None]:
#Create a dropdown menu for the user
def f(x):
    display(x)
    return x

C3 = top_N_user_similarity.index
P3 = interactive(f, x=widgets.Dropdown(options=C3, value=7 ,description='User ID:',disabled=False))
print("Select a user: ")
display(P3)

#Create a dropdown menu for the item
def f(x):
    display(x)
    return x

C4 = item_similarity.columns
P4 = interactive(f, x=widgets.Dropdown(options=C4, value=19 ,description='Anime ID:',disabled=False))
print("Select an Item: ")
display(P4)

In [None]:
user2 = P3.result
item2 = P4.result

#Predict the rating for a particular user
user_item_rating_prediction(user2,item2)

In [None]:
#Create a dropdown menu for the user
def f(x):
    display(x)
    return x

C5 = user_similarity.columns
P5 = interactive(f, x=widgets.Dropdown(options=C5, value=3 ,description='User ID:',disabled=False))
print("Select a user to generate recommendations: ")
display(P5)

In [None]:
user3 = P5.result
print("Recommendations for user {} are: \n".format(user3))
new_list = []
for i in range(3):
    new_list.append(top_anime_for_user(user3)[i].values)
    
final_rec = []
for sublist in new_list:
    for item in sublist:
        final_rec.append(item)
        
def unique(list1):
    x = np.array(list1)
    print(np.unique(x))
        
# for i in range(final_rec):        
unique(final_rec)

<hr>

**<h3> Decision Tree Model </h3>**

In [None]:
#Create dataset for out Decision Tree Model
decision_tree_data = final_dataset[["name","genre","type","episodes","members","AverageRating"]][:50000]
display(decision_tree_data.head())
decision_tree_data.shape

**<h4>Data Preprocessing</h4>**

In [None]:
#Check the data for null values
decision_tree_data.isnull().sum()

In [None]:
#One-Hot-Encode the genre column
data = list(decision_tree_data["genre"].apply(lambda x:x.split(",") ))
encoded_data = pd.DataFrame(data)
encoded_data = pd.get_dummies(encoded_data)
df = pd.concat([decision_tree_data,encoded_data],axis = 1)
df.drop(["genre"],inplace=True,axis=1)

#One-hot encode the type column
type_encoded = pd.get_dummies(df["type"])
df = pd.concat([df,type_encoded],axis=1)
df.drop(["type","name"],inplace=True,axis=1)

In [None]:
#Check the data for null values and drop if any
display(df.isnull().sum())
df.dropna(inplace=True)
display(df.head())
df.shape

In [None]:
#Calculate percentage of outliers in continuous columns
for k, v in df[["episodes","members","AverageRating"]].items():
        q1 = v.quantile(0.25)
        q3 = v.quantile(0.75)
        irq = q3 - q1
        v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)]
        perc = np.shape(v_col)[0] * 100.0 / np.shape(data)[0]
        print("Column %s outliers = %.2f%%" % (k, perc))

In [None]:
X=df.drop(["AverageRating"],axis=1)
Y=df["AverageRating"]

#Scale the continuous columns using Standard Scaler
SS = StandardScaler()
X_scaling = pd.DataFrame(SS.fit_transform(X[["episodes","members"]]),columns=["episodes_scaled","members_scaled"])
X = pd.concat([X,X_scaling],axis=1)
X.drop(["episodes","members"],axis=1,inplace=True)

#Split data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
#Add timestamps to calculate execution time
start=time.time()
#Define the Decision Tree Model and fit training data to it
model_decision_tree = DecisionTreeRegressor(criterion="mse",max_depth=20,random_state=10)
model_decision_tree.fit(X_train,Y_train)
#Generate predictions using test data
Y_pred = model_decision_tree.predict(X_test)
end = time.time()
model_decision_tree_time=end-start
print(f"Execution time of model: {round((model_decision_tree_time),5)} seconds")

In [None]:
#Define a function to evaluate and plot the results of the Decision Tree Model.
def compute(Y_pred,Y_test):
    #Plot Predicted vs Actual Ratings
    plt.figure(figsize=(20,8))
    plt.scatter(Y_test.index,Y_pred,label="Predicted")
    plt.scatter(Y_test.index,Y_test,label="Actual")
    plt.title("Predicted vs Actual")
    plt.grid()
    plt.legend()
    plt.show()

    #Calculate Metrics
    mse=mean_squared_error(Y_test,Y_pred)
    acc = r2_score(Y_test,Y_pred)
    print("\nR2 Score: {}%".format(round((acc*100),4)))
    print("Mean Square Error (MSE): {}".format(round((mse),4)))
    print("Root Mean Squared Error (RMSE): {}".format(round(math.sqrt(mse),4)))

In [None]:
compute(Y_pred,Y_test)