In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

%matplotlib inline

In [2]:
anime_df = pd.read_csv('anime_df_copy_csv')
genres_df = pd.read_csv('Genre_csv')

anime_df = pd.concat([anime_df, genres_df], axis=1)
anime_df

Unnamed: 0,anime_name,Format,Number Of Episodes,Episode Duration,status,Popularity,favorites,Studios,Genres,mean_score,...,Mecha,Music,Mystery,Psychological,Romance,Sci_Fi,Slice_of_Life,Sports,Supernatural,Thriller
0,Shingeki no Kyojin,1.0,25.0,24,Finished,522270.0,47674.0,Wit Studio,Action|Drama|Fantasy|Mystery,85.0,...,0,0,1,0,0,0,0,0,0,0
1,DEATH NOTE,1.0,37.0,23,Finished,474846.0,35688.0,MADHOUSE,Mystery|Psychological|Supernatural|Thriller,84.0,...,0,0,1,1,0,0,0,0,1,1
2,Boku no Hero Academia,1.0,13.0,24,Finished,471660.0,23789.0,bones,Action|Adventure|Comedy,79.0,...,0,0,0,0,0,0,0,0,0,0
3,Kimetsu no Yaiba,1.0,26.0,24,Finished,468624.0,40432.0,ufotable,Action|Adventure|Drama|Fantasy|Supernatural,85.0,...,0,0,0,0,0,0,0,0,1,0
4,HUNTER×HUNTER (2011),1.0,148.0,24,Finished,433766.0,56845.0,MADHOUSE,Action|Adventure|Fantasy,90.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5943,NOMAD: Megalo Box 2 - Hachidori to tabibito.,3.0,1.0,0,Finished,1225.0,11.0,XX,XX,72.0,...,0,0,0,0,0,0,0,0,0,0
5944,The Big O (2003),1.0,13.0,24,Finished,1225.0,54.0,Sunrise,Action|Mecha|Mystery|Psychological|Sci-Fi,73.0,...,1,0,1,1,0,1,0,0,0,0
5945,Zoids Wild,1.0,50.0,25,Finished,1225.0,10.0,OLM,Action|Adventure|Mecha|Sci-Fi,51.0,...,1,0,0,0,0,1,0,0,0,0
5946,Sanctuary,4.0,1.0,67,Finished,1225.0,9.0,Pastel,Action|Drama|Thriller,58.0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
#Dropping the string columns and the original mean_score column(not binned) for the machine learning process.
y = anime_df.iloc[:,anime_df.columns.get_loc("mean_score_binned")]

cols=['anime_name','status','Studios','Genres','mean_score_binned','mean_score','Creator']
anime_df.drop(cols, axis=1, inplace=True)

In [4]:
# Separating our data to training and test sets.
XTrain, XTest, yTrain, yTest = train_test_split(anime_df, y, random_state=42, test_size=0.2, shuffle=True)

In [5]:
# Training our model.
k = 29
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(XTrain, yTrain)

KNeighborsClassifier(n_neighbors=29)

In [6]:
y_pred=clf.predict(XTest)

print(metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))

print('Accuracy = ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

[[ 32   0   0   0   0   0   0   1]
 [  0   0   0   0   0   1   0   0]
 [  0   0   0   0   3   2   0   0]
 [  0   0   0   0   8  23   0   0]
 [  4   0   0   0  20 124   8   1]
 [  3   0   0   0  15 440  74   1]
 [  4   0   0   0   3 191 148   7]
 [  1   0   0   0   0  11  59   6]]
Accuracy =  0.5428571428571428


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

parameters = {'n_neighbors':range(1,50,2) }
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters,scoring=make_scorer(metrics.accuracy_score, greater_is_better=True))
clf.fit(XTrain, yTrain)

print("best parameter set is:",clf.best_params_," and its score was",clf.best_score_)



best parameter set is: {'n_neighbors': 29}  and its score was 0.5361510219229648


In [8]:
k_s=[]
train_accuracies=[]
test_accuracies=[]
for k in range(1,20):
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(XTrain, yTrain)
    y_pred_train=clf.predict(XTrain)
    y_pred=clf.predict(XTest)
    k_s.append(k)
    train_accuracies.append(metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
    test_accuracies.append(metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

df=pd.DataFrame({"k":k_s,"train_accuracy":train_accuracies,"test_accuracy":test_accuracies})
df

Unnamed: 0,k,train_accuracy,test_accuracy
0,1,1.0,0.485714
1,2,0.736444,0.446218
2,3,0.716267,0.489916
3,4,0.675914,0.489076
4,5,0.65889,0.509244
5,6,0.651114,0.501681
6,7,0.641656,0.503361
7,8,0.628836,0.504202
8,9,0.620429,0.509244
9,10,0.614754,0.518487
