In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import surprise
from surprise import Dataset
from surprise import Reader
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
anime_data = pd.read_csv('../input/anime-recommendations-database/anime.csv')
rates_data = pd.read_csv('../input/anime-recommendations-database/rating.csv')

In [3]:
print ("Data tabel anime berupa (row, column):"+ str(anime_data.shape))
print (anime_data.info())

In [4]:
print ("Data tabel rate berupa (row, column):"+ str(rates_data.shape))
print (rates_data.info())

In [5]:
#Merge data anime - setting user_id dan user_rate
anim_fulldata = pd.merge(anime_data, rates_data, on='anime_id',suffixes= ['', '_user'])
anim_fulldata = anim_fulldata.rename(columns={'name': 'anime_title', 'rating_user': 'user_rating'})
anim_fulldata.head()

In [6]:
gabung_anime_rates = anim_fulldata.dropna(axis = 0, subset = ['anime_title'])
anim_rateCount = (gabung_anime_rates.
                     groupby(by = ['anime_title'])['user_rating'].
                     count().
                     reset_index().rename(columns = {'rating': 'totalRatingCount'})
                     [['anime_title', 'user_rating']]
                     )

top_animerate = anim_rateCount[['anime_title', 'user_rating']].sort_values(
    by = 'user_rating',ascending = False
).head(10)

ax = sns.barplot(x = "anime_title", y = "user_rating", data = top_animerate, palette="Dark2")
ax.set_xticklabels(ax.get_xticklabels(), fontsize=11, rotation=40, ha="right")
ax.set_title('Top 10 Anime dengan rating terbanyak',fontsize = 22)
ax.set_xlabel('Anime',fontsize = 20) 
ax.set_ylabel('User Rating count', fontsize = 20)

In [7]:
anim_feature = anim_fulldata.copy()
anim_feature["user_rating"].replace({-1: np.nan}, inplace=True)
anim_feature.head()

anim_feature = anim_feature.dropna(axis = 0, how ='any') 
anim_feature.isnull().sum()

In [8]:
counts = anim_feature['user_id'].value_counts()
anim_feature = anim_feature[anim_feature['user_id'].isin(counts[counts >= 200].index)]

In [9]:
anim_pivot = anim_feature.pivot_table(index='anime_title',
                                      columns='user_id',
                                      values='user_rating'
                                     ).fillna(0)
anim_pivot.head()

In [10]:
anime_matrix = csr_matrix(anim_pivot.values)
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(anime_matrix)

In [11]:
query_index = np.random.choice(anim_pivot.shape[0])

distances, indices = model_knn.kneighbors(
    anim_pivot.iloc[query_index,:].values.reshape(1, -1), 
    n_neighbors = 6
)

In [12]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Rekomendasi {0}:\n'.format(anim_pivot.index[query_index]))
    else:
        print('{0}: {1}, dengan index {2}:'.format(i, 
                                                       anim_pivot.index[indices.flatten()[i]], 
                                                       distances.flatten()[i]
                                                      ))

In [13]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text

anime_data['name'] = anime_data['name'].apply(text_cleaning)

In [14]:
TfV = TfidfVectorizer(min_df=3,  max_features=None, 
                      strip_accents = 'unicode', 
                      analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range = (1, 3),
                      stop_words = 'english')
anime_data['genre'] = anime_data['genre'].fillna('')
genres_str = anime_data['genre'].str.split(',').astype(str)
tfv_matrix = TfV.fit_transform(genres_str)

In [15]:
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
indices = pd.Series(anime_data.index, index=anime_data['name']).drop_duplicates()

In [16]:
def minta_rec(title, sig=sig):
    # ambil index title anime
    idx = indices[title]

    # mengambil score
    sig_scores = list(enumerate(sig[idx]))

    # urutin movie dgn score 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    sig_scores = sig_scores[1:11]

    # Movie indices
    anime_indices = [i[0] for i in sig_scores]

    # Top 10 anime movie
    return pd.DataFrame({'Nama Anime': anime_data['name'].iloc[anime_indices].values,
                         'Rating': anime_data['rating'].iloc[anime_indices].values
                        })

## Hasil Akhir

In [17]:
# Ubah sesuai judul anime yang ingin dicari
minta_rec('No Game No Life')