In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation, BatchNormalization, Input, Embedding, Dot, Dense, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping

from wordcloud import wordcloud
%matplotlib inline

In [49]:
import os
os.getcwd()

'C:\\Users\\mkuma\\OneDrive\\Desktop\\ANIME RECOMMENDER SYSTEM\\artifacts\\raw_data'

In [50]:
os.chdir('C:\\Users\\mkuma\\OneDrive\\Desktop\\ANIME RECOMMENDER SYSTEM\\artifacts\\raw_data')

In [51]:
os.getcwd()


'C:\\Users\\mkuma\\OneDrive\\Desktop\\ANIME RECOMMENDER SYSTEM\\artifacts\\raw_data'

In [52]:
rating_df = pd.read_csv("animelist.csv", low_memory=True, usecols=["user_id", "anime_id", "rating"])
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


In [53]:
rating_df.shape[0]

109224747

#### DATA PROCESING

In [54]:
n_ratings = rating_df["user_id"].value_counts()
n_ratings

user_id
20807     17546
140590    17534
281232    17533
147331    17526
131988    17520
          ...  
353195        1
353215        1
353235        1
353306        1
353307        1
Name: count, Length: 325770, dtype: int64

In [55]:
rating_df = rating_df[rating_df["user_id"].isin(n_ratings[n_ratings>400].index)].copy()

In [56]:
rating_df.shape

(71311714, 3)

In [57]:
n_users = rating_df["user_id"].nunique()
n_users

91375

In [58]:
n_anime = rating_df["anime_id"].nunique()
n_anime

17560

In [59]:
min_rating = rating_df['rating'].min()
min_rating

np.int64(0)

In [60]:
max_rating = rating_df['rating'].max()
max_rating

np.int64(10)

In [61]:
avg_rating = rating_df['rating'].mean()
avg_rating

np.float64(4.04729767117924)

In [62]:
rating_df["rating"] = rating_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values.astype(np.float64)
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
213,2,24833,0.0
214,2,235,1.0
215,2,36721,0.0
216,2,40956,0.0
217,2,31933,0.0


In [63]:
rating_df.duplicated().sum()

np.int64(1)

In [64]:
rating_df.drop_duplicates(inplace=True)

In [65]:
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [66]:
user_ids = rating_df["user_id"].unique().tolist()

user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user2user_decoded = {i: x for i, x in enumerate(user_ids)}

rating_df["user_encoded"] = rating_df["user_id"].map(user2user_encoded)
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,user_encoded
213,2,24833,0.0,0
214,2,235,1.0,0
215,2,36721,0.0,0
216,2,40956,0.0,0
217,2,31933,0.0,0


In [67]:
len(user2user_encoded)

91375

In [68]:
anime_ids = rating_df["anime_id"].unique().tolist()

anime2anime_encoded = {x: i for i, x in enumerate(anime_ids)}
anime2anime_decoded = {i: x for i, x in enumerate(anime_ids)}

rating_df["anime_encoded"] = rating_df["anime_id"].map(anime2anime_encoded)
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,user_encoded,anime_encoded
213,2,24833,0.0,0,0
214,2,235,1.0,0,1
215,2,36721,0.0,0,2
216,2,40956,0.0,0,3
217,2,31933,0.0,0,4


In [69]:
len(anime2anime_encoded)

17560

In [70]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,user_encoded,anime_encoded
213,2,24833,0.0,0,0
214,2,235,1.0,0,1
215,2,36721,0.0,0,2
216,2,40956,0.0,0,3
217,2,31933,0.0,0,4


In [71]:
X = rating_df[["user_encoded", "anime_encoded"]].values
y = rating_df["rating"]

In [72]:
test_size = 1000
train_indices = rating_df.shape[0] - test_size

In [73]:
X_train, X_test, y_train, y_test = (
    X[:train_indices],
    X[train_indices:],
    y[:train_indices],
    y[train_indices:]
)

In [74]:
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

#### MODEL ARCHITECTURE

In [75]:
def RecommenderNet():
    embedding_size = 128

    user = Input(name="user", shape=[1])

    user_embedding = Embedding(name="user_embedding", input_dim=n_users, output_dim=embedding_size)(user)

    anime = Input(name="anime", shape=[1])

    anime_embedding = Embedding(name="anime_embedding", input_dim=n_anime, output_dim=embedding_size)(anime)

    x = Dot(name="dot_product", normalize=True, axes=2)([user_embedding, anime_embedding])

    x = Flatten()(x)

    x = Dense(1, kernel_initializer="he_normal")(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)

    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss='binary_crossentropy', metrics=['mae','mse'], optimizer=Adam(learning_rate=0.001))

    return model

In [76]:
model = RecommenderNet()

In [77]:
model.summary()

In [78]:
start_learing_rate = 0.0001
min_lr = 0.0001
max_lr = 0.001
batch_size = 10000

ramup_epochs = 5
sustain_epochs = 0
exp_decay = 0.8

def lrfn(epoch):
    if epoch < ramup_epochs:
        lr = (max_lr - start_learing_rate) / ramup_epochs * epoch + start_learing_rate
    elif epoch < ramup_epochs + sustain_epochs:
        lr = max_lr
    else:
        lr = (max_lr - min_lr) * exp_decay**(epoch - ramup_epochs - sustain_epochs) + min_lr
    return lr

In [79]:
lr_callback = LearningRateScheduler(lambda epoch: lrfn(epoch), verbose=0)
checkpoint_filepath = './weights.weights.h5'


model_checkpoint = ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True, mode='min')

In [80]:
my_callbacks = [lr_callback, model_checkpoint, early_stopping]

In [None]:
history = model.fit(
    x = X_train_array,
    y = y_train,
    batch_size=batch_size,
    epochs=20,
    validation_data=(X_test_array, y_test),
    verbose=1,
    callbacks=my_callbacks
)

In [None]:
metrics = ["loss", "mae", "mse"]

fig, axes = plt.subplots(len(metrics), 1, figsize=(8, len(metrics) * 4))

for i, metric in enumerate(metrics):
    ax = axes[i]
    ax.plot(history.history[metric][0:-2], marker="o", label=f"train {metric}")
    ax.plot(history.history[f"val_{metric}"][0:-2], marker="o", label=f"test {metric}")
    ax.set_title(f"Model {metric.capitalize()}")
    ax.set_ylabel(metric.capitalize())
    ax.set_xlabel("Epoch")
    ax.legend(loc="upper left")
    ax.grid(True)

plt.tight_layout()
plt.show()

In [81]:
def extract_weights(name, model):
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    norm = np.linalg.norm(weights, axis=1, keepdims=True) + 1e-10
    return weights / norm

In [83]:
from tensorflow.keras.models import load_model
weights = model.load_weights("anime_checkpoint.weights.h5")
model = load_model("anime_model.keras")
# anime_weights = extract_weights("anime_embedding", model)

  saveable.load_own_variables(weights_store.get(inner_path))


In [86]:
anime_weights = extract_weights("anime_embedding", model)
joblib.dump(anime_weights, "anime_weights.h5")


['anime_weights.h5']

In [87]:
user_weights = extract_weights("user_embedding", model)
joblib.dump(user_weights, "user_weights.h5")

['user_weights.h5']

#### READING anime.csv

In [40]:
df = pd.read_csv("anime.csv", low_memory=True)
df.head(2)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0


In [41]:
df = df.replace("Unknown", np.nan)
df.head(2)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0


In [42]:
def getAnimeName(anime_id):
    try:
        name = df[df.anime_id == anime_id].eng_version.values[0]
        if name is np.nan:
            name = df[df.anime_id == anime_id].Name.values[0]
        return name
    except Exception as e:
        print(f"Error: {e}")


In [43]:
df["anime_id"] = df["MAL_ID"]
df["eng_version"] = df["English name"]
df['eng_version'] = df['anime_id'].apply(lambda x: getAnimeName(x))

In [44]:
df.head(2)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,anime_id,eng_version
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,1,Cowboy Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",,...,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,5,Cowboy Bebop:The Movie


In [45]:
df.sort_values(by="Score", ascending=False, na_position="last",inplace=True)

In [46]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,anime_id,eng_version
3971,5114,Fullmetal Alchemist: Brotherhood,9.19,"Action, Military, Adventure, Comedy, Drama, Ma...",Fullmetal Alchemist:Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,TV,64,"Apr 5, 2009 to Jul 4, 2010",Spring 2009,...,199160.0,70045.0,20210.0,9308.0,3222.0,1536.0,2162.0,16806.0,5114,Fullmetal Alchemist:Brotherhood
15926,40028,Shingeki no Kyojin: The Final Season,9.17,"Action, Military, Mystery, Super Power, Drama,...",Attack on Titan Final Season,進撃の巨人 The Final Season,TV,16,"Dec 7, 2020 to ?",Winter 2021,...,26016.0,8793.0,2674.0,1336.0,588.0,382.0,514.0,11061.0,40028,Attack on Titan Final Season
5683,9253,Steins;Gate,9.11,"Thriller, Sci-Fi",Steins;Gate,STEINS;GATE,TV,24,"Apr 6, 2011 to Sep 14, 2011",Spring 2011,...,140914.0,57740.0,21375.0,11126.0,5061.0,2292.0,1678.0,5255.0,9253,Steins;Gate
14963,38524,Shingeki no Kyojin Season 3 Part 2,9.1,"Action, Drama, Fantasy, Military, Mystery, Sho...",Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,TV,10,"Apr 29, 2019 to Jul 1, 2019",Spring 2019,...,110481.0,33662.0,8365.0,2974.0,1108.0,550.0,385.0,4169.0,38524,Attack on Titan Season 3 Part 2
9913,28977,Gintama°,9.1,"Action, Comedy, Historical, Parody, Samurai, S...",Gintama Season 4,銀魂°,TV,51,"Apr 8, 2015 to Mar 30, 2016",Spring 2015,...,21360.0,10215.0,3898.0,2311.0,952.0,648.0,1100.0,4508.0,28977,Gintama Season 4


In [88]:
user2user_encoded

{2: 0,
 6: 1,
 12: 2,
 16: 3,
 17: 4,
 19: 5,
 21: 6,
 41: 7,
 42: 8,
 44: 9,
 47: 10,
 53: 11,
 55: 12,
 60: 13,
 66: 14,
 73: 15,
 74: 16,
 85: 17,
 89: 18,
 90: 19,
 94: 20,
 98: 21,
 102: 22,
 108: 23,
 111: 24,
 112: 25,
 120: 26,
 121: 27,
 122: 28,
 135: 29,
 145: 30,
 146: 31,
 147: 32,
 153: 33,
 155: 34,
 156: 35,
 172: 36,
 174: 37,
 184: 38,
 190: 39,
 193: 40,
 194: 41,
 198: 42,
 204: 43,
 205: 44,
 209: 45,
 214: 46,
 219: 47,
 222: 48,
 227: 49,
 228: 50,
 235: 51,
 238: 52,
 240: 53,
 243: 54,
 248: 55,
 251: 56,
 252: 57,
 257: 58,
 264: 59,
 267: 60,
 272: 61,
 274: 62,
 275: 63,
 284: 64,
 285: 65,
 286: 66,
 290: 67,
 291: 68,
 293: 69,
 300: 70,
 301: 71,
 306: 72,
 308: 73,
 310: 74,
 313: 75,
 314: 76,
 316: 77,
 320: 78,
 321: 79,
 324: 80,
 325: 81,
 326: 82,
 327: 83,
 330: 84,
 336: 85,
 340: 86,
 345: 87,
 346: 88,
 349: 89,
 350: 90,
 366: 91,
 367: 92,
 371: 93,
 372: 94,
 375: 95,
 381: 96,
 382: 97,
 386: 98,
 389: 99,
 398: 100,
 405: 101,
 406: 102,
 

In [47]:
df.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1', 'anime_id',
       'eng_version'],
      dtype='str')

In [48]:
df = df[["anime_id", "eng_version", "Score", "Genres", "Episodes", "Type", "Members", "Premiered"]]
df.head()

Unnamed: 0,anime_id,eng_version,Score,Genres,Episodes,Type,Members,Premiered
3971,5114,Fullmetal Alchemist:Brotherhood,9.19,"Action, Military, Adventure, Comedy, Drama, Ma...",64,TV,2248456,Spring 2009
15926,40028,Attack on Titan Final Season,9.17,"Action, Military, Mystery, Super Power, Drama,...",16,TV,733260,Winter 2021
5683,9253,Steins;Gate,9.11,"Thriller, Sci-Fi",24,TV,1771162,Spring 2011
14963,38524,Attack on Titan Season 3 Part 2,9.1,"Action, Drama, Fantasy, Military, Mystery, Sho...",10,TV,1073626,Spring 2019
9913,28977,Gintama Season 4,9.1,"Action, Comedy, Historical, Parody, Samurai, S...",51,TV,404121,Spring 2015


In [49]:
def getAnimeFrame(anime, df):
    try:
        if isinstance(anime, int):
            return df[df.anime_id == anime]
        if isinstance(anime, str):
            return df[df.eng_version == anime]
    except Exception as e:
        print(f"Error: {e}")

In [50]:
getAnimeFrame(40028, df)

Unnamed: 0,anime_id,eng_version,Score,Genres,Episodes,Type,Members,Premiered
15926,40028,Attack on Titan Final Season,9.17,"Action, Military, Mystery, Super Power, Drama,...",16,TV,733260,Winter 2021


#### READING anime_with_synopsis.csv 

In [51]:
cols = ["MAL_ID", "Name", "Genres", "sypnopsis"]

In [52]:
synopsis_df = pd.read_csv("anime_with_synopsis.csv", low_memory=True, usecols=cols)
synopsis_df.head()

Unnamed: 0,MAL_ID,Name,Genres,sypnopsis
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [53]:
def getSynopsis(anime, df):
    try:
        if isinstance(anime, int):
            return df[df.MAL_ID == anime].sypnopsis.values[0]
        if isinstance(anime, str):
            return df[df.Name == anime].sypnopsis.values[0]
    except Exception as e:
        print(f"Error: {e}")

In [54]:
getSynopsis(40028, synopsis_df)

"Gabi Braun and Falco Grice have been training their entire lives to inherit one of the seven titans under Marley's control and aid their nation in eradicating the Eldians on Paradis. However, just as all seems well for the two cadets, their peace is suddenly shaken by the arrival of Eren Yeager and the remaining members of the Survey Corps. Having finally reached the Yeager family basement and learned about the dark history surrounding the titans, the Survey Corps has at long last found the answer they so desperately fought to uncover. With the truth now in their hands, the group set out for the world beyond the walls. In Shingeki no Kyojin: The Final Season , two utterly different worlds collide as each party pursues its own agenda in the long-awaited conclusion to Paradis' fight for freedom."

### CONTENT/ITEM BASED RECOMMENDER SYSTEM

In [55]:
# pd.set_option('max_colwidth', None)

In [56]:
def find_similar_animes(name, anime_weights, anime2anime_encoded, anime2anime_decoded, df, synopsis_df, top_n = 10, return_dists = False, neg = False):
    try:
        anime_id = getAnimeFrame(name, df).anime_id.values[0]
        encoded_anime_id = anime2anime_encoded.get(anime_id, None)

        weights = anime_weights

        dists = np.dot(weights, weights[encoded_anime_id])
        sorted_dist = np.argsort(dists)

        if neg:
            closest_animes = sorted_dist[:top_n]
        else:
            closest_animes = sorted_dist[-(top_n + 1):]

        print(f"Top {top_n} similar animes to '{name}':")

        if return_dists:
            return dists, closest_animes

        similar_animes = []

        for closest in closest_animes:
            decoded_anime_id = anime2anime_decoded.get(closest, None)
            synopsis = getSynopsis(decoded_anime_id, synopsis_df)
            anime_frame = getAnimeFrame(decoded_anime_id, df)
            anime_name = anime_frame.eng_version.values[0]
            genre = anime_frame.Genres.values[0]

            similar_animes.append({
                "anime_id": decoded_anime_id,
                "anime_name": anime_name,
                "genre": genre,
                "synopsis": synopsis,
                "similarity": dists[closest]
            })

        similar_animes_df =  pd.DataFrame(similar_animes).sort_values(by='similarity', ascending=False)
        similar_animes_df = similar_animes_df[similar_animes_df.anime_id != anime_id].reset_index(drop=True)
        return similar_animes_df
    except Exception as e:
        print(f"Error: {e}")


In [57]:
find_similar_animes("Haikyu!!", anime_weights, anime2anime_encoded, anime2anime_decoded, df, synopsis_df, 10, False, False)

Top 10 similar animes to 'Haikyu!!':


Unnamed: 0,anime_id,anime_name,genre,synopsis,similarity
0,28891,Haikyu!! 2nd Season,"Comedy, Sports, Drama, School, Shounen",Following their participation at the Inter-Hig...,0.900792
1,32935,Haikyu!! 3rd Season,"Comedy, Sports, Drama, School, Shounen","fter the victory against Aoba Jousai High, Kar...",0.851921
2,16894,Kuroko's Basketball 2,"Comedy, Sports, School, Shounen","h the Interhigh Championship finally over, Sei...",0.843239
3,11771,Kuroko's Basketball,"Comedy, School, Shounen, Sports",Teikou Junior High School's basketball team is...,0.820068
4,24415,Kuroko's Basketball 3,"Comedy, Sports, School, Shounen","Following their triumph against Yousen High, S...",0.797531
5,11061,Hunter x Hunter,"Action, Adventure, Fantasy, Shounen, Super Power",Hunter x Hunter is set in a world where Hunter...,0.770372
6,20507,Noragami:Stray God,"Action, Adventure, Comedy, Supernatural, Shounen","In times of need, if you look in the right pla...",0.768087
7,16498,Attack on Titan,"Action, Military, Mystery, Super Power, Drama,...","Centuries ago, mankind was slaughtered to near...",0.756924
8,28171,Food Wars! Shokugeki no Soma,"Ecchi, School, Shounen","Ever since he was a child, fifteen-year-old So...",0.755009
9,31964,My Hero Academia,"Action, Comedy, School, Shounen, Super Power","The appearance of ""quirks,"" newly discovered s...",0.745956


### USER BASED RECOMMENDATION

In [61]:
user2user_encoded[2]

0

In [None]:
def find_similar_users(item_input, user_weights, user2user_encoded, user2user_decoded, top_n = 10, return_dists = False, neg = False):
    user_id = item_input
    encoded_user_id = user2user_encoded.get(user_id)
    print(type(encoded_user_id), encoded_user_id)

    weights = user_weights

    dists = np.dot(weights, weights[encoded_user_id])
    sorted_dist = np.argsort(dists)
    if neg:
        closest_users = sorted_dist[:top_n]
    else:
        closest_users = sorted_dist[-(top_n + 1):]

    if return_dists:
        return dists, closest_users

    print(f"Top {top_n} similar users to '{user_id}':")

    similar_users = []

    for closest in closest_users:
        decoded_user_id = user2user_decoded.get(closest, None)
        similar_users.append({
            "user_id": decoded_user_id,
            "similarity": dists[closest]
        })

    similar_users_df = pd.DataFrame(similar_users).sort_values(by="similarity", ascending=False)
    similar_users_df = similar_users_df[similar_users_df.user_id != user_id].reset_index(drop=True)
    return similar_users_df

In [63]:
user_weights.shape

(91375, 128)

In [68]:
find_similar_users(2, user_weights, user2user_encoded, user2user_decoded, 10, False, False)

<class 'int'> 0
Top 10 similar users to '2':


Unnamed: 0,user_id,similarity
0,306210,0.845988
1,130106,0.840286
2,164066,0.838719
3,222177,0.836974
4,307026,0.831423
5,147331,0.831209
6,222811,0.830155
7,264001,0.828499
8,284246,0.828068
9,76102,0.826568


In [83]:
from wordcloud import WordCloud
def showWordCloud(all_genres):
    wordcloud = WordCloud(width=700, height=400, background_color='white', colormap='gnuplot').generate_from_frequencies(all_genres)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
from collections import defaultdict

def getFavGenre(frame, plot = False):

    frame = frame.dropna()

    all_genres = defaultdict(int)
    genre_lst = []
    for genres in frame.Genres:
        for genre in genres.split(","):
            genre = genre.strip()
            genre_lst.append(genre)
            all_genres[genre] +=1

    if plot:
        showWordCloud(all_genres)

    return genre_lst

In [96]:
x = getAnimeFrame("Haikyu!!", df)

In [98]:
getFavGenre(x,False)

['Comedy', 'Sports', 'Drama', 'School', 'Shounen']

In [None]:
def getUserPreferences(user_id, rating_df, df, plot = False):
    animes_watched_by_user = rating_df[rating_df.user_id == user_id]

    user_rating_percentile = np.percentile(animes_watched_by_user.rating, 75)

    anime_greater_than_percentile = animes_watched_by_user[animes_watched_by_user.rating >= user_rating_percentile]

    top_watched_anime_by_user = anime_greater_than_percentile.sort_values(by='rating', ascending = False).anime_id.values

    anime_df_rows = df[df.anime_id.isin(top_watched_anime_by_user)]
    anime_df_rows  = anime_df_rows[["anime_id", "eng_version", "Genres"]]

    if plot:
        getFavGenre(anime_df_rows, plot)

    return anime_df_rows


In [103]:
getUserPreferences(2, rating_df, df, False)

Unnamed: 0,anime_id,eng_version,Genres
0,5114,Fullmetal Alchemist:Brotherhood,"Action, Military, Adventure, Comedy, Drama, Ma..."
1,9253,Steins;Gate,"Thriller, Sci-Fi"
2,38524,Attack on Titan Season 3 Part 2,"Action, Drama, Fantasy, Military, Mystery, Sho..."
3,11061,Hunter x Hunter,"Action, Adventure, Fantasy, Shounen, Super Power"
4,9969,Gintama Season 2,"Action, Sci-Fi, Comedy, Historical, Parody, Sa..."
...,...,...,...
489,8668,The Qwaser Of Stigmata Picture Drama,Ecchi
490,38831,Yue ni Hitozuma wa Netorareta.,Hentai
491,20959,Kansen: Ball Buster The Animation,Hentai
492,10325,Kyuuketsuki,"Hentai, Vampire"


In [143]:
from collections import Counter

def getUserRecommendation(similar_users, user_preferences, df, rating_df, synopsis_df, top_n = 10):
    recommendation_anime = []
    anime_pool = []

    watched_anime = set(user_preferences.eng_version.values)

    for user_id in similar_users.user_id:

        pref_df = getUserPreferences(user_id, rating_df, df, plot = False)

        if pref_df.empty:
            continue

        pref_df = pref_df[~pref_df.eng_version.isin(watched_anime)]

        anime_pool.extend(pref_df.eng_version.values)

    if not anime_pool:
        return pd.DataFrame()

    top_animes = Counter(anime_pool).most_common(top_n)

    for anime_name, count in top_animes:

        frame = getAnimeFrame(anime_name, df)

        if frame.empty:
            continue

        anime_id = frame.anime_id.values[0]
        genre = frame.Genres.values[0]
        synopsis = getSynopsis(anime_name, synopsis_df)

        recommendation_anime.append({
            "anime_name": anime_name,
            "genre": genre,
            "synopsis": synopsis,
            "number_of_similar_users_preferred": count
        })

    recommendation_anime_df = pd.DataFrame(recommendation_anime)
    return recommendation_anime_df

In [120]:
similar_users = find_similar_users(108, user_weights, user2user_encoded, user2user_decoded, 10, False, False)

<class 'int'> 23
Top 10 similar users to '108':


In [121]:
user_pref = getUserPreferences(108, rating_df, df, False)

In [144]:
getUserRecommendation(similar_users, user_pref, df, rating_df, synopsis_df, top_n = 10)

Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0


Unnamed: 0,anime_name,genre,synopsis,number_of_similar_users_preferred
0,Nausicaä of the Valley of the Wind,"Adventure, Fantasy",,10
1,Castle in the Sky,"Adventure, Fantasy, Romance, Sci-Fi",,10
2,Howl's Moving Castle,"Adventure, Drama, Fantasy, Romance",,9
3,Bakemonogatari,"Romance, Supernatural, Mystery, Vampire","Koyomi Araragi, a third-year high school stude...",9
4,Princess Mononoke,"Action, Adventure, Fantasy",,8
5,Grave of the Fireflies,"Drama, Historical",,8
6,My Neighbor Totoro,"Adventure, Comedy, Supernatural",,8
7,Kiki's Delivery Service,"Adventure, Comedy, Drama, Magic, Romance, Fantasy",,8
8,Whisper of the Heart,"Slice of Life, Drama, Romance, Shoujo",,8
9,The Girl Who Leapt Through Time,"Adventure, Drama, Romance, Sci-Fi",,8


### HYBRID RECOMMENDER SYSTEM

In [149]:
def hybrid_recommendation(user_id, user_weight=0.5, content_weight=0.5):

    # ---------- USER BASED ----------
    similar_users = find_similar_users(
        user_id, user_weights, user2user_encoded, user2user_decoded, top_n=10
    )

    user_pref = getUserPreferences(user_id, rating_df, df)

    user_recommended_anime = getUserRecommendation(
        similar_users, user_pref, df, rating_df, synopsis_df
    )

    user_recommended_anime_lst = user_recommended_anime.anime_name.to_list()

    # ---------- CONTENT BASED ----------
    content_recommended_anime_lst = []

    for anime in user_recommended_anime_lst:

        similar_anime = find_similar_animes(
            anime,
            anime_weights,
            anime2anime_encoded,
            anime2anime_decoded,
            df,
            synopsis_df,
            top_n=10
        )

        if similar_anime is not None and not similar_anime.empty:
            content_recommended_anime_lst.extend(
                similar_anime.anime_name.to_list()
            )

    # ---------- COMBINE SCORES ----------
    combined_scores = {}

    for anime in user_recommended_anime_lst:
        combined_scores[anime] = combined_scores.get(anime, 0) + user_weight

    for anime in content_recommended_anime_lst:
        combined_scores[anime] = combined_scores.get(anime, 0) + content_weight

    # ---------- SORT ----------
    sorted_animes = sorted(
        combined_scores.items(),
        key=lambda x: x[1],
        reverse=True
    )

    return [anime for anime, score in sorted_animes[:10]]

In [150]:
hybrid_recommendation(11880, user_weight=0.1, content_weight=0.3)

<class 'int'> 3000
Top 10 similar users to '11880':
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Error: index 0 is out of bounds for axis 0 with size 0
Top 10 similar animes to 'Code Geass:Lelouch of the Rebellion R2':
Top 10 similar animes to 'Haikyu!! 3rd Season':
Top 10 similar animes to 'Spirited Away':
Top 10 similar animes to 'Code Geass:Lelouch of the Rebellion':
Top 10 similar animes to 'Fate/Zero Season 2':
Top 10 similar animes to 'Hotarubi no Mori e':
Top 10 similar animes to 'Your Name.':
Top 10 similar animes to 'Your Lie in April':
Top 10 similar animes to 'Haikyu!! 2nd Season':
Top 1

['One Punch Man',
 'Fullmetal Alchemist:Brotherhood',
 'Attack on Titan',
 'Death Note',
 'Steins;Gate',
 'My Hero Academia 2',
 'Haikyu!!',
 'My Hero Academia',
 'A Silent Voice',
 'Code Geass:Lelouch of the Rebellion R2']