In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

data = pd.read_csv('../../joined_datasets/joined_rating_dataset.csv')
user_df = pd.read_csv("../../cleaned_datasets/users_details_dataset_cleaned.csv")
anime_df = pd.read_csv("../../cleaned_datasets/anime_dataset_cleaned.csv")

In [2]:
top_countries = user_df['Location'].value_counts().head(10)

# # Get the list of top 10 countries
top_10_countries = top_countries.index.tolist()
user_df_filtered = user_df[user_df['Location'].isin(top_10_countries)]
user_df_filtered.info()
print(user_df_filtered['Location'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Index: 27242 entries, 0 to 41485
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        27242 non-null  int64  
 1   Mal ID            27242 non-null  int64  
 2   Username          27242 non-null  object 
 3   Gender            26452 non-null  object 
 4   Birthday          27242 non-null  object 
 5   Location          27242 non-null  object 
 6   Joined            27242 non-null  object 
 7   Days Watched      27241 non-null  float64
 8   Mean Score        27241 non-null  float64
 9   Watching          27241 non-null  float64
 10  Completed         27241 non-null  float64
 11  On Hold           27241 non-null  float64
 12  Dropped           27241 non-null  float64
 13  Plan to Watch     27241 non-null  float64
 14  Total Entries     27241 non-null  float64
 15  Rewatched         27241 non-null  float64
 16  Episodes Watched  27241 non-null  float64
 17

In [3]:
user_one_hot = pd.get_dummies(user_df_filtered, columns=['Gender', 'Location'])
user_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27242 entries, 0 to 41485
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              27242 non-null  int64  
 1   Mal ID                  27242 non-null  int64  
 2   Username                27242 non-null  object 
 3   Birthday                27242 non-null  object 
 4   Joined                  27242 non-null  object 
 5   Days Watched            27241 non-null  float64
 6   Mean Score              27241 non-null  float64
 7   Watching                27241 non-null  float64
 8   Completed               27241 non-null  float64
 9   On Hold                 27241 non-null  float64
 10  Dropped                 27241 non-null  float64
 11  Plan to Watch           27241 non-null  float64
 12  Total Entries           27241 non-null  float64
 13  Rewatched               27241 non-null  float64
 14  Episodes Watched        27241 non-null  flo

In [4]:
user_one_hot["Age"] = ((pd.to_datetime('Jan 01, 2024', format='%b %d, %Y') - pd.to_datetime(user_one_hot["Birthday_Date"], format='%Y-%m-%d')) / np.timedelta64(52, 'W'))
user_one_hot = user_one_hot.round()
user_df_filtered["Age"] = user_one_hot["Age"]
user_one_hot.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df_filtered["Age"] = user_one_hot["Age"]


Index(['Unnamed: 0', 'Mal ID', 'Username', 'Birthday', 'Joined',
       'Days Watched', 'Mean Score', 'Watching', 'Completed', 'On Hold',
       'Dropped', 'Plan to Watch', 'Total Entries', 'Rewatched',
       'Episodes Watched', 'Birthday_Date', 'Joined_Date', 'Age_Join',
       'Gender_Female', 'Gender_Male', 'Gender_Non-Binary',
       'Location_Australia', 'Location_Brazil', 'Location_Canada',
       'Location_France', 'Location_Germany', 'Location_Philippines',
       'Location_Poland', 'Location_Russia', 'Location_Sweden',
       'Location_United States', 'Age'],
      dtype='object')

In [5]:
import re
features_columns_regex = [r"^Gender_.*", r"^Location_.*", r"^Age$"]

matching_columns = []

for col in user_one_hot.columns:
    if any(re.match(pattern, col) for pattern in features_columns_regex):
        matching_columns.append(col)


In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler_fit = scaler.fit(user_one_hot[['Age']])
user_one_hot[['Age']] = scaler_fit.transform(user_one_hot[['Age']])
user_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27242 entries, 0 to 41485
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              27242 non-null  int64  
 1   Mal ID                  27242 non-null  int64  
 2   Username                27242 non-null  object 
 3   Birthday                27242 non-null  object 
 4   Joined                  27242 non-null  object 
 5   Days Watched            27241 non-null  float64
 6   Mean Score              27241 non-null  float64
 7   Watching                27241 non-null  float64
 8   Completed               27241 non-null  float64
 9   On Hold                 27241 non-null  float64
 10  Dropped                 27241 non-null  float64
 11  Plan to Watch           27241 non-null  float64
 12  Total Entries           27241 non-null  float64
 13  Rewatched               27241 non-null  float64
 14  Episodes Watched        27241 non-null  flo

In [7]:
import pickle
import joblib


knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(user_one_hot[matching_columns])

joblib.dump(knn, './models/knn_model.pkl')

['./models/knn_model.pkl']

In [None]:
user = pd.DataFrame({
    'Gender_Female':[0],
    'Gender_Male':[1], 
    'Gender_Non-Binary':[0], 
    'Location_Australia':[1], 
    'Location_Brazil':[0], 
    'Location_Canada':[0], 
    'Location_France':[0], 
    'Location_Germany':[0], 
    'Location_Philippines':[0], 
    'Location_Poland':[0], 
    'Location_Russia':[0], 
    'Location_Sweden':[0], 
    'Location_United States':[0], 
    'Age':[25]
    })

scaler = StandardScaler()
user[['Age']] = scaler_fit.transform(user[['Age']])
user

Unnamed: 0,Gender_Female,Gender_Male,Gender_Non-Binary,Location_Australia,Location_Brazil,Location_Canada,Location_France,Location_Germany,Location_Philippines,Location_Poland,Location_Russia,Location_Sweden,Location_United States,Age
0,0,1,0,1,0,0,0,0,0,0,0,0,0,-1.760891


In [19]:
knn_loaded = joblib.load('./models/knn_model.pkl')

distances, indices = knn_loaded.kneighbors(user)
print("Indices of similar users:", indices)
print("Distances to similar users:", distances)

Indices of similar users: [[20770 22804 11894 26223 15932 24543 22805 23659 18521 24626]]
Distances to similar users: [[2.22044605e-16 2.31756022e-03 2.31756022e-03 2.31756022e-03
  2.31756022e-03 2.31756022e-03 1.09274598e-02 1.09274598e-02
  1.09274598e-02 1.09274598e-02]]


In [20]:
similar_user_ids = user_df_filtered.iloc[indices[0]]["Mal ID"].values
user_df_filtered.iloc[indices[0]]

Unnamed: 0.1,Unnamed: 0,Mal ID,Username,Gender,Birthday,Location,Joined,Days Watched,Mean Score,Watching,...,On Hold,Dropped,Plan to Watch,Total Entries,Rewatched,Episodes Watched,Birthday_Date,Joined_Date,Age_Join,Age
31799,450813,519857,Naboo2,Male,1998-12-07T00:00:00+00:00,Australia,2011-07-21T00:00:00+00:00,27.7,8.55,6.0,...,1.0,4.0,4.0,99.0,15.0,1581.0,1998-12-07,2011-07-21,12.662088,25.0
34872,540718,751459,wu-0140,Male,1998-05-20T00:00:00+00:00,Australia,2011-10-15T00:00:00+00:00,3.6,8.71,1.0,...,0.0,0.0,1.0,10.0,0.0,214.0,1998-05-20,2011-10-15,13.450549,26.0
18238,284742,342494,Seladawe,Male,1997-12-24T00:00:00+00:00,Australia,2010-06-17T00:00:00+00:00,101.1,7.35,5.0,...,0.0,0.0,1.0,263.0,0.0,5976.0,1997-12-24,2010-06-17,12.521978,26.0
39984,704713,1206469,SazhBolt,Male,1998-04-06T00:00:00+00:00,Australia,2012-03-19T00:00:00+00:00,31.3,7.97,9.0,...,13.0,2.0,15.0,180.0,0.0,1897.0,1998-04-06,2012-03-19,14.0,26.0
24523,357346,416779,Sidius,Male,1998-01-14T00:00:00+00:00,Australia,2010-12-06T00:00:00+00:00,30.6,9.05,2.0,...,0.0,1.0,0.0,22.0,67.0,1829.0,1998-01-14,2010-12-06,12.936813,26.0
37482,660641,1103955,ryan_S10R,Male,1998-06-12T00:00:00+00:00,Australia,2012-01-14T00:00:00+00:00,47.4,8.44,25.0,...,0.0,0.0,0.0,128.0,11.0,2814.0,1998-06-12,2012-01-14,13.637363,26.0
34873,540756,751535,STIGMA2X,Male,1996-12-31T00:00:00+00:00,Australia,2011-10-15T00:00:00+00:00,12.3,9.99,8.0,...,0.0,0.0,5.0,73.0,0.0,721.0,1996-12-31,2011-10-15,14.837912,27.0
36176,639526,1061003,soggysocks,Male,1997-05-01T00:00:00+00:00,Australia,2011-12-16T00:00:00+00:00,37.8,7.75,8.0,...,9.0,11.0,37.0,131.0,12.0,2224.0,1997-05-01,2011-12-16,14.675824,27.0
28439,411783,474703,Blackrockzz,Male,1996-12-25T00:00:00+00:00,Australia,2011-04-16T00:00:00+00:00,0.0,9.5,2.0,...,0.0,0.0,0.0,2.0,0.0,2.0,1996-12-25,2011-04-16,14.354396,27.0
37604,662340,1107393,DiLiNiTi,Male,1996-12-14T00:00:00+00:00,Australia,2012-01-17T00:00:00+00:00,77.9,7.38,4.0,...,0.0,7.0,15.0,150.0,30.0,4591.0,1996-12-14,2012-01-17,15.142857,27.0


In [21]:
similar_users_ratings = data[data['user_id'].isin(similar_user_ids)][["anime_id", "rating"]]
similar_users_ratings
similar_users_ratings['anime_id'].unique()

array([14967,  1735,    21, 15989,   101,  8676,  6547,  7817,  6347,
        7805,  5081,  2156,  5337,  7674,    67,    68,  2589,   269,
        1686,  2889,  4835,   762,   834,   150, 10719,  1689, 10012,
          59,   596, 14741,  2167,  1723,  4181,  6351,  4059,  1575,
        2904,     1,  1482,  2025,  4182,  7338,  6573,  1535,  9379,
       10638,   223,   225,   987,  6033,   502,   891,   892,   893,
         813,   894,   895,   896,   897,   898,   899,   900,   901,
         902,   903,   904,   905,   906,   986,   985,   356, 33050,
        6922, 10087,  8536,   120,    71,    73,    72,   121,  5114,
         430,  5670,  6959,  4872,  8425, 10504,   245,  3299,   263,
         264,   265,  5258,  2026,  4192,  5521,  8074,   135, 11061,
        7088,   249,   452,   450,   451,  6811,  5680,  7791,  6862,
        7017,  6205,  6045,  9656,  8841, 10209,   189,  2034,  1887,
        4472, 34599,  9756, 10110,  8460, 10620,  8424,    20,   442,
         936,  2144,

In [22]:
def calculate_relevance(anime_id, similar_users_ratings):
    # Filter ratings for the specific anime
    anime_ratings = similar_users_ratings[similar_users_ratings['anime_id'] == anime_id]
    return anime_ratings['rating'].mean()

In [23]:
from sklearn.preprocessing import MultiLabelBinarizer

anime_df_one_hot = pd.get_dummies(anime_df, columns=['Type'])
anime_df_one_hot
anime_df_one_hot['Genres'] = anime_df_one_hot['Genres'].str.strip().str.replace(', ', ',').str.replace(' ,', ',').str.split(',')
mlb = MultiLabelBinarizer()
genres_one_hot = pd.DataFrame(mlb.fit_transform(anime_df_one_hot['Genres']), columns=mlb.classes_, index=anime_df.index)
genres_one_hot = genres_one_hot.add_prefix("Genres_")
genres_one_hot
anime_features_df = pd.concat([anime_df, genres_one_hot], axis=1).drop('Genres', axis=1)
anime_features_df
import re
features_columns_regex = [r"^Genres_.*", r"^Type_.*", r"^Episodes_Norm$"]

matching_columns = []

for col in anime_features_df.columns:
    if any(re.match(pattern, col) for pattern in features_columns_regex):
        matching_columns.append(col)


In [24]:

def calculate_diversity(selected_animes, anime_id, anime_feature_df):
    if not selected_animes:
        return 0
    
    candidate_features = anime_feature_df.loc[anime_df[anime_df['anime_id'] == anime_id].index].values.reshape(1, -1)
    selected_features = anime_feature_df.loc[anime_df[anime_df['anime_id'].isin(selected_animes)].index].values
    diversity_score = cosine_similarity(candidate_features, selected_features).mean()
    return diversity_score

In [25]:
def recommend_top_animes(similar_users_ratings, anime_feature_df, num_recommendations=10, alpha=0.7):
    candidate_animes = similar_users_ratings['anime_id'].unique()
    selected_animes = []
    mmr_scores = {}

    for _ in range(num_recommendations): 
        best_anime = None
        best_score = -np.inf

        for anime_id in candidate_animes:
            if anime_id in selected_animes:
                continue
            
            relevance = calculate_relevance(anime_id, similar_users_ratings)
            diversity = calculate_diversity(selected_animes, anime_id, anime_feature_df)

            mmr_score = alpha * relevance - (1 - alpha) * diversity

            if mmr_score > best_score:
                best_score = mmr_score
                best_anime = anime_id
        
        if best_anime is not None:
            selected_animes.append(best_anime)
            mmr_scores[best_anime] = best_score

    return selected_animes


In [26]:
top_animes = recommend_top_animes(similar_users_ratings, anime_features_df[matching_columns], num_recommendations=5, alpha=0.7)
print("Top Recommended Animes:", anime_df.loc[anime_df[anime_df['anime_id'].isin(top_animes)].index])

Top Recommended Animes:       Unnamed: 0  anime_id                                               Name  \
4616        4724      6768  Code Geass: Hangyaku no Lelouch R2 Special Edi...   
4703        4820      6948                               Bakemonogatari Recap   
5502        5667      9253                                        Steins;Gate   
7632        7988     19111          Love Live! School Idol Project 2nd Season   
7698        8061     19489       Little Witch Academia: Mahoujikake no Parade   

                                     English name  \
4616                                      UNKNOWN   
4703                                      UNKNOWN   
5502                                  Steins;Gate   
7632             Love Live! School Idol Project 2   
7698  Little Witch Academia: The Enchanted Parade   

                                           Other name Score  \
4616  コードギアス 反逆のルルーシュ R2 Special Edition Zero Requiem  7.78   
4703                                        

In [27]:
print(scaler_fit.mean_)
print(scaler_fit.var_)
print(scaler_fit.var_**0.5)


[32.76650026]
[19.45296631]
[4.4105517]
