In [1]:
import os 
import numpy as np 
import pandas as pd 
import warnings
import scipy as sp 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
rating_path  = "C:\\Users\\71519\\Documents\\dataScienceWorkArea\\dataset\\rating.csv"
anime_path = "C:\\Users\\71519\\Documents\\dataScienceWorkArea\\dataset\\anime.csv"

In [3]:
rating_df = pd.read_csv(rating_path)
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [4]:
anime_df = pd.read_csv(anime_path)
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
anime_df.shape

(12294, 7)

In [6]:
rating_df.shape

(7813737, 3)

In [7]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [8]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [9]:
anime_df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [10]:
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [11]:
anime_df=anime_df[~np.isnan(anime_df["rating"])] # note to self: this step drops anime with rating 0

In [12]:
anime_df['genre'] = anime_df['genre'].fillna(
anime_df['genre'].dropna().mode().values[0])

In [13]:
anime_df['type'] = anime_df['type'].fillna(
anime_df['type'].dropna().mode().values[0])

In [14]:
anime_df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [15]:
rating_df['rating'].replace(-1, np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rating_df['rating'].replace(-1, np.nan, inplace=True)


In [16]:
from scipy.sparse import csr_matrix
user_item_matrix = rating_df.pivot_table(index='user_id', columns='anime_id', values='rating')
user_item_matrix.fillna(0, inplace=True)
user_item_sparse = csr_matrix(user_item_matrix)

In [17]:
# Use NearestNeighbors to find similar users
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(user_item_sparse)

In [18]:
def recommend_anime(user_id, num_recommendations=20):
    if user_id not in user_item_matrix.index:
        return "User ID not found."

    # Get the user's ratings
    user_index = user_item_matrix.index.get_loc(user_id)
    user_ratings = user_item_matrix.iloc[user_index]

    # Find the nearest neighbors
    distances, indices = knn.kneighbors(user_item_sparse[user_index], n_neighbors=10)

    # Get recommendations from similar users
    recommendations = {}
    for i in range(1, len(indices[0])):  # Skip the first neighbor (itself)
        similar_user_index = indices[0][i]
        similar_user_ratings = user_item_matrix.iloc[similar_user_index]
        for anime_id in user_item_matrix.columns:
            if user_ratings[anime_id] == 0 and similar_user_ratings[anime_id] > 0:
                if anime_id not in recommendations:
                    recommendations[anime_id] = similar_user_ratings[anime_id]
                else:
                    recommendations[anime_id] += similar_user_ratings[anime_id]

    # Sort recommendations by predicted rating
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

    # Check if there are any recommendations
    if not sorted_recommendations:
        return "No recommendations available for this user."

    # Scale ratings to be between 1 and 5
    min_rating = min(recommendations.values())
    max_rating = max(recommendations.values())
    scaled_recommendations = [(anime_id, ((pred_rating - min_rating) / (max_rating - min_rating)) * 4 + 1) for anime_id, pred_rating in sorted_recommendations]

    # Get anime names and scaled predicted ratings from anime_df
    recommended_animes = [(anime_df.loc[anime_df['anime_id'] == anime_id, 'name'].values[0], round(pred_rating, 2)) 
                          for anime_id, pred_rating in scaled_recommendations[:num_recommendations]]

    return recommended_animes


In [19]:
# Example usage
user_id_to_test = 5
recommend_anime(user_id_to_test)

[('Shingeki no Kyojin', 5.0),
 ('One Punch Man', 5.0),
 ('Hunter x Hunter (2011)', 4.85),
 ('Bakemonogatari', 4.69),
 ('Nisemonogatari', 4.38),
 ('Tengen Toppa Gurren Lagann', 4.33),
 ('Monogatari Series: Second Season', 4.33),
 ('Clannad', 4.28),
 ('Clannad: After Story', 4.28),
 ('Oda Nobuna no Yabou', 4.23),
 ('Shokugeki no Souma', 4.23),
 ('Death Parade', 4.13),
 ('Btooom!', 4.13),
 ('Katanagatari', 4.08),
 ('Deadman Wonderland', 4.08),
 ('Toaru Majutsu no Index II', 4.08),
 ('Fate/Zero', 4.08),
 ('Zankyou no Terror', 4.08),
 ('Bakuman. 3rd Season', 4.03),
 ('Beelzebub', 4.03)]

In [25]:
from sklearn.decomposition import TruncatedSVD
def recommend_anime_svd(user_id, num_recommendations=20):
    if user_id not in user_item_matrix.index:
        return "User ID not found."

    # Apply Truncated SVD
    svd = TruncatedSVD(n_components=20)
    user_item_matrix_svd = svd.fit_transform(user_item_matrix)

    # Get the user's ratings in the transformed space
    user_index = user_item_matrix.index.get_loc(user_id)
    user_vector = user_item_matrix_svd[user_index]

    # Predict ratings for all animes
    predicted_ratings = np.dot(user_item_matrix_svd, user_vector)

    # Get anime IDs the user hasn't rated
    user_rated_animes = set(user_item_matrix.columns[user_item_matrix.iloc[user_index] > 0])
    recommendations = {anime_id: predicted_ratings[i] for i, anime_id in enumerate(user_item_matrix.columns) if anime_id not in user_rated_animes}

    # Sort recommendations by predicted rating
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

    # Scale ratings to be between 1 and 5
    min_rating = min(recommendations.values())
    max_rating = max(recommendations.values())
    scaled_recommendations = [(anime_id, ((pred_rating - min_rating) / (max_rating - min_rating)) * 4 + 1) for anime_id, pred_rating in sorted_recommendations]

    # Get anime names and predicted ratings from anime_df
    recommended_animes = [(anime_df.loc[anime_df['anime_id'] == anime_id, 'name'].values[0], round(pred_rating, 2)) 
                          for anime_id, pred_rating in scaled_recommendations[:num_recommendations]]

    return recommended_animes

In [26]:
recommend_anime_svd(user_id_to_test)

[('Last Orders', 5.0),
 ('Oni Chichi 2: Harvest', 4.81),
 ('Girls und Panzer Specials', 4.71),
 ('Fushigi na Somera-chan Special', 4.65),
 ('Saibi', 4.58),
 ('Spectral Force', 4.36),
 ('Mashiro-iro Symphony: Airi ga Anata no Kanojo ni!?', 4.23),
 ('Takarajima (Movie)', 4.21),
 ('Hyakka Ryouran: Samurai After', 4.18),
 ('Shunga', 4.17),
 ('Hishoka Drop The Animation', 4.16),
 ('Yes! Precure 5 Movie: Kagami no Kuni no Miracle Daibouken!', 4.12),
 ('Answer', 4.12),
 ('Senran Kagura', 4.03),
 ('Mitsudomoe', 3.97),
 ('Kinnikuman: Kessen! Shichinin no Seigi Choujin vs. Uchuu Nobushi', 3.96),
 ('Penguin&#039;s Memory: Shiawase Monogatari', 3.91),
 ('Ningyou no Yakata', 3.91),
 ('Aria The Animation', 3.9),
 ('Tamayura: More Aggressive - Tsuitachi dake no Shuugakuryokou, Nanode',
  3.86)]