In [1]:
import pandas as pd
from fastai.collab import *
from fastai.tabular.all import *
from fastai.data.transforms import RandomSplitter
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from time import gmtime, strftime

path = "datasets/users-score-2023.csv"
ratings = pd.read_csv(path)

ratings = ratings.drop('Username', axis=1)

ratings.info()

# scaler = MinMaxScaler(feature_range=(0, 1))

# # Scale the 'score' column between 0 and 1
# ratings['scaled_score'] = scaler.fit_transform(ratings[['rating']])

# Define thresholds
user_threshold = 10  # Example: Users with at least 10 ratings
anime_threshold = 5000  # Example: Anime with at least 30k ratings

# Get counts
user_counts = ratings['user_id'].value_counts()
anime_counts = ratings['Anime Title'].value_counts()


# Filter users and anime based on thresholds
filtered_users = user_counts[user_counts > user_threshold].index
filtered_animes = anime_counts[anime_counts > anime_threshold].index

# Filter the original DataFrame
filtered_ratings = ratings[ratings['user_id'].isin(filtered_users) & ratings['Anime Title'].isin(filtered_animes)]

user_avg_ratings = filtered_ratings.groupby('user_id')['rating'].mean()
filtered_ratings = filtered_ratings.merge(user_avg_ratings, on='user_id', suffixes=('', '_avg'))
filtered_ratings['scaled_rating'] = filtered_ratings['rating'] - filtered_ratings['rating_avg']

filtered_ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24325191 entries, 0 to 24325190
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   user_id      int64 
 1   anime_id     int64 
 2   Anime Title  object
 3   rating       int64 
dtypes: int64(3), object(1)
memory usage: 742.3+ MB


Unnamed: 0,user_id,anime_id,Anime Title,rating,rating_avg,scaled_rating
0,1,21,One Piece,9,7.80597,1.19403
1,1,48,.hack//Sign,7,7.80597,-0.80597
2,1,320,A Kite,5,7.80597,-2.80597
3,1,49,Aa! Megami-sama!,8,7.80597,0.19403
4,1,304,Aa! Megami-sama! Movie,8,7.80597,0.19403


In [None]:
anime = pd.read_csv("datasets/anime-dataset-2023.csv")
anime = anime[['anime_id', 'Name']]
anime = anime.rename(columns={'Name': 'title'})
anime.head()

print(type(filtered_ratings))


filtered_ratings = filtered_ratings.drop('anime_id', axis=1)


# train_df, valid_df = train_test_split(filtered_ratings.sample(n=10000), test_size=0.2)

dls = CollabDataLoaders.from_df(filtered_ratings, item_name='Anime Title', bs=64, num_workers=4)
dls.show_batch()

n_users = len(dls.classes['user_id'])
n_animes = len(dls.classes['Anime Title'])
n_factors = 5
user_factors = torch.randn(n_users, n_factors)
anime_factors = torch.randn(n_animes, n_factors)

def create_params(size):
  return nn.Parameter(torch.zeros(*size).normal_(0,0.1))
class DotProductBias(Module):
  def __init__(self, n_users, n_animes, n_factors, y_range = (0, 10.5)):
    self.user_factors = create_params([n_users, n_factors])
    self.user_bias = create_params([n_users])
    self.movie_factors = create_params([n_animes, n_factors])
    self.movie_bias = create_params([n_animes])
    self.y_range = y_range
  def forward(self,x):
    users = self.user_factors[x[:,0]]
    movies = self.movie_factors[x[:,1]]
    res = (users*movies).sum(dim=1)
    res += self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
    return sigmoid_range(res, *self.y_range)

filtered_ratings.isna().sum() 

torch.set_num_threads(4)

model = DotProductBias(n_users, n_animes, 30)
learn = Learner(dls , model, loss_func= MSELossFlat())
learn.fit_one_cycle(5, 1e-2, wd = 0.1)

filename = 'export' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + '.pk1'
learn.export(fname=filename)

In [None]:
# filename = 'User_Scores_Model_' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
# learn.save(filename)

# movie_bias = learn.model.movie_bias.squeeze()
# idxs = movie_bias.argsort()[:5]
# [dls.classes['Anime Title'][i] for i in idxs]

# idxs = movie_bias.argsort(descending=True)[:5]
# [dls.classes['Anime Title'][i] for i in idxs]

# def get_recommendations(user_id, n_animes, model, rated_animes, top_n=5, device='cpu'):
#     model.eval()  # Set model to evaluation mode
#     # Generate predictions for all movies for the given user
#     user_tensor = torch.tensor([user_id] * n_animes, dtype=torch.long, device=device)
#     movie_tensor = torch.tensor(range(n_animes), dtype=torch.long, device=device)
#     with torch.no_grad():
#         predictions = model(torch.stack([user_tensor, movie_tensor], dim=1))
    
#     # Convert to numpy for easier processing
#     movie_ids = torch.arange(n_animes, device=device).cpu().numpy()
#     movie_predictions = predictions.cpu().numpy()
    
#     # Exclude movies that the user has already rated
#     # rated_movie_ids = set(rated_animes)
#     recommendations = [(movie_id, rating) for movie_id, rating in zip(movie_ids, movie_predictions) if movie_id not in rated_animes]
    
#     # Get top N recommendations
#     top_recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
#     return top_recommendations

# user_id = 1

# filtered_df = filtered_ratings[filtered_ratings['user_id'] == 1]

# anime_ids = filtered_df['Anime Title'].tolist()

# top_recommendations = get_recommendations(user_id, n_animes, model, anime_ids)

# recommendations = pd.DataFrame(top_recommendations, columns=['anime_id', 'rating'])
# recommendations = recommendations.merge(anime, on='anime_id')


# # Print recommendations
# print(f"Top recommendations for user {user_id}:")
# for index, row in recommendations.iterrows():
#     print(f"{row['title']}, Predicted Rating: {row['rating']:.2f}")

# recommendations.head()

