In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import re
from functools import partial
from collections import defaultdict
import torch

In [2]:
anime_df = pd.read_csv('data/anime.csv', 
                      usecols=['MAL_ID', 'Episodes', 'Premiered', 'Duration', 'Genres', 'Score', ])
ratings_df = pd.read_csv('data/rating_complete.csv')
 
def convert_episodes(ep):
    try:
        return float(ep)
    except (ValueError, TypeError):
        return np.nan

def convert_duration(duration):
    if pd.isna(duration):
        return np.nan
    if not isinstance(duration, str):
        return np.nan
    try:
        if 'hr' in duration and 'min' in duration:
            parts = duration.split()
            hours = float(parts[0])
            minutes = float(parts[2])
            return hours * 60 + minutes
        elif 'hr' in duration:
            hours = float(duration.split()[0])
            return hours * 60
        elif 'min' in duration:
            return float(duration.split()[0])
    except (ValueError, IndexError):
        return np.nan

def extract_year(premiered):
    if isinstance(premiered, str):
        match = re.search(r'\d{4}', premiered)
        return float(match.group()) if match else np.nan
    return np.nan

anime_features = anime_df.copy()

anime_features['Episodes'] = pd.to_numeric(anime_features['Episodes'], errors='coerce')
anime_features['Duration_minutes'] = anime_features['Duration'].apply(convert_duration)
anime_features['year'] = anime_features['Premiered'].apply(extract_year)
anime_features['Score'] = pd.to_numeric(anime_features['Score'], errors='coerce')

genres = anime_features['Genres'].str.get_dummies(sep=', ')
anime_features['Encoded_Genres'] = genres.apply(
    lambda row: ''.join(row.astype(str)), 
    axis=1
).apply(lambda row: int(row, 2))

anime_features['Season'] = anime_features['Premiered'].str.extract(r'(Spring|Summer|Fall|Winter)')
seasons = pd.get_dummies(anime_features['Season'], prefix='Season', dummy_na=True).astype('int')

anime_features["enc_season"] = seasons.apply(
    lambda row: ''.join(row.astype(str)), 
    axis=1
).apply(lambda row: int(row, 2))

features = pd.concat([
    anime_features[['MAL_ID']].astype('int32'),
    anime_features[['Episodes', 'Duration_minutes', 'Score', 'year']].astype('float32'),
    anime_features['Encoded_Genres'].astype('int64'),
    anime_features["enc_season"].astype('int8')
], axis=1)

features = features.fillna({
    'Episodes': features['Episodes'].median(),
    'Duration_minutes': features['Duration_minutes'].median(),
    'Score': features['Score'].median(),
    'year': features['year'].median()
})


Anime_ID_map = {anime_id: i for i, anime_id in enumerate(features['MAL_ID'])}
Genere_index = genres.columns
Seaon_index = seasons.columns

del anime_features, genres, seasons

In [3]:
print(features.memory_usage(deep=True).sum() / 1024**2)
features.head()

0.4858264923095703


Unnamed: 0,MAL_ID,Episodes,Duration_minutes,Score,year,Encoded_Genres,enc_season
0,1,26.0,24.0,8.78,1998.0,14431090147584,8
1,5,1.0,115.0,8.39,2010.0,8933536203008,1
2,6,26.0,24.0,8.24,1998.0,14431090149376,8
3,7,26.0,25.0,7.27,2002.0,8933671436320,4
4,8,52.0,23.0,6.98,2004.0,4432406251552,16


In [4]:
def str_remap(value, bit_size = 44):
    binary_str = f"{value:0{bit_size}b}" 
    binary_vector = np.array([int(bit) for bit in binary_str])
    return binary_vector

In [5]:
features["Encoded_Genres"].head().apply(str_remap)
features["enc_season"].head().apply(partial(str_remap, bit_size=5))

0    [0, 1, 0, 0, 0]
1    [0, 0, 0, 0, 1]
2    [0, 1, 0, 0, 0]
3    [0, 0, 1, 0, 0]
4    [1, 0, 0, 0, 0]
Name: enc_season, dtype: object

In [6]:
len(ratings_df)

57633278

In [7]:
ratings_df, _ =train_test_split(ratings_df, test_size=0.998, random_state=42)

In [8]:
len(ratings_df)

115266

In [9]:
ratingsTrain, ratingsTest = train_test_split(ratings_df, test_size=0.2, random_state=42)
ratingsTrain, ratingsValid = train_test_split(ratingsTrain, test_size=0.2, random_state=42)

In [10]:
users = ratingsTrain["user_id"].unique()
anime_per_user = {user: () for user in users}
anime_per_user = ratingsTrain.groupby("user_id")["anime_id"].apply(list)
anime_rating_per_user = ratingsTrain.groupby("user_id")["rating"].apply(list)
for i, data in enumerate(anime_per_user.items()):
    user, anime_list = data
    ratings = torch.tensor(anime_rating_per_user[user], dtype=torch.float32)
    anime_id_list = torch.tensor([Anime_ID_map[anime] for anime in anime_list], dtype=torch.int64)
    anime_per_user[user] = (anime_id_list, ratings)

In [None]:
mse_min = float('inf')
best_vals = None
Num_epochs = 15

def fit_model(lambda_reg, k=3, lr=0.01):
    global mse_min, best_vals
    n_users = len(users)
    n_animes = len(Anime_ID_map)

    alpha = torch.tensor(torch.mean(torch.tensor(ratingsTrain["rating"].values, dtype=torch.float32)), requires_grad=True, device="cuda")
    beta_u = {user: torch.zeros(1, dtype=torch.float32, requires_grad=True, device="cuda") for user in users}
    beta_a = torch.zeros(n_animes, dtype=torch.float32, requires_grad=True, device="cuda")
    gamma_u = {user: torch.rand(k, dtype=torch.float32, requires_grad=True, device="cuda") for user in users}
    gamma_a = torch.rand(n_animes, k, dtype=torch.float32, requires_grad=True, device="cuda")

    params = [alpha, *gamma_u.values(), gamma_a, *beta_u.values(), beta_a]
    optimizer = torch.optim.Adam(params, lr=lr, weight_decay=lambda_reg)
    mse = mse_on_validation(alpha, beta_u, beta_a, gamma_u, gamma_a)
    print(f"Initial MSE: {mse}")
    for epoch in range(Num_epochs):
        for i,dt in enumerate(anime_per_user.items()):
            user, anime_data = dt
            anime_list, ratings = anime_data
            ratings = ratings.to("cuda")
            pred = alpha + beta_u[user] + beta_a[anime_list] + torch.sum(gamma_u[user] * gamma_a[anime_list], dim=1)
            loss = torch.mean((ratings - pred) ** 2)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if i % 1000 == 0:
                print(f"Epoch {epoch}, {i}/{n_users}, Loss: {loss.item()}")
        mse = mse_on_validation(alpha, beta_u, beta_a, gamma_u, gamma_a)
        print(f"Epoch {epoch+1}, MSE: {mse}")
        if mse < mse_min:
            mse_min = mse
            best_vals = (alpha, beta_u, beta_a, gamma_u, gamma_a)

def mse_on_validation(alpha, beta_u, beta_a, gamma_u, gamma_a):
    sse = 0
    for u, a, r in ratingsValid.values:
        if u not in beta_u:
            pred = alpha
        else:
            a_id = Anime_ID_map.get(a)
            pred = alpha + beta_u[u] + beta_a[a_id] + torch.sum(gamma_u[u] * gamma_a[a_id])
            pred = pred.item()
        sse += (r - pred) ** 2
    mse = sse / len(ratingsValid)
    return mse

fit_model(0.05, 5, 0.01)

In [None]:
def fit_model(lambda_reg, k=3, initals = None, lr = 0.01):
    global mse_min, best_vals
    num_epochs = 20
    learning_rate = lr

    # Initialize biases
    alpha = np.mean([r for _, _, r in ratingsTrain.values])
    beta_u = defaultdict(float)
    beta_i = defaultdict(float)
    
    # Initialize latent factors randomly
    gamma_u = {u: np.random.normal(scale=0.05, size=k) for u in users}
    gamma_i = {b: np.random.normal(scale=0.05, size=k) for b in animes}
    
    if initals:
        alpha, beta_u, beta_i, gamma_u, gamma_i = initals

    # Training loop
    for epoch in range(num_epochs):
        learning_rate*=0.99
        for index,d in enumerate(ratingsTrain.values):
            u, i, r = d
            # Compute prediction
            prediction = alpha + np.dot(gamma_u[u], gamma_i[i]) + beta_u[u] + beta_i[i]
            error = r - prediction

            # Update global bias
            alpha += learning_rate * (error)

            # Update user and item biases
            beta_u[u] += learning_rate * (error - lambda_reg * beta_u[u])
            beta_i[i] += learning_rate * (error - lambda_reg * beta_i[i])

            # Update latent factors
            gamma_u[u] += learning_rate * (error * gamma_i[i] - lambda_reg * gamma_u[u])
            gamma_i[i] += learning_rate * (error * gamma_u[u] - lambda_reg * gamma_i[i])
            if index % 1000000 == 0:
                print(f'Epoch {epoch + 1}/{num_epochs}, {index}/{len(ratingsTrain)}')
        mse = mse_on_validation(alpha, beta_u, beta_i, gamma_u, gamma_i)
        if mse < mse_min:
            print(f'Epoch {epoch + 1}/{num_epochs}, MSE on validation set: {mse}')
            mse_min = mse
            best_vals = (alpha, beta_u, beta_i, gamma_u, gamma_i)

    
    return alpha, beta_u, beta_i, gamma_u, gamma_i