Dataset: https://www.kaggle.com/datasets/CooperUnion/anime-recommendations-database?select=rating.csv

## Librerias

In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
import numpy as np
import torch

## Recolección de la data.

In [2]:
ratings = pd.read_csv("rating.csv")
animes = pd.read_csv("anime.csv")

## Preparación de la data/ preprocesamiento la data

In [3]:
ratings.head(6)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
5,1,355,-1


In [4]:
animes.head(6)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


In [5]:
animes_titles = animes[["anime_id", "name"]]
animes_titles.head(6)

Unnamed: 0,anime_id,name
0,32281,Kimi no Na wa.
1,5114,Fullmetal Alchemist: Brotherhood
2,28977,Gintama°
3,9253,Steins;Gate
4,9969,Gintama&#039;
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...


In [6]:
anime_ratings = ratings.merge(animes_titles, on="anime_id")
anime_ratings.head(6)

Unnamed: 0,user_id,anime_id,rating,name
0,1,20,-1,Naruto
1,3,20,8,Naruto
2,5,20,6,Naruto
3,6,20,-1,Naruto
4,10,20,-1,Naruto
5,21,20,8,Naruto


In [7]:
animes_titles[animes_titles["name"] == "Naruto"]

Unnamed: 0,anime_id,name
841,20,Naruto


In [8]:
anime_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813727 entries, 0 to 7813726
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   int64 
 1   anime_id  int64 
 2   rating    int64 
 3   name      object
dtypes: int64(3), object(1)
memory usage: 238.5+ MB


In [9]:
anime_ratings.isna().any()

user_id     False
anime_id    False
rating      False
name        False
dtype: bool

In [10]:
anime_ratings.loc[anime_ratings['rating'] == -1, 'rating'] = 0

In [11]:
anime_ratings = anime_ratings.sample(10000)

## Entrenamiento del modelo.

In [12]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)  

In [14]:
def test_loss(model, unsqueeze=False):
    model.eval()

    users = torch.LongTensor(df_val.user_id.values)
    items = torch.LongTensor(df_val.anime_id.values)
    ratings = torch.FloatTensor(df_val.rating.values)

    if unsqueeze:
        ratings = ratings.unsqueeze(1)

    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    
    print("test loss %.3f " % loss.item())

In [13]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()

    for i in range(epochs):
        users = torch.LongTensor(df_train.user_id.values)
        items = torch.LongTensor(df_train.anime_id.values)
        ratings = torch.FloatTensor(df_train.rating.values)

        if unsqueeze:
            ratings = ratings.unsqueeze(1)

        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(loss.item()) 

    test_loss(model, unsqueeze)

In [15]:
encoder = LabelEncoder()

In [16]:
encoder.fit(anime_ratings['user_id'])
anime_ratings.loc[:, 'user_id'] = sorted(encoder.transform(anime_ratings['user_id']))

encoder.fit(anime_ratings['anime_id'])
anime_ratings.loc[:, 'anime_id'] = encoder.transform(anime_ratings['anime_id'])

In [17]:
x = anime_ratings.iloc[:, :-2]
y = anime_ratings.iloc[:, -2]

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2, test_size=0.2)

In [19]:
df_train = x_train.join(y_train)
df_val = x_test.join(y_test)

In [20]:
num_users = len(anime_ratings.user_id.unique())
num_items = len(anime_ratings.anime_id.unique())
emb_size = 10000

In [21]:
model = MF(num_users, num_items, emb_size=emb_size)

In [22]:
train_epocs(model, epochs=10, lr=0.6)

11.388671875
9312218.0
2630335.0
827604.5625
3196565.25
5653263.0
1976290.25
543000.3125
881518.0
847236.9375
test loss 142278.875 
