# Setup

In [2]:
import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from pathlib import Path
import wandb
from kaggle_secrets import UserSecretsClient

dataset_path = Path('/kaggle/input/myanimelist-dataset-animes-profiles-reviews')



Here, we setup Weights and Biases for model tracking.

In [3]:
# Get appropriate device for training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [4]:
# user_secrets = UserSecretsClient()
# wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")
# wandb.login(key=wandb_api_key)

# wandb.init(
#     project="anime-collaborative-filtering-system",
#     config={
#         "architecture": "Collaborative Filtering"
#     }
# )

In [5]:
reviews = pd.read_csv(dataset_path/'reviews.csv', usecols=['profile', 'anime_uid', 'score'])
reviews.head()

Unnamed: 0,profile,anime_uid,score
0,DesolatePsyche,34096,8
1,baekbeans,34599,10
2,skrn,28891,7
3,edgewalker00,2904,9
4,aManOfCulture99,4181,10


In [6]:
animes = pd.read_csv(dataset_path/'animes.csv', usecols=['uid', 'title'])
animes.head()

Unnamed: 0,uid,title
0,28891,Haikyuu!! Second Season
1,23273,Shigatsu wa Kimi no Uso
2,34599,Made in Abyss
3,5114,Fullmetal Alchemist: Brotherhood
4,31758,Kizumonogatari III: Reiketsu-hen


In [7]:
# Add anime titles to dataframe
reviews = pd.merge(reviews, animes, left_on='anime_uid', right_on='uid')
reviews

Unnamed: 0,profile,anime_uid,score,uid,title
0,DesolatePsyche,34096,8,34096,Gintama.
1,DesolatePsyche,34096,8,34096,Gintama.
2,claudinou,34096,8,34096,Gintama.
3,claudinou,34096,8,34096,Gintama.
4,PeterFromRussia,34096,8,34096,Gintama.
...,...,...,...,...,...
317474,Kuromizue,9751,9,9751,Strike Witches Movie
317475,ryanxwonbin,9751,8,9751,Strike Witches Movie
317476,AobaSuzukaze,9751,10,9751,Strike Witches Movie
317477,7jaws7,9751,9,9751,Strike Witches Movie


# Data Exploration

In [8]:
cross_tabulation = pd.crosstab(reviews.profile, reviews.title, reviews.score, aggfunc=np.sum)

Here, we preview the cross-tabulated data with the users and animes with the most ratings.

In [9]:
user_groups = reviews.groupby('profile').score.count()
top_users = user_groups.sort_values(ascending=False)[:10].keys()

anime_groups = reviews.groupby('title').score.count()
top_animes = anime_groups.sort_values(ascending=False)[:10].keys()

cross_tabulation.loc[top_users, top_animes]

title,Death Note,Steins;Gate,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Clannad: After Story,Toradora!,Mahou Shoujo Madoka★Magica,Mirai Nikki,Tengen Toppa Gurren Lagann,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.
profile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Stark700,,,,,,,,36.0,,
Sidewinder51,,,,,,,,,,
ktulu007,,28.0,,32.0,,20.0,36.0,,24.0,28.0
LegendAqua,,,40.0,,,,,,,
ggultra2764,,,24.0,24.0,,,32.0,16.0,,
literaturenerd,24.0,,,28.0,,,20.0,16.0,,
BanjoTheBear,,,,,,,,,,
BabyGirl06301,36.0,32.0,36.0,40.0,,,,28.0,,
PyraXadon,28.0,,,,40.0,,36.0,,,36.0
angelsreview,,,,,,24.0,32.0,,,


# Create DataLoaders

We must create indexes for each user and anime to correspond to.

In [10]:
unique_users = reviews.profile.unique()
user_to_index = {}
for index, user in enumerate(unique_users):
    user_to_index[user] = index
        
unique_animes = reviews.title.unique()
anime_to_index = {}
index_to_anime = {}
for index, anime in enumerate(unique_animes):
    anime_to_index[anime] = index
    index_to_anime[index] = anime

In [11]:
class ReviewDataset(Dataset):
    """
    A class for a Pytorch dataset that stores users, animes, and scores.
    """
    
    def __init__(self, dataframe, user_to_index, anime_to_index):
        # Convert users to integers
        user_indexes = dataframe.profile.map(user_to_index)
        
        # Convert animes to integers
        anime_indexes = dataframe.title.map(anime_to_index)
            
        self.X = pd.DataFrame({'user_index': user_indexes, 'anime_index': anime_indexes})
        self.y = dataframe.score.astype(np.intc)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        X = torch.tensor(self.X.iloc[index], dtype=torch.int32).to(device)
        y = torch.tensor([self.y.iloc[index]], dtype=torch.float32).to(device)
        return X, y

In [12]:
train_reviews, validation_reviews = train_test_split(reviews, test_size=0.2, random_state=42)
train_reviews

Unnamed: 0,profile,anime_uid,score,uid,title
196752,samuel_sfx,14741,9,14741,Chuunibyou demo Koi ga Shitai!
210305,Popaglockin,120,5,120,Fruits Basket
213579,Zyzoxing,31859,9,31859,Hai to Gensou no Grimgar
204607,azuslu7jpg,10357,9,10357,Jinrui wa Suitai Shimashita
29003,JyoStar,400,10,400,Seihou Bukyou Outlaw Star
...,...,...,...,...,...
119879,ratchet573,5081,8,5081,Bakemonogatari
259178,Lord_Odous,10793,6,10793,Guilty Crown
131932,LacePendragon,13601,9,13601,Psycho-Pass
146867,Tozzy,32,9,32,Neon Genesis Evangelion: The End of Evangelion


In [13]:
train_dataset = ReviewDataset(train_reviews, user_to_index, anime_to_index)
validation_dataset = ReviewDataset(validation_reviews, user_to_index, anime_to_index)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=64, shuffle=True)

# Create Model

In [22]:
class CollaborativeFilteringNeuralNetwork(nn.Module):
    """
    Creates a neural network with embedding layers.
    
    Arguments:
        num_users:
            Number of unique users
            
        num_items:
            Number of unique items
            
        num_factors:
            Number of latent factors for each user and item
            
        hiddens:
            A list of integers defining the number of units in each hidden layer.
    """
    
    def __init__(self, num_users, num_items, num_factors, hiddens):
        super().__init__()
        
        def generate_layers(num_in):
            """
            Generator that creates layers
            """
            
            for num_out in hiddens:
                yield nn.Linear(num_in, num_out)
                yield nn.ReLU()
                num_in = num_out
                
            # Output layer
            yield nn.Linear(num_in, 1)
            yield nn.Sigmoid()
                
        self.user_embeddings = nn.Embedding(num_users, num_factors)
        self.item_embeddings = nn.Embedding(num_items, num_factors)
        self.linear_relu_stack = nn.Sequential(*list(generate_layers(num_factors * 2)))
        
    def forward(self, x):
        user_embeddings = self.user_embeddings(x[:, 0])
        item_embeddings = self.item_embeddings(x[:, 1])
        model_input = torch.cat((user_embeddings, item_embeddings), dim=1)
        model_output = self.linear_relu_stack(model_input)
        return model_output

In [23]:
num_users = len(unique_users)
num_items = len(unique_animes)
num_factors = 5
hiddens = [5]

model = CollaborativeFilteringNeuralNetwork(num_users, num_items, num_factors, hiddens).to(device)

# Train Model

In [21]:
def train(dataloader, model, loss_function, optimizer):
    """
    Training loop.
    """
    total_rows = len(dataloader.dataset)
    # Put model in training mode. Important for batch normalization and dropout
    model.train()
    
    for batch, (X, y) in enumerate(dataloader):
        # Move tensors to device
        X, y = X.to(device), y.to(device)
        
        # Compute prediction error
        predictions = model(X) * 10
        loss = loss_function(predictions, y)
        
        # Backpropogation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        # Print metrics every 100 batches
        if batch % 100 == 0:
            # Get loss value as a float
            loss = loss.item()
            current_row = (batch + 1) * len(X)
            print(f"Training loss: {loss:>7f}  [{current_row:>5d}/{total_rows:>5d}]")

In [24]:
def validate(dataloader, model, loss_function):
    """
    Validation loop.
    """
    
    total_rows = len(dataloader.dataset)
    num_batches = len(dataloader)
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    model.eval()
    validation_loss = 0
    
    with torch.no_grad():
        for X, y in dataloader:
            # Move tensors to device
            X, y = X.to(device), y.to(device)
            
            # Compute prediction error
            predictions = model(X) * 10
            
            # Compute loss and accuracy
            validation_loss += loss_function(predictions, y).item()
            
            
    validation_loss /= num_batches
    print(f"Validation Error: \n Validation loss: {validation_loss:>8f} \n")

In [26]:
epochs = 2
loss_function = nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for i in range(epochs):
    print(f"Epoch {i+1}\n-------------------------------")
    train(train_dataloader, model, loss_function, optimizer)
    validate(validation_dataloader, model, loss_function)
    
print("トレーニングが完了しました!")

Epoch 1
-------------------------------
Training loss: 4.010642  [   64/253983]
Training loss: 4.473638  [ 6464/253983]
Training loss: 4.047721  [12864/253983]
Training loss: 6.673236  [19264/253983]
Training loss: 4.235540  [25664/253983]
Training loss: 4.318337  [32064/253983]
Training loss: 5.976287  [38464/253983]
Training loss: 4.167884  [44864/253983]
Training loss: 4.551960  [51264/253983]
Training loss: 5.800469  [57664/253983]
Training loss: 6.384316  [64064/253983]
Training loss: 4.593410  [70464/253983]
Training loss: 3.852396  [76864/253983]
Training loss: 6.158123  [83264/253983]
Training loss: 4.219616  [89664/253983]
Training loss: 5.545725  [96064/253983]
Training loss: 5.358496  [102464/253983]
Training loss: 4.039722  [108864/253983]
Training loss: 3.699881  [115264/253983]
Training loss: 5.440631  [121664/253983]
Training loss: 5.322474  [128064/253983]
Training loss: 4.558142  [134464/253983]
Training loss: 4.110984  [140864/253983]
Training loss: 4.934152  [147264/