In [2]:
### Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import warnings
warnings.filterwarnings(action='ignore')

# Data Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Model Training
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Model Training with PyTorch Lightning
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor

## Import necessary modules for content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
# Load the dataset
df=pd.read_csv('../datasets/users-score-2023.csv', usecols=["user_id","anime_id","rating"])
print("Shape of the Dataset:",df.shape)
df.head()

Shape of the Dataset: (24325191, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,21,9
1,1,48,7
2,1,320,5
3,1,49,8
4,1,304,8


In [4]:
# Checking if there are any duplicate rows
duplicated_rows = df[df.duplicated()]
print("Duplicated Rows:")
print(duplicated_rows)

: 

In [None]:
# Calculating the average score
avg_score = np.mean(df['rating'])
print('Average Score:', avg_score)

In [None]:
# Scaling our "rating" column
# Create a MinMaxScaler object
scaler = MinMaxScaler(feature_range=(0, 1))

# Scale the 'score' column between 0 and 1
df['scaled_score'] = scaler.fit_transform(df[['rating']])

In [None]:
# Encoding categorical data

## Encoding user IDs
user_encoder = LabelEncoder()
df["user_encoded"] = user_encoder.fit_transform(df["user_id"])
num_users = len(user_encoder.classes_)

## Encoding anime IDs
anime_encoder = LabelEncoder()
df["anime_encoded"] = anime_encoder.fit_transform(df["anime_id"])
num_animes = len(anime_encoder.classes_)

# Printing dataset information
print("Number of unique users: {}, Number of unique anime: {}".format(num_users, num_animes))
print("Minimum rating: {}, Maximum rating: {}".format(min(df['rating']), max(df['rating'])))

In [None]:
# Shuffle the dataset
df = shuffle(df, random_state=100)

# Create feature matrix X and target variable y
X = df[['user_encoded', 'anime_encoded']].values
y = df["scaled_score"].values

# Printing dataset information
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

In [None]:
test_set_size = 10000  # Number of samples to include in the test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=73)

print("Number of samples in the training set:", len(y_train))
print("Number of samples in the test set:", len(y_test))

In [None]:
# Prepare input data for model training and evaluation
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [None]:
# Convert arrays into PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor[:, 0], X_train_tensor[:, 1], y_train_tensor)
val_dataset = TensorDataset(X_test_tensor[:, 0], X_test_tensor[:, 1], y_test_tensor)

# DataLoader
batch_size = 80000
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
class RecommenderNet(pl.LightningModule):
    def __init__(self, num_users, num_animes, embedding_size=128):
        super(RecommenderNet, self).__init__()
        self.embedding_size = embedding_size
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.anime_embedding = nn.Embedding(num_animes, embedding_size)

        self.fc1 = nn.Linear(1, 64)
        self.fc2 = nn.Linear(64, 1)

        # Parameters for LR scheduler
        self.start_lr = 0.00001
        self.min_lr = 0.00001
        self.max_lr = 0.0004
        self.rampup_epochs = 5
        self.sustain_epochs = 0
        self.exp_decay = .8

    def forward(self, user_input, anime_input):
        # Embedding layers
        user_embedded = self.user_embedding(user_input)
        anime_embedded = self.anime_embedding(anime_input)

        # Normalize embeddings
        user_embedded_norm = F.normalize(user_embedded, p=2, dim=1)
        anime_embedded_norm = F.normalize(anime_embedded, p=2, dim=1)

        # Dot product and flattening
        dot_product = torch.sum(user_embedded_norm * anime_embedded_norm, dim=1, keepdim=True)

        # Dense layers
        x = F.relu(self.fc1(dot_product))
        x = torch.sigmoid(self.fc2(x))
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.start_lr)

        # Define the learning rate schedule function
        def lr_lambda(epoch):
            if epoch < self.rampup_epochs:
                return (self.max_lr - self.start_lr) / self.rampup_epochs * epoch + self.start_lr
            elif epoch < self.rampup_epochs + self.sustain_epochs:
                return self.max_lr
            else:
                return (self.max_lr - self.min_lr) * self.exp_decay**(epoch - self.rampup_epochs - self.sustain_epochs) + self.min_lr

        # Create the scheduler
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

        return [optimizer], [scheduler]


    def lr_lambda(self, epoch):
        if epoch < self.rampup_epochs:
            return (self.max_lr - self.start_lr) / self.rampup_epochs * epoch + self.start_lr
        elif epoch < self.rampup_epochs + self.sustain_epochs:
            return self.max_lr
        else:
            return (self.max_lr - self.min_lr) * self.exp_decay**(epoch - self.rampup_epochs - self.sustain_epochs) + self.min_lr

    def training_step(self, batch, batch_idx):
        user_input, anime_input, labels = batch
        predictions = self(user_input, anime_input)
        loss = F.binary_cross_entropy(predictions, labels.view(-1, 1))
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        user_input, anime_input, labels = batch
        predictions = self(user_input, anime_input)
        loss = F.binary_cross_entropy(predictions, labels.view(-1, 1))
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

model = RecommenderNet(num_users, num_animes)


In [None]:
# Callbacks
checkpoint_callback = ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3, mode='min')
lr_monitor = LearningRateMonitor(logging_interval='epoch')

trainer = pl.Trainer(callbacks=[checkpoint_callback, early_stopping_callback, lr_monitor], max_epochs=20)

trainer.fit(model, train_loader, val_loader)

# Save the model
torch.save(model.state_dict(), "anime_recommender_weights.pt")

In [None]:
# Utility functions
def extract_weights(name, model):
    # Access the embedding layer directly by name
    weight_layer = getattr(model, name)

    # Get the weights from the layer; .weight returns the weight tensor for embeddings
    weights = weight_layer.weight.data

    # Normalize the weights
    # Calculate the L2 norm of the weights, keep the dimension for broadcasting
    norms = torch.norm(weights, p=2, dim=1, keepdim=True)

    # Use broadcasting to perform the normalization
    normalized_weights = weights / norms

    # Convert normalized weights to numpy array if necessary
    normalized_weights_np = normalized_weights.cpu().numpy()

    return normalized_weights_np

In [None]:
df_anime=pd.read_csv('../datasets/anime-dataset-2023.csv')


In [None]:
popularity_threshold = 50
df_anime= df_anime.query('Members >= @popularity_threshold')
print(df_anime.shape)
df_anime.head(3)

In [None]:
def find_similar_animes(name, n=10, return_dist=False, neg=False):
    try:
        anime_row = df_anime[df_anime['Name'] == name].iloc[0]
        index = anime_row['anime_id']
        encoded_index = anime_encoder.transform([index])[0]
        weights = anime_weights
        dists = np.dot(weights, weights[encoded_index])
        sorted_dists = np.argsort(dists)
        n = n + 1
        if neg:
            closest = sorted_dists[:n]
        else:
            closest = sorted_dists[-n:]
        print('Animes closest to {}'.format(name))
        if return_dist:
            return dists, closest

        SimilarityArr = []

        for close in closest:
            decoded_id = anime_encoder.inverse_transform([close])[0]
            anime_frame = df_anime[df_anime['anime_id'] == decoded_id]

            anime_name = anime_frame['Name'].values[0]
            english_name = anime_frame['English name'].values[0]
            name = english_name if english_name != "UNKNOWN" else anime_name
            genre = anime_frame['Genres'].values[0]
            Synopsis = anime_frame['Synopsis'].values[0]
            similarity = dists[close]
            similarity = "{:.2f}%".format(similarity * 100)
            SimilarityArr.append({"Name": name, "Similarity": similarity, "Genres": genre, "Synopsis":Synopsis})
        Frame = pd.DataFrame(SimilarityArr).sort_values(by="Similarity", ascending=False)
        return Frame[Frame.Name != name]
    except:
        print('{} not found in Anime list'.format(name))

pd.set_option('display.max_colwidth', None)

In [None]:
find_similar_animes('Tensei shitara Slime Datta Ken', n=5, neg=False)