In [1]:
import os
import surprise
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from typing import Tuple
from tqdm.auto import tqdm
from pytorch_lightning.loggers import WandbLogger
import wandb

In [2]:
movielens = surprise.Dataset.load_builtin('ml-100k', prompt=False)

In [3]:
df = pd.DataFrame(movielens.raw_ratings, columns=["User", "Movie", "Rating", "Timestamp"])
df["User"] = pd.to_numeric(df["User"])
df["Movie"] = pd.to_numeric(df["Movie"])
df["Rating"] = pd.to_numeric(df["Rating"])

In [4]:
user_id_mapping = {value: index for index, value in enumerate(df["User"].unique())}
inv_user_id_mapping = {v: k for k, v in user_id_mapping.items()}

movie_id_mapping = {value: index for index, value in enumerate(df["Movie"].unique())}
inv_movie_id_mapping = {v: k for k, v in movie_id_mapping.items()}

In [5]:
df["User"] = df["User"].apply(lambda x: user_id_mapping[x])
df["Movie"] = df["Movie"].apply(lambda x: movie_id_mapping[x])

In [6]:
df.groupby("User")[["Movie"]].count().reset_index().rename(columns={"Movie": "Count"}).sort_values("Count", ascending=False)

Unnamed: 0,User,Count
402,402,737
650,650,685
58,58,636
442,442,540
37,37,518
...,...,...
475,475,20
135,135,20
813,813,20
903,903,20


In [7]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        df (pd.DataFrame): Dataframe containing the movie ratings
    
    """

    def __init__(self, df: pd.DataFrame):
        self.users, self.items, self.ratings = self.get_dataset(df)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx: int):
        return self.users[idx], self.items[idx], self.ratings[idx]

    def get_dataset(self, df: pd.DataFrame) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        return torch.tensor(df["User"].values), torch.tensor(df["Movie"].values), torch.tensor(df["Rating"].values)

In [8]:
class DeepRecommender(pl.LightningModule):
    """ Deep Recommender
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users: int, num_items: int):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        
    def forward(self, user_input, item_input):
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        vector = torch.cat([user_embedded, item_embedded], dim=-1)
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        res = self.output(vector)

        return res
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.MSELoss()(predicted_labels, labels.view(-1, 1).float())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.MSELoss()(predicted_labels, labels.view(-1, 1).float())
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

In [9]:
wandb.init()
num_users = len(df["User"].unique())
num_movie = len(df["Movie"].unique())

recommender = DeepRecommender(num_users, num_movie)
dataset = MovieLensTrainDataset(df)
train_size = int(0.8 * len(df))
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, len(df) - train_size])


train_loader = DataLoader(train_dataset,
                          batch_size=512, num_workers=0)

test_loader = DataLoader(test_dataset,
batch_size=512, num_workers=0)


wandb_logger = WandbLogger()

trainer = pl.Trainer(gpus=1, max_epochs=10, logger=wandb_logger)
trainer.fit(model=recommender, train_dataloader=train_loader, val_dataloaders=test_loader)
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: calychas (use `wandb login --relogin` to force relogin)


In [10]:
with torch.no_grad():
    user_embeddings = recommender.user_embedding(torch.tensor(np.arange(num_users)).type(torch.LongTensor)).numpy()
    movie_embeddings = recommender.item_embedding(torch.tensor(np.arange(num_movie)).type(torch.LongTensor)).numpy()

In [18]:
import sklearn
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import umap
import plotly.express as px

# X = StandardScaler().fit_transform(user_embeddings)
X = user_embeddings
db = DBSCAN(metric="cosine", eps=0.2).fit(X)
db.labels_

reducer = umap.UMAP()
X_umap = reducer.fit_transform(X)
df_cluster = pd.DataFrame(np.hstack((X_umap, db.labels_.reshape(-1, 1))), columns=["x", "y", "cluster_id"])
fig = px.scatter(df_cluster, x="x", y="y", color="cluster_id")

fig.show()

In [21]:
X = movie_embeddings
db = DBSCAN(metric="cosine", eps=0.15).fit(X)
db.labels_

reducer = umap.UMAP()
X_umap = reducer.fit_transform(X)
df_cluster = pd.DataFrame(np.hstack((X_umap, db.labels_.reshape(-1, 1))), columns=["x", "y", "cluster_id"])
fig = px.scatter(df_cluster, x="x", y="y", color="cluster_id")

fig.show()