In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks import Callback
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [3]:
users = pd.read_csv('processed_dataset/MovieLens-1M/users/users_movielens.csv')
movies = pd.read_csv('processed_dataset/MovieLens-1M/movies/movies_movielens.csv')
full_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/fulldata_movielens.csv')
train_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/traindata_movielens.csv')
val_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/valdata_movielens.csv')
test_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/testdata_movielens.csv')

In [4]:
# Combine user features into a single string for each user
users['user_features'] = 'occupation: ' + users['occupation'] + ' [SEP] age: ' + users['age'].astype(str) + ' [SEP] gender: ' + users['gender'].astype(str)

# Combine movie features into a single string for each movie
movies['movie_features'] = 'title: ' + movies['title'] + ' [SEP] genres: ' + movies['genres']

In [5]:
# Create a dictionary for fast lookup
user_features_dict = users.set_index('user_id')['user_features'].to_dict()
movie_features_dict = movies.set_index('item_id')['movie_features'].to_dict()

# Create lists of user and item texts
user_texts = [user_features_dict[userId] for userId in full_ratings['user_id'].unique()]
item_texts = [movie_features_dict[movieId] for movieId in full_ratings['item_id'].unique()]

# Create a mapping from userId and movieId to indices
user_id_to_idx = {userId: idx for idx, userId in enumerate(full_ratings['user_id'].unique())}
movie_id_to_idx = {movieId: idx for idx, movieId in enumerate(full_ratings['item_id'].unique())}

# Map userId and movieId in ratings_df to indices
train_ratings['user_idx'] = train_ratings['user_id'].map(user_id_to_idx)
train_ratings['movie_idx'] = train_ratings['item_id'].map(movie_id_to_idx)

# Map userId and movieId in ratings_val to indices
val_ratings['user_idx'] = val_ratings['user_id'].map(user_id_to_idx)
val_ratings['movie_idx'] = val_ratings['item_id'].map(movie_id_to_idx)

# Extract user indices, item indices, and ratings
train_user_indices = torch.LongTensor(train_ratings['user_idx'].values).to(device)
train_item_indices = torch.LongTensor(train_ratings['movie_idx'].values).to(device)
train_labels = torch.FloatTensor(train_ratings['rating'].values).to(device)

# Extract user indices, item indices, and ratings for validation
val_user_indices = torch.LongTensor(val_ratings['user_idx'].values).to(device)
val_item_indices = torch.LongTensor(val_ratings['movie_idx'].values).to(device)
val_labels = torch.FloatTensor(val_ratings['rating'].values).to(device)

In [6]:
item_texts[train_item_indices[0].item()]

'title: Silence of the Lambs, The (1991) [SEP] genres: Drama|Thriller'

In [7]:
user_texts[train_user_indices[0].item()]

'occupation: doctor/health care [SEP] age: 25-34 [SEP] gender: Male'

In [8]:
train_labels[0].item()

5.0

In [9]:
# Create DataLoader for training data
train_dataset = TensorDataset(train_user_indices, train_item_indices, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)

# Create DataLoader for training data
val_dataset = TensorDataset(val_user_indices, val_item_indices, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True, drop_last=True)

In [10]:
class TwoTowerModel(pl.LightningModule):
    def __init__(self, user_model_name, item_model_name, embedding_size=384):
        super(TwoTowerModel, self).__init__()
        self.user_model = SentenceTransformer(user_model_name)
        self.item_model = SentenceTransformer(item_model_name)

        self.user_fc = nn.Linear(embedding_size, embedding_size)
        self.item_fc = nn.Linear(embedding_size, embedding_size)

        self.criterion = nn.MSELoss()
        self.epoch_losses = {'train_loss': [], 'val_loss': []}

    def forward(self, user_text, item_text):
        user_embedding = self.user_model.encode(user_text, convert_to_tensor=True).to(device)
        item_embedding = self.item_model.encode(item_text, convert_to_tensor=True).to(device)

        user_output = self.user_fc(user_embedding)
        item_output = self.item_fc(item_embedding)

        # dot_product = torch.sum(user_output * item_output, dim=1)
        dot_product = torch.matmul(user_output.unsqueeze(1), item_output.unsqueeze(2)).squeeze()
        # Apply sigmoid transformation and scaling here

        dot_product = 4 * torch.sigmoid(dot_product) + 1

        return dot_product

    def training_step(self, batch, batch_idx):
        users, items, ratings = batch

        users = [user_texts[i] for i in users.tolist()]
        items = [item_texts[i] for i in items.tolist()]

        preds = self(users, items)

        loss = self.criterion(preds, ratings)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        users, items, ratings = batch

        users = [user_texts[i] for i in users.tolist()]
        items = [item_texts[i] for i in items.tolist()]

        preds = self(users, items)

        loss = self.criterion(preds, ratings)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-5)

class PrintLossesCallback(Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        train_loss = trainer.callback_metrics.get('train_loss')
        if train_loss is not None:
            pl_module.epoch_losses['train_loss'].append(train_loss.item())
            print(f"Epoch {trainer.current_epoch + 1}: Train Loss: {train_loss.item()}")

    def on_validation_epoch_end(self, trainer, pl_module):
        val_loss = trainer.callback_metrics.get('val_loss')
        if val_loss is not None:
            pl_module.epoch_losses['val_loss'].append(val_loss.item())
            print(f"Epoch {trainer.current_epoch + 1}: Val Loss: {val_loss.item()}")

model = TwoTowerModel(user_model_name='paraphrase-MiniLM-L6-v2', item_model_name='paraphrase-MiniLM-L6-v2')

trainer = pl.Trainer(max_epochs=5, log_every_n_steps=1, callbacks=[PrintLossesCallback()], enable_progress_bar=True)
trainer.fit(model, train_dataloader, val_dataloader)

# Print losses after training completes
print("Epoch losses:")
for epoch in range(trainer.max_epochs):
    train_loss = model.epoch_losses['train_loss'][epoch] if epoch < len(model.epoch_losses['train_loss']) else 'N/A'
    val_loss = model.epoch_losses['val_loss'][epoch] if epoch < len(model.epoch_losses['val_loss']) else 'N/A'
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss}, Val Loss: {val_loss}")


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                | Params | Mode 
-----------------------------------------------------------
0 | user_model | SentenceTransformer | 22.7 M | train
1 | item_model | SentenceTransformer | 22.7 M | train
2 | user_fc    | Linear              | 147 K  | train
3 | item_fc    | Linear              | 147 K  | train
4 | criterion  | MSELoss             | 0      | train
-----------------------------------------------------------
45.7 M    Trainable pa

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:475: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  6.83it/s]Epoch 1: Val Loss: 1.5445998907089233
                                                                           

D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 18753/18753 [12:34<00:00, 24.84it/s, v_num=20]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6251 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6251 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/6251 [00:00<04:22, 23.77it/s][A
Validation DataLoader 0:   0%|          | 2/6251 [00:00<04:05, 25.44it/s][A
Validation DataLoader 0:   0%|          | 3/6251 [00:00<04:07, 25.28it/s][A
Validation DataLoader 0:   0%|          | 4/6251 [00:00<04:03, 25.61it/s][A
Validation DataLoader 0:   0%|          | 5/6251 [00:00<04:00, 25.93it/s][A
Validation DataLoader 0:   0%|          | 6/6251 [00:00<04:00, 25.93it/s][A
Validation DataLoader 0:   0%|          | 7/6251 [00:00<04:06, 25.32it/s][A
Validation DataLoader 0:   0%|          | 8/6251 [00:00<04:03, 25.68it/s][A
Validation DataLoader 0:   0%|          | 9/6251 [00:00<04:01, 25.86it/s][A
Validation DataLoader 0:   0%|          | 10/6251 [00:00<03:

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 18753/18753 [16:41<00:00, 18.73it/s, v_num=20]
Epoch losses:
Epoch 1: Train Loss: 0.67975914478302, Val Loss: 1.5445998907089233
Epoch 2: Train Loss: 1.2870962619781494, Val Loss: 1.1172562837600708
Epoch 3: Train Loss: 0.786232054233551, Val Loss: 1.1137503385543823
Epoch 4: Train Loss: 0.5968674421310425, Val Loss: 1.1077935695648193
Epoch 5: Train Loss: 1.380179524421692, Val Loss: 1.1011466979980469


In [18]:
model.epoch_losses

{'train_loss': [0.67975914478302,
  1.2870962619781494,
  0.786232054233551,
  0.5968674421310425,
  1.380179524421692],
 'val_loss': [1.5445998907089233,
  1.1172562837600708,
  1.1137503385543823,
  1.1077935695648193,
  1.1011466979980469,
  1.0977823734283447]}

In [11]:
full_items_embeddings = torch.stack([model.item_model.encode(item_text, convert_to_tensor=True) for item_text in item_texts])

In [12]:
def get_top_n_items(model, userId, n):
    # Ensure the model is in evaluation mode
    model.eval()

    # Get the user text for the given userId
    user_text = user_features_dict[userId]

    # Encode the user text
    user_embedding = model.user_model.encode(user_text, convert_to_tensor=True)

    # Compute the scores (dot product between user embedding and each item embedding)
    user_output = model.user_fc(user_embedding)
    item_output = model.item_fc(full_items_embeddings)
    scores = torch.matmul(user_output, item_output.t()).squeeze()

    # Get the top n item indices and their scores
    top_n_scores, top_n_indices = torch.topk(scores, n)

    # Map indices back to item IDs and convert scores to a pandas Series
    top_n_item_ids = [list(movie_id_to_idx.keys())[list(movie_id_to_idx.values()).index(idx)] for idx in top_n_indices.tolist()]
    top_n_scores = top_n_scores.cpu().detach().numpy()

    top_n_series = pd.Series(data=top_n_scores, index=top_n_item_ids)

    return top_n_series

In [14]:
# Example usage
user_id = 6039  # Replace with the userId you want to get recommendations for
top_n = 5  # Number of top items to retrieve
top_items = get_top_n_items(model, user_id, top_n)

print(f"Top {top_n} items for user {user_id}: {top_items}")

Top 5 items for user 6039: 2019    2.126097
3382    2.087105
2612    2.040844
917     1.952969
3532    1.932746
dtype: float32


In [15]:
user_ids = test_ratings['user_id'].unique()  # List of user IDs in the test dataset
n = 5  # Number of recommendations per user
recommendations = {}

for user_id in user_ids:
    recommendations[user_id] = get_top_n_items(model, user_id, n)

In [16]:
recommendations

{6040: 2019    2.234239
 2313    2.226366
 3741    2.212635
 684     2.168626
 2612    2.109877
 dtype: float32,
 6039: 2019    2.126097
 3382    2.087105
 2612    2.040844
 917     1.952969
 3532    1.932746
 dtype: float32,
 6038: 3382    2.397695
 1104    2.236118
 2686    2.210438
 2019    2.190321
 1099    2.188079
 dtype: float32,
 6037: 2019    2.152579
 3382    2.128674
 1099    2.102513
 154     2.066690
 2612    2.062244
 dtype: float32,
 6036: 1099    2.170006
 3382    2.147869
 3430    2.121928
 1315    2.113326
 1104    2.112581
 dtype: float32,
 6035: 2612    2.143137
 2019    2.104325
 3382    2.068414
 1099    2.051682
 154     2.039857
 dtype: float32,
 6034: 796     2.295027
 2019    2.273199
 3741    2.198731
 2940    2.166033
 2612    2.153715
 dtype: float32,
 6033: 1104    2.261466
 2019    2.253831
 2612    2.248802
 913     2.192010
 946     2.066678
 dtype: float32,
 6032: 2019    2.318330
 2940    2.019801
 3382    1.979060
 1104    1.907766
 2218    1.889243


In [17]:
from evaluator import EvaluateMetrics

# Evaluate the recommendations
evaluator = EvaluateMetrics(test_ratings)
evaluation_results = evaluator.evaluate_recommendations(recommendations, k=n)

print(f"NDCG@{n}: {evaluation_results['NDCG@k']:.4f}")
print(f"MRR@{n}: {evaluation_results['MRR@k']:.4f}")
print(f"HR@{n}: {evaluation_results['HR@k']:.4f}")

NDCG@5: 0.6500
MRR@5: 0.5389
HR@5: 1.0000
