In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from pytorch_lightning.callbacks import Callback
import pandas as pd
import pickle
from pytorch_lightning.callbacks import ModelCheckpoint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [3]:
users = pd.read_csv('processed_dataset/MovieLens-1M/users/users_movielens.csv')
movies = pd.read_csv('processed_dataset/MovieLens-1M/movies/movies_movielens_modified.csv')
full_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_full_movielens.csv')
train_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_train_movielens.csv')
val_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_val_movielens.csv')
test_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_test_movielens.csv')

In [8]:
movies

Unnamed: 0,item_id,title,genres,year
0,1,Toy Story,"Animation, Children's, Comedy",1995
1,2,Jumanji,"Adventure, Children's, Fantasy",1995
2,3,Grumpier Old Men,"Comedy, Romance",1995
3,4,Waiting to Exhale,"Comedy, Drama",1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents,Comedy,2000
3879,3949,Requiem for a Dream,Drama,2000
3880,3950,Tigerland,Drama,2000
3881,3951,Two Family House,Drama,2000


In [18]:
def generate_user_texts_with_history_bad_rating(users, movies, ratings):
    user_histories = {user_id: {'positive': [], 'negative': []} for user_id in users['user_id'].unique()}
    user_texts = []

    # Convert relevant columns to dictionaries for faster access
    user_features_dict = users.set_index('user_id').to_dict('index')
    movie_details_dict = movies.set_index('item_id').to_dict('index')

    for _, row in ratings.iterrows():
        user_id = row['user_id']
        movie_id = row['item_id']
        rating = row['rating']

        # Get user features
        user = user_features_dict[user_id]
        user_features = f"occupation: {user['occupation']} [SEP] age: {user['age']} [SEP] gender: {user['gender']}"

        # Function to get movie details
        def get_movie_details(movie_id):
            movie = movie_details_dict[movie_id]
            return f"title: {movie['title']} [SEP] genres: {movie['genres']} [SEP] year: {movie['year']}"

        # Append the user's positive and negative histories (only the last 3 movies)
        positive_history = [get_movie_details(mid) for mid in user_histories[user_id]['positive'][-3:]]
        negative_history = [get_movie_details(mid) for mid in user_histories[user_id]['negative'][-3:]]

        positive_history_str = " [SEP] ".join(positive_history)
        negative_history_str = " [SEP] ".join(negative_history)

        # Combine user features and history
        history_str = ""
        if positive_history_str:
            history_str += f"positively rated movies: {positive_history_str}"
        if negative_history_str:
            if history_str:
                history_str += " [SEP] "
            history_str += f"negatively rated movies: {negative_history_str}"

        combined_features = f"{user_features} [SEP] {history_str}".strip(" [SEP] ")

        user_texts.append(combined_features)

        # Update the user history after generating combined features
        if rating > 2.5:
            user_histories[user_id]['positive'].append(movie_id)
        else:
            user_histories[user_id]['negative'].append(movie_id)

    return user_texts

In [19]:
def generate_last_user_texts_with_history(users, movies, ratings):
    user_histories = {user_id: {'positive': [], 'negative': []} for user_id in users['user_id'].unique()}
    last_user_texts = {}

    # Convert relevant columns to dictionaries for faster access
    user_features_dict = users.set_index('user_id').to_dict('index')
    movie_details_dict = movies.set_index('item_id').to_dict('index')

    for _, row in ratings.iterrows():
        user_id = row['user_id']
        movie_id = row['item_id']
        rating = row['rating']

        # Get user features
        user = user_features_dict[user_id]
        user_features = f"occupation: {user['occupation']} [SEP] age: {user['age']} [SEP] gender: {user['gender']}"

        # Function to get movie details
        def get_movie_details(movie_id):
            movie = movie_details_dict[movie_id]
            return f"title: {movie['title']} [SEP] genres: {movie['genres']} [SEP] year: {movie['year']}"

        # Append the user's positive and negative histories (only the last 3 movies)
        positive_history = [get_movie_details(mid) for mid in user_histories[user_id]['positive'][-3:]]
        negative_history = [get_movie_details(mid) for mid in user_histories[user_id]['negative'][-3:]]

        positive_history_str = " [SEP] ".join(positive_history)
        negative_history_str = " [SEP] ".join(negative_history)

        # Combine user features and history
        history_str = ""
        if positive_history_str:
            history_str += f"positively rated movies: {positive_history_str}"
        if negative_history_str:
            if history_str:
                history_str += " [SEP] "
            history_str += f"negatively rated movies: {negative_history_str}"

        combined_features = f"{user_features} [SEP] {history_str}".strip(" [SEP] ")

        # Update the dictionary to keep the last text for each user
        last_user_texts[user_id] = combined_features

        # Update the user history after generating combined features
        if rating > 2.5:
            user_histories[user_id]['positive'].append(movie_id)
        else:
            user_histories[user_id]['negative'].append(movie_id)

    return last_user_texts

# Generate the last user texts for the validation data
val_last_user_texts = generate_last_user_texts_with_history(users, movies, val_ratings)
# Display the dictionary of the last user texts
# import ace_tools as tools; tools.display_dataframe_to_user(name="Validation User Texts", dataframe=pd.DataFrame(val_last_user_texts.items(), columns=["User ID", "Last Text"]))


In [20]:
train_user_texts = generate_user_texts_with_history_bad_rating(users, movies, train_ratings)
val_user_texts = generate_user_texts_with_history_bad_rating(users, movies, val_ratings)
test_user_texts = generate_user_texts_with_history_bad_rating(users, movies, test_ratings)

In [21]:
print(val_last_user_texts.get(1))

occupation: K-12 student [SEP] age: Under 18 [SEP] gender: Female [SEP] positively rated movies: title: Beauty and the Beast [SEP] genres: Animation, Children's, Musical [SEP] year: 1991 [SEP] title: Aladdin [SEP] genres: Animation, Children's, Comedy, Musical [SEP] year: 1992 [SEP] title: Toy Story [SEP] genres: Animation, Children's, Comedy [SEP] year: 1995


In [None]:
[POSITIVE_MOVIES]
Title: Inception, Genre: Sci-Fi, Year: 2010
Title: The Matrix, Genre: Action, Year: 1999
Title: Interstellar, Genre: Sci-Fi, Year: 2014
[NEGATIVE_MOVIES]
Title: Twilight, Genre: Romance, Year: 2008
Title: Fifty Shades of Grey, Genre: Romance, Year: 2015

In [6]:
# # Save user embeddings locally
# with open('train_user_texts.pkl', 'wb') as f:
#     pickle.dump(train_user_texts, f)
#
# print("Train user embeddings saved successfully.")
#
# with open('val_user_texts.pkl', 'wb') as f:
#     pickle.dump(val_user_texts, f)
#
# print("Validation user embeddings saved successfully.")
#
# with open('test_user_texts.pkl', 'wb') as f:
#     pickle.dump(test_user_texts, f)
#
# print("Test user embeddings saved successfully.")

In [7]:
# # Load user texts from file
# with open('./text_for_embeddings/last_three_history/train_user_texts.pkl', 'rb') as f:
#     train_user_texts = pickle.load(f)
# print("Train user text loaded successfully.")
#
# with open('./text_for_embeddings/last_three_history/val_user_texts.pkl', 'rb') as f:
#     val_user_texts = pickle.load(f)
# print("Validation user text loaded successfully.")
#
# with open('./text_for_embeddings/last_three_history/test_user_texts.pkl', 'rb') as f:
#     test_user_texts = pickle.load(f)
# print("Test user text loaded successfully.")

Train user text loaded successfully.
Validation user text loaded successfully.
Test user text loaded successfully.


In [22]:
movies['year'] = movies['year'].fillna('').astype(str)

# Combine movie features into a single string for each movie
movies['movie_features'] = 'title: ' + movies['title'] + ' [SEP] genres: ' + movies['genres'] + ' [SEP] year: ' + movies['year']

In [23]:
movies

Unnamed: 0,item_id,title,genres,year,movie_features
0,1,Toy Story,"Animation, Children's, Comedy",1995,"title: Toy Story [SEP] genres: Animation, Chil..."
1,2,Jumanji,"Adventure, Children's, Fantasy",1995,"title: Jumanji [SEP] genres: Adventure, Childr..."
2,3,Grumpier Old Men,"Comedy, Romance",1995,"title: Grumpier Old Men [SEP] genres: Comedy, ..."
3,4,Waiting to Exhale,"Comedy, Drama",1995,"title: Waiting to Exhale [SEP] genres: Comedy,..."
4,5,Father of the Bride Part II,Comedy,1995,title: Father of the Bride Part II [SEP] genre...
...,...,...,...,...,...
3878,3948,Meet the Parents,Comedy,2000,title: Meet the Parents [SEP] genres: Comedy [...
3879,3949,Requiem for a Dream,Drama,2000,title: Requiem for a Dream [SEP] genres: Drama...
3880,3950,Tigerland,Drama,2000,title: Tigerland [SEP] genres: Drama [SEP] yea...
3881,3951,Two Family House,Drama,2000,title: Two Family House [SEP] genres: Drama [S...


In [24]:
# Create a dictionary for fast lookup
movie_features_dict = movies.set_index('item_id')['movie_features'].to_dict()

# Create lists of user and item texts
item_texts = [movie_features_dict[movieId] for movieId in full_ratings['item_id'].unique()]

# Create a mapping from userId and movieId to indices
movie_id_to_idx = {movieId: idx for idx, movieId in enumerate(full_ratings['item_id'].unique())}

# Map userId and movieId in ratings_df to indices
train_ratings['movie_idx'] = train_ratings['item_id'].map(movie_id_to_idx)

# Map userId and movieId in ratings_val to indices
val_ratings['movie_idx'] = val_ratings['item_id'].map(movie_id_to_idx)

# Map userId and movieId in ratings_test to indices
test_ratings['movie_idx'] = test_ratings['item_id'].map(movie_id_to_idx)

# Extract user indices, item indices, and ratings
train_item_indices = torch.LongTensor(train_ratings['movie_idx'].values).to(device)
train_labels = torch.FloatTensor(train_ratings['rating'].values).to(device)

# Extract user indices, item indices, and ratings for validation
val_item_indices = torch.LongTensor(val_ratings['movie_idx'].values).to(device)
val_labels = torch.FloatTensor(val_ratings['rating'].values).to(device)

# Extract user indices, item indices, and ratings for test
test_item_indices = torch.LongTensor(test_ratings['movie_idx'].values).to(device)
test_labels = torch.FloatTensor(test_ratings['rating'].values).to(device)


In [25]:
from torch.utils.data import Dataset, DataLoader

class CustomTextDataset(Dataset):
    def __init__(self, users, item_ids, ratings):
        self.users = users
        self.item_ids = item_ids
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        users = self.users[idx]
        item_id = self.item_ids[idx]
        rating = self.ratings[idx]
        return users, item_id, rating

In [26]:
# Create DataLoader for training data
train_dataset = CustomTextDataset(train_user_texts, train_item_indices, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)

# Create DataLoader for validation data
val_dataset = CustomTextDataset(val_user_texts, val_item_indices, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True, drop_last=True)

# Create DataLoader for test data
test_dataset = CustomTextDataset(test_user_texts, test_item_indices, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, drop_last=True)

In [12]:
'occupation: doctor/health care [SEP] age: 25-34 [SEP] gender: Male [SEP] positively rated movies: ... [SEP] negatively rated movies: ...'

'occupation: doctor/health care [SEP] age: 25-34 [SEP] gender: Male [SEP] positively rated movies: ... [SEP] negatively rated movies: ...'

In [27]:
class TwoTowerModel(pl.LightningModule):
    def __init__(self, user_model_name, item_model_name, embedding_size=384):
        super(TwoTowerModel, self).__init__()
        self.user_model = SentenceTransformer(user_model_name)
        self.item_model = SentenceTransformer(item_model_name)

        self.user_fc = nn.Linear(embedding_size, embedding_size)
        self.item_fc = nn.Linear(embedding_size, embedding_size)

        self.criterion = nn.MSELoss()
        self.epoch_losses = {'train_loss': [], 'val_loss': []}

    def forward(self, user_text, item_text):
        user_embedding = self.user_model.encode(user_text, convert_to_tensor=True).to(device)
        item_embedding = self.item_model.encode(item_text, convert_to_tensor=True).to(device)

        user_output = self.user_fc(user_embedding)
        item_output = self.item_fc(item_embedding)

        # dot_product = torch.sum(user_output * item_output, dim=1)
        dot_product = torch.matmul(user_output.unsqueeze(1), item_output.unsqueeze(2)).squeeze()
        # Apply sigmoid transformation and scaling here
        # dot_product = 4 * torch.sigmoid(dot_product) + 1

        return dot_product

    def training_step(self, batch, batch_idx):
        users, items, ratings = batch

        items = [item_texts[i] for i in items.tolist()]

        preds = self(users, items)

        loss = self.criterion(preds, ratings)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        users, items, ratings = batch

        items = [item_texts[i] for i in items.tolist()]

        preds = self(users, items)

        loss = self.criterion(preds, ratings)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-5)

    def full_predict(self, user_texts, item_ids):

        items = [item_texts[i] for i in item_ids]

        with torch.no_grad():
            user_embedding = self.user_model.encode(user_texts, convert_to_tensor=True).to(device)
            item_embeddings = torch.stack([self.item_model.encode(item_text, convert_to_tensor=True) for item_text in items]).to(self.device)

            user_output = self.user_fc(user_embedding)
            item_output = self.item_fc(item_embeddings)

            # dot_product = torch.sum(user_output * item_output, dim=1)
            dot_product = torch.matmul(user_output, item_output.T)
            # dot_product = 4 * torch.sigmoid(dot_product) + 1

        return dot_product.cpu()

class PrintLossesCallback(Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        train_loss = trainer.callback_metrics.get('train_loss')
        if train_loss is not None:
            pl_module.epoch_losses['train_loss'].append(train_loss.item())
            print(f"Epoch {trainer.current_epoch + 1}: Train Loss: {train_loss.item()}")

    def on_validation_epoch_end(self, trainer, pl_module):
        val_loss = trainer.callback_metrics.get('val_loss')
        if val_loss is not None:
            pl_module.epoch_losses['val_loss'].append(val_loss.item())
            print(f"Epoch {trainer.current_epoch + 1}: Val Loss: {val_loss.item()}")

In [28]:
model = TwoTowerModel(user_model_name='paraphrase-MiniLM-L6-v2', item_model_name='paraphrase-MiniLM-L6-v2')

# Define the ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',  # Metric to monitor
    dirpath='checkpoints/',  # Directory to save the checkpoints
    filename='with-history-best-checkpoint',  # Filename for the best model
    save_top_k=1,  # Save only the top 1 model
    mode='min'  # Mode to save the best model (min for validation loss)
)

trainer = pl.Trainer(max_epochs=5, log_every_n_steps=1, callbacks=[PrintLossesCallback()], enable_progress_bar=True)
trainer.fit(model, train_dataloader, val_dataloader)

# Print losses after training completes
print("Epoch losses:")
for epoch in range(trainer.max_epochs):
    train_loss = model.epoch_losses['train_loss'][epoch] if epoch < len(model.epoch_losses['train_loss']) else 'N/A'
    val_loss = model.epoch_losses['val_loss'][epoch] if epoch < len(model.epoch_losses['val_loss']) else 'N/A'
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss}, Val Loss: {val_loss}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                | Params | Mode 
-----------------------------------------------------------
0 | user_model | SentenceTransformer | 22.7 M | train
1 | item_model | SentenceTransformer | 22.7 M | train
2 | user_fc    | Linear              | 147 K  | train
3 | item_fc    | Linear              | 147 K  | train
4 | criterion  | MSELoss             | 0      | train
-----------------------------------------------------------
45.7 M    Trainable pa

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:475: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  2.41it/s]Epoch 1: Val Loss: 15.00890064239502
                                                                           

D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 12464/12464 [37:17<00:00,  5.57it/s, v_num=13]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1557 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1557 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/1557 [00:00<04:00,  6.47it/s][A
Validation DataLoader 0:   0%|          | 2/1557 [00:00<04:13,  6.14it/s][A
Validation DataLoader 0:   0%|          | 3/1557 [00:00<04:16,  6.06it/s][A
Validation DataLoader 0:   0%|          | 4/1557 [00:00<04:20,  5.96it/s][A
Validation DataLoader 0:   0%|          | 5/1557 [00:00<04:19,  5.97it/s][A
Validation DataLoader 0:   0%|          | 6/1557 [00:01<04:22,  5.91it/s][A
Validation DataLoader 0:   0%|          | 7/1557 [00:01<04:23,  5.88it/s][A
Validation DataLoader 0:   1%|          | 8/1557 [00:01<04:21,  5.92it/s][A
Validation DataLoader 0:   1%|          | 9/1557 [00:01<04:22,  5.90it/s][A
Validation DataLoader 0:   1%|          | 10/1557 [00:01<04:

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 12464/12464 [45:32<00:00,  4.56it/s, v_num=13]
Epoch losses:
Epoch 1: Train Loss: 1.1365914344787598, Val Loss: 15.00890064239502
Epoch 2: Train Loss: 0.867249071598053, Val Loss: 1.1272138357162476
Epoch 3: Train Loss: 1.126298189163208, Val Loss: 1.1123785972595215
Epoch 4: Train Loss: 0.9519960880279541, Val Loss: 1.1183810234069824
Epoch 5: Train Loss: 1.016836404800415, Val Loss: 1.1283931732177734


In [19]:
# model.epoch_losses

# Evaluation

In [29]:
# Assuming the training part has been done already, load the best model checkpoint
# best_model_path = './lightning_logs/history_paraphrase-MiniLM-L6-v2_5-epochs_lr-1e-5/checkpoints/epoch=4-step=93765.ckpt'  # Path where the best model is saved
best_model_path = './lightning_logs/version_13/checkpoints/epoch=4-step=62320.ckpt'  # Path where the best model is saved

best_model = TwoTowerModel.load_from_checkpoint(best_model_path, user_model_name='paraphrase-MiniLM-L6-v2', item_model_name='paraphrase-MiniLM-L6-v2').to(device)



In [14]:
def get_top_n_items_with_history(model, user_text, n):
    # Ensure the model is in evaluation mode
    model.eval()

    # Encode the user text
    user_embedding = model.user_model.encode(user_text, convert_to_tensor=True)

    # Compute the scores (dot product between user embedding and each item embedding)
    user_output = model.user_fc(user_embedding)
    item_output = model.item_fc(full_items_embeddings).to(device)
    dot_product = torch.matmul(user_output, item_output.t()).squeeze()
    # scores =  4 * torch.sigmoid(dot_product) + 1

    # Get the top n item indices and their scores
    top_n_scores, top_n_indices = torch.topk(dot_product, n)

    # Map indices back to item IDs and convert scores to a pandas Series
    top_n_item_ids = [list(movie_id_to_idx.keys())[list(movie_id_to_idx.values()).index(idx)] for idx in top_n_indices.tolist()]
    top_n_scores = top_n_scores.cpu().detach().numpy()

    top_n_series = pd.Series(data=top_n_scores, index=top_n_item_ids)

    return top_n_series

In [10]:
test_ratings['user_text'] = test_user_texts

# Create a variable that pairs userIds with their corresponding texts
test_user_text_pairs = test_ratings[['user_id', 'user_text']].to_dict(orient='records')

In [17]:
full_items_embeddings = torch.stack([best_model.item_model.encode(item_text, convert_to_tensor=True) for item_text in item_texts]).to(device)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [18]:
n = 5  # Number of recommendations per user
recommendations = []

for index, item in enumerate(test_user_text_pairs):
    recommendations.append((item['user_id'], get_top_n_items_with_history(best_model, item['user_text'], n)))

In [175]:
recommendations_dict = {user_id: recs for user_id, recs in recommendations}

## Evaluation with groupby

In [176]:
# Prepare test data for NDCG calculation
test_ratings['predicted_rating_with_history'] = test_ratings.apply(
    lambda row: recommendations_dict.get(row['user_id'], pd.Series()).get(row['item_id'], 0), axis=1
)
test_ratings_grouped = test_ratings.groupby('user_id')

In [178]:
from sklearn.metrics import ndcg_score

# Calculate NDCG scores
ndcg_scores_with_history = []

for user, group in test_ratings_grouped:
    if len(group) > 1:  # Filter users with more than one rating
        true_ratings = group['rating'].values
        pred_ratings = group['predicted_rating_with_history'].values
        ndcg_scores_with_history.append(ndcg_score([true_ratings], [pred_ratings], k=5))


## Evaluation full from github

In [24]:
import torch
import numpy as np

class Metric(object):
    def __init__(self, k):
        self.k = k

    def recall(self, test_data, r, k):
        # right_pred = r[:, :k].sum(1)
        right_pred = r[:, :k].sum(1)

        test_data_cpu = test_data.cpu() if test_data.is_cuda else test_data
        recall_n = np.array([test_data_cpu[i].sum().item() if test_data_cpu[i].sum().item() > 0 else 1 for i in range(len(test_data_cpu))])

        # recall_n = np.array([test_data[i].sum() if test_data[i].sum() > 0 else 1 for i in range(len(test_data))])
        recall = np.sum(right_pred / recall_n)
        return recall

    def precision(self, r, k):
        # right_pred = r[:, :k].sum(1)
        right_pred = r[:, :k].sum(1)
        precis_n = k
        precision = np.sum(right_pred) / (precis_n * len(r))
        return precision

    def mrr(self, r, k):
        # pred_data = r[:, :k]
        pred_data = r[:, :k]
        scores = 1. / np.arange(1, k + 1)
        pred_data = pred_data * scores
        pred_data = pred_data.sum(1)
        return np.sum(pred_data) / len(r)

    def ndcg(self, test_data, r, k):
        assert len(r) == len(test_data)
        # pred_data = r[:, :k]
        pred_data = r[:, :k]
        test_data_cpu = test_data.cpu() if test_data.is_cuda else test_data

        # test_matrix = np.zeros((len(pred_data), k))
        # for i, items in enumerate(test_data):
        #     for idx, rating in enumerate(items):
        #         if idx < k:
        #             test_matrix[i, idx] = rating / 5.0
        test_matrix = np.zeros((len(pred_data), k))
        for i, items in enumerate(test_data_cpu):
            for idx, rating in enumerate(items):
                if idx < k:
                    test_matrix[i, idx] = rating.item() / 5.0  # Normalize ratings from 1 to 5 to 0 to 1

        max_r = test_matrix
        idcg = np.sum(max_r * 1. / np.log2(np.arange(2, k + 2)), axis=1)
        dcg = pred_data * (1. / np.log2(np.arange(2, k + 2)))
        dcg = np.sum(dcg, axis=1)
        idcg[idcg == 0.] = 1.
        ndcg = dcg / idcg
        ndcg[np.isnan(ndcg)] = 0.
        return np.sum(ndcg)

    def get_label(self, test_data, pred_data):
        r = []
        for i in range(len(test_data)):

            ground_true = test_data[i]
            predict_topk = pred_data[i]
            pred = list(map(lambda x: x in ground_true, predict_topk))
            pred = np.array(pred).astype("float")
            r.append(pred)
        return np.array(r).astype('float')

    def eval_batch(self, data, topks):
        sorted_items = data[0]
        ground_true = data[1]
        r = self.get_label(ground_true, sorted_items)
        result = {}
        for k in topks:
            result[f'recall@{k}'] = self.recall(ground_true, r, k)
            result[f'precision@{k}'] = self.precision(r, k)
            result[f'mrr@{k}'] = self.mrr(r, k)
            result[f'ndcg@{k}'] = self.ndcg(ground_true, r, k)
        return result

    def eval(self, model, test_dataloader, test_user_text_pairs, topks=[5]):
        result = {f'recall@{k}': 0 for k in topks}
        result.update({f'precision@{k}': 0 for k in topks})
        result.update({f'mrr@{k}': 0 for k in topks})
        result.update({f'ndcg@{k}': 0 for k in topks})

        batch_ratings = []
        ground_truths = []
        test_user_num = len(test_dataloader.dataset)
        user_text_to_id = {pair['user_text']: pair['user_id'] for pair in test_user_text_pairs}

        for tem in test_dataloader:
            user_texts, item_ids, _ = tem

            item_ids = item_ids.cpu().numpy().tolist()
            user_texts = list(user_texts)  # Ensure user_texts is a list

            with torch.no_grad():
                batch_pred = model.full_predict(user_texts, item_ids)

            _, batch_rate = torch.topk(batch_pred, k=max(topks))
            batch_ratings.append(batch_rate.cpu().numpy())

            # Extract ground truth based on user texts from test_user_text_pairs
            ground_truth = []
            for user_text in user_texts:
                user_id = user_text_to_id[user_text]
                ground_truth.append(test_labels[user_id])
            ground_truths.append(ground_truth)

        data_pair = zip(batch_ratings, ground_truths)
        eval_results = []
        for _data in data_pair:
            eval_results.append(self.eval_batch(_data, topks))

        for batch_result in eval_results:
            for metric in batch_result:
                result[metric] += batch_result[metric] / test_user_num

        return result


## Old Evaluation high

In [36]:
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

def mrr_at_k(relevance_scores, k):
    for i, rel in enumerate(relevance_scores[:k]):
        if rel > 0:
            return 1 / (i + 1)
    return 0.

def hr_at_k(relevance_scores, k):
    return int(np.any(np.asarray(relevance_scores)[:k] > 0))

In [37]:
import numpy as np

def evaluate_recommendations(recommendations, k):
    ndcg_scores = []
    mrr_scores = []
    hr_scores = []

    for i, recommendation in enumerate(recommendations):
        user_id, user_recommendations = recommendation

        # Get the true ratings for the recommended items
        true_ratings = test_ratings[
            (test_ratings['user_id'] == user_id) & (
                test_ratings['item_id'].isin(list(user_recommendations.index)))]

        if true_ratings.empty:
            continue

        true_ratings = true_ratings.set_index('item_id').reindex(list(user_recommendations.index))['rating'].fillna(0)

        relevance = true_ratings.values / 5  # Assuming ratings are from 1 to 5

        # Calculate metrics
        ndcg = ndcg_at_k(relevance, k)
        mrr = mrr_at_k(relevance, k)
        hr = hr_at_k(relevance, k)

        ndcg_scores.append(ndcg)
        mrr_scores.append(mrr)
        hr_scores.append(hr)

    average_ndcg = np.mean(ndcg_scores)
    average_mrr = np.mean(mrr_scores)
    average_hr = np.mean(hr_scores)

    return {
        'NDCG@k': average_ndcg,
        'MRR@k': average_mrr,
        'HR@k': average_hr
    }

## Old Evaluation low

In [38]:
def evaluate_recommendation_with_history(recommendations, k):
    user_metrics = {}

    for user_id, user_recommendations in recommendations:
        # Filter test ratings for the current user
        user_test_ratings = test_ratings[test_ratings['user_id'] == user_id]

        relevance_scores = []
        for item_id in user_recommendations.index:
            if item_id in user_test_ratings['item_id'].values:
                relevance_score = user_test_ratings[user_test_ratings['item_id'] == item_id]['rating'].values[0]
                relevance_scores.append(relevance_score / 5)  # Normalize if ratings are 1 to 5
            else:
                relevance_scores.append(0)

        # Calculate metrics
        ndcg = ndcg_at_k(relevance_scores, k)
        mrr = mrr_at_k(relevance_scores, k)
        hr = hr_at_k(relevance_scores, k)

        if user_id not in user_metrics:
            user_metrics[user_id] = {
                'ndcg': [],
                'mrr': [],
                'hr': []
            }

        user_metrics[user_id]['ndcg'].append(ndcg)
        user_metrics[user_id]['mrr'].append(mrr)
        user_metrics[user_id]['hr'].append(hr)

    # Aggregate metrics for each user
    ndcg_scores = [np.mean(metrics['ndcg']) for metrics in user_metrics.values()]
    mrr_scores = [np.mean(metrics['mrr']) for metrics in user_metrics.values()]
    hr_scores = [np.mean(metrics['hr']) for metrics in user_metrics.values()]

    average_ndcg = np.mean(ndcg_scores)
    average_mrr = np.mean(mrr_scores)
    average_hr = np.mean(hr_scores)

    return {
        'NDCG@k': average_ndcg,
        'MRR@k': average_mrr,
        'HR@k': average_hr
    }

## Calculations

In [179]:
ndcg_with_history = np.mean(ndcg_scores_with_history)
print("Two-Tower Model with History NDCG:", ndcg_with_history)

Two-Tower Model with History NDCG: 0.7962613276593588


In [25]:
# Initialize the metric evaluator
metric_evaluator = Metric(k=[5])

# Evaluate the model
eval_result = metric_evaluator.eval(best_model, test_dataloader, test_user_text_pairs, topks=[5])

for metric, values in eval_result.items():
    print(f"{metric}: {values}")

AttributeError: 'list' object has no attribute 'is_cuda'

In [22]:
# Initialize the metric evaluator
metric_evaluator = Metric(k=[5])

# Evaluate the model
eval_result = metric_evaluator.eval(best_model, test_dataloader, test_user_text_pairs, topks=[5])

for metric, values in eval_result.items():
    print(f"{metric}: {values}")

recall@5: 0.1598014416972462
precision@5: 0.0009987590106077511
mrr@5: 0.0023174169882658234
ndcg@5: 0.03226853357186639


In [150]:
# from evaluator import EvaluateMetrics

# Evaluate the recommendations
# evaluator = EvaluateMetrics(test_ratings)
# evaluation_results = evaluator.evaluate_recommendations_with_history(recommendations, k=n)
evaluation_results = evaluate_recommendations(recommendations, k=n)

print(f"NDCG@{n}: {evaluation_results['NDCG@k']:.4f}")
print(f"MRR@{n}: {evaluation_results['MRR@k']:.4f}")
print(f"HR@{n}: {evaluation_results['HR@k']:.4f}")

NDCG@5: 0.6406
MRR@5: 0.5281
HR@5: 1.0000


In [168]:
evaluation_results_with_history = evaluate_recommendation_with_history(recommendations, k=5)
print(f"With History - NDCG@{n}: {evaluation_results_with_history['NDCG@k']:.4f}")
print(f"With History - MRR@{n}: {evaluation_results_with_history['MRR@k']:.4f}")
print(f"With History - HR@{n}: {evaluation_results_with_history['HR@k']:.4f}")

With History - NDCG@5: 0.0772
With History - MRR@5: 0.0639
With History - HR@5: 0.1196


paraphrase-MiniLM-L6-v2 5 epochs

NDCG@5: 0.6459
MRR@5: 0.5347
HR@5: 1.0000

In [30]:
def get_top_n_items_without_history_unseen_items(model, userId, n):
    # Ensure the model is in evaluation mode
    model.eval()

    # Get the user text for the given userId
    user_text = val_last_user_texts[userId]

    # Encode the user text
    user_embedding = model.user_model.encode(user_text, convert_to_tensor=True).to(device)

    # Compute the scores (dot product between user embedding and each item embedding)
    user_output = model.user_fc(user_embedding).to(device)
    item_output = model.item_fc(full_items_embeddings).to(device)
    dot_product = torch.matmul(user_output, item_output.t()).squeeze()

    # Get items the user has seen in the training and validation data
    seen_items_train = train_ratings[train_ratings['user_id'] == userId]['item_id'].values
    seen_items_val = val_ratings[val_ratings['user_id'] == userId]['item_id'].values
    seen_items = set(np.concatenate((seen_items_train, seen_items_val)))

    # Get the top n + len(seen_items) item indices and their scores
    top_n_scores, top_n_indices = torch.topk(dot_product, n + len(seen_items))

    # Map indices back to item IDs
    top_n_item_ids = [list(movie_id_to_idx.keys())[list(movie_id_to_idx.values()).index(idx.item())] for idx in top_n_indices]

    # Filter out seen items
    unseen_top_n_item_ids = [item for item in top_n_item_ids if item not in seen_items]

    return unseen_top_n_item_ids[:n]

In [31]:
# Assuming full_items_embeddings is already defined
full_items_embeddings = torch.stack([best_model.item_model.encode(item_text, convert_to_tensor=True) for item_text in item_texts]).to(device)

In [32]:
def dcg(scores, k):
    scores = np.asfarray(scores)[:k]
    return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))

def ndcg_at_k(labels, k):
    ideal_labels = sorted(labels, reverse=True)
    return dcg(labels, k) / dcg(ideal_labels, k)

def evaluate_user_cf_model(model, test_data, train_data, val_data, all_items, k):
    ndcg_scores = []

    # Get unique users
    unique_users = test_data['user_id'].unique()

    for user in unique_users:
        # Get the top N items for the user, filtering out seen items
        recommended_items = get_top_n_items_without_history_unseen_items(model, user, k)

        user_test_data = test_data[test_data['user_id'] == user]
        test_items = user_test_data['item_id'].values
        print(user)
        y_score = [1 if item in test_items else 0 for item in recommended_items]
        ndcg = ndcg_at_k(y_score, k)
        ndcg_scores.append(ndcg)

    # avg_ndcg = np.mean(np.nan_to_num(ndcg_scores, nan=0.0))
    avg_ndcg = np.nanmean(ndcg_scores)

    return {
        'NDCG@{}'.format(k): avg_ndcg
    }

all_items = movies['item_id'].unique()
# Evaluate the model
eval_result = evaluate_user_cf_model(best_model, test_ratings, train_ratings, val_ratings, all_items, k=5)
print(eval_result)

1
2
3
4
5
6
7
8
9


  return dcg(labels, k) / dcg(ideal_labels, k)


10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
28

In [None]:
{'NDCG@5': 0.6890022642171051}

In [33]:
def dcg(scores, k):
    scores = np.asfarray(scores)[:k]
    return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))

def ndcg_at_k(labels, k):
    ideal_labels = sorted(labels, reverse=True)
    return dcg(labels, k) / dcg(ideal_labels, k)

def evaluate_user_cf_model(model, test_data, train_data, val_data, all_items, k):
    ndcg_scores = []

    # Get unique users
    unique_users = test_data['user_id'].unique()

    for user in unique_users:
        # Get the top N items for the user, filtering out seen items
        recommended_items = get_top_n_items_without_history_unseen_items(model, user, k)

        user_test_data = test_data[test_data['user_id'] == user]
        test_items = user_test_data['item_id'].values
        print(user)
        # y_score = [1 if item in test_items else 0 for item in recommended_items]

        y_score = [
            user_test_data[user_test_data['item_id'] == item]['rating'].values[0] if item in test_items else 2.5
            for item in recommended_items
        ]

        ndcg = ndcg_at_k(y_score, k)
        ndcg_scores.append(ndcg)

    # avg_ndcg = np.mean(np.nan_to_num(ndcg_scores, nan=0.0))
    avg_ndcg = np.nanmean(ndcg_scores)

    return {
        'NDCG@{}'.format(k): avg_ndcg
    }

all_items = movies['item_id'].unique()
# Evaluate the model
eval_result = evaluate_user_cf_model(best_model, test_ratings, train_ratings, val_ratings, all_items, k=5)
print(eval_result)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [34]:
def dcg(scores, k):
    scores = np.asfarray(scores)[:k]
    return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))

def ndcg_at_k(labels, k):
    ideal_labels = sorted(labels, reverse=True)
    return dcg(labels, k) / dcg(ideal_labels, k)

def evaluate_user_cf_model(model, test_data, train_data, val_data, all_items, k):
    ndcg_scores = []

    # Get unique users
    unique_users = test_data['user_id'].unique()

    for user in unique_users:
        # Get the top N items for the user, filtering out seen items
        recommended_items = get_top_n_items_without_history_unseen_items(model, user, k)

        user_test_data = test_data[test_data['user_id'] == user]
        test_items = user_test_data['item_id'].values
        print(user)
        # y_score = [1 if item in test_items else 0 for item in recommended_items]

        y_score = [
            user_test_data[user_test_data['item_id'] == item]['rating'].values[0] if item in test_items else 0
            for item in recommended_items
        ]

        ndcg = ndcg_at_k(y_score, k)
        ndcg_scores.append(ndcg)

    # avg_ndcg = np.mean(np.nan_to_num(ndcg_scores, nan=0.0))
    avg_ndcg = np.nanmean(ndcg_scores)

    return {
        'NDCG@{}'.format(k): avg_ndcg
    }

all_items = movies['item_id'].unique()
# Evaluate the model
eval_result = evaluate_user_cf_model(best_model, test_ratings, train_ratings, val_ratings, all_items, k=5)
print(eval_result)

1
2
3
4
5
6
7
8
9


  return dcg(labels, k) / dcg(ideal_labels, k)


10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
28