In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from pytorch_lightning.callbacks import Callback
import pandas as pd
import pickle
from pytorch_lightning.callbacks import ModelCheckpoint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [3]:
users = pd.read_csv('processed_dataset/MovieLens-1M/users/users_movielens.csv')
movies = pd.read_csv('processed_dataset/MovieLens-1M/movies/movies_movielens_modified.csv')
full_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_full_movielens.csv')
train_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_train_movielens.csv')
val_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_val_movielens.csv')
test_ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_test_movielens.csv')

In [4]:
def generate_user_texts_with_history(users, movies, ratings):
    user_histories = {user_id: [] for user_id in users['user_id'].unique()}
    user_texts = []

    user_features_dict = users.set_index('user_id').to_dict('index')
    movie_dict = movies.set_index('item_id').to_dict('index')

    for _, row in ratings.iterrows():
        user_id = row['user_id']
        movie_id = row['item_id']

        user = user_features_dict[user_id]
        user_features = f"occupation: {user['occupation']} [SEP] gender: {user['gender']}"

        history_movies = [
            {"title": movie_dict[mid]['title'], "genres": movie_dict[mid]['genres']}
            for mid in user_histories[user_id][-3:]
        ]

        user_text = {
            "occupation_gender": user_features,
            "history": history_movies
        }

        user_texts.append(user_text)
        user_histories[user_id].append(movie_id)

    return user_texts

In [5]:
def generate_last_user_texts_with_history(users, movies, ratings):
    user_histories = {user_id: [] for user_id in users['user_id'].unique()}
    last_user_texts = {}

    # Convert relevant columns to dictionaries for faster access
    user_features_dict = users.set_index('user_id').to_dict('index')
    movie_dict = movies.set_index('item_id').to_dict('index')

    for _, row in ratings.iterrows():
        user_id = row['user_id']
        movie_id = row['item_id']

        # Get user features
        user = user_features_dict[user_id]
        user_features = f"occupation: {user['occupation']} [SEP] gender: {user['gender']}"

        # Append the user's history (only the last 3 movies)
        history_movies = [
            {"title": movie_dict[mid]['title'], "genres": movie_dict[mid]['genres']}
            for mid in user_histories[user_id][-3:]
        ]

        # Combine user features and history
        user_text = {
            "occupation_gender": user_features,
            "history": history_movies
        }

        # Update the dictionary to keep the last text for each user
        last_user_texts[user_id] = user_text

        # Update the user history after generating combined features
        user_histories[user_id].append(movie_id)

    return last_user_texts

In [7]:
# # Use smaller subsets of the data
# train_ratings_subset = train_ratings
# val_ratings_subset = val_ratings[:
# test_ratings_subset = test_ratings[:10]

In [6]:
train_user_texts = generate_user_texts_with_history(users, movies, train_ratings)
val_user_texts = generate_user_texts_with_history(users, movies, val_ratings)
test_user_texts = generate_user_texts_with_history(users, movies, test_ratings)

In [7]:
train_user_texts[:5]

[{'occupation_gender': 'occupation: K-12 student [SEP] gender: Female',
  'history': []},
 {'occupation_gender': 'occupation: K-12 student [SEP] gender: Female',
  'history': [{'title': 'Girl, Interrupted', 'genres': 'Drama'}]},
 {'occupation_gender': 'occupation: K-12 student [SEP] gender: Female',
  'history': [{'title': 'Girl, Interrupted', 'genres': 'Drama'},
   {'title': 'Cinderella', 'genres': "Animation, Children's, Musical"}]},
 {'occupation_gender': 'occupation: K-12 student [SEP] gender: Female',
  'history': [{'title': 'Girl, Interrupted', 'genres': 'Drama'},
   {'title': 'Cinderella', 'genres': "Animation, Children's, Musical"},
   {'title': 'Titanic', 'genres': 'Drama, Romance'}]},
 {'occupation_gender': 'occupation: K-12 student [SEP] gender: Female',
  'history': [{'title': 'Cinderella',
    'genres': "Animation, Children's, Musical"},
   {'title': 'Titanic', 'genres': 'Drama, Romance'},
   {'title': 'Back to the Future', 'genres': 'Comedy, Sci-Fi'}]}]

In [8]:
val_last_user_texts = generate_last_user_texts_with_history(users, movies, val_ratings)

In [9]:
print(val_last_user_texts.get(1))

{'occupation_gender': 'occupation: K-12 student [SEP] gender: Female', 'history': [{'title': 'Beauty and the Beast', 'genres': "Animation, Children's, Musical"}, {'title': 'Aladdin', 'genres': "Animation, Children's, Comedy, Musical"}, {'title': 'Toy Story', 'genres': "Animation, Children's, Comedy"}]}


In [10]:
# Define weights for the parts of user texts and movie texts
weights1 = {"occupation_gender": 0.1, "history": 0.9}
history_weights = [0.2, 0.3, 0.5]
weights2 = {"title": 0.2, "genres": 0.8}

In [11]:
movie_dict = movies.set_index('item_id').to_dict('index')
item_texts = [
    {"title": movie_dict[movie_id]['title'], "genres": movie_dict[movie_id]['genres']}
    for movie_id in full_ratings['item_id'].unique()
]

movie_id_to_idx = {movieId: idx for idx, movieId in enumerate(full_ratings['item_id'].unique())}

In [12]:
# Add movie indices to ratings
train_ratings['movie_idx'] = train_ratings['item_id'].map(movie_id_to_idx)
val_ratings['movie_idx'] = val_ratings['item_id'].map(movie_id_to_idx)
test_ratings['movie_idx'] = test_ratings['item_id'].map(movie_id_to_idx)

train_item_indices = torch.LongTensor(train_ratings['movie_idx'].values)
train_labels = torch.FloatTensor(train_ratings['rating'].values)
val_item_indices = torch.LongTensor(val_ratings['movie_idx'].values)
val_labels = torch.FloatTensor(val_ratings['rating'].values)
test_item_indices = torch.LongTensor(test_ratings['movie_idx'].values)
test_labels = torch.FloatTensor(test_ratings['rating'].values)

In [15]:
val_ratings

Unnamed: 0,user_id,item_id,rating,timestamp,movie_idx
0,1,745,3,978824268,369
1,1,595,5,978824268,1534
2,1,588,4,978824268,140
3,1,1,5,978824268,655
4,1,2687,3,978824268,904
...,...,...,...,...,...
99687,6040,3083,4,963272132,1551
99688,6040,2366,3,963272166,555
99689,6040,3819,5,963272166,3235
99690,6040,1900,5,964828352,2400


In [13]:
from torch.utils.data import Dataset, DataLoader

class CustomTextDataset(Dataset):
    def __init__(self, users, item_ids, ratings):
        self.users = users
        self.item_ids = item_ids
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        user_text = self.users[idx]  # This should be a dictionary
        item_id = self.item_ids[idx]
        rating = self.ratings[idx]
        return user_text, item_id, rating

def custom_collate_fn(batch):
    users, item_ids, ratings = zip(*batch)

    # users should remain as a list of dictionaries
    item_ids = torch.tensor(item_ids)
    ratings = torch.tensor(ratings)

    return users, item_ids, ratings
# def custom_collate_fn(batch):
#     users, item_ids, ratings = zip(*batch)
#     return list(users), torch.tensor(item_ids), torch.tensor(ratings)

In [14]:
# Create DataLoader for training data
train_dataset = CustomTextDataset(train_user_texts, train_item_indices, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True, collate_fn=custom_collate_fn)

# Create DataLoader for validation data
val_dataset = CustomTextDataset(val_user_texts, val_item_indices, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True, drop_last=True, collate_fn=custom_collate_fn)

# Create DataLoader for test data
test_dataset = CustomTextDataset(test_user_texts, test_item_indices, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, drop_last=True, collate_fn=custom_collate_fn)

In [17]:
'occupation: doctor/health care [SEP] age: 25-34 [SEP] gender: Male [SEP] positively rated movies: ... [SEP] negatively rated movies: ...'

'occupation: doctor/health care [SEP] age: 25-34 [SEP] gender: Male [SEP] positively rated movies: ... [SEP] negatively rated movies: ...'

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from pytorch_lightning.callbacks import Callback
from sentence_transformers import SentenceTransformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TwoTowerModel(pl.LightningModule):
    def __init__(self, user_model_name, item_model_name, embedding_size=384):
        super(TwoTowerModel, self).__init__()
        self.user_model = SentenceTransformer(user_model_name)
        self.item_model = SentenceTransformer(item_model_name)

        self.user_fc = nn.Linear(embedding_size, embedding_size)
        self.item_fc = nn.Linear(embedding_size, embedding_size)

        self.criterion = nn.MSELoss()
        self.epoch_losses = {'train_loss': [], 'val_loss': []}

    # def compute_weighted_user_embedding(self, user_text):
    #     occupation_gender_weight = 0.1
    #     history_weight = 0.9
    #     default_history_weights = [0.2, 0.3, 0.5]
    #     # print(user_text)
    #     occupation_gender_embedding = self.user_model.encode(user_text["occupation_gender"], convert_to_tensor=True).to(device)
    #
    #     history_texts = [f"title: {movie['title']} [SEP] genres: {movie['genres']}" for movie in user_text["history"]]
    #
    #     if history_texts:
    #         num_history_items = len(history_texts)
    #         adjusted_weights = default_history_weights[-num_history_items:]  # Use the last n weights
    #         history_weights = torch.tensor(adjusted_weights, device=device)
    #
    #         history_embeddings = self.user_model.encode(history_texts, convert_to_tensor=True).to(device)
    #         weighted_history_embedding = torch.matmul(history_weights, history_embeddings)
    #     else:
    #         weighted_history_embedding = torch.zeros_like(occupation_gender_embedding)
    #
    #     weighted_embedding = (occupation_gender_weight * occupation_gender_embedding +
    #                           history_weight * weighted_history_embedding)
    #
    #     return weighted_embedding
    def compute_weighted_user_embedding(self, user_texts):
        occupation_gender_weight = 0.1
        history_weight = 0.9
        default_history_weights = [0.2, 0.3, 0.5]

        all_user_embeddings = []

        for user_text in user_texts:
            # Compute occupation_gender embedding
            occupation_gender_embedding = self.user_model.encode(
                user_text["occupation_gender"],
                convert_to_tensor=True
            ).to(device)

            # Extract and compute embeddings for history
            history_texts = [
                f"title: {movie['title']} [SEP] genres: {movie['genres']}"
                for movie in user_text["history"]
            ]

            if history_texts:
                num_history_items = len(history_texts)
                adjusted_weights = default_history_weights[-num_history_items:]  # Use the last n weights
                history_weights = torch.tensor(adjusted_weights, device=device)

                history_embeddings = self.user_model.encode(
                    history_texts,
                    convert_to_tensor=True
                ).to(device)
                weighted_history_embedding = torch.matmul(history_weights, history_embeddings)
            else:
                weighted_history_embedding = torch.zeros_like(occupation_gender_embedding)

            # Combine the embeddings with their respective weights
            weighted_embedding = (
                occupation_gender_weight * occupation_gender_embedding +
                history_weight * weighted_history_embedding
            )

            all_user_embeddings.append(weighted_embedding)

        # Stack all embeddings into a tensor
        return torch.stack(all_user_embeddings)

    # def compute_weighted_item_embedding(self, item_text):
    #     title_weight = 0.2
    #     genres_weight = 0.8
    #
    #     title_embedding = self.item_model.encode(f"title: {item_text['title']}", convert_to_tensor=True).to(device)
    #     genres_embedding = self.item_model.encode(f"genres: {item_text['genres']}", convert_to_tensor=True).to(device)
    #
    #     weighted_embedding = (title_weight * title_embedding +
    #                           genres_weight * genres_embedding)
    #
    #     return weighted_embedding

    def compute_weighted_item_embedding(self, item_texts):
        title_weight = 0.2
        genres_weight = 0.8

        all_item_embeddings = []

        for item_text in item_texts:
            title_embedding = self.item_model.encode(
                f"title: {item_text['title']}",
                convert_to_tensor=True
            ).to(device)
            genres_embedding = self.item_model.encode(
                f"genres: {item_text['genres']}",
                convert_to_tensor=True
            ).to(device)

            weighted_embedding = (
                title_weight * title_embedding +
                genres_weight * genres_embedding
            )

            all_item_embeddings.append(weighted_embedding)

        # Stack all embeddings into a tensor
        return torch.stack(all_item_embeddings)

    def forward(self, user_text, item_text):
        user_embedding = self.compute_weighted_user_embedding(user_text)
        # user_embedding = self.user_model.encode(user_text, convert_to_tensor=True).to(device)
        item_embedding = self.compute_weighted_item_embedding(item_text)

        user_output = self.user_fc(user_embedding)
        item_output = self.item_fc(item_embedding)

        dot_product = torch.matmul(user_output, item_output.T)
        dot_product = 4 * torch.sigmoid(dot_product) + 1

        return dot_product

    def training_step(self, batch, batch_idx):
        users, items, ratings = batch

        items = [item_texts[i] for i in items.tolist()]

        preds = self(users, items)

        loss = self.criterion(preds, ratings)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        users, items, ratings = batch

        items = [item_texts[i] for i in items.tolist()]

        preds = self(users, items)

        loss = self.criterion(preds, ratings)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-5)

    def full_predict(self, user_texts, item_ids):

        items = [item_texts[i] for i in item_ids]

        with torch.no_grad():
            user_embeddings = torch.stack([self.compute_weighted_user_embedding(user_text) for user_text in user_texts]).to(device)
            item_embeddings = torch.stack([self.compute_weighted_item_embedding(item_text) for item_text in items]).to(device)

            user_output = self.user_fc(user_embeddings)
            item_output = self.item_fc(item_embeddings)

            dot_product = torch.matmul(user_output.squeeze(), item_output.T)
            dot_product = 4 * torch.sigmoid(dot_product) + 1

        return dot_product.cpu()

class PrintLossesCallback(Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        train_loss = trainer.callback_metrics.get('train_loss')
        if train_loss is not None:
            pl_module.epoch_losses['train_loss'].append(train_loss.item())
            print(f"Epoch {trainer.current_epoch + 1}: Train Loss: {train_loss.item()}")

    def on_validation_epoch_end(self, trainer, pl_module):
        val_loss = trainer.callback_metrics.get('val_loss')
        if val_loss is not None:
            pl_module.epoch_losses['val_loss'].append(val_loss.item())
            print(f"Epoch {trainer.current_epoch + 1}: Val Loss: {val_loss.item()}")


In [17]:
# Initialize the model
model = TwoTowerModel(user_model_name='paraphrase-MiniLM-L6-v2', item_model_name='paraphrase-MiniLM-L6-v2')

# Define the ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',  # Metric to monitor
    dirpath='checkpoints/',  # Directory to save the checkpoints
    filename='weight-history-best-checkpoint',  # Filename for the best model
    save_top_k=1,  # Save only the top 1 model
    mode='min'  # Mode to save the best model (min for validation loss)
)

# Initialize the trainer
trainer = pl.Trainer(max_epochs=2, log_every_n_steps=1, callbacks=[PrintLossesCallback(), checkpoint_callback], enable_progress_bar=True)
trainer.fit(model, train_dataloader, val_dataloader)

# Print losses after training completes
print("Epoch losses:")
for epoch in range(trainer.max_epochs):
    train_loss = model.epoch_losses['train_loss'][epoch] if epoch < len(model.epoch_losses['train_loss']) else 'N/A'
    val_loss = model.epoch_losses['val_loss'][epoch] if epoch < len(model.epoch_losses['val_loss']) else 'N/A'
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss}, Val Loss: {val_loss}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                | Params | Mode 
-----------------------------------------------------------
0 | user_model | SentenceTransformer | 22.7 M | train
1 | item_model | SentenceTransformer | 22.7 M | train
2 | user_fc    | Linear              | 147 K  | train
3 | item_fc    | Linear              | 147 K  | train
4 | criterion  | MSELoss             | 0      | train
-----------------------------------------------------------
45.7 M    Trainable params
0         Non-trainable params
45.7 M    Total params
182.888   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:475: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  0.75it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:02<00:00,  0.77it/s]Epoch 1: Val Loss: 1.3298068046569824
                                                                           

D:\Anaconda\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 24929/24929 [8:49:05<00:00,  0.79it/s, v_num=16]  
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/3115 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/3115 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/3115 [00:01<1:05:19,  0.79it/s][A
Validation DataLoader 0:   0%|          | 2/3115 [00:02<1:07:03,  0.77it/s][A
Validation DataLoader 0:   0%|          | 3/3115 [00:03<1:06:33,  0.78it/s][A
Validation DataLoader 0:   0%|          | 4/3115 [00:05<1:06:13,  0.78it/s][A
Validation DataLoader 0:   0%|          | 5/3115 [00:06<1:06:03,  0.78it/s][A
Validation DataLoader 0:   0%|          | 6/3115 [00:07<1:05:36,  0.79it/s][A
Validation DataLoader 0:   0%|          | 7/3115 [00:08<1:05:50,  0.79it/s][A
Validation DataLoader 0:   0%|          | 8/3115 [00:10<1:05:32,  0.79it/s][A
Validation DataLoader 0:   0%|          | 9/3115 [00:11<1:05:11,  0.79it/s][A
Validation DataLoader 0:   0%|        

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 24929/24929 [10:01:57<00:00,  0.69it/s, v_num=16]
Epoch losses:
Epoch 1: Train Loss: 0.9619219303131104, Val Loss: 1.3298068046569824
Epoch 2: Train Loss: 1.5898168087005615, Val Loss: 1.1831309795379639


In [None]:
# model.epoch_losses

# Evaluation

In [36]:
# Assuming the training part has been done already, load the best model checkpoint
# best_model_path = './lightning_logs/history_paraphrase-MiniLM-L6-v2_5-epochs_lr-1e-5/checkpoints/epoch=4-step=93765.ckpt'  # Path where the best model is saved
# best_model_path = './lightning_logs/paraphrase-MiniLM-L6-v2/binarized/history_5-epochs_lr-1e-5/checkpoints/epoch=4-step=62320.ckpt'  # Path where the best model is saved
# best_model_path = './lightning_logs/paraphrase-MiniLM-L6-v2/binarized/history_5-epochs_lr-1e-5_BCEloss/checkpoints/epoch=4-step=62320.ckpt'  # Path where the best model is saved
best_model_path = './lightning_logs/version_16/weight-history-best-checkpoint.ckpt'  # Path where the best model is saved

best_model = TwoTowerModel.load_from_checkpoint(best_model_path, user_model_name='paraphrase-MiniLM-L6-v2', item_model_name='paraphrase-MiniLM-L6-v2').to(device)



## Calculations

In [37]:
def get_top_n_items_without_history_unseen_items(model, userId, n):
    # Ensure the model is in evaluation mode
    model.eval()

    # Get the user text for the given userId
    user_text = val_last_user_texts[userId]

    # Encode the user text
    # user_embedding = model.user_model.encode(user_text, convert_to_tensor=True).to(device)
    user_embedding = model.compute_weighted_user_embedding([user_text]).squeeze(0).to(device)

    # Compute the scores (dot product between user embedding and each item embedding)
    # user_output = model.user_fc(user_embedding).to(device)
    user_output = model.user_fc(user_embedding).unsqueeze(0).to(device)
    item_output = model.item_fc(full_items_embeddings).to(device)
    # dot_product = torch.matmul(user_output, item_output.t()).squeeze()
    dot_product = torch.matmul(user_output, item_output.T).squeeze()

    # Get items the user has seen in the training and validation data
    seen_items_train = train_ratings[train_ratings['user_id'] == userId]['item_id'].values
    seen_items_val = val_ratings[val_ratings['user_id'] == userId]['item_id'].values
    seen_items = set(np.concatenate((seen_items_train, seen_items_val)))

    # Get the top n + len(seen_items) item indices and their scores
    top_n_scores, top_n_indices = torch.topk(dot_product, n + len(seen_items))

    # Map indices back to item IDs
    top_n_item_ids = [list(movie_id_to_idx.keys())[list(movie_id_to_idx.values()).index(idx.item())] for idx in top_n_indices]

    # Filter out seen items
    unseen_top_n_item_ids = [item for item in top_n_item_ids if item not in seen_items]

    return unseen_top_n_item_ids[:n]

In [38]:
item_texts[:2]

[{'title': 'Godfather, The', 'genres': 'Action, Crime, Drama'},
 {'title': 'Babe: Pig in the City', 'genres': "Children's, Comedy"}]

In [39]:
# Assuming full_items_embeddings is already defined
# full_items_embeddings = torch.stack([best_model.item_model.encode(item_text, convert_to_tensor=True) for item_text in item_texts]).to(device)
full_items_embeddings = torch.stack([
    best_model.compute_weighted_item_embedding(item_texts)
    # for item_text in item_texts
]).to(device)

In [45]:
full_items_embeddings

tensor([[[ 0.3453, -0.5064, -0.2076,  ..., -0.1414,  0.2801, -0.2178],
         [ 0.1718, -0.3473, -0.2005,  ..., -0.0419,  0.3893, -0.2938],
         [ 0.1622, -0.4982, -0.2743,  ..., -0.4158,  0.3266, -0.1165],
         ...,
         [ 0.3931, -0.4381,  0.0859,  ..., -0.1343,  0.3520,  0.0758],
         [ 0.3975, -0.5015, -0.1554,  ..., -0.2721,  0.3811, -0.0092],
         [ 0.3716, -0.5835, -0.2343,  ..., -0.3305,  0.3911,  0.0148]]],
       device='cuda:0')

In [41]:
def get_top_n_items_without_history_unseen_items(model, userId, n):
    # Ensure the model is in evaluation mode
    model.eval()

    # Get the user text for the given userId
    user_text = val_last_user_texts[userId]

    # Compute the user embedding using the updated method
    user_embedding = model.compute_weighted_user_embedding([user_text]).squeeze(0).to(device)

    # Compute user output from the fully connected layer
    user_output = model.user_fc(user_embedding).to(device)  # user_output shape: [384]

    # Compute item outputs from the fully connected layer
    item_output = model.item_fc(full_items_embeddings).to(device)  # item_output shape: [3706, 384]

    # Reshape user_output to be [384, 1] so it can be multiplied with item_output
    user_output = user_output.unsqueeze(1)  # Now user_output shape is [384, 1]

    # Compute the dot product between user output and item outputs
    dot_product = torch.matmul(item_output, user_output).squeeze()  # Result shape: [3706]

    # Get items the user has seen in the training and validation data
    seen_items_train = train_ratings[train_ratings['user_id'] == userId]['item_id'].values
    seen_items_val = val_ratings[val_ratings['user_id'] == userId]['item_id'].values
    seen_items = set(np.concatenate((seen_items_train, seen_items_val)))

    # Get the top n + len(seen_items) item indices and their scores
    top_n_scores, top_n_indices = torch.topk(dot_product, n + len(seen_items))

    # Map indices back to item IDs
    top_n_item_ids = [list(movie_id_to_idx.keys())[list(movie_id_to_idx.values()).index(idx.item())] for idx in top_n_indices]

    # Filter out seen items
    unseen_top_n_item_ids = [item for item in top_n_item_ids if item not in seen_items]

    return unseen_top_n_item_ids[:n]


In [42]:
def dcg(scores, k):
    scores = np.asfarray(scores)[:k]
    return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))

def ndcg_at_k(labels, k):
    ideal_labels = sorted(labels, reverse=True)
    return dcg(labels, k) / dcg(ideal_labels, k)

def evaluate_user_cf_model(model, test_data, train_data, val_data, all_items, k):
    ndcg_scores = []

    # Get unique users
    unique_users = test_data['user_id'].unique()

    for user in unique_users:
        # Get the top N items for the user, filtering out seen items
        recommended_items = get_top_n_items_without_history_unseen_items(model, user, k)

        user_test_data = test_data[test_data['user_id'] == user]
        test_items = user_test_data['item_id'].values
        print(user)
        y_score = [1 if item in test_items else 0 for item in recommended_items]
        ndcg = ndcg_at_k(y_score, k)
        ndcg_scores.append(ndcg)

    # avg_ndcg = np.mean(np.nan_to_num(ndcg_scores, nan=0.0))
    avg_ndcg = np.nanmean(ndcg_scores)

    return {
        'NDCG@{}'.format(k): avg_ndcg
    }

all_items = movies['item_id'].unique()
# Evaluate the model
eval_result = evaluate_user_cf_model(best_model, test_ratings, train_ratings, val_ratings, all_items, k=5)
print(eval_result)

1
2
3
4
5
6


  return dcg(labels, k) / dcg(ideal_labels, k)


7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280


In [None]:
{'NDCG@5': 0.6700433102185901}

In [47]:
def dcg(scores, k):
    scores = np.asfarray(scores)[:k]
    return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))

def ndcg_at_k(labels, k):
    ideal_labels = sorted(labels, reverse=True)
    return dcg(labels, k) / dcg(ideal_labels, k)

def evaluate_user_cf_model(model, test_data, train_data, val_data, all_items, k):
    ndcg_scores = []

    # Get unique users
    unique_users = test_data['user_id'].unique()

    for user in unique_users:
        # Get the top N items for the user, filtering out seen items
        recommended_items = get_top_n_items_without_history_unseen_items(model, user, k)

        user_test_data = test_data[test_data['user_id'] == user]
        test_items = user_test_data['item_id'].values
        print(user)
        # y_score = [1 if item in test_items else 0 for item in recommended_items]

        y_score = [
            user_test_data[user_test_data['item_id'] == item]['rating'].values[0] if item in test_items else 2.5
            for item in recommended_items
        ]

        ndcg = ndcg_at_k(y_score, k)
        ndcg_scores.append(ndcg)

    # avg_ndcg = np.mean(np.nan_to_num(ndcg_scores, nan=0.0))
    avg_ndcg = np.nanmean(ndcg_scores)

    return {
        'NDCG@{}'.format(k): avg_ndcg
    }

all_items = movies['item_id'].unique()
# Evaluate the model
eval_result = evaluate_user_cf_model(best_model, test_ratings, train_ratings, val_ratings, all_items, k=5)
print(eval_result)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [48]:
def dcg(scores, k):
    scores = np.asfarray(scores)[:k]
    return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))

def ndcg_at_k(labels, k):
    ideal_labels = sorted(labels, reverse=True)
    return dcg(labels, k) / dcg(ideal_labels, k)

def evaluate_user_cf_model(model, test_data, train_data, val_data, all_items, k):
    ndcg_scores = []

    # Get unique users
    unique_users = test_data['user_id'].unique()

    for user in unique_users:
        # Get the top N items for the user, filtering out seen items
        recommended_items = get_top_n_items_without_history_unseen_items(model, user, k)

        user_test_data = test_data[test_data['user_id'] == user]
        test_items = user_test_data['item_id'].values
        print(user)
        # y_score = [1 if item in test_items else 0 for item in recommended_items]

        y_score = [
            user_test_data[user_test_data['item_id'] == item]['rating'].values[0] if item in test_items else 0
            for item in recommended_items
        ]

        ndcg = ndcg_at_k(y_score, k)
        ndcg_scores.append(ndcg)

    # avg_ndcg = np.mean(np.nan_to_num(ndcg_scores, nan=0.0))
    avg_ndcg = np.nanmean(ndcg_scores)

    return {
        'NDCG@{}'.format(k): avg_ndcg
    }

all_items = movies['item_id'].unique()
# Evaluate the model
eval_result = evaluate_user_cf_model(best_model, test_ratings, train_ratings, val_ratings, all_items, k=5)
print(eval_result)

1
2
3
4
5
6
7


  return dcg(labels, k) / dcg(ideal_labels, k)


8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
28