In [2]:
!pip install sentence_transformers

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Define the Two-Tower Model with SBERT fine-tuning
class TwoTowerSBERTModel(nn.Module):
    def __init__(self, sbert_model_name, device):
        super(TwoTowerSBERTModel, self).__init__()
        self.user_model = SentenceTransformer(sbert_model_name).to(device)
        self.item_model = SentenceTransformer(sbert_model_name).to(device)
        self.embedding_dim = self.user_model.get_sentence_embedding_dimension()
        self.fc = nn.Linear(self.embedding_dim * 2, 1).to(device)
        # remove linear

    def forward(self, user_texts, item_texts):
        user_embeddings = self.user_model.encode(user_texts, convert_to_tensor=True).to(device)
        item_embeddings = self.item_model.encode(item_texts, convert_to_tensor=True).to(device)
        combined = torch.cat((user_embeddings, item_embeddings), dim=1)
        # remove concat
        # get dot product
        output = self.fc(combined).squeeze()
        # output = dot product
        return torch.clamp(4 * torch.sigmoid(output) + 1, min=1, max=5)

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [4]:
# Load SBERT model
sbert_model_name = "all-MiniLM-L6-v2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTowerSBERTModel(sbert_model_name, device).to(device)



In [5]:
# Load datasets
users = pd.read_csv('processed_dataset/MovieLens-1M/users/users_full_movielens.csv')
movies = pd.read_csv('processed_dataset/MovieLens-1M/movies/movies_full_movielens.csv')
ratings = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_train_movielens.csv')
ratings_test = pd.read_csv('processed_dataset/MovieLens-1M/ratings/ml_1m_test_movielens.csv')

In [6]:
# Combine user features into a single string for each user
users['user_features'] = 'occupation: ' + users['occupation'] + ' [SEP] age: ' + users['age'].astype(str) + ' [SEP] gender: ' + users['gender'].astype(str)

# Combine movie features into a single string for each movie
movies['movie_features'] = 'title: ' + movies['title'] + ' [SEP] genres: ' + movies['genres']

In [7]:
# Create a dictionary for fast lookup
user_features_dict = users.set_index('user_id')['user_features'].to_dict()
movie_features_dict = movies.set_index('item_id')['movie_features'].to_dict()

# Create lists of user and item texts
user_texts = [user_features_dict[userId] for userId in ratings['user_id'].unique()]
item_texts = [movie_features_dict[movieId] for movieId in ratings['item_id'].unique()]

# Create a mapping from userId and movieId to indices
user_id_to_idx = {userId: idx for idx, userId in enumerate(ratings['user_id'].unique())}
movie_id_to_idx = {movieId: idx for idx, movieId in enumerate(ratings['item_id'].unique())}

# Map userId and movieId in ratings_df to indices
ratings['user_idx'] = ratings['user_id'].map(user_id_to_idx)
ratings['movie_idx'] = ratings['item_id'].map(movie_id_to_idx)

# Extract user indices, item indices, and ratings
user_indices = torch.LongTensor(ratings['user_idx'].values).to(device)
item_indices = torch.LongTensor(ratings['movie_idx'].values).to(device)
labels = torch.FloatTensor(ratings['rating'].values).to(device)

In [8]:
# Identify missing entries
missing_item_ids = set(ratings['item_id'].unique()) - set(movie_features_dict.keys())
print("Missing item IDs:", missing_item_ids)

Missing item IDs: set()


In [91]:
ratings

Unnamed: 0,user_id,item_id,rating,user_idx,movie_idx
0,2124,1035,4,0,0
1,2493,3730,5,1,1
2,955,3479,5,2,2
3,5950,1746,3,3,3
4,2857,366,2,4,4
...,...,...,...,...,...
800162,4507,2947,5,1904,143
800163,801,2916,3,2516,58
800164,3358,318,5,2029,142
800165,146,2291,4,777,145


In [9]:
# Reverse the user_id_to_idx dictionary to map indices back to user IDs
idx_to_user_id = {idx: userId for userId, idx in user_id_to_idx.items()}

# Get the real user ID for user_idx 1
real_user_id = idx_to_user_id[1]
print(f"Real user ID for user_idx 1: {real_user_id}")
# Get the text features for the identified real user ID
user_text_features = user_features_dict[real_user_id]
print(f"Text features for user ID {real_user_id}: {user_text_features}")

Real user ID for user_idx 1: 2493
Text features for user ID 2493: occupation: artist [SEP] age: 45-49 [SEP] gender: Male


In [10]:
# Reverse the user_id_to_idx dictionary to map indices back to user IDs
idx_to_item_id = {idx: itemId for itemId, idx in movie_id_to_idx.items()}

# Get the real user ID for user_idx 1
real_user_id = idx_to_item_id[1338]
print(f"Real user ID for user_idx 1: {real_user_id}")
# Get the text features for the identified real user ID
user_text_features = movie_features_dict[real_user_id]
print(f"Text features for user ID {real_user_id}: {user_text_features}")

Real user ID for user_idx 1: 1937
Text features for user ID 1937: title: Going My Way (1944) [SEP] genres: Comedy


In [12]:
# Create DataLoader
dataset = TensorDataset(user_indices, item_indices, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [13]:
dataset.tensors

(tensor([   0,    1,    2,  ..., 2029,  777,   28], device='cuda:0'),
 tensor([   0,    1,    2,  ...,  142,  145, 1715], device='cuda:0'),
 tensor([4., 5., 5.,  ..., 5., 4., 1.], device='cuda:0'))

In [14]:
# Define optimizer and loss function
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [71]:
movie_features_dict[1512]

KeyError: 1512

In [16]:
# Training loop
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in dataloader:
        # print(batch)
        user_batch, item_batch, rating_batch = batch
        user_batch, item_batch, rating_batch = user_batch.to('cuda'), item_batch.to('cuda'), rating_batch.to('cuda')

        # Reverse the user_id_to_idx dictionary to map indices back to user IDs
        idx_to_user_id = {idx: userId for userId, idx in user_id_to_idx.items()}
        idx_to_movie_id = {idx: movieId for movieId, idx in movie_id_to_idx.items()}

        # Get the user text features for the batch
        user_texts_batch = [user_features_dict[idx_to_user_id[user_id.item()]] for user_id in user_batch]
        item_texts_batch = [movie_features_dict[idx_to_movie_id[item_id.item()]] for item_id in item_batch]

        optimizer.zero_grad()
        outputs = model(user_texts_batch, item_texts_batch)

        loss = criterion(outputs, rating_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Average Loss: {total_loss / len(dataloader)}")

Epoch 1, Average Loss: 1.123010394578551


In [68]:
# Save the model
model_path = "two_tower_sbert_model_with_embeddings.pth"
torch.save({
    'model_state_dict': model.state_dict()
}, model_path)

print("Model saved.")

Model saved.


In [17]:
def get_top_n_recommendations(model, user_text, item_texts, top_n=5):
    model.eval()
    with torch.no_grad():
        # Get user embedding
        user_embedding = model.user_model.encode([user_text], convert_to_tensor=True).to(device)

        # Compute scores for all items
        item_embeddings = model.item_model.encode(item_texts, convert_to_tensor=True).to(device)

        # Expand user embedding to match the number of items
        user_embedding_batch = user_embedding.expand(item_embeddings.shape[0], -1)

        combined_embeddings = torch.cat((user_embedding_batch, item_embeddings), dim=1)
        scores = model.fc(combined_embeddings).squeeze()
        scores = torch.clamp(4 * torch.sigmoid(scores) + 1, min=1, max=5)

        # Get top N item indices and their scores
        top_n_scores, top_n_indices = torch.topk(scores, top_n)

        # Convert to a pandas Series for the required format
        top_n_series = pd.Series(data=top_n_scores.cpu().tolist(), index=top_n_indices.cpu().tolist())

        return top_n_series

In [None]:
# Generate recommendations and evaluate
recommendations = {}
top_n = 5
total_users = len(ratings_test['user_id'].unique())
counter = 0

for user_id in ratings_test['user_id'].unique():
    counter += 1
    print(f"Processing user {counter}/{total_users}")
    user_text = user_features_dict[user_id]
    item_texts = [movie_features_dict[movie_id] for movie_id in ratings['item_id'].unique()]
    top_n_series = get_top_n_recommendations(model, user_text, item_texts, top_n=top_n)
    recommendations[user_id] = top_n_series

Processing user 1/6037
Processing user 2/6037
Processing user 3/6037
Processing user 4/6037
Processing user 5/6037
Processing user 6/6037
Processing user 7/6037
Processing user 8/6037
Processing user 9/6037
Processing user 10/6037
Processing user 11/6037
Processing user 12/6037
Processing user 13/6037
Processing user 14/6037
Processing user 15/6037
Processing user 16/6037
Processing user 17/6037
Processing user 18/6037
Processing user 19/6037
Processing user 20/6037
Processing user 21/6037
Processing user 22/6037
Processing user 23/6037
Processing user 24/6037
Processing user 25/6037
Processing user 26/6037
Processing user 27/6037
Processing user 28/6037
Processing user 29/6037
Processing user 30/6037
Processing user 31/6037
Processing user 32/6037
Processing user 33/6037
Processing user 34/6037
Processing user 35/6037
Processing user 36/6037
Processing user 37/6037
Processing user 38/6037
Processing user 39/6037
Processing user 40/6037
Processing user 41/6037
Processing user 42/6037
P

In [21]:
recommendations

{1579: 2851    4.514713
 3119    4.494742
 3264    4.456530
 570     4.442110
 3620    4.420434
 dtype: float64,
 5627: 2851    4.514713
 3119    4.494742
 3264    4.456530
 570     4.442110
 3620    4.420434
 dtype: float64,
 3780: 2851    4.545275
 3119    4.526392
 3264    4.490224
 570     4.476563
 3620    4.456014
 dtype: float64,
 5547: 2851    4.549943
 3119    4.531229
 3264    4.495378
 570     4.481834
 3620    4.461460
 dtype: float64,
 1059: 2851    4.527365
 3119    4.507842
 3264    4.470470
 570     4.456362
 3620    4.435149
 dtype: float64,
 2823: 2851    4.530161
 3119    4.510738
 3264    4.473553
 570     4.459515
 3620    4.438404
 dtype: float64,
 2026: 2851    4.544794
 3119    4.525894
 3264    4.489694
 570     4.476021
 3620    4.455454
 dtype: float64,
 4540: 2851    4.544410
 3119    4.525496
 3264    4.489269
 570     4.475587
 3620    4.455005
 dtype: float64,
 3518: 2851    4.551325
 3119    4.532660
 3264    4.496902
 570     4.483394
 3620    4.463072


In [None]:
from evaluator import EvaluateMetrics

# Evaluate the recommendations
evaluator = EvaluateMetrics(ratings_test)
results = evaluator.evaluate_recommendations(recommendations, top_n)

print(f"NDCG@{top_n}: {results['NDCG@k']:.4f}")
print(f"MRR@{top_n}: {results['MRR@k']:.4f}")
print(f"HR@{top_n}: {results['HR@k']:.4f}")

In [None]:
from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("all-MiniLM-L6-v2")

In [3]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset
# from sentence_transformers import SentenceTransformer
print('hi')

hi


In [None]:
# Load pre-trained SBERT model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
print(sbert_model)

In [None]:
!pip install rank_eval