In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from tqdm import tqdm

In [3]:
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv', 
                      parse_dates=['timestamp'])

In [4]:
rand_userIds = np.random.choice(ratings['userId'].unique(), 
                                size=int(len(ratings['userId'].unique())*0.3), 
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

There are 6054025 rows of data from 41547 users


In [5]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

In [6]:
train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

In [7]:
from collections import defaultdict

In [8]:
# ordered_dict_of_all_movies = []
# ordered_dict_of_all_users = []
# ratings = defaultdict(int)

# for _,row in train_ratings.iterrows():
#     u,m,r = row
#     ordered_dict_of_all_movies.append(m)
#     ordered_dict_of_all_users.append(u)
#     ratings[(u,m)] = r

# ordered_dict_of_all_movies = list(set(ordered_dict_of_all_movies))
# ordered_dict_of_all_movies.sort()
# ordered_dict_of_all_users = list(set(ordered_dict_of_all_users))
# ordered_dict_of_all_users.sort()

# print(f'Unique users : {len(ordered_dict_of_all_users)}')
# print(f'Unique movies : {len(ordered_dict_of_all_movies)}')

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import defaultdict
import numpy

In [11]:
latent_dim = 32

# gamma_user = numpy.random.random((len(ordered_dict_of_all_users),latent_dim))
# gamma_movies = numpy.random.random((len(ordered_dict_of_all_movies),latent_dim))

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
class RatingPredictor(nn.Module):
    def __init__(self, n_users, n_movies,latent_dim = 32):
        super(RatingPredictor, self).__init__()
        
        # Properly register parameters using nn.Parameter and move to device
        self.alpha = nn.Parameter(torch.tensor([3.0], device=device))
        self.beta_user = nn.Parameter(torch.randn(n_users, 1, device=device))
        self.beta_movies = nn.Parameter(torch.randn(n_movies, 1, device=device))
        self.gamma_user = nn.Parameter(torch.randn(n_users, latent_dim, device=device))
        self.gamma_movies = nn.Parameter(torch.randn(n_movies, latent_dim, device=device))
        
        # Initialize parameters with normal distribution
        with torch.no_grad():
            nn.init.normal_(self.beta_user, std=0.01)
            nn.init.normal_(self.beta_movies, std=0.01)
            nn.init.normal_(self.gamma_user, std = 1/np.sqrt(latent_dim))
            nn.init.normal_(self.gamma_movies, std = 1/np.sqrt(latent_dim))

    def forward(self, user_indices, movie_indices):
        user_biases = self.beta_user[user_indices]
        movie_biases = self.beta_movies[movie_indices]

        user_preferences = self.gamma_user[user_indices]
        movies_preferences = self.gamma_movies[movie_indices]

        interaction = torch.sum(user_preferences * movies_preferences, dim=1)
        predictions = self.alpha + user_biases + movie_biases + interaction
        return predictions.squeeze()

In [14]:
def prepare_data(ratingsTrain):
    """
    Prepare data for PyTorch training
    """
    # Get unique users and movies
    users = np.array([u for u, _, _ in ratingsTrain])
    movies = np.array([b for _, b, _ in ratingsTrain])
    ratings = np.array([r for _, _, r in ratingsTrain])
    
    # Get unique sorted arrays
    ordered_dict_of_all_users = np.unique(users)
    ordered_dict_of_all_movies = np.unique(movies)
    
    # Create mapping dictionaries
    user_to_idx = {u: i for i, u in enumerate(ordered_dict_of_all_users)}
    movie_to_idx = {b: i for i, b in enumerate(ordered_dict_of_all_movies)}
    
    # Convert to indices
    user_indices = np.array([user_to_idx[u] for u in users])
    movie_indices = np.array([movie_to_idx[b] for b in movies])
    
    # Convert to PyTorch tensors
    user_indices = torch.LongTensor(user_indices).to(device)
    movie_indices = torch.LongTensor(movie_indices).to(device)
    ratings = torch.FloatTensor(ratings).to(device)
    
    return (user_indices, movie_indices, ratings, 
            user_to_idx, movie_to_idx, 
            len(ordered_dict_of_all_users), len(ordered_dict_of_all_movies))

In [15]:
from torch.utils.data import DataLoader, Dataset

class RatingsDataset(Dataset):
    def __init__(self, ratings, user_to_idx, movie_to_idx):
        """
        Initialize the dataset with ratings and mapping dictionaries.
        """
        self.user_indices = torch.LongTensor([user_to_idx[u] for u, _, _ in ratings])
        self.movie_indices = torch.LongTensor([movie_to_idx[m] for _, m, _ in ratings])
        self.ratings = torch.FloatTensor([r for _, _, r in ratings])
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        """
        Retrieve a single sample from the dataset.
        """
        return self.user_indices[idx], self.movie_indices[idx], self.ratings[idx]

In [16]:
def train_model(ratingsTrain, n_epochs=5, lambda_reg=1.0, learning_rate=0.01, batch_size=4096,latent_dim = 32):
    # Prepare data
    users = np.array([u for u, _, _ in ratingsTrain])
    movies = np.array([b for _, b, _ in ratingsTrain])
    ratings = np.array([r for _, _, r in ratingsTrain])

    print(f'Unique users : {len(np.unique(users))}')
    
    
    # Get unique sorted arrays
    ordered_dict_of_all_users = np.unique(users)
    ordered_dict_of_all_movies = np.unique(movies)

    print('Ordered sets created')
    
    # Create mapping dictionaries
    user_to_idx = {u: i for i, u in enumerate(ordered_dict_of_all_users)}
    movie_to_idx = {b: i for i, b in enumerate(ordered_dict_of_all_movies)}

    print('Mapping dictionaries created')
    
    # Create dataset and data loader
    dataset = RatingsDataset(ratingsTrain, user_to_idx, movie_to_idx)

    print('Dataset created')
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    print('Data loader created')

    
    # Create model and move to device
    model = RatingPredictor(len(ordered_dict_of_all_users), len(ordered_dict_of_all_movies), latent_dim).to(device)

    print('Model initialized and moved to device')
    print(f"Model parameters: {[p.shape for p in model.parameters()]}")  # Debug print
    
    # Define loss function and optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Verify optimizer has parameters
    if len(list(model.parameters())) == 0:
        raise ValueError("Model has no parameters!")
    
    # Training loop
    for epoch in range(n_epochs):
        epoch_loss = 0
        for batch_idx, (user_batch, movie_batch, rating_batch) in enumerate(data_loader):
            user_batch = user_batch.to(device)
            movie_batch = movie_batch.to(device)
            rating_batch = rating_batch.to(device)
            
            optimizer.zero_grad()
            predictions = model(user_batch, movie_batch)
            sse_loss = torch.sum((predictions - rating_batch) ** 2)
            l2_reg = lambda_reg * (
                torch.sum(model.beta_user**2) +
                torch.sum(model.beta_movies**2) +
                torch.sum(model.gamma_user**2) +
                torch.sum(model.gamma_movies**2)
            )
            loss = sse_loss + l2_reg
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()

            if batch_idx % 100 == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()/len(user_batch):.4f}")
        
        if epoch % 1 == 0:
            print(f'Epoch {epoch}, Loss: {epoch_loss/len(ratingsTrain):.4f}')
    
    return model, user_to_idx, movie_to_idx

In [17]:
def predict_rating(model, user, movie, user_to_idx, movie_to_idx):
    """
    Predict rating for a given user-movie pair
    """
    try:
        user_idx = torch.LongTensor([user_to_idx[user]]).to(device)
        movie_idx = torch.LongTensor([movie_to_idx[movie]]).to(device)
        
        with torch.no_grad():
            prediction = model(user_idx, movie_idx)
        
        return prediction.item()
    except:
        return model.alpha.item()

In [18]:
ratingsTrain = train_ratings.values.tolist()
ratingsValid = test_ratings.values.tolist()

In [19]:
model, user_to_idx, movie_to_idx = train_model(ratingsTrain ,n_epochs=10, lambda_reg=2.0, learning_rate=0.005,latent_dim = 128)

Unique users : 41547
Ordered sets created
Mapping dictionaries created
Dataset created
Data loader created
Model initialized and moved to device
Model parameters: [torch.Size([1]), torch.Size([41547, 1]), torch.Size([21707, 1]), torch.Size([41547, 128]), torch.Size([21707, 128])]
Epoch 0, Batch 0, Loss: 5819.6270
Epoch 0, Batch 100, Loss: 4467.9072
Epoch 0, Batch 200, Loss: 4238.9248
Epoch 0, Batch 300, Loss: 3342.1125
Epoch 0, Batch 400, Loss: 3110.7031
Epoch 0, Batch 500, Loss: 3303.4419
Epoch 0, Batch 600, Loss: 3073.1208
Epoch 0, Batch 700, Loss: 3035.5696
Epoch 0, Batch 800, Loss: 2965.3176
Epoch 0, Batch 900, Loss: 2891.3103
Epoch 0, Batch 1000, Loss: 2984.1079
Epoch 0, Batch 1100, Loss: 2857.4832
Epoch 0, Batch 1200, Loss: 2949.9280
Epoch 0, Batch 1300, Loss: 2864.2766
Epoch 0, Batch 1400, Loss: 2774.6577
Epoch 0, Loss: 3283.4659
Epoch 1, Batch 0, Loss: 2054.8208
Epoch 1, Batch 100, Loss: 2041.6888
Epoch 1, Batch 200, Loss: 1932.9877
Epoch 1, Batch 300, Loss: 2096.6670
Epoch 1, 

In [20]:
# The model parameters can be accessed as:
alpha = model.alpha.item()
beta_user = model.beta_user.detach().cpu().numpy()
beta_movies = model.beta_movies.detach().cpu().numpy()

validMSE = 0

for u,b,r in tqdm(ratingsValid):
    prediction = predict_rating(model,u,b,user_to_idx, movie_to_idx)
    validMSE += (r - prediction)**2

validMSE /= len(ratingsValid)
print(validMSE)

100%|██████████| 41547/41547 [00:09<00:00, 4594.78it/s]

1.0130723440501455



