In [None]:
import pandas as pd
import numpy as np

In [None]:
df_ratings = pd.read_csv('cleaned_ratings.csv')

In [None]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [None]:
user_counts = df_ratings['userId'].value_counts()


In [None]:
df_ratings = df_ratings[df_ratings['userId'].isin(user_counts[user_counts >= 5].index)]
df_ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df_ratings, test_size=0.2, random_state=42, shuffle=True)


In [None]:
def mapping_movies(dataset):
    movie2idx = {}
    i = 0
    for movie_id in dataset["movieId"].unique():  # go through unique movie IDs
        movie2idx[movie_id] = i
        i+=1

    return movie2idx

def mapping_users(dataset):
    user2idx = {}
    i = 0
    for user_id in dataset["userId"].unique():  # go through unique movie IDs
        user2idx[user_id] = i
        i+=1

    return user2idx

train_movie2idx = mapping_movies(train_df)
train_user2idx = mapping_users(train_df)

val_movie2idx = mapping_movies(val_df)
val_user2idx = mapping_users(val_df)



In [None]:
# Map user and movie IDs in both train and validation sets
train_df["userId"] = train_df["userId"].map(train_user2idx)
train_df["movieId"] = train_df["movieId"].map(train_movie2idx)

train_users = set(train_df['userId'].unique())
train_movies = set(train_df['movieId'].unique())


val_df["userId"] = val_df["userId"].map(val_user2idx)
val_df["movieId"] = val_df["movieId"].map(val_movie2idx)

val_df = val_df[val_df['userId'].isin(train_users) & val_df['movieId'].isin(train_movies)]

# Convert to list of lists (faster than looping with iloc)
train_data = train_df[["userId", "movieId", "rating"]].values.tolist()
valid_data = val_df[["userId", "movieId", "rating"]].values.tolist()



In [None]:
from torch.utils.data import Dataset, DataLoader

class MoviesDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):

        user_data = self.data[idx][0]
        movie_data = self.data[idx][1]
        rating_data = self.data[idx][2]

        user_id_tensor = torch.tensor(user_data, dtype=torch.long)
        movie_id_tensor = torch.tensor(movie_data, dtype=torch.long)
        rating_tensor = torch.tensor(rating_data, dtype=torch.float32)

        return(user_id_tensor, movie_id_tensor, rating_tensor)




In [None]:
train_dataset = DataLoader(train_data, batch_size=1024, shuffle=True)
valid_dataset = DataLoader(valid_data, batch_size=1024, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class MovieRecommender(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=32):
            super(MovieRecommender, self).__init__()
            self.user_embedding = nn.Embedding(num_users, embedding_dim)
            self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
            self.mlp = nn.Sequential(
                nn.Linear(64, 32),  # shrink 256-dim input → 128
                nn.ReLU(),            # add nonlinearity
                nn.Linear(32, 16),   # shrink again
                nn.ReLU(),
                nn.Linear(16, 1)      # final output: one number (predicted rating)
            )

    def forward(self, user_ids, movie_ids):
        user_vecs = self.user_embedding(user_ids)
        movie_vecs = self.movie_embedding(movie_ids)

        combined_vec = torch.cat((user_vecs, movie_vecs), dim=1)

        return self.mlp(combined_vec).squeeze(-1)






In [None]:
#training loop

num_epochs = 9
train_losses, val_losses = [], []

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MovieRecommender(256107,45109)
model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0003, weight_decay=1e-3)
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch in train_dataset:
        user_ids, movie_ids, ratings = batch
        user_ids = user_ids.long().to(device)
        movie_ids = movie_ids.long().to(device)
        ratings = ratings.float().to(device)

        predictions = model(user_ids, movie_ids)
        loss = criterion(predictions, ratings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()  # accumulate

    avg_loss = running_loss / len(train_dataset)  # mean per batch
    train_losses.append(avg_loss)


    model.eval()  # evaluation mode (no dropout, no batchnorm updates)
    val_loss = 0.0
    with torch.no_grad():  # disable gradient calculation
        for batch in valid_dataset:  # <-- use validation data
            user_ids, movie_ids, ratings = batch
            user_ids = user_ids.long().to(device)
            movie_ids = movie_ids.long().to(device)
            ratings = ratings.float().to(device)

            predictions = model(user_ids, movie_ids)
            loss = criterion(predictions, ratings)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(valid_dataset)
    val_losses.append(avg_val_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {avg_loss:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}")




Epoch [1/9], Train Loss: 1.0268, Val Loss: 1.3984
Epoch [2/9], Train Loss: 0.7868, Val Loss: 1.4242
Epoch [3/9], Train Loss: 0.7830, Val Loss: 1.4206
Epoch [4/9], Train Loss: 0.7826, Val Loss: 1.4245
