In [16]:
import torch
from torch import nn
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader
import optuna
import time

In [2]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Used device: {device}")

Used device: mps


In [3]:
df = pd.read_csv("movies_dataset.csv")
x = df[["user_id", "movie_id"]]
y = df["rating"]

X_f, X_test, y_f, y_test = train_test_split(
    x, y, test_size=0.15, random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_f, y_f, test_size=0.12, random_state=42
)

### Baseline - average of ratings

In [4]:
average_rating = df["rating"].mean()
y_pred_baseline = [average_rating] * len(y_test)
baseline_mse = mean_squared_error(y_test, y_pred_baseline)

print(f"Ratings average global: {average_rating:.2f}")
print(f"Baseline mse: {baseline_mse:.4f}")

Ratings average global: 3.50
Baseline mse: 1.1329


### Neural Network

using pytorch Dataset to create custom dataset for neural network and DataLoaders

In [5]:
class MovieDataset(Dataset):
    def __init__(self, df, targets):
        self.users = torch.tensor(df["user_id"].values, dtype=torch.long)
        self.movies = torch.tensor(df["movie_id"].values, dtype=torch.long)
        self.ratings = torch.tensor(targets.values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        return self.users[index], self.movies[index], self.ratings[index]

In [6]:
train_set = MovieDataset(X_train, y_train)
valid_set = MovieDataset(X_valid, y_valid)
test_set = MovieDataset(X_test, y_test)

In [7]:
BATCH_SIZE = 1024
torch.manual_seed(42)

train_loader = DataLoader(
    train_set,
    batch_size=BATCH_SIZE,
    num_workers=0,
    shuffle=True,
    pin_memory=True
)

valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, pin_memory=True)

Model architecture

In [None]:
class Recommender(nn.Module):
    def __init__(self, n_users, n_movies, embedding_dim=32, dropout_rate=0.3):
        super().__init__()
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 1)
        )

        self.user_embedding.weight.data.uniform_(0, 0.05)
        self.movie_embedding.weight.data.uniform_(0, 0.05)
    
    def forward(self, user, movie):
        user_vector = self.user_embedding(user)
        movie_vector = self.movie_embedding(movie)
        x = torch.cat([user_vector, movie_vector], dim=-1)

        return self.mlp(x).squeeze()

In [9]:
n_users = df['user_id'].nunique()
n_movies = df['movie_id'].nunique()

train model and find hyperparameters using optuna

In [10]:
def objective(trial):
    emb_dim = trial.suggest_categorical("embedding_dim", [16, 32, 64])
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    dropout = trial.suggest_float("dropout", 0.2, 0.5)

    model = Recommender(n_users, n_movies, emb_dim, dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    n_epochs = 5

    for epoch in range(n_epochs):
        model.train()
        for users, movies, ratings in train_loader:
            users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
            optimizer.zero_grad()
            pred = model(users, movies)
            loss = criterion(pred, ratings)
            loss.backward()
            optimizer.step()
        
        model.eval()
        valid_mse = 0
        with torch.inference_mode():
            for users, movies, ratings in valid_loader:
                users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
                preds = model(users, movies)
                valid_mse += criterion(preds, ratings).item()
        
        avg_valid_mse = valid_mse / len(valid_loader)

        trial.report(avg_valid_mse, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return avg_valid_mse

In [11]:
torch.manual_seed(42)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    direction="minimize",
    sampler=sampler,
    pruner=optuna.pruners.MedianPruner()
)

study.optimize(objective, n_trials=20)

[I 2026-01-28 11:05:39,189] A new study created in memory with name: no-name-0160c46a-522e-4b42-97ea-69ead40c57fb
[I 2026-01-28 11:06:11,439] Trial 0 finished with value: 0.8687361645698547 and parameters: {'embedding_dim': 32, 'lr': 0.0015751320499779737, 'dropout': 0.24680559213273096}. Best is trial 0 with value: 0.8687361645698547.
[I 2026-01-28 11:06:47,006] Trial 1 finished with value: 0.8247564500570297 and parameters: {'embedding_dim': 64, 'lr': 0.0015930522616241021, 'dropout': 0.41242177333881364}. Best is trial 1 with value: 0.8247564500570297.
[I 2026-01-28 11:07:16,876] Trial 2 finished with value: 0.8555025804042816 and parameters: {'embedding_dim': 32, 'lr': 0.00026587543983272726, 'dropout': 0.2545474901621302}. Best is trial 1 with value: 0.8247564500570297.
[I 2026-01-28 11:07:52,628] Trial 3 finished with value: 0.8752407956123353 and parameters: {'embedding_dim': 64, 'lr': 0.0007309539835912913, 'dropout': 0.2873687420594126}. Best is trial 1 with value: 0.824756450

### Train model more thouroughly on the best params found

In [18]:
# evaluate model function

def evaluate_tm(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    with torch.inference_mode():
        for users, movies, ratings in data_loader:
            users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
            y_pred = model(users, movies)
            loss = loss_fn(y_pred, ratings)
            total_loss += loss.item()
    
    return total_loss / len(data_loader)

In [23]:
# training loop

def train_with_early_stopping(model, optimizer, loss_fn, train_loader,
                              valid_loader, n_epochs, patience=10):
    history = {"train_losses": [], "valid_losses": []}
    best_valid_loss = float('inf')
    patience_counter = 0

    for epoch in range(n_epochs):
        total_loss = 0.0
        model.train()
        t0 = time.time()

        for users, movies, ratings in train_loader:
            users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
            optimizer.zero_grad()
            y_pred = model(users, movies)
            loss = loss_fn(y_pred, ratings)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        avg_valid_loss = evaluate_tm(model, valid_loader, loss_fn)

        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), "best_model_tmp.pth")
            best = " (best)"
            patience_counter = 0
        else:
            patience_counter += 1
            best = ""
        
        t1 = time.time()
        history["train_losses"].append(avg_train_loss)
        history["valid_losses"].append(avg_valid_loss)

        print(f"Epoch {epoch + 1}/{n_epochs} | "
              f"Train MSE: {avg_train_loss:.4f} | "
              f"Valid MSE: {avg_valid_loss:.4f}{best} | "
              f"Time: {t1 - t0:.1f}s")

        if patience_counter >= patience:
            print("Early stopping")
            model.load_state_dict(torch.load("best_model_tmp.pth"))
            break
    
    return history

In [26]:
def plot_train_valid_loss(history):
    epochs = range(1, len(history["train_losse"]) + 1)
    plt.figure(figsize=(10, 7))
    plt.plot(epochs, history["train_losses"], "bo-", label="training")
    plt.plot(epochs, history["valid_losses"], "ro-", label="validation")

    plt.title("Training vs validation loss")
    plt.xlabel("epoch")
    plt.ylabel("loss value")
    plt.legend()
    plt.grid()
    plt.show()

In [12]:
params = study.best_params

print(f"Best learning rate: {params['lr']}")
print(f"Best dropout: {params['dropout']}")
print(f"Best embedding dimension: {params['embedding_dim']}")

Best learning rate: 0.007340778207430317
Best dropout: 0.4908611332363598
Best embedding dimension: 16


In [29]:
final_recommender = Recommender(
    n_users, n_movies,
    embedding_dim=64,
    dropout_rate=0.3
).to(device)

optimizer = torch.optim.Adam(final_recommender.parameters(), lr=params["lr"])
criterion = nn.MSELoss()

In [30]:
torch.manual_seed(42)

n_epochs = 100
history = train_with_early_stopping(
    final_recommender, 
    optimizer,
    criterion,
    train_loader,
    valid_loader,
    n_epochs,
    patience=10
)

Epoch 1/100 | Train MSE: 1.0977 | Valid MSE: 0.8369 (best) | Time: 12.0s
Epoch 2/100 | Train MSE: 0.7990 | Valid MSE: 0.8174 (best) | Time: 11.8s
Epoch 3/100 | Train MSE: 0.7062 | Valid MSE: 0.8106 (best) | Time: 11.8s
Epoch 4/100 | Train MSE: 0.6412 | Valid MSE: 0.8232 | Time: 11.8s
Epoch 5/100 | Train MSE: 0.5819 | Valid MSE: 0.8413 | Time: 11.5s
Epoch 6/100 | Train MSE: 0.5237 | Valid MSE: 0.8603 | Time: 11.8s
Epoch 7/100 | Train MSE: 0.4736 | Valid MSE: 0.8585 | Time: 11.8s
Epoch 8/100 | Train MSE: 0.4330 | Valid MSE: 0.8949 | Time: 11.8s
Epoch 9/100 | Train MSE: 0.3974 | Valid MSE: 0.9012 | Time: 11.8s
Epoch 10/100 | Train MSE: 0.3700 | Valid MSE: 0.8971 | Time: 11.9s
Epoch 11/100 | Train MSE: 0.3455 | Valid MSE: 0.9558 | Time: 11.7s
Epoch 12/100 | Train MSE: 0.3275 | Valid MSE: 0.9403 | Time: 11.8s
Epoch 13/100 | Train MSE: 0.3109 | Valid MSE: 0.9436 | Time: 11.9s
Early stopping


  model.load_state_dict(torch.load("best_model_tmp.pth"))
