In [44]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

class RatingDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = torch.LongTensor(users)
        self.movies = torch.LongTensor(movies)
        self.ratings = torch.FloatTensor(ratings)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]


class GMF(nn.Module):
    def __init__(self, n_users, n_movies, embedding_dim, global_mean=0.0):
        super().__init__()

        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)

        self.user_bias = nn.Embedding(n_users, 1)
        self.movie_bias = nn.Embedding(n_movies, 1)

        self.affine_output = nn.Linear(embedding_dim, 1, bias=False)
        self.global_mean = global_mean

        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.movie_embedding.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.movie_bias.weight)
        nn.init.xavier_uniform_(self.affine_output.weight)

    def forward(self, user_ids, movie_ids):
        u = self.user_embedding(user_ids)
        m = self.movie_embedding(movie_ids)

        interaction = self.affine_output(u * m).squeeze(-1)
        bu = self.user_bias(user_ids).squeeze(-1)
        bi = self.movie_bias(movie_ids).squeeze(-1)

        return interaction + bu + bi + self.global_mean
        
class DMF(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim, user_hidden_dims, item_hidden_dims, dropout, global_mean=0.0):
        super().__init__()
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.global_mean = global_mean

        u_layers = []
        in_dim = embedding_dim
        for h in user_hidden_dims:
            u_layers += [
                nn.Linear(in_dim, h),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
            in_dim = h
        self.user_mlp = nn.Sequential(*u_layers)

        i_layers = []
        in_dim = embedding_dim
        for h in item_hidden_dims:
            i_layers += [
                nn.Linear(in_dim, h),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
            in_dim = h
        self.item_mlp = nn.Sequential(*i_layers)
        self.user_proj = nn.Linear(user_hidden_dims[-1], embedding_dim)
        self.item_proj = nn.Linear(item_hidden_dims[-1], embedding_dim)
        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, user_ids, item_ids):
        u = self.user_embedding(user_ids)
        i = self.item_embedding(item_ids)

        u = self.user_mlp(u)
        i = self.item_mlp(i)

        u = self.user_proj(u)
        i = self.item_proj(i)

        interaction = (u * i).sum(dim=-1)

        bu = self.user_bias(user_ids).squeeze(-1)
        bi = self.item_bias(item_ids).squeeze(-1)

        return interaction + bu + bi + self.global_mean
        

class AttentionBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=4, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        if x.dim() == 2:
            x = x.unsqueeze(1)
        attn_out, _ = self.attn(x, x, x)
        x = self.norm1(x + self.relu(self.dropout(attn_out)))
        x = self.norm2(x + self.dropout(self.relu(self.fc(x))))
        return x.squeeze(1)


class NeuMF(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim=64, mlp_hidden_dims=(128, 64), dropout=0.1, global_mean=0.0):
        super().__init__()

        self.user_embedding_gmf = nn.Embedding(n_users, embedding_dim)
        self.item_embedding_gmf = nn.Embedding(n_items, embedding_dim)

        self.user_embedding_mlp = nn.Embedding(n_users, embedding_dim)
        self.item_embedding_mlp = nn.Embedding(n_items, embedding_dim)

        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.global_mean = global_mean

        layers = []
        input_dim = embedding_dim * 2
        for h in mlp_hidden_dims:
            layers += [
                nn.Linear(input_dim, h),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
            input_dim = h
        self.mlp = nn.Sequential(*layers)

        self.output_layer = nn.Linear(embedding_dim + input_dim, 1)

        self._init_weights()

    def _init_weights(self):
        for emb in [
            self.user_embedding_gmf,
            self.item_embedding_gmf,
            self.user_embedding_mlp,
            self.item_embedding_mlp
        ]:
            nn.init.normal_(emb.weight, std=0.01)

        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        nn.init.xavier_uniform_(self.output_layer.weight)

    def forward(self, user_ids, item_ids):
        u_gmf = self.user_embedding_gmf(user_ids)
        i_gmf = self.item_embedding_gmf(item_ids)
        gmf_out = u_gmf * i_gmf

        u_mlp = self.user_embedding_mlp(user_ids)
        i_mlp = self.item_embedding_mlp(item_ids)
        mlp_input = torch.cat([u_mlp, i_mlp], dim=-1)
        mlp_out = self.mlp(mlp_input)

        fusion = torch.cat([gmf_out, mlp_out], dim=-1)
        pred = self.output_layer(fusion).squeeze(-1)

        bu = self.user_bias(user_ids).squeeze(-1)
        bi = self.item_bias(item_ids).squeeze(-1)

        return pred + bu + bi + self.global_mean


class AttentionNet(nn.Module):
    def __init__(self, n_users, n_movies, embedding_dim, hidden_dims,
                 n_attention_blocks, dropout, num_heads=4, global_mean=0.0):
        super().__init__()

        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)

        self.feature_dim = hidden_dims[-1]
        self.user_bias = nn.Embedding(n_users, 1)
        self.movie_bias = nn.Embedding(n_movies, 1)

        self.global_mean = global_mean

        self.attention_blocks = nn.ModuleList([
            AttentionBlock(embedding_dim * 2, num_heads, dropout)
            for _ in range(n_attention_blocks)
        ])

        layers = []
        dim = embedding_dim * 2
        for h in hidden_dims:
            layers += [
                nn.Linear(dim, h),
                nn.ReLU(),
                nn.LayerNorm(h),
                nn.Dropout(dropout)
            ]
            dim = h

        self.mlp_features = nn.Sequential(*layers)
        self.final_layer = nn.Linear(dim, 1)

        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.movie_embedding.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.movie_bias.weight)

    def forward(self, user_ids, movie_ids, return_features=False):
        u = self.user_embedding(user_ids)
        m = self.movie_embedding(movie_ids)

        x = torch.cat([u, m], dim=-1)
        for block in self.attention_blocks:
            x = block(x)

        features = self.mlp_features(x)

        if return_features:
            return features

        bu = self.user_bias(user_ids).squeeze(-1)
        bi = self.movie_bias(movie_ids).squeeze(-1)

        out = self.final_layer(features).squeeze(-1)
        return out + bu + bi + self.global_mean


class FusionNCF(nn.Module):
    def __init__(self, gmf_model, attn_model, dropout=0.1, global_mean=0.0):
        super().__init__()

        self.gmf = gmf_model
        self.attn_net = attn_model
        self.global_mean = global_mean

        self.user_bias = gmf_model.user_bias
        self.movie_bias = gmf_model.movie_bias

        gmf_dim = gmf_model.user_embedding.embedding_dim
        attn_dim = attn_model.feature_dim

        self.attn_layer = nn.MultiheadAttention(
            embed_dim=gmf_dim + attn_dim,
            num_heads=4,
            batch_first=True
        )

        self.fusion_layer = nn.Linear(gmf_dim + attn_dim, 1)
        self.dropout = nn.Dropout(dropout)

        nn.init.xavier_uniform_(self.fusion_layer.weight)

    def forward(self, user_ids, movie_ids):
        u = self.gmf.user_embedding(user_ids)
        m = self.gmf.movie_embedding(movie_ids)
        gmf_vec = u * m

        attn_vec = self.attn_net(user_ids, movie_ids, return_features=True)

        x = torch.cat([gmf_vec, attn_vec], dim=-1).unsqueeze(1)
        x, _ = self.attn_layer(x, x, x)
        x = self.dropout(x.squeeze(1))

        bu = self.user_bias(user_ids).squeeze(-1)
        bi = self.movie_bias(movie_ids).squeeze(-1)

        out = self.fusion_layer(x).squeeze(-1)
        return out + bu + bi + self.global_mean
        

class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim, n_layers, edge_index, global_mean=0.0, learn_global_mean=False):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers

        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)

        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)

        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)

        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

        if learn_global_mean:
            self.global_mean = nn.Parameter(torch.tensor(global_mean))
        else:
            self.register_buffer("global_mean", torch.tensor(global_mean))

        self.register_buffer("norm_adj", self.build_norm_adj(edge_index))

    def build_norm_adj(self, edge_index):
        device = edge_index.device
        num_nodes = self.n_users + self.n_items

        u = edge_index[0]
        i = edge_index[1] + self.n_users

        row = torch.cat([u, i])
        col = torch.cat([i, u])

        values = torch.ones(row.size(0), device=device)

        adj = torch.sparse_coo_tensor(
            torch.stack([row, col]),
            values,
            (num_nodes, num_nodes)
        ).coalesce()

        deg = torch.sparse.sum(adj, dim=1).to_dense()
        deg_inv_sqrt = torch.pow(deg + 1e-8, -0.5)
        deg_inv_sqrt[torch.isinf(deg_inv_sqrt)] = 0.0

        r, c = adj.indices()
        norm_values = deg_inv_sqrt[r] * adj.values() * deg_inv_sqrt[c]

        norm_adj = torch.sparse_coo_tensor(
            adj.indices(),
            norm_values,
            adj.size()
        )

        return norm_adj

    def propagate(self):
        all_embeddings = torch.cat(
            [self.user_embedding.weight, self.item_embedding.weight], dim=0
        )

        embeddings = [all_embeddings]

        for _ in range(self.n_layers):
            all_embeddings = torch.sparse.mm(self.norm_adj, all_embeddings)
            embeddings.append(all_embeddings)

        final_embeddings = torch.mean(torch.stack(embeddings, dim=0), dim=0)
        users, items = torch.split(
            final_embeddings, [self.n_users, self.n_items]
        )

        return users, items
        
    def propagate_perturbed(self, eps=0.1):
        all_embeddings = torch.cat(
            [self.user_embedding.weight, self.item_embedding.weight], dim=0
        )
    
        embeddings = [all_embeddings]
    
        for _ in range(self.n_layers):
            all_embeddings = torch.sparse.mm(self.norm_adj, all_embeddings)
            noise = F.normalize(torch.rand_like(all_embeddings), dim=-1)
            all_embeddings = all_embeddings + eps * noise
            embeddings.append(all_embeddings)
    
        final_embeddings = torch.mean(torch.stack(embeddings, dim=0), dim=0)
        return torch.split(final_embeddings, [self.n_users, self.n_items])

    def forward(self, user_ids, item_ids):
        user_emb, item_emb = self.propagate()

        u = user_emb[user_ids]
        i = item_emb[item_ids]

        interaction = (u * i).sum(dim=-1)

        bu = self.user_bias(user_ids).squeeze(-1)
        bi = self.item_bias(item_ids).squeeze(-1)

        return interaction + bu + bi + self.global_mean


class LightGCNPP(LightGCN):
    def __init__(self, *args, residual=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.residual = residual
        self.layer_weights = nn.Parameter(
            torch.ones(self.n_layers + 1)
        )

    def propagate(self):
        all_embeddings = torch.cat(
            [self.user_embedding.weight, self.item_embedding.weight], dim=0
        )

        embeddings = [all_embeddings]

        for _ in range(self.n_layers):
            neigh = torch.sparse.mm(self.norm_adj, all_embeddings)
            all_embeddings = neigh + all_embeddings if self.residual else neigh
            embeddings.append(all_embeddings)

        stack = torch.stack(embeddings, dim=0)
        alpha = torch.softmax(self.layer_weights, dim=0)
        final_embeddings = torch.sum(alpha[:, None, None] * stack, dim=0)

        return torch.split(final_embeddings, [self.n_users, self.n_items])

class SimGCL(LightGCN):
    def __init__(self, *args, eps=0.1, temperature=0.2, lambda_cl=0.1, **kwargs):
        super().__init__(*args, **kwargs)
        self.eps = eps
        self.temperature = temperature
        self.lambda_cl = lambda_cl
    
    def contrastive_loss(self, users, items):
        u1, i1 = self.propagate_perturbed(self.eps)
        u2, i2 = self.propagate_perturbed(self.eps)
        
        loss_u = info_nce_loss(u1[users], u2[users], self.temperature)
        loss_i = info_nce_loss(i1[items], i2[items], self.temperature)
        return loss_u + loss_i


def info_nce_loss(z1, z2, temperature=0.2):
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)
    
    
    pos = torch.exp(torch.sum(z1 * z2, dim=1) / temperature)
    ttl = torch.exp(torch.matmul(z1, z2.t()) / temperature).sum(dim=1)
    return -torch.log(pos / ttl).mean()

def build_edge_index(df):
    users = torch.LongTensor(df['user_idx'].values)
    items = torch.LongTensor(df['movie_idx'].values)
    return torch.stack([users, items], dim=0)

class Ensemble(nn.Module):
    def __init__(self, models, learn_weights=True):
        super().__init__()

        self.models = nn.ModuleList(models)

        for m in self.models:
            for p in m.parameters():
                p.requires_grad = False

        n_models = len(models)

        if learn_weights:
            self.weights = nn.Parameter(torch.ones(n_models) / n_models)
        else:
            self.register_buffer("weights", torch.ones(n_models) / n_models)

    def forward(self, users, items):
        preds = []
        for model in self.models:
            preds.append(model(users, items))

        preds = torch.stack(preds, dim=0)
        weights = torch.softmax(self.weights, dim=0)

        return (weights[:, None] * preds).sum(dim=0)

In [45]:
def preprocessing(data_path, test_size=0.02, random_state=42):
    if not os.path.exists(data_path):
        print("Data file not found. Generating dummy data...")
        users = np.random.randint(0, 100, 10000)
        movies = np.random.randint(0, 200, 10000)
        ratings = np.random.randint(1, 6, 10000)
        df = pd.DataFrame({'userId': users, 'movieId': movies, 'rating': ratings})
    else:
        df = pd.read_csv(data_path, sep='\t', names=['userId', 'movieId', 'rating'])

    user_ids = df['userId'].unique()
    movie_ids = df['movieId'].unique()

    min_rating = df['rating'].min()
    max_rating = df['rating'].max()
    print(f"Scaling ratings from [{min_rating}, {max_rating}] to [0, 1].")
    df['rating'] = (df['rating'] - min_rating) / (max_rating - min_rating)

    user_map = {uid: idx for idx, uid in enumerate(user_ids)}
    movie_map = {mid: idx for idx, mid in enumerate(movie_ids)}

    df['user_idx'] = df['userId'].map(user_map)
    df['movie_idx'] = df['movieId'].map(movie_map)

    n_users = len(user_map)
    n_movies = len(movie_map)

    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    val_size_relative = test_size / (1 - test_size)
    train_df, val_df = train_test_split(train_val_df, test_size=val_size_relative, random_state=random_state)

    return train_df, val_df, test_df, user_map, movie_map, n_users, n_movies, min_rating, max_rating


def train(model, train_df, val_df, batch_size, epochs, learning_rate, weight_decay, name, device, min_rating, max_rating, lambda_cl=0.1, temperature=0.2):
    print(f"\nTraining Model: {name}")

    train_dataset = RatingDataset(
        train_df['user_idx'].values,
        train_df['movie_idx'].values,
        train_df['rating'].values
    )
    val_dataset = RatingDataset(
        val_df['user_idx'].values,
        val_df['movie_idx'].values,
        val_df['rating'].values
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    mse_criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2
    )

    best_val_loss = float('inf')
    scaling_factor = max_rating - min_rating

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for users, pos_items, ratings in train_loader:
            users = users.to(device)
            pos_items = pos_items.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()

            predictions = model(users, pos_items)
            rating_loss = mse_criterion(predictions, ratings)

            cl_loss = (
                model.contrastive_loss(users, pos_items)
                if hasattr(model, "contrastive_loss")
                else 0.0
            )
            
            loss = rating_loss + (
                model.lambda_cl * cl_loss
                if hasattr(model, "lambda_cl")
                else 0.0
            )

            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)

        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for users, items, ratings in val_loader:
                users = users.to(device)
                items = items.to(device)
                ratings = ratings.to(device)

                preds = model(users, items)

                loss = mse_criterion(preds, ratings)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        scheduler.step(val_loss)

        val_rmse = np.sqrt(val_loss) * scaling_factor

        print(
            f"Epoch {epoch+1}/{epochs} | "
            f"Train Loss: {train_loss:.4f} | "
            f"Val Loss: {val_loss:.4f} | "
            f"Val RMSE: {val_rmse:.4f}"
        )

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), name + ".pth")

    model.load_state_dict(torch.load(name + ".pth", map_location=device))
    print(f"Best Val RMSE: {np.sqrt(best_val_loss) * scaling_factor:.4f}")
    return model


def eval_model(model, test_df, batch_size, device, min_rating, max_rating):
    test_dataset = RatingDataset(test_df['user_idx'].values, test_df['movie_idx'].values, test_df['rating'].values)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    criterion = nn.MSELoss()
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for users, movies, ratings in test_loader:
            users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
            predictions = model(users, movies)
            test_loss += criterion(predictions, ratings).item()
    test_loss /= len(test_loader)

    scaling_factor = max_rating - min_rating
    test_rmse_scaled = np.sqrt(test_loss)
    test_rmse = test_rmse_scaled * scaling_factor

    print(f"Test RMSE: {test_rmse:.4f}")
    return test_rmse

def make_model(model_type, config, n_users, n_movies, pretrained_models=None):
    device = config['device']
    global_mean = data_bundle[0]['rating'].mean()

    if model_type == 'gmf':
        model = GMF(n_users, n_movies, config['embedding_dim'], global_mean=global_mean)
        
    elif model_type == 'lightgbm':
            if not pretrained_models or 'gmf' not in pretrained_models:
                 raise ValueError("LightGBM requires a pretrained 'gmf' model for embeddings.")
                 
            model = LightGBMRec(
                n_users=n_users, 
                n_movies=n_movies, 
                gmf_model=pretrained_models['gmf'],
                train_df=config['train_df'],
                device=device
            )
            return model
        
    elif model_type == 'attention':
        model = AttentionNet(
            n_users=n_users,
            n_movies=n_movies,
            embedding_dim=config['embedding_dim'],
            hidden_dims=config['hidden_dims'],
            n_attention_blocks=config['n_attention_blocks'],
            num_heads=config['n_heads'],
            dropout=config['dropout'],
            global_mean=global_mean
        )
        
    elif model_type == 'dmf':
        model = DMF(
            n_users=n_users,
            n_items=n_movies,
            embedding_dim=config['embedding_dim'],
            user_hidden_dims=config.get('user_hidden_dims', [128, 64]),
            item_hidden_dims=config.get('item_hidden_dims', [128, 64]),
            dropout=config.get('dropout', 0.1),
            global_mean=global_mean
        )

    elif model_type == 'ncf':
        if not pretrained_models or 'gmf' not in pretrained_models or 'attention' not in pretrained_models:
            raise ValueError("NCF requires pretrained 'gmf' and 'attention' models passed in 'pretrained_models'")

        gmf_pretrained = pretrained_models['gmf']
        attn_pretrained = pretrained_models['attention']

        model = FusionNCF(gmf_pretrained, attn_pretrained, dropout=config.get('dropout', 0.1))

        if config.get('freeze_pretrained', False):
            for param in model.gmf.parameters():
                param.requires_grad = False
            for param in model.attn_net.parameters():
                param.requires_grad = False
                
    elif model_type == 'nmf':
        model = NeuMF(
            n_users=n_users,
            n_items=n_movies,
            embedding_dim=config['embedding_dim'],
            mlp_hidden_dims=config.get('mlp_hidden_dims', [128, 64]),
            dropout=config.get('dropout', 0.1),
            global_mean=global_mean
        )

    elif model_type == 'lightgcn':
        edge_index = build_edge_index(config['train_df']).to(device)
    
        model = LightGCN(
            n_users=n_users,
            n_items=n_movies,
            embedding_dim=config['embedding_dim'],
            n_layers=config['n_layers'],
            edge_index=edge_index,
            global_mean=global_mean
        )
        
    elif model_type == 'lightgcnpp':
        edge_index = build_edge_index(config['train_df']).to(device)
    
        model = LightGCNPP(
            n_users=n_users,
            n_items=n_movies,
            embedding_dim=config['embedding_dim'],
            n_layers=config['n_layers'],
            edge_index=edge_index,
            global_mean=global_mean,
            residual=True
        )

    elif model_type == 'simgcl':
        edge_index = build_edge_index(config['train_df']).to(device)
    
        model = SimGCL(
            n_users=n_users,
            n_items=n_movies,
            embedding_dim=config['embedding_dim'],
            n_layers=config['n_layers'],
            edge_index=edge_index,
            global_mean=global_mean,
            eps=config.get('eps', 0.1),
            temperature=config.get('temperature', 0.2),
            lambda_cl=config.get('lambda_cl', 0.1)
        )
        
    elif model_type == 'ensemble':
        if not pretrained_models:
            raise ValueError("Ensemble requires pretrained_models")
    
        model_list = []
        for name in config['ensemble_models']:
            model_list.append(pretrained_models[name])
    
        model = Ensemble(
            models=model_list,
            learn_weights=config.get('learn_weights', True)
        )

    else:
        raise ValueError(f"Unknown model_type: {model_type}")

    return model.to(device)
    
def sample(model, test_df, n_samples, device):
    if n_samples > len(test_df):
        n_samples = len(test_df)

    sample_df = test_df.sample(n=n_samples).copy()

    model.eval()
    with torch.no_grad():
        users = torch.LongTensor(sample_df['user_idx'].values).to(device)
        movies = torch.LongTensor(sample_df['movie_idx'].values).to(device)
        predictions = 1 + model(users, movies) * 4
        predictions = torch.clamp(predictions, 1.0, 5.0).cpu().numpy()

    sample_df['predicted_rating'] = predictions
    sample_df['rating'] = 1 + 4 * sample_df['rating']
    sample_df['error'] = sample_df['predicted_rating'] - sample_df['rating']
    sample_df['absolute_error'] = np.abs(sample_df['error'])

    sample_rmse = np.sqrt(np.mean(sample_df['error'] ** 2))
    sample_mae = np.mean(sample_df['absolute_error'])

    display_df = sample_df[['userId', 'movieId', 'rating', 'predicted_rating', 'error', 'absolute_error']].head(10)
    print(display_df.round(2).to_string(index=False))

    print("\n" + "-" * 100)
    print(f"RMSE: {sample_rmse:.4f}")
    print(f"MAE: {sample_mae:.4f}")

    return sample_df, sample_rmse

def pipeline(model_type, config, data_bundle, pretrained_models=None):
    train_df, val_df, test_df, _, _, n_users, n_movies, min_rating, max_rating = data_bundle

    model = make_model(model_type, config, n_users, n_movies, pretrained_models)

    model = train(
        model=model,
        train_df=train_df,
        val_df=val_df,
        batch_size=config['batch_size'],
        epochs=config['epochs'],
        learning_rate=config['learning_rate'],
        weight_decay=config['weight_decay'],
        name=f"best_{model_type}",
        device=config['device'],
        min_rating=min_rating,
        max_rating=max_rating
    )

    eval_model(model, test_df, config['batch_size'], config['device'], min_rating, max_rating)

    return model

In [15]:
def generate_submission(movie_map, user_map, model, prompt_path, train_df, output_path="submission.csv", device='cuda'):
    prompt_df = pd.read_csv(prompt_path, sep='\t', names=['UserId', 'MovieId'])
    prompt_df['UserIdx'] = prompt_df['UserId'].map(user_map)
    prompt_df['MovieIdx'] = prompt_df['MovieId'].map(movie_map)

    model.eval()
    model = model.to(device)

    preds = []
    with torch.no_grad():
        for _, row in prompt_df.iterrows():
            user_idx = row['UserIdx']
            movie_idx = row['MovieIdx']

            if pd.notna(movie_idx):
                user_tensor = torch.LongTensor([user_idx]).to(device)
                movie_tensor = torch.LongTensor([movie_idx]).to(device)
                output = model(user_tensor, movie_tensor).item()
                pred = np.clip(1 + output * 4, 1.0, 5.0)
            else:
                pred = 4.0

            preds.append(pred)

    submission = pd.DataFrame({
        "Id": np.arange(1, len(preds) + 1),
        "Score": preds
    })

    submission.to_csv(output_path, index=False)
    print(f"Saved submission file to: {output_path}")
    print(submission.head())

    return submission


In [4]:
def check_coverage(movie_map, user_map, prompt_path):
    prompt_df = pd.read_csv(prompt_path, sep='\t', names=['UserId', 'MovieId'])

    missing_users = set(prompt_df['UserId']) - set(user_map.keys())
    missing_movies = set(prompt_df['MovieId']) - set(movie_map.keys())

    ok = True
    if missing_users:
        print(f"Missing {len(missing_users)} users not found in user_map:")
        print(list(missing_users)[:10], "..." if len(missing_users) > 10 else "")
        ok = False

    if missing_movies:
        print(f"Missing {len(missing_movies)} movies not found in movie_map:")
        print(list(missing_movies)[:10], "..." if len(missing_movies) > 10 else "")
        ok = False

    if ok:
        print("All users and movies are covered.")

    return ok

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_PATH = '/kaggle/input/recsys/train.txt'

data_bundle = preprocessing(DATA_PATH)
train_df, val_df, test_df, user_map, movie_map, n_users, n_movies, min_value, max_value = data_bundle
print(f"Users: {n_users}, Movies: {n_movies}")
trained_models = {}

Scaling ratings from [1, 5] to [0, 1].
Users: 943, Movies: 1680


In [46]:
gmf_config = {
    'epochs': 7,
    'batch_size': 512,
    'learning_rate': 0.00065,
    'weight_decay': 1e-5,
    'embedding_dim': 64,
    'device': device
}
trained_models['gmf'] = pipeline('gmf', gmf_config, data_bundle)


Training Model: best_gmf
Epoch 1/7 | Train Loss: 0.0734 | Val Loss: 0.0669 | Val RMSE: 1.0344
Epoch 2/7 | Train Loss: 0.0635 | Val Loss: 0.0581 | Val RMSE: 0.9642
Epoch 3/7 | Train Loss: 0.0549 | Val Loss: 0.0524 | Val RMSE: 0.9152
Epoch 4/7 | Train Loss: 0.0495 | Val Loss: 0.0498 | Val RMSE: 0.8923
Epoch 5/7 | Train Loss: 0.0458 | Val Loss: 0.0488 | Val RMSE: 0.8832
Epoch 6/7 | Train Loss: 0.0422 | Val Loss: 0.0480 | Val RMSE: 0.8766
Epoch 7/7 | Train Loss: 0.0387 | Val Loss: 0.0479 | Val RMSE: 0.8754
Best Val RMSE: 0.8754
Test RMSE: 0.9173


In [63]:
dmf_config = {
    'epochs': 20,
    'batch_size': 512,
    'learning_rate': 0.003,
    'weight_decay': 1e-6,
    'embedding_dim': 64,
    'user_hidden_dims': [64, 32],
    'item_hidden_dims': [64, 32],
    'dropout': 0.25,
    'device': device
}
trained_models['dmf'] = pipeline('dmf', dmf_config, data_bundle)


Training Model: best_dmf
Epoch 1/20 | Train Loss: 0.0712 | Val Loss: 0.0555 | Val RMSE: 0.9427
Epoch 2/20 | Train Loss: 0.0570 | Val Loss: 0.0533 | Val RMSE: 0.9234
Epoch 3/20 | Train Loss: 0.0553 | Val Loss: 0.0523 | Val RMSE: 0.9147
Epoch 4/20 | Train Loss: 0.0547 | Val Loss: 0.0524 | Val RMSE: 0.9157
Epoch 5/20 | Train Loss: 0.0540 | Val Loss: 0.0533 | Val RMSE: 0.9237
Epoch 6/20 | Train Loss: 0.0538 | Val Loss: 0.0518 | Val RMSE: 0.9107
Epoch 7/20 | Train Loss: 0.0536 | Val Loss: 0.0516 | Val RMSE: 0.9085
Epoch 8/20 | Train Loss: 0.0533 | Val Loss: 0.0519 | Val RMSE: 0.9113
Epoch 9/20 | Train Loss: 0.0533 | Val Loss: 0.0515 | Val RMSE: 0.9075
Epoch 10/20 | Train Loss: 0.0531 | Val Loss: 0.0512 | Val RMSE: 0.9050
Epoch 11/20 | Train Loss: 0.0526 | Val Loss: 0.0511 | Val RMSE: 0.9039
Epoch 12/20 | Train Loss: 0.0525 | Val Loss: 0.0515 | Val RMSE: 0.9074
Epoch 13/20 | Train Loss: 0.0524 | Val Loss: 0.0511 | Val RMSE: 0.9045
Epoch 14/20 | Train Loss: 0.0521 | Val Loss: 0.0527 | Val RM

In [47]:
attn_config = {
    'epochs': 11,
    'batch_size': 512,
    'learning_rate': 0.00085,
    'weight_decay': 1e-3,
    'embedding_dim': 64,
    'hidden_dims': [128, 64],
    'n_attention_blocks': 2,
    'n_heads': 4,
    'dropout': 0.3,
    'device': device
}
trained_models['attention'] = pipeline('attention', attn_config, data_bundle)


Training Model: best_attention
Epoch 1/11 | Train Loss: 0.1083 | Val Loss: 0.0665 | Val RMSE: 1.0312
Epoch 2/11 | Train Loss: 0.0650 | Val Loss: 0.0588 | Val RMSE: 0.9703
Epoch 3/11 | Train Loss: 0.0585 | Val Loss: 0.0547 | Val RMSE: 0.9358
Epoch 4/11 | Train Loss: 0.0560 | Val Loss: 0.0548 | Val RMSE: 0.9364
Epoch 5/11 | Train Loss: 0.0549 | Val Loss: 0.0531 | Val RMSE: 0.9220
Epoch 6/11 | Train Loss: 0.0539 | Val Loss: 0.0524 | Val RMSE: 0.9157
Epoch 7/11 | Train Loss: 0.0536 | Val Loss: 0.0527 | Val RMSE: 0.9187
Epoch 8/11 | Train Loss: 0.0530 | Val Loss: 0.0532 | Val RMSE: 0.9222
Epoch 9/11 | Train Loss: 0.0527 | Val Loss: 0.0521 | Val RMSE: 0.9128
Epoch 10/11 | Train Loss: 0.0523 | Val Loss: 0.0519 | Val RMSE: 0.9112
Epoch 11/11 | Train Loss: 0.0520 | Val Loss: 0.0512 | Val RMSE: 0.9054
Best Val RMSE: 0.9054
Test RMSE: 0.9388


In [48]:
ncf_config = {
    'epochs': 15,
    'batch_size': 256,
    'learning_rate': 0.00035,
    'weight_decay': 0.001,
    'dropout': 0.3,
    'freeze_pretrained': False,
    'device': device
}
trained_models['ncf'] = pipeline('ncf', ncf_config, data_bundle, pretrained_models=trained_models)


Training Model: best_ncf
Epoch 1/15 | Train Loss: 0.0638 | Val Loss: 0.0493 | Val RMSE: 0.8880
Epoch 2/15 | Train Loss: 0.0528 | Val Loss: 0.0483 | Val RMSE: 0.8788
Epoch 3/15 | Train Loss: 0.0515 | Val Loss: 0.0483 | Val RMSE: 0.8789
Epoch 4/15 | Train Loss: 0.0505 | Val Loss: 0.0486 | Val RMSE: 0.8822
Epoch 5/15 | Train Loss: 0.0500 | Val Loss: 0.0486 | Val RMSE: 0.8816
Epoch 6/15 | Train Loss: 0.0480 | Val Loss: 0.0480 | Val RMSE: 0.8766
Epoch 7/15 | Train Loss: 0.0471 | Val Loss: 0.0484 | Val RMSE: 0.8804
Epoch 8/15 | Train Loss: 0.0467 | Val Loss: 0.0483 | Val RMSE: 0.8789
Epoch 9/15 | Train Loss: 0.0463 | Val Loss: 0.0488 | Val RMSE: 0.8841
Epoch 10/15 | Train Loss: 0.0447 | Val Loss: 0.0498 | Val RMSE: 0.8929
Epoch 11/15 | Train Loss: 0.0442 | Val Loss: 0.0495 | Val RMSE: 0.8895
Epoch 12/15 | Train Loss: 0.0439 | Val Loss: 0.0501 | Val RMSE: 0.8955
Epoch 13/15 | Train Loss: 0.0430 | Val Loss: 0.0501 | Val RMSE: 0.8956
Epoch 14/15 | Train Loss: 0.0426 | Val Loss: 0.0507 | Val RM

In [66]:
nmf_config = {
    'epochs': 9,
    'batch_size': 1024,
    'learning_rate': 0.0005,
    'weight_decay': 1e-6,
    'embedding_dim': 128,
    'mlp_hidden_dims': [128, 64, 32],
    'dropout': 0.1,
    'device': device
}
trained_models['nmf'] = pipeline('nmf', nmf_config, data_bundle)


Training Model: best_nmf
Epoch 1/9 | Train Loss: 0.2065 | Val Loss: 0.0586 | Val RMSE: 0.9681
Epoch 2/9 | Train Loss: 0.0619 | Val Loss: 0.0550 | Val RMSE: 0.9378
Epoch 3/9 | Train Loss: 0.0587 | Val Loss: 0.0547 | Val RMSE: 0.9359
Epoch 4/9 | Train Loss: 0.0569 | Val Loss: 0.0522 | Val RMSE: 0.9143
Epoch 5/9 | Train Loss: 0.0523 | Val Loss: 0.0502 | Val RMSE: 0.8961
Epoch 6/9 | Train Loss: 0.0464 | Val Loss: 0.0494 | Val RMSE: 0.8893
Epoch 7/9 | Train Loss: 0.0406 | Val Loss: 0.0481 | Val RMSE: 0.8775
Epoch 8/9 | Train Loss: 0.0350 | Val Loss: 0.0489 | Val RMSE: 0.8842
Epoch 9/9 | Train Loss: 0.0297 | Val Loss: 0.0486 | Val RMSE: 0.8817
Best Val RMSE: 0.8775
Test RMSE: 0.9013


In [67]:
lightgcn_config = {
    'epochs': 30,
    'batch_size': 512,
    'learning_rate': 0.002,
    'weight_decay': 1e-6,
    'embedding_dim': 128,
    'n_layers': 6,
    'device': device,
    'train_df': data_bundle[0]
}
trained_models['lightgcn'] = pipeline('lightgcn', lightgcn_config, data_bundle)


Training Model: best_lightgcn
Epoch 1/30 | Train Loss: 0.1484 | Val Loss: 0.1030 | Val RMSE: 1.2837
Epoch 2/30 | Train Loss: 0.0895 | Val Loss: 0.0847 | Val RMSE: 1.1641
Epoch 3/30 | Train Loss: 0.0768 | Val Loss: 0.0751 | Val RMSE: 1.0962
Epoch 4/30 | Train Loss: 0.0698 | Val Loss: 0.0693 | Val RMSE: 1.0532
Epoch 5/30 | Train Loss: 0.0656 | Val Loss: 0.0662 | Val RMSE: 1.0289
Epoch 6/30 | Train Loss: 0.0628 | Val Loss: 0.0633 | Val RMSE: 1.0062
Epoch 7/30 | Train Loss: 0.0610 | Val Loss: 0.0616 | Val RMSE: 0.9930
Epoch 8/30 | Train Loss: 0.0597 | Val Loss: 0.0603 | Val RMSE: 0.9822
Epoch 9/30 | Train Loss: 0.0588 | Val Loss: 0.0592 | Val RMSE: 0.9736
Epoch 10/30 | Train Loss: 0.0581 | Val Loss: 0.0587 | Val RMSE: 0.9693
Epoch 11/30 | Train Loss: 0.0575 | Val Loss: 0.0581 | Val RMSE: 0.9642
Epoch 12/30 | Train Loss: 0.0571 | Val Loss: 0.0575 | Val RMSE: 0.9595
Epoch 13/30 | Train Loss: 0.0567 | Val Loss: 0.0572 | Val RMSE: 0.9570
Epoch 14/30 | Train Loss: 0.0563 | Val Loss: 0.0566 | V

In [11]:
lightgcnpp_config = {
    'epochs': 12,
    'batch_size': 1024,
    'learning_rate': 0.001,
    'weight_decay': 1e-6,

    'embedding_dim': 48,
    'n_layers': 3,

    'device': device,
    'train_df': data_bundle[0]
}
trained_models['lightgcnpp'] = pipeline('lightgcnpp', lightgcnpp_config, data_bundle)


Training Model: best_lightgcnpp
Epoch 1/12 | Train Loss: 0.0702 | Val Loss: 0.0628 | Val RMSE: 1.0020
Epoch 2/12 | Train Loss: 0.0597 | Val Loss: 0.0568 | Val RMSE: 0.9535
Epoch 3/12 | Train Loss: 0.0536 | Val Loss: 0.0533 | Val RMSE: 0.9235
Epoch 4/12 | Train Loss: 0.0494 | Val Loss: 0.0512 | Val RMSE: 0.9050
Epoch 5/12 | Train Loss: 0.0460 | Val Loss: 0.0496 | Val RMSE: 0.8908
Epoch 6/12 | Train Loss: 0.0429 | Val Loss: 0.0485 | Val RMSE: 0.8813
Epoch 7/12 | Train Loss: 0.0399 | Val Loss: 0.0482 | Val RMSE: 0.8782
Epoch 8/12 | Train Loss: 0.0368 | Val Loss: 0.0480 | Val RMSE: 0.8767
Epoch 9/12 | Train Loss: 0.0338 | Val Loss: 0.0481 | Val RMSE: 0.8774
Epoch 10/12 | Train Loss: 0.0308 | Val Loss: 0.0486 | Val RMSE: 0.8817
Epoch 11/12 | Train Loss: 0.0279 | Val Loss: 0.0493 | Val RMSE: 0.8880
Epoch 12/12 | Train Loss: 0.0250 | Val Loss: 0.0496 | Val RMSE: 0.8910
Best Val RMSE: 0.8767
Test RMSE: 0.8973


In [38]:
simgcl_config = {
    'epochs': 30,
    'batch_size': 1024,
    'learning_rate': 0.0017,
    'weight_decay': 1e-6,
    'embedding_dim': 64,
    'n_layers': 3,

    'eps': 0.1,
    'temperature': 0.2,
    'lambda_cl': 0.1,

    'device': device,
    'train_df': data_bundle[0]
}
trained_models['simgcl'] = pipeline('simgcl', simgcl_config, data_bundle)


Training Model: best_simgcl
Epoch 1/30 | Train Loss: 0.7550 | Val Loss: 0.0628 | Val RMSE: 1.0023
Epoch 2/30 | Train Loss: 0.5769 | Val Loss: 0.0581 | Val RMSE: 0.9644
Epoch 3/30 | Train Loss: 0.5605 | Val Loss: 0.0563 | Val RMSE: 0.9488
Epoch 4/30 | Train Loss: 0.5525 | Val Loss: 0.0551 | Val RMSE: 0.9394
Epoch 5/30 | Train Loss: 0.5478 | Val Loss: 0.0546 | Val RMSE: 0.9349
Epoch 6/30 | Train Loss: 0.5446 | Val Loss: 0.0542 | Val RMSE: 0.9310
Epoch 7/30 | Train Loss: 0.5424 | Val Loss: 0.0539 | Val RMSE: 0.9290
Epoch 8/30 | Train Loss: 0.5406 | Val Loss: 0.0537 | Val RMSE: 0.9269
Epoch 9/30 | Train Loss: 0.5394 | Val Loss: 0.0536 | Val RMSE: 0.9262
Epoch 10/30 | Train Loss: 0.5383 | Val Loss: 0.0534 | Val RMSE: 0.9244
Epoch 11/30 | Train Loss: 0.5373 | Val Loss: 0.0535 | Val RMSE: 0.9248
Epoch 12/30 | Train Loss: 0.5364 | Val Loss: 0.0534 | Val RMSE: 0.9243
Epoch 13/30 | Train Loss: 0.5358 | Val Loss: 0.0533 | Val RMSE: 0.9234
Epoch 14/30 | Train Loss: 0.5352 | Val Loss: 0.0534 | Val

In [68]:
ensemble_models = ['lightgcn', 'ncf', 'gmf', 'simgcl']
for model_name in ensemble_models:
    state = torch.load(f"best_{model_name}.pth", map_location="cpu")
    trained_models[model_name].load_state_dict(state)

ensemble_config = {
    'epochs': 8,
    'batch_size': 1024,
    'learning_rate': 0.04,
    'weight_decay': 0.0001,
    'ensemble_models': ensemble_models,
    'learn_weights': True,
    'device': device
}
trained_models['ensemble'] = pipeline('ensemble', ensemble_config, data_bundle, trained_models)


Training Model: best_ensemble
Epoch 1/8 | Train Loss: 0.0468 | Val Loss: 0.0503 | Val RMSE: 0.8969
Epoch 2/8 | Train Loss: 0.0467 | Val Loss: 0.0503 | Val RMSE: 0.8968
Epoch 3/8 | Train Loss: 0.0467 | Val Loss: 0.0502 | Val RMSE: 0.8961
Epoch 4/8 | Train Loss: 0.0467 | Val Loss: 0.0502 | Val RMSE: 0.8959
Epoch 5/8 | Train Loss: 0.0467 | Val Loss: 0.0502 | Val RMSE: 0.8964
Epoch 6/8 | Train Loss: 0.0467 | Val Loss: 0.0502 | Val RMSE: 0.8962
Epoch 7/8 | Train Loss: 0.0467 | Val Loss: 0.0502 | Val RMSE: 0.8963
Epoch 8/8 | Train Loss: 0.0467 | Val Loss: 0.0501 | Val RMSE: 0.8956
Best Val RMSE: 0.8956
Test RMSE: 0.9129


In [None]:
check_coverage(movie_map, user_map, "/kaggle/input/recsys/test.txt")

In [49]:
model_name = 'ncf'
state = torch.load(f"best_{model_name}.pth", map_location="cpu")
trained_models[model_name].load_state_dict(state)
trained_models[model_name].eval()
sample(
    model=trained_models[model_name],
    test_df=test_df,
    n_samples=700,
    device=device
)
None

 userId  movieId  rating  predicted_rating  error  absolute_error
    250      271     4.0              3.36  -0.64            0.64
    222      145     2.0              1.81  -0.19            0.19
    889       81     4.0              3.45  -0.55            0.55
    208      523     4.0              3.80  -0.20            0.20
    312      614     4.0              4.08   0.08            0.08
    293      419     3.0              3.22   0.22            0.22
    422      672     3.0              2.74  -0.26            0.26
    220      289     4.0              3.31  -0.69            0.69
    151      736     4.0              4.22   0.22            0.22
    884      640     1.0              3.44   2.44            2.44

----------------------------------------------------------------------------------------------------
RMSE: 0.9383
MAE: 0.7351


In [50]:
submission = generate_submission(
    movie_map=movie_map,
    user_map=user_map,
    train_df=train_df,
    model=trained_models[model_name],
    prompt_path="/kaggle/input/recsys/test.txt",
    output_path=f"{model_name}.csv",
    device=device
)

Saved submission file to: ncf.csv
   Id     Score
0   1  3.882270
1   2  3.730960
2   3  4.067957
3   4  3.261101
4   5  2.164522
