## This notebook contains models architectures

In [None]:
from typing import Callable, Any

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import os
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from pathlib import Path
from torch.utils.data import DataLoader
export_dir = os.getcwd()
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import plot
import random
import math
import heapq
from scipy.special import expit  # Sigmoid function
import itertools
from IPython.display import Latex, display
import pickle
import warnings

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
torch.set_printoptions(sci_mode=False)


In [None]:
pip install ipynb

In [None]:
from ipynb.fs.defs.utils import *
from ipynb.fs.defs.data_processing import *


## SAE NCF Architecture

In [None]:
class TopKActivation(nn.Module):
    def __init__(self, k):
        """
        Keeps only the top k values (per sample) and zeros out the rest.
        """
        super(TopKActivation, self).__init__()
        self.k = k

    def forward(self, x):
        # x: (batch, hidden_dim)
        # Get top-k values along the feature dimension.
        topk_values, _ = torch.topk(x, self.k, dim=1)
        # Threshold is the k-th largest value for each sample.
        threshold = topk_values[:, -1].unsqueeze(1).expand_as(x)
        mask = (x >= threshold).float()
        return x * mask

# -----------------------------------------------------------------------------
# 2. Custom Tied Transpose Module for Decoder
class TiedTranspose(nn.Module):
    def __init__(self, tied_layer):
        """
        Ties this module's weight to the transpose of the given linear layer's weight.
        """
        super(TiedTranspose, self).__init__()
        self.tied_layer = tied_layer

    def forward(self, x):
        # Use the transpose of the tied layer's weight.
        return nn.functional.linear(x, self.tied_layer.weight.t())


In [None]:
class SparseAutoencoderNCF(nn.Module):
    def __init__(self, input_dim=100, hidden_dim=70, topk=5, tie_weights=True):
        """
        Args:
            input_dim (int): Dimensionality of each input embedding (20).
            hidden_dim (int): Dimensionality of the latent space.
            topk (int): Number of activations to keep per sample- if using topK activation.
            tie_weights (bool): If True, tie the decoder's weight to the encoder's weight.
        """
        super(SparseAutoencoderNCF, self).__init__()
        # Encoder: a linear layer followed by a activation.
        self.encoder_linear = nn.Linear(input_dim, hidden_dim)
        # self.topk_activation = TopKActivation(k=topk)
        self.topk_activation = nn.ReLU()


        # Decoder: if tie_weights is True, use TiedTranspose.
        self.tie_weights = tie_weights
        if tie_weights:
            self.decoder = TiedTranspose(self.encoder_linear)
        else:
            self.decoder = nn.Linear(hidden_dim, input_dim, bias=False)
        self.loss = []
        self.weights_loss = []
        self.test_subset_users_ind = test_subset_users
        self.test_subset_items_ind = test_subset_items

    def forward(self, x):

        latent_pre_act = self.encoder_linear(x)           # (batch, hidden_dim)
        encoded = self.topk_activation(latent_pre_act)      # (batch, hidden_dim)
        decoded = self.decoder(encoded)                     # (batch, input_dim)
        return decoded, encoded




-

## MF SAE Architecture

In [None]:
class Autoencoder(nn.Module):

      def __init__(
          self, n_latents: int, n_inputs: int, activation: Callable = nn.ReLU(), tied: bool = True,
          normalize: bool = False
      ) -> None:
          super().__init__()

          self.pre_bias = nn.Parameter(torch.zeros(n_inputs))
          self.encoder: nn.Module = nn.Linear(n_inputs, n_latents, bias=False)
          self.latent_bias = nn.Parameter(torch.zeros(n_latents))
          self.activation = activation
          if tied:
              self.decoder: nn.Linear | TiedTranspose = TiedTranspose(self.encoder)
          else:
              self.decoder = nn.Linear(n_latents, n_inputs, bias=False)
          self.normalize = normalize
          self.loss = []
          self.test_subset_users_ind = test_subset_users ####
          self.test_subset_items_ind = test_subset_items ####
          self.weights_loss = [] ####
          self.test = test_flag
          self.activation_rate = {}


      def encode_pre_act(self, x: torch.Tensor, latent_slice: slice = slice(None)) -> torch.Tensor:
          if type(x) == np.ndarray:
            x= torch.from_numpy(x)
          x = x - self.pre_bias
          latents_pre_act = F.linear(
              x, self.encoder.weight[latent_slice], self.latent_bias[latent_slice]
          )
          return latents_pre_act

      def preprocess(self, x: torch.Tensor) -> tuple[torch.Tensor, dict[str, Any]]:
          if not self.normalize:
              return x, dict()
          x, mu, std = LN(x)
          return x, dict(mu=mu, std=std)

      def encode(self, x: torch.Tensor) -> tuple[torch.Tensor, dict[str, Any]]:
          x, info = self.preprocess(x)
          return self.activation(self.encode_pre_act(x)), info

      def decode(self, latents: torch.Tensor, info: dict[str, Any] | None = None) -> torch.Tensor:
          ret = self.decoder(latents) + self.pre_bias
          if self.normalize:
              assert info is not None
              ret = ret * info["std"] + info["mu"]
          return ret

      def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
          x, info = self.preprocess(x)
          latents_pre_act = self.encode_pre_act(x)
          latents = self.activation(latents_pre_act)
          recons = self.decode(latents, info)
          return latents_pre_act, latents, recons

      @classmethod
      def from_state_dict(
          cls, state_dict: dict[str, torch.Tensor], strict: bool = True
      ) -> "Autoencoder":
          n_latents, d_model = state_dict["encoder.weight"].shape

          # Retrieve activation
          activation_class_name = state_dict.pop("activation", "ReLU")
          activation_class = ACTIVATIONS_CLASSES.get(activation_class_name, nn.ReLU)
          normalize = activation_class_name == "TopK"
          activation_state_dict = state_dict.pop("activation_state_dict", {})
          if hasattr(activation_class, "from_state_dict"):
              activation = activation_class.from_state_dict(
                  activation_state_dict, strict=strict
              )
          else:
              activation = activation_class()
              if hasattr(activation, "load_state_dict"):
                  activation.load_state_dict(activation_state_dict, strict=strict)

          autoencoder = cls(n_latents, d_model, activation=activation, normalize=normalize)
          autoencoder.load_state_dict(state_dict, strict=strict)
          return autoencoder

      def state_dict(self, destination=None, prefix="", keep_vars=False):
          sd = super().state_dict(destination, prefix, keep_vars)
          sd[prefix + "activation"] = self.activation.__class__.__name__
          if hasattr(self.activation, "state_dict"):
              sd[prefix + "activation_state_dict"] = self.activation.state_dict()
          return sd


class TiedTranspose(nn.Module):
    def __init__(self, linear: nn.Linear):
        super().__init__()
        self.linear = linear

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        assert self.linear.bias is None
        return F.linear(x, self.linear.weight.t(), None)

    @property
    def weight(self) -> torch.Tensor:
        return self.linear.weight.t()

    @property
    def bias(self) -> torch.Tensor:
        return self.linear.bias


class TopK(nn.Module):
    def __init__(self, k: int, postact_fn: Callable = nn.ReLU()) -> None:
        super().__init__()
        self.k = k
        self.postact_fn = postact_fn

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        topk = torch.topk(x, k=self.k, dim=-1)
        values = self.postact_fn(topk.values)
        result = torch.zeros_like(x)
        result.scatter_(-1, topk.indices, values)
        return result

    def state_dict(self, destination=None, prefix="", keep_vars=False):
        state_dict = super().state_dict(destination, prefix, keep_vars)
        state_dict.update({prefix + "k": self.k, prefix + "postact_fn": self.postact_fn.__class__.__name__})
        return state_dict

    @classmethod
    def from_state_dict(cls, state_dict: dict[str, torch.Tensor], strict: bool = True) -> "TopK":
        k = state_dict["k"]
        postact_fn = ACTIVATIONS_CLASSES[state_dict["postact_fn"]]()
        return cls(k=k, postact_fn=postact_fn)


ACTIVATIONS_CLASSES = {
    "ReLU": nn.ReLU,
    "Identity": nn.Identity,
    "TopK": TopK,
}


## SAE Matryoshka Architecture

In [None]:
# -----------------------------------
class MatryoshkaAutoencoder(Autoencoder):
    def __init__(self,
                 latent_dim: int,
                 input_dim: int,
                 group_sizes: list[int],
                 activation: nn.Module = nn.ReLU(),
                 tied: bool = True,
                 normalize: bool = True):
        """
        Extends your base Autoencoder to implement Matryoshka-style nested codebooks.

        Args:
            latent_dim: total number of latent units (D)
            input_dim:  dimension of input features (e.g. K from MF embeddings)
            group_sizes: list of prefix lengths [m1, m2, ..., D] that sum to latent_dim
            activation:  nonlinearity to apply after encoder pre-activation
            tied:        whether decoder weights are tied to encoder
            normalize:   whether to apply normalization hooks from base class
        """
        super().__init__(latent_dim, input_dim,
                         activation=activation,
                         tied=tied, normalize=normalize)
        assert group_sizes[-1] == latent_dim
        self.group_sizes = group_sizes

    def forward(self, x: torch.Tensor):
        # Preprocess and encode
        x_norm, info = self.preprocess(x)
        z_pre  = self.encode_pre_act(x_norm)
        z = self.activation(z_pre)  # apply activation (ReLU)

        # Multi-level reconstruction: for each prefix length m,
        # mask off latents beyond m and decode
        recons = []
        for m in self.group_sizes:
            z_masked = z.clone()
            z_masked[:, m:] = 0  # zero out units beyond prefix m
            x_hat = self.decode(z_masked, info)
            recons.append(x_hat)
        return z_pre, z, recons

-

## MF recommender Architecture

In [None]:
class MatrixFactorization:
    def __init__(self, R, R_df, pos_idx_ex_use, neg_idx_ex_use, neg_ex_hidden, neg_ex, pos_ex_num, K, alpha, beta, iterations):

        self.R = R
        self.ratings = R_df
        self.num_users = R.shape[0]
        self.num_items = R.shape[1]
        self.K = K  # latent dimensions (e.g., 100)
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initialize biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)

        self.pos_idx_ex_use = pos_idx_ex_use_
        self.neg_idx_ex_use = neg_idx_ex_use_
        self.neg_ex = neg_ex_
        self.neg_ex_hidden = neg_ex_hidden_
        self.neg_ex_use = {(row): list(filter(lambda x: x not in self.neg_ex_hidden[row],
                                self.neg_ex[row])) for row in range(self.num_users)}
        self.pos_ex_num = pos_ex_num_

        # test set to track validation error through training:
        self.rmse_train = {}
        self.rmse_test = {}

        self.columns = list(self.ratings.columns)
        # To store batch logs across epochs if needed.
        self.batch_history = []


    def sgd_batch_tensor(self, u_batch, i_batch, r_batch):
      pred = torch.sigmoid(self.b_u[u_batch] + self.b_i[i_batch] +
                          torch.sum(self.P[u_batch] * self.Q[i_batch], dim=1))
      error = r_batch - pred

      grad_b_u = self.alpha * (error - self.beta * self.b_u[u_batch])
      grad_b_i = self.alpha * (error - self.beta * self.b_i[i_batch])
      grad_P = self.alpha * (error.unsqueeze(1) * self.Q[i_batch] - self.beta * self.P[u_batch])
      grad_Q = self.alpha * (error.unsqueeze(1) * self.P[u_batch] - self.beta * self.Q[i_batch])



      # Gradient accumulation
      self.b_u.index_add_(0, u_batch, grad_b_u)
      self.b_i.index_add_(0, i_batch, grad_b_i)
      self.P.index_add_(0, u_batch, grad_P)
      self.Q.index_add_(0, i_batch, grad_Q)


    def rmse_tensor(self, user_ids, item_ids, labels):
        
        # prediction calculation

        user_ids = user_ids.long()
        item_ids = item_ids.long()


        pred = torch.sigmoid(
            self.b_u[user_ids] +
            self.b_i[item_ids] +
            torch.sum(self.P[user_ids] * self.Q[item_ids], dim=1)
        )

        # validation examples
        mask = labels == 1
        if mask.sum() == 0:
            return float('nan')  # no validation examples

        error = labels[mask] - pred[mask]
        mse = (error ** 2).mean()
        return torch.sqrt(mse).item()


    def get_rating(self, i, j):
        return torch.sigmoid(
            self.b_u[i] + self.b_i[j] + torch.sum(self.P[i] * self.Q[j])
        ).item()


    def full_matrix(self):
        return torch.sigmoid(
            self.b_u[:, None] + self.b_i[None, :] + self.P @ self.Q.T
        )

    def recommend(self, user_id, top_n):
        df_user_ratings = pd.DataFrame(self.full_matrix(), index=self.ratings.index, columns=self.ratings.columns)
        df_user_ratings_one = df_user_ratings.loc[user_id]
        recommendations = [(i, df_user_ratings_one.loc[i]) for i in df_user_ratings.columns]
        recommendations.sort(key=lambda x: x[1], reverse=True)
        return recommendations[:top_n]

    # def recommend_pop_regularized(self, user_id, top_n, filter_listened=False, pop_penalty=pop_penalty):
    def recommend_pop_regularized(self, user_id, top_n, filter_listened=False):

      if not hasattr(self, 'predicted_matrix'):
          self.predicted_matrix = self.full_matrix()

      df_user_ratings = pd.DataFrame(self.predicted_matrix, index=self.ratings.index, columns=self.ratings.columns)
      df_user_ratings_one = df_user_ratings.loc[user_id]

      if filter_listened:
          already_listened = set(self.R.columns[self.R.loc[user_id] > 0])
          candidate_items = [i for i in df_user_ratings.columns if i not in already_listened]
      else:
          candidate_items = df_user_ratings.columns
      recommendations = [(i, df_user_ratings_one.loc[i]) for i in candidate_items]
      recommendations.sort(key=lambda x: x[1], reverse=True)
      return recommendations[:top_n]


    def recommend_norm(self, user_id, top_n):
        return normalize_val(self.recommend(user_id, top_n))

    
    def recommend_unknown(self, user_id, top_n, filter_listened=True):

      if not hasattr(self, 'predicted_matrix'):
          self.predicted_matrix = self.full_matrix()

      df_user_ratings = pd.DataFrame(self.predicted_matrix, index=self.R.index, columns=self.R.columns)
      df_user_ratings_one = df_user_ratings.loc[user_id]

      if filter_listened:
          already_listened = set(self.R.columns[self.R.loc[user_id] > 0])
          candidate_items = [i for i in df_user_ratings.columns if i not in already_listened]
      else:
          candidate_items = df_user_ratings.columns
      recommendations = [(i, df_user_ratings_one.loc[i]) for i in candidate_items]
      recommendations.sort(key=lambda x: x[1], reverse=True)
      return recommendations[:top_n]

-

## NCF recommender Architecture

In [None]:
class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users=user_artist_matrix.shape[0], num_items=user_artist_matrix.shape[1], embedding_dim=100, hidden_layers=[64, 32, 16]):
        """
        num_users: total number of users
        num_items: total number of items
        embedding_dim: dimensionality of the latent representation
        hidden_layers: list containing the sizes of the hidden layers of the MLP
        """
        super(NeuralCollaborativeFiltering, self).__init__()


        #------------------------------------------------------------
        # item and user embeddings initializtion:

        # pretrained_weights_item = dataset_items_init.clone().detach() 
        # # Create the embedding layer:
        # item_embedding = nn.Embedding(num_items, embedding_dim)
        # # Copy the pretrained weights into the embedding layer:
        # item_embedding.weight.data.copy_(pretrained_weights_item)


        # pretrained_weights_user = dataset_users_init.clone().detach() 
        # # Create the embedding layer:
        # user_embedding = nn.Embedding(num_users, embedding_dim)
        # # Copy the pretrained weights into the embedding layer:
        # user_embedding.weight.data.copy_(pretrained_weights_user)

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        nn.init.normal_(self.user_embedding.weight, mean=0.0, std=0.1)
        nn.init.normal_(self.item_embedding.weight, mean=0.0, std=0.1)

        self.user_embedding = user_embedding
        self.item_embedding = item_embedding
        self.user_bias   = nn.Embedding(num_users, 1)
        self.item_bias   = nn.Embedding(num_items, 1)
        self.num_users = num_users
        self.num_items = num_items

        #------------------------------------------------------------

        self.pos_idx_ex_use = pos_idx_ex_use_
        self.neg_idx_ex_use = neg_idx_ex_use_
        self.neg_ex = neg_ex_
        self.neg_ex_hidden= neg_ex_hidden_
        self.pos_ex_num = pos_ex_num_
        self.neg_ex_use = neg_ex_use_

        # MLP:
        layers = []
        input_dim = embedding_dim * 2  # Concatenation of user and item embeddings
        for hidden_dim in hidden_layers:
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            input_dim = hidden_dim
        # Final output layer to produce the rating or score
        layers.append(nn.Linear(input_dim, 1))
        layers.append(nn.Sigmoid())
        self.fc_layers = nn.Sequential(*layers)
        
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_indices, item_indices):
        # Look up embeddings
        user_emb = self.user_embedding(user_indices)
        item_emb = self.item_embedding(item_indices)
        # Concatenate embeddings instead of taking inner product.
        x = torch.cat([user_emb, item_emb], dim=-1)
        # Pass through the MLP to obtain prediction.
        output = self.fc_layers(x).squeeze(-1)  + self.user_bias(user_indices).squeeze(-1) \
                        + self.item_bias(item_indices).squeeze(-1)
        return output