## ML Sklearn

In [None]:
# Load RS models from Github
url = f"https://raw.githubusercontent.com/statmlben/CUHK-STAT3009/main/src/TabRS.py"
!wget --no-cache --backups=1 {url}

from TabRS import rmse, GlobalMeanRS, UserMeanRS, ItemMeanRS, SVD
# rmse: turth, pred
# UserMeanRS: n_users, min_data
# SVD: n_users, n_items, lam, K, iterNum, tol, verbose

--2025-11-19 15:01:00--  https://raw.githubusercontent.com/statmlben/CUHK-STAT3009/main/src/TabRS.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9001 (8.8K) [text/plain]
Failed to rename TabRS.py to TabRS.py.1: (2) No such file or directory
Saving to: ‘TabRS.py’


2025-11-19 15:01:00 (19.7 MB/s) - ‘TabRS.py’ saved [9001/9001]



In [None]:
# Data Preprocessing

# Prepare Data: Type(1)
from sklearn.model_selection import train_test_split

data = pd.read_csv('url_to_data')
X, y = data.values[:,0:2], data.values[:,2] # From df to np array
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Prepare Data: Type(2)
train = pd.read_csv('url_to_train')
test = pd.read_csv('url_to_test')
X_train, y_train = train.values[:,0:2], train.values[:,2]
X_test = test.values

# Feature Standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X[:, 0] = StandardScaler().fit_transform(X[:, 0].reshape(-1, 1)).flatten() # For a specific feature

# Label Standardization
from sklearn.preprocessing import LabelEncoder

X = np.concatenate([X_train, X_test], axis=0)
user_le, item_le = LabelEncoder(), LabelEncoder()

user_le.fit(X[:,0]) # fit the encoder
item_le.fit(X[:,1])

X_train[:,0] = user_le.transform(X_train[:,0]) # user
X_test[:,0] = user_le.transform(X_test[:,0])

X_train[:,1] = item_le.transform(X_train[:,1]) # item
X_test[:,1] = item_le.transform(X_test[:,1])

n_users = len(user_le.classes_)
n_items = len(item_le.classes_)

In [None]:
# Basic method: User/Item Mean
from sklearn.base import BaseEstimator

class UserItemAverage(BaseEstimator):
    def __init__(self, feats, target, min_data):
        self.feats = feats
        self.target = target
        self.min_data = min_data

    def fit(self, X):
        self.glb_avg = X[self.target].mean()

        user_stats = X.groupby(feats[1])[self.target].agg(['mean','count'])
        item_stats = X.groupby(feats[0])[self.target].agg(['mean','count'])

        self.user_avg = user_stats['mean'].where(user_stats['count'] >= self.min_data, self.glb_avg).to_dict()
        self.item_avg = item_stats['mean'].where(item_stats['count'] >= self.min_data, self.glb_avg).to_dict()

        return self

    def predict(self, X):
        user_avg = X[feats[1]].map(self.user_avg).fillna(self.glb_avg)
        item_avg = X[feats[0]].map(self.item_avg).fillna(self.glb_avg)

        return (user_avg + item_avg)/2

# For the implementation I would use dataframe instead of numpy arrays
class UserMedianRS(BaseEstimator):
    def fit(self, X_train):
        self.glb_median = X_train['rating'].median()
        self.user_median = X_train.groupby('user_id')['rating'].median()

    def predict(self, X_test):
        return X_test['user_id'].map(self.user_median).fillna(self.glb_median)

In [None]:
# Advanced method: SVD
import numpy as np
from collections import defaultdict
from sklearn.base import BaseEstimator
from sklearn.linear_model import Ridge

class SVD(BaseEstimator):
    """
    Matrix Factorization (MF) class for collaborative filtering using ALS with Static Bias.
    """

    def __init__(self, n_users, n_items, lam=.01, K=10, iterNum=10, tol=1e-4, verbose=1):
        self.n_users = n_users
        self.n_items = n_items
        self.lam = lam
        self.K = K
        self.iterNum = iterNum
        self.tol = tol
        self.verbose = verbose

        # Parameters initialization
        self.mu = 0.0
        self.a = np.zeros(n_users)
        self.b = np.zeros(n_items)
        self.P = np.random.normal(scale=1./self.K, size=(n_users, K))
        self.Q = np.random.normal(scale=1./self.K, size=(n_items, K))

    def fit(self, X, y):
        """
        Fits the matrix factorization model to the given data.
        X: (n_samples, 2) -> pairs of [user_id, item_id]
        y: (n_samples,) -> ratings
        """
        n_obs = len(X)

        if self.verbose:
            print(f'Fitting Reg-SVD: K={self.K}, lam={self.lam}')

        # =========================================
        # 1. Pre-compute Indices (O(N) Optimization)
        # =========================================
        # Avoid repeated np.where calls in the loop.
        # Build an adjacency list for fast lookup: item -> [row_indices], user -> [row_indices]
        item_indices_dict = defaultdict(list)
        user_indices_dict = defaultdict(list)

        for idx, (u, i) in enumerate(X):
            user_indices_dict[u].append(idx)
            item_indices_dict[i].append(idx)

        # Convert to arrays for numpy indexing support
        self.index_item = {k: np.array(v) for k, v in item_indices_dict.items()}
        self.index_user = {k: np.array(v) for k, v in user_indices_dict.items()}

        # =========================================
        # 2. Initialize Biases (Static Strategy)
        # =========================================
        # Compute Global Bias
        self.mu = np.mean(y)

        # Compute Item Bias: b[i] = mean(y - mu) for items
        # We calculate this once and fix it.
        for i in range(self.n_items):
            idx = self.index_item.get(i, [])
            if len(idx) > 0:
                self.b[i] = np.mean(y[idx] - self.mu)

        # Compute User Bias: a[u] = mean(y - mu - b[i]) for users
        for u in range(self.n_users):
            idx = self.index_user.get(u, [])
            if len(idx) > 0:
                # Items rated by this user
                items_rated = X[idx, 1]
                self.a[u] = np.mean(y[idx] - self.mu - self.b[items_rated])

        # =========================================
        # 3. ALS Loop (Update P and Q only)
        # =========================================
        # Note: Ridge alpha needs to match the objective function scale.
        # Obj = MSE + lam * Penalty = (1/N)*RSS + lam * Penalty
        # Target function for Ridge is RSS + alpha * Penalty
        # So, alpha should be lam * n_obs
        ridge_alpha = self.lam * n_obs

        for l in range(self.iterNum):
            obj_old = self.obj(X, y)

            # Update Item Latent Factors (Q)
            for item_id in range(self.n_items):
                idx = self.index_item.get(item_id, [])
                if len(idx) == 0: continue

                # Get data relevant to this item
                y_subset = y[idx]         # Actual ratings
                u_subset = X[idx, 0]      # Users who rated this item

                # Target residual for matrix factorization part: y - (mu + a + b)
                # We want P[u] * Q[i]^T approx (y - bias)
                bias_part = self.mu + self.a[u_subset] + self.b[item_id]
                residual_target = y_subset - bias_part

                # Features: The User latent factors P for these users
                P_features = self.P[u_subset]

                # Solve for Q[item_id]
                clf = Ridge(alpha=ridge_alpha, fit_intercept=False)
                clf.fit(X=P_features, y=residual_target)
                self.Q[item_id, :] = clf.coef_

            # Update User Latent Factors (P)
            for user_id in range(self.n_users):
                idx = self.index_user.get(user_id, [])
                if len(idx) == 0: continue

                # Get data relevant to this user
                y_subset = y[idx]
                i_subset = X[idx, 1]      # Items rated by this user

                # Target residual
                bias_part = self.mu + self.a[user_id] + self.b[i_subset]
                residual_target = y_subset - bias_part

                # Features: The Item latent factors Q for these items
                Q_features = self.Q[i_subset]

                # Solve for P[user_id]
                clf = Ridge(alpha=ridge_alpha, fit_intercept=False)
                clf.fit(X=Q_features, y=residual_target)
                self.P[user_id, :] = clf.coef_

            # Check convergence
            obj_new = self.obj(X, y)
            diff = abs(obj_old - obj_new)
            rmse_val = np.sqrt(self.mse(X, y))

            if self.verbose:
                print(f"RegSVD-ALS: {l+1}; obj: {obj_new:.3f}; rmse: {rmse_val:.3f}; diff: {diff:.5f}")

            if diff < self.tol:
                break

        return self

    def predict(self, X):
        """
        Vectorized prediction for user-item pairs.
        """
        users = X[:, 0]
        items = X[:, 1]

        # Vectorized formula: mu + a[u] + b[i] + (P[u] . Q[i])
        # np.sum(A * B, axis=1) computes the dot product row-wise efficiently
        interaction = np.sum(self.P[users] * self.Q[items], axis=1)

        return self.mu + self.a[users] + self.b[items] + interaction

    def mse(self, X, y):
        pred_y = self.predict(X)
        return np.mean((pred_y - y)**2)

    def obj(self, X, y):
        """
        Computes objective function: MSE + Regularization
        """
        mse_tmp = self.mse(X, y)
        # Regularization term (L2 norm of P and Q)
        # Note: Biases are static now, so we technically only penalize P and Q updates
        pen_tmp = np.sum(self.P**2) + np.sum(self.Q**2)
        return mse_tmp + self.lam * pen_tmp

In [None]:
# Cross Validation: GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'lam': [0.1, 1],  # Regularization parameter
    'K': [5, 10, 15],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # You can change this to other metrics like 'f1', 'precision', 'recall'
    n_jobs=-1,  # Use all available CPU cores
    verbose=2  # Print progress
)

print("Starting grid search...")
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("\nBest parameters found:")
print(grid_search.best_params_)
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

## DL Keras

In [None]:
# Standardization
from sklearn.preprocessing import StandardScaler # Value
from sklearn.preprocessing import LabelEncoder # Label

feats = None
cate = ['cate1', 'cate2']
dense = ['dense1', 'dense2']

# StandardScaler can input numpy array or dataframe
scaler = StandardScaler()
feats[dense] = scaler.fit_transform(feats[dense])

# Label Standardization
for cate_tmp in cate:
  cate_le = LabelEncoder()
  feats[cate_tmp] = cate_le.fit_transform(feats[cate_tmp])

# Data Preparation
feats =


In [None]:
# Build the model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class RSModel(keras.Model):
  def __init__(self, n_users, embedding_dim, hidden_dim):
    super().__init__()
    # Embedding layers
    self.user_embed = layers.Embedding(n_users, embedding_dim)

    # Dense layers
    self.dense = layers.Dense(hidden_dim, activation='relu')
    ## for out layer, we don't need activation function

    # Other layers
    self.dot = layers.Dot(axes=1)
    self.concat = layers.Concatenate()

  def call(self, inputs):
    user_feats, item_feats = inputs
    merged = self.concat([user_feats, item_feats])
    sim = self.dot([user_feats, item_feats])
    return sim


In [None]:
# Training
opt = keras.optimizers.Adam(1e-3)

model.compile(optimizer=opt,
      loss='mean_squared_error',
      metrics=['root_mean_squared_error'])

## callback
callback = [keras.callbacks.EarlyStopping(
      monitor='val_loss',
      patience=10,
      mode='min',
      restore_best_weights=True,
  )]

## fit with early stopping
model.fit(x=X_train,
    y=y_train,
    epochs=20,
    batch_size=512,
    validation_split=0.3,
    callbacks=callback
    )

print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=512)
print("test loss, test acc:", results)