# 3. Model Training

**Goal:** Train the recommendation models based on the multi-stage hybrid approach:
1.  **Stage 1: Candidate Generation Model:** Alternating Least Squares (ALS) for collaborative filtering to learn user and item embeddings.
2.  **Stage 2: Ranking Model:** LightGBM using rich features (user profile, item metadata, interaction counts, and ALS embeddings) to predict the likelihood of a positive interaction.

## Setup
Import libraries, define paths, and create necessary directories.

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import implicit # Library for ALS
import lightgbm as lgb # Library for Ranking Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib # For saving scikit-learn style models
import pickle # For saving Python objects (like encoders, feature lists)
import os
import ast # For parsing list strings from CSV
import warnings

warnings.filterwarnings('ignore')

# --- Configuration ---
DATA_DIR = "../data/"
RAW_DATA_DIR = "../raw_data/KuaiRec/data/"
MODEL_DIR = "../models/"

# Input file paths
INTERACTIONS_TRAIN_PATH = os.path.join(DATA_DIR, "interactions_train.csv")
USER_FEATURES_PATH = os.path.join(RAW_DATA_DIR, "user_features.csv")
VIDEO_METADATA_PATH = os.path.join(DATA_DIR, "video_metadata.csv")

# Output file paths
USER_ENCODER_PATH = os.path.join(MODEL_DIR, 'user_encoder.pkl')
ITEM_ENCODER_PATH = os.path.join(MODEL_DIR, 'item_encoder.pkl')
ALS_MODEL_PATH = os.path.join(MODEL_DIR, 'als_model.joblib')
USER_EMBEDDINGS_PATH = os.path.join(MODEL_DIR, 'user_embeddings.npy')
ITEM_EMBEDDINGS_PATH = os.path.join(MODEL_DIR, 'item_embeddings.npy')
LGBM_MODEL_PATH = os.path.join(MODEL_DIR, 'lgbm_ranker_model.joblib')
LGBM_FEATURES_PATH = os.path.join(MODEL_DIR, 'lgbm_feature_cols.pkl')

# Create model directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True)

## Stage 1: Candidate Generation Model (ALS)
Train ALS on user-item interactions to generate embeddings.

### Load and Prepare Interaction Data

In [2]:
print("Loading training interactions...")
train_interactions = pd.read_csv(INTERACTIONS_TRAIN_PATH)
print(f"Loaded {len(train_interactions)} training interactions.")

# Create label encoders for mapping original IDs to contiguous 0-based indices
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

train_interactions['user_idx'] = user_encoder.fit_transform(train_interactions['user_id'])
train_interactions['item_idx'] = item_encoder.fit_transform(train_interactions['item_id'])

n_users = len(user_encoder.classes_)
n_items = len(item_encoder.classes_)
print(f"Number of unique users (mapped): {n_users}")
print(f"Number of unique items (mapped): {n_items}")

# Save the fitted encoders for later use (e.g., in prediction/evaluation)
with open(USER_ENCODER_PATH, 'wb') as f:
    pickle.dump(user_encoder, f)
with open(ITEM_ENCODER_PATH, 'wb') as f:
    pickle.dump(item_encoder, f)
print(f"Saved user encoder to {USER_ENCODER_PATH}")
print(f"Saved item encoder to {ITEM_ENCODER_PATH}")

# Create sparse interaction matrix (users x items) for implicit library
# Using 'positive_interaction' (1 for positive, 0 otherwise) as implicit feedback strength.
print("Creating sparse interaction matrix (users x items)...")
sparse_interaction_matrix_user_item = sparse.csr_matrix((
    train_interactions['positive_interaction'],
    (train_interactions['user_idx'], train_interactions['item_idx'])
), shape=(n_users, n_items))

print(f"Created sparse interaction matrix with shape: {sparse_interaction_matrix_user_item.shape}")

Loading training interactions...
Loaded 3741256 training interactions.
Number of unique users (mapped): 1411
Number of unique items (mapped): 2993
Saved user encoder to ../models/user_encoder.pkl
Saved item encoder to ../models/item_encoder.pkl
Creating sparse interaction matrix (users x items)...
Created sparse interaction matrix with shape: (1411, 2993)


### Train ALS Model
Using the `implicit` library.

In [3]:
# ALS Hyperparameters
factors = 64        # Embedding dimensionality
regularization = 0.01 # L2 regularization
iterations = 20     # Number of optimization iterations

print(f"Training ALS model (factors={factors}, regularization={regularization}, iterations={iterations})...")
als_model = implicit.als.AlternatingLeastSquares(
    factors=factors,
    regularization=regularization,
    iterations=iterations,
    calculate_training_loss=True,
    random_state=42,
    use_gpu=implicit.gpu.HAS_CUDA # Automatically use GPU if available
)

# Train the model on the user-item matrix
als_model.fit(sparse_interaction_matrix_user_item)

print("ALS model training complete.")

Training ALS model (factors=64, regularization=0.01, iterations=20)...


  0%|          | 0/20 [00:00<?, ?it/s]

ALS model training complete.


### Save ALS Model and Embeddings
Save the trained model object and the learned user/item embeddings (factors).

In [4]:
# Save the ALS model object
joblib.dump(als_model, ALS_MODEL_PATH)
print(f"Saved ALS model object to {ALS_MODEL_PATH}")

# Extract embeddings, converting from GPU if necessary
try:
    user_factors_np = als_model.user_factors.to_numpy()
    item_factors_np = als_model.item_factors.to_numpy()
    print("Converted factors from GPU/Implicit format using .to_numpy()")
except AttributeError:
    print("Factors don't have .to_numpy(), attempting direct conversion/use.")
    user_factors_np = np.array(als_model.user_factors)
    item_factors_np = np.array(als_model.item_factors)

# Verify shapes before saving
if user_factors_np.shape != (n_users, factors) or item_factors_np.shape != (n_items, factors):
    print(f"ERROR: Unexpected embedding shapes! User: {user_factors_np.shape} (Expected {(n_users, factors)}), Item: {item_factors_np.shape} (Expected {(n_items, factors)})")
else:
    # Save the NumPy arrays
    np.save(USER_EMBEDDINGS_PATH, user_factors_np)
    np.save(ITEM_EMBEDDINGS_PATH, item_factors_np)
    print(f"Saved user embeddings to {USER_EMBEDDINGS_PATH} (shape: {user_factors_np.shape})")
    print(f"Saved item embeddings to {ITEM_EMBEDDINGS_PATH} (shape: {item_factors_np.shape})")

Saved ALS model object to ../models/als_model.joblib
Converted factors from GPU/Implicit format using .to_numpy()
Saved user embeddings to ../models/user_embeddings.npy (shape: (1411, 64))
Saved item embeddings to ../models/item_embeddings.npy (shape: (2993, 64))


## Stage 2: Ranking Model (LightGBM)
Prepare features and train the LightGBM classifier to predict `positive_interaction`.

### Feature Engineering for Ranker
Combine various features into a single DataFrame suitable for LightGBM.

In [5]:
# --- 1. Load necessary raw/processed feature data ---
print("Loading feature data for ranker...")
try:
    user_features = pd.read_csv(USER_FEATURES_PATH)
    user_features = user_features.set_index('user_id')
    # Define user feature columns to use
    user_cat_cols = ['user_active_degree'] + [f'onehot_feat{i}' for i in range(18)]
    user_num_cols = ['is_lowactive_period', 'is_live_streamer', 'is_video_author', 
                     'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days']
    user_feature_cols = user_cat_cols + user_num_cols
    user_features = user_features[user_feature_cols]
    print(f"Loaded user features for {len(user_features)} users.")
except FileNotFoundError:
    print("User features file not found. Proceeding without them.")
    user_features = None
    user_cat_cols, user_num_cols, user_feature_cols = [], [], []

item_metadata = pd.read_csv(VIDEO_METADATA_PATH)
def parse_list_string(s):
    try: return ast.literal_eval(s) if isinstance(s, str) else []
    except: return []
item_metadata['feat_list'] = item_metadata['feat'].apply(parse_list_string)
item_metadata['num_categories'] = item_metadata['feat_list'].apply(len) # Example item feature
item_metadata = item_metadata.set_index('item_id')[['num_categories']] 
print(f"Loaded item metadata for {len(item_metadata)} items.")

# Load ALS embeddings (already saved as NumPy arrays)
try:
    user_embeddings = np.load(USER_EMBEDDINGS_PATH)
    item_embeddings = np.load(ITEM_EMBEDDINGS_PATH)
    factors = user_embeddings.shape[1]
    print(f"Loaded ALS embeddings with {factors} factors.")
except FileNotFoundError:
    print("Error: ALS embedding files not found. Cannot proceed with ranker training.")
    exit()

# --- 2. Create base DataFrame with interactions and target ---
print("Creating base training data for ranker...")
# Use interactions from train_interactions which already has user_idx, item_idx
ranker_train_df = train_interactions[['user_id', 'item_id', 'user_idx', 'item_idx', 'positive_interaction']].copy()

# --- 3. Merge features onto the base DataFrame ---
print("Merging features...")
# User Features
if user_features is not None:
    ranker_train_df = ranker_train_df.merge(user_features, on='user_id', how='left')

# Item Features
ranker_train_df = ranker_train_df.merge(item_metadata, on='item_id', how='left')

# ALS Embeddings (using user_idx/item_idx)
user_emb_df = pd.DataFrame(user_embeddings, index=pd.RangeIndex(n_users))
item_emb_df = pd.DataFrame(item_embeddings, index=pd.RangeIndex(n_items))
user_emb_df.columns = [f'user_emb_{i}' for i in range(factors)]
item_emb_df.columns = [f'item_emb_{i}' for i in range(factors)]
ranker_train_df = ranker_train_df.merge(user_emb_df, left_on='user_idx', right_index=True, how='left')
ranker_train_df = ranker_train_df.merge(item_emb_df, left_on='item_idx', right_index=True, how='left')

# Interaction Count Features (calculated on the fly from the merged df)
user_counts = ranker_train_df.groupby('user_id')['item_id'].transform('size')
item_counts = ranker_train_df.groupby('item_id')['user_id'].transform('size')
ranker_train_df['user_interaction_count'] = user_counts
ranker_train_df['item_interaction_count'] = item_counts

print("Finished merging features.")

# --- 4. Impute Missing Values --- 
print(f"NaN count before imputation: {ranker_train_df.isnull().sum().sum()}")
# Impute numeric features (e.g., counts, item metadata) potentially missing due to 'left' merge
num_cols_to_impute = ['num_categories', 'user_interaction_count', 'item_interaction_count'] + user_num_cols
for col in num_cols_to_impute:
    if col in ranker_train_df.columns:
        ranker_train_df[col].fillna(0, inplace=True) # Impute numerical with 0

# Impute categorical user features with a placeholder (e.g., -1 or 'Missing')
if user_features is not None:
    for col in user_cat_cols:
        if col in ranker_train_df.columns:
            ranker_train_df[col].fillna(-1, inplace=True) # Use -1 for missing categories

# Impute embeddings (should be rare if data prep is correct)
emb_cols = [f'user_emb_{i}' for i in range(factors)] + [f'item_emb_{i}' for i in range(factors)]
ranker_train_df[emb_cols] = ranker_train_df[emb_cols].fillna(0)

print(f"NaN count after imputation: {ranker_train_df.isnull().sum().sum()}")
if ranker_train_df.isnull().sum().sum() > 0:
    print("Warning: NaNs remain after imputation! Check columns:")
    print(ranker_train_df.isnull().sum()[ranker_train_df.isnull().sum() > 0])

# --- 5. Define Feature Set and Convert Types ---
target = 'positive_interaction'
exclude_cols = ['user_id', 'item_id', 'user_idx', 'item_idx', target, 'feat_list'] # Exclude IDs and intermediate lists
feature_cols = [col for col in ranker_train_df.columns if col not in exclude_cols]

# Convert identified categorical columns to 'category' dtype for LightGBM
categorical_features_lgbm = []
if user_features is not None:
    for col in user_cat_cols:
        if col in ranker_train_df.columns:
             ranker_train_df[col] = ranker_train_df[col].astype('category')
             categorical_features_lgbm.append(col)

# Add other known categorical item features if loaded (e.g., 'author_id')
# Example: if 'author_id' was loaded and merged:
# if 'author_id' in ranker_train_df.columns:
#    ranker_train_df['author_id'] = ranker_train_df['author_id'].fillna(-1) # Impute first
#    ranker_train_df['author_id'] = ranker_train_df['author_id'].astype('category')
#    categorical_features_lgbm.append('author_id')

print(f"\nCreated ranker training data with {len(feature_cols)} features.")
print(f"Shape: {ranker_train_df.shape}")
print(f"Categorical features for LightGBM: {categorical_features_lgbm}")

Loading feature data for ranker...
Loaded user features for 7176 users.
Loaded item metadata for 10728 items.
Loaded ALS embeddings with 64 factors.
Creating base training data for ranker...
Merging features...
Finished merging features.
NaN count before imputation: 284158
NaN count after imputation: 0

Created ranker training data with 157 features.
Shape: (3741256, 162)
Categorical features for LightGBM: ['user_active_degree', 'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4', 'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8', 'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12', 'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16', 'onehot_feat17']


### Split Data and Train LightGBM Model

In [6]:
# Prepare data for LightGBM training
X = ranker_train_df[feature_cols]
y = ranker_train_df[target]

# Split ranker data for training and validation (e.g., 80/20 random split)
# Using a validation set helps tune parameters and prevents overfitting via early stopping.
X_train_lgbm, X_val_lgbm, y_train_lgbm, y_val_lgbm = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y # Stratify ensures similar target distribution
)

print(f"LightGBM training set size: {len(X_train_lgbm)}")
print(f"LightGBM validation set size: {len(X_val_lgbm)}")

# LightGBM Parameters (Example values - tuning is crucial for performance)
params = {
    'objective': 'binary',
    'metric': 'auc',         # Area Under ROC Curve - good for ranking tasks
    'boosting_type': 'gbdt', # Standard Gradient Boosted Decision Trees
    'n_estimators': 1000,    # Max number of trees (use early stopping to find optimal)
    'learning_rate': 0.05,
    'num_leaves': 31,        # Controls complexity of trees
    'max_depth': -1,         # No limit on depth
    'seed': 42,
    'n_jobs': -1,            # Use all available CPU cores
    'verbose': -1,           # Suppress verbose training output
    'colsample_bytree': 0.8, # Fraction of features used per tree
    'subsample': 0.8,        # Fraction of data used per tree (requires boosting_type='gbdt')
    'reg_alpha': 0.1,        # L1 regularization
    'reg_lambda': 0.1,       # L2 regularization
    'max_bin': 128
}

print("\nTraining LightGBM ranking model...")
lgbm_ranker = lgb.LGBMClassifier(**params)

lgbm_ranker.fit(
    X_train_lgbm,
    y_train_lgbm,
    eval_set=[(X_val_lgbm, y_val_lgbm)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)], # Stop if validation AUC doesn't improve
    categorical_feature=categorical_features_lgbm # Pass names of categorical features
)

print("LightGBM model training complete.")

LightGBM training set size: 2993004
LightGBM validation set size: 748252

Training LightGBM ranking model...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.852387
LightGBM model training complete.


### Save Trained Ranker Model and Features

In [7]:
# Save the trained LightGBM model
joblib.dump(lgbm_ranker, LGBM_MODEL_PATH)
print(f"Saved LightGBM model to {LGBM_MODEL_PATH}")

# Save the list of feature columns used by the model
# This is crucial for ensuring the same features are used during prediction/evaluation
with open(LGBM_FEATURES_PATH, 'wb') as f:
    pickle.dump(feature_cols, f)
print(f"Saved LightGBM feature column list to {LGBM_FEATURES_PATH}")

print("\n--- Model Training Complete ---")

Saved LightGBM model to ../models/lgbm_ranker_model.joblib
Saved LightGBM feature column list to ../models/lgbm_feature_cols.pkl

--- Model Training Complete ---
