first, open the data files

In [None]:
import numpy as np 
import pandas as pd 

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_extra_df = pd.read_csv('training_extra.csv')

check the basic information of the data by info()

In [None]:
train_df.info()
train_extra_df.info()
test_df.info()

Using the `info()` function, we can observe that the columns Brand, Material, Size, Laptop Compartment, Waterproof, Style, Color, and Weight Capacity (kg) contain a small number of missing values in the dataset.

Second, data preprocessing

In [None]:
num_columns = ['Compartments', 'Weight Capacity (kg)']
object_columns = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

# Fill missing values in numeric columns with the median
for col in num_columns:
    test_df[col] = test_df[col].fillna(test_df[col].median())

# Fill missing values in numeric columns with the median
for col in num_columns:
    train_df[col] = train_df[col].fillna(train_df[col].median())

# Fill missing values in object columns with 'unknown'
for col in object_columns:
    test_df[col] = test_df[col].fillna('unknown')

# Fill missing values in object columns with 'unknown'
for col in object_columns:
    train_df[col] = train_df[col].fillna('unknown')

To ensure that the training and test sets have consistent dimensions, missing values are filled in this step. Alternatively, missing values in the training set could be dropped while filling them in the test set to maintain alignment. Additionally, the `train_extra_df` dataset is not used at this stage due to its large size, which could slow down computation—especially since stacking is being applied. Next, we proceed to encode the object-type (categorical) features.

In [None]:
# Iterate through all object-type columns and inspect their unique() values
for column in train_df.select_dtypes(include=['object']).columns:
    unique_values = train_df[column].unique()
    print(f"Unique values in '{column}': {unique_values}")

Based on the values in the object-type columns, one-hot encoding can be safely applied without causing a dimensionality explosion.

In [None]:
train = pd.get_dummies(train_df, columns=object_columns, drop_first=True, dtype=int)
test  = pd.get_dummies( test_df, columns=object_columns, drop_first=True, dtype=int)


train.shape,test.shape

Third, model training

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
import optuna
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data preparation
x = train.drop(columns=['Price','id'])
y = train['Price']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define hyperparameter optimization functions for the three models

# LightGBM optimization function
def objective_lgb(trial):
    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.05),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 50.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 50.0),
        'random_state': 42
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, pred))

# XGBoost optimization function
def objective_xgb(trial):
    params = {
        'objective': 'reg:squarederror',
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'alpha': trial.suggest_float('alpha', 1e-2, 50.0),
        'lambda': trial.suggest_float('lambda', 1e-2, 50.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'random_state': 42
    }
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, pred))

# CatBoost optimization function
def objective_cat(trial):
    params = {
        'loss_function': 'RMSE',
        'depth': trial.suggest_int('depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 50.0),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'verbose': False,
        'random_state': 42
    }
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, pred))

# Perform hyperparameter optimization
# Optimize LightGBM
study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(objective_lgb, n_trials=50)
best_lgb = study_lgb.best_params
best_lgb['random_state'] = 42

# Optimize XGBoost
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=50)
best_xgb = study_xgb.best_params
best_xgb['random_state'] = 42

# Optimize CatBoost
study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(objective_cat, n_trials=50)
best_cat = study_cat.best_params
best_cat['verbose'] = False
best_cat['random_state'] = 42

# Define base models
lgb_model = lgb.LGBMRegressor(**best_lgb)
xgb_model = xgb.XGBRegressor(**best_xgb)
cat_model = CatBoostRegressor(**best_cat)

# Generate stacking features
def generate_stacking_features(model, X_train, y_train, X_test, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_train = np.zeros(X_train.shape[0])
    oof_test = np.zeros(X_test.shape[0])
    
    for train_idx, val_idx in kf.split(X_train):
        X_tr, y_tr = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        
        model.fit(X_tr, y_tr)
        oof_train[val_idx] = model.predict(X_val)
        oof_test += model.predict(X_test)
    
    oof_test /= n_splits
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Generate Out-Of-Fold (OOF) features for each model
lgb_oof, lgb_test = generate_stacking_features(lgb_model, X_train, y_train, X_test)
xgb_oof, xgb_test = generate_stacking_features(xgb_model, X_train, y_train, X_test)
cat_oof, cat_test = generate_stacking_features(cat_model, X_train, y_train, X_test)

# Combine stacking features
stacked_X_train = np.concatenate([lgb_oof, xgb_oof, cat_oof], axis=1)
stacked_X_test = np.concatenate([lgb_test, xgb_test, cat_test], axis=1)

# Train meta model
meta_model = LinearRegression()
meta_model.fit(stacked_X_train, y_train)

# Evaluate stacked model
stacked_pred = meta_model.predict(stacked_X_test)
rmse = np.sqrt(mean_squared_error(y_test, stacked_pred))
# 预测 vs 实际值
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_test, y=stacked_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted vs Actual - Stacked Model")
plt.grid(True)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()

# 残差分布图
residuals = y_test - stacked_pred
plt.figure(figsize=(6, 4))
sns.histplot(residuals, kde=True, bins=30)
plt.title("Residual Distribution")
plt.xlabel("Prediction Error")
plt.grid(True)
plt.show()
residuals = y_test - stacked_pred

plt.figure(figsize=(6, 4))
sns.histplot(residuals, kde=True, bins=30, color="orange")
plt.title("📉 Residual Distribution (Stacked Model)")
plt.xlabel("Prediction Error")
plt.ylabel("Count")
plt.grid(True)
plt.show()


print("Best parameters for LightGBM:", best_lgb)
print("Best parameters for XGBoost:", best_xgb)
print("Best parameters for CatBoost:", best_cat)
print(f"RMSE of the stacked model: {rmse:.5f}")


To save your time from running,the output are as followed:



In [None]:
Best parameters for LightGBM: {'max_depth': 6, 'num_leaves': 102, 'min_child_samples': 29, 'learning_rate': 0.04811541008025368, 'subsample': 0.6373168094008929, 'colsample_bytree': 0.6287351039931749, 'reg_alpha': 24.94216574124474, 'reg_lambda': 13.3381875824349, 'random_state': 42}
Best parameters for XGBoost: {'max_depth': 4, 'learning_rate': 0.0242976335510153, 'subsample': 0.7392383751300301, 'colsample_bytree': 0.7081780053291772, 'alpha': 1.3548669745281217, 'lambda': 0.1300423102945354, 'n_estimators': 445, 'random_state': 42}
Best parameters for CatBoost: {'depth': 4, 'learning_rate': 0.06257815809830002, 'l2_leaf_reg': 23.148679122356665, 'subsample': 0.6957264471620155, 'iterations': 410, 'verbose': False, 'random_state': 42}
RMSE of the stacked model: 38.90021

This implementation applies a multi-model stacking approach, combining three popular machine learning models: LightGBM, XGBoost, and CatBoost. The Optuna library is used to perform hyperparameter tuning for each of these models. The predictions from these base models are then used as features, which are fed into a linear regression model acting as the meta-model for a second-level prediction. 