## Imports and Setup

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split



from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import StackingRegressor

import matplotlib.pyplot as plt

from scipy import stats
import numpy as np
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
preprocessed_dataset = pd.read_csv("../data/pre-processed/preprocessed_youtube_data.csv")

In [6]:
# Convert to datetime if not already
preprocessed_dataset['published_time'] = pd.to_datetime(preprocessed_dataset['published_time'], format='%H:%M:%S', errors='coerce')

# Extract hour and minute
preprocessed_dataset['published_hour'] = preprocessed_dataset['published_time'].dt.hour
preprocessed_dataset['published_minute'] = preprocessed_dataset['published_time'].dt.minute

# Drop the original time column
preprocessed_dataset.drop(columns=['published_time'], inplace=True)

In [7]:
# Initialize label encoder
le = LabelEncoder()

# Apply on 'published_day_of_week'
preprocessed_dataset['published_day_of_week'] = le.fit_transform(preprocessed_dataset['published_day_of_week'])

In [8]:
missing_mask = preprocessed_dataset[['like_count_initial', 'like_count_final', 'view_count_initial', 'view_count_final']].isnull()

# Check rows where all four columns are NaN
rows_all_nan = missing_mask.all(axis=1)

print("Number of rows with all four columns NaN:", rows_all_nan.sum())

# Optionally, see those rows
print(preprocessed_dataset[rows_all_nan])

Number of rows with all four columns NaN: 11
     category_id  view_count_initial  like_count_initial  \
22           NaN                 NaN                 NaN   
28           NaN                 NaN                 NaN   
32           NaN                 NaN                 NaN   
61           NaN                 NaN                 NaN   
70           NaN                 NaN                 NaN   
83           NaN                 NaN                 NaN   
130          NaN                 NaN                 NaN   
137          NaN                 NaN                 NaN   
179          NaN                 NaN                 NaN   
239          NaN                 NaN                 NaN   
307          NaN                 NaN                 NaN   

     c_view_count_initial  c_subscriber_count_initial  view_count_final  \
22                    NaN                         NaN               NaN   
28                    NaN                         NaN               NaN   
32       

In [7]:
rows_any_nan = missing_mask.any(axis=1)
print("Rows with any of the four columns NaN:", rows_any_nan.sum())

Rows with any of the four columns NaN: 11


In [9]:
preprocessed_dataset = preprocessed_dataset.dropna(subset=[
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
])

# Define target columns
target_columns = [
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
]

In [10]:
# Choose columns to check for outliers, e.g. target columns
cols_to_check = target_columns  # or other numeric features

z_scores = np.abs(stats.zscore(preprocessed_dataset[cols_to_check]))
# Define threshold, e.g. 3 std deviations
threshold = 3
# Keep only rows where all z-scores are below threshold (no outlier)
non_outliers = (z_scores < threshold).all(axis=1)


print(f"Rows before outlier removal: {len(preprocessed_dataset)}")
preprocessed_dataset = preprocessed_dataset[non_outliers]
print(f"Rows after outlier removal: {len(preprocessed_dataset)}")

Rows before outlier removal: 339
Rows after outlier removal: 332


In [11]:
# Separate features and targets
X = preprocessed_dataset.drop(columns=target_columns)
y = preprocessed_dataset[target_columns]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [11]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'objective': 'reg:squarederror',
        # Remove GPU parameters
        # 'tree_method': 'gpu_hist',
        # 'predictor': 'gpu_predictor',
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = xgb.XGBRegressor(**params)
    try:
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )
    except TypeError:
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='mape',
            verbose=False
        )
    
    preds = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

# Run XGBoost optimization
xgb_study = optuna.create_study(direction="minimize", study_name="XGBoost_Optimization")
xgb_study.optimize(xgb_objective, n_trials=100)

print("=== XGBoost Results ===")
print("Best parameters:", xgb_study.best_trial.params)
print("Best MAPE:", xgb_study.best_value)


[I 2025-07-31 01:26:50,192] A new study created in memory with name: XGBoost_Optimization
[I 2025-07-31 01:27:05,038] Trial 0 finished with value: 7.310185520640819e+17 and parameters: {'n_estimators': 1660, 'learning_rate': 0.10050620421504669, 'max_depth': 14, 'min_child_weight': 10, 'subsample': 0.8936149826152475, 'colsample_bytree': 0.806609379135165, 'gamma': 0.5756535718983102, 'reg_alpha': 0.8295225481309385, 'reg_lambda': 0.47007076941611725}. Best is trial 0 with value: 7.310185520640819e+17.
[I 2025-07-31 01:27:14,030] Trial 1 finished with value: 2.9543430761742336e+17 and parameters: {'n_estimators': 2446, 'learning_rate': 0.11947110517929159, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.6750270018006592, 'colsample_bytree': 0.617878266050494, 'gamma': 0.8267389517780553, 'reg_alpha': 0.5530010339135898, 'reg_lambda': 0.7687622004939227}. Best is trial 1 with value: 2.9543430761742336e+17.
[I 2025-07-31 01:27:21,922] Trial 2 finished with value: 3.275702837182464e

=== XGBoost Results ===
Best parameters: {'n_estimators': 784, 'learning_rate': 0.191250616505773, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.7466204266818607, 'colsample_bytree': 0.6995836895407798, 'gamma': 0.08640139232883039, 'reg_alpha': 0.6812304406102406, 'reg_lambda': 0.46164757532757517}
Best MAPE: 4.31709754448937e+16


In [12]:
y.shape

(332, 4)

In [13]:
import optuna
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor  # Missing import
from sklearn.metrics import mean_absolute_percentage_error

def lgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        # Remove 'device': 'gpu' to avoid potential issues
        'objective': 'regression',
        'metric': 'mape',
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1  # Suppress LightGBM output
    }
    
    base_model = LGBMRegressor(**params)
    model = MultiOutputRegressor(base_model)
    
    model.fit(X_train, y_train)
    
    preds = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

# Run LightGBM optimization
lgb_study = optuna.create_study(direction="minimize", study_name="LightGBM_Optimization")
lgb_study.optimize(lgb_objective, n_trials=50)

print("=== LightGBM Results ===")
print("Best parameters:", lgb_study.best_trial.params)
print("Best MAPE:", lgb_study.best_value)


[I 2025-07-31 01:38:24,607] A new study created in memory with name: LightGBM_Optimization
[I 2025-07-31 01:38:28,826] Trial 0 finished with value: 9.335465711791182e+17 and parameters: {'n_estimators': 710, 'learning_rate': 0.1089839889793862, 'max_depth': 13, 'reg_alpha': 0.021144831541040188, 'reg_lambda': 0.10370539171717519, 'min_child_samples': 16, 'colsample_bytree': 0.6088063094896762, 'subsample': 0.5537228186928318}. Best is trial 0 with value: 9.335465711791182e+17.
[I 2025-07-31 01:38:39,413] Trial 1 finished with value: 6.97364608316543e+17 and parameters: {'n_estimators': 3677, 'learning_rate': 0.1798641553826299, 'max_depth': 4, 'reg_alpha': 0.4999929936759937, 'reg_lambda': 0.4754690022615088, 'min_child_samples': 28, 'colsample_bytree': 0.9624639642598086, 'subsample': 0.8034747218660597}. Best is trial 1 with value: 6.97364608316543e+17.
[I 2025-07-31 01:38:43,199] Trial 2 finished with value: 7.181331174438459e+17 and parameters: {'n_estimators': 1683, 'learning_rate

=== LightGBM Results ===
Best parameters: {'n_estimators': 515, 'learning_rate': 0.015868178136511008, 'max_depth': 8, 'reg_alpha': 0.5685007712004838, 'reg_lambda': 0.4390457505213317, 'min_child_samples': 39, 'colsample_bytree': 0.7474441269621528, 'subsample': 0.7153149090468353}
Best MAPE: 4.418948105618819e+17


In [14]:
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

def catboost_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'depth': trial.suggest_int('depth', 4, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 64),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0, 1.0),
        'loss_function': 'MAPE',
        'eval_metric': 'MAPE',
        'random_state': 42,
        # Remove 'task_type': 'GPU' - this is causing the error
        'verbose': False
    }
    
    # For multi-output regression with CatBoost
    base_model = CatBoostRegressor(**params)
    model = MultiOutputRegressor(base_model)
    
    # MultiOutputRegressor doesn't support eval_set and early_stopping_rounds
    model.fit(X_train, y_train)
    
    preds = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

# Run CatBoost optimization
catboost_study = optuna.create_study(direction="minimize", study_name="CatBoost_Optimization")
catboost_study.optimize(catboost_objective, n_trials=50)

print("=== CatBoost Results ===")
print("Best parameters:", catboost_study.best_trial.params)
print("Best MAPE:", catboost_study.best_value)


[I 2025-07-31 01:42:08,244] A new study created in memory with name: CatBoost_Optimization
[I 2025-07-31 01:42:16,988] Trial 0 finished with value: 2.474546333598519e+17 and parameters: {'n_estimators': 2320, 'learning_rate': 0.0796424995377985, 'depth': 8, 'min_data_in_leaf': 13, 'l2_leaf_reg': 6.831244847551595, 'bagging_temperature': 0.2617109050923243, 'random_strength': 0.6094459177168263}. Best is trial 0 with value: 2.474546333598519e+17.
[I 2025-07-31 01:42:30,588] Trial 1 finished with value: 4.3127936119612096e+17 and parameters: {'n_estimators': 3625, 'learning_rate': 0.134347254005275, 'depth': 8, 'min_data_in_leaf': 44, 'l2_leaf_reg': 1.6623625880155706, 'bagging_temperature': 0.8729218860237574, 'random_strength': 0.6045760678726404}. Best is trial 0 with value: 2.474546333598519e+17.
[I 2025-07-31 01:42:31,953] Trial 2 finished with value: 3.3998036515258976e+17 and parameters: {'n_estimators': 801, 'learning_rate': 0.11069982308211943, 'depth': 6, 'min_data_in_leaf': 20

=== CatBoost Results ===
Best parameters: {'n_estimators': 516, 'learning_rate': 0.19707349390291037, 'depth': 4, 'min_data_in_leaf': 29, 'l2_leaf_reg': 9.882302498800481, 'bagging_temperature': 0.7858361996124579, 'random_strength': 0.011407743327057757}
Best MAPE: 1593063934845118.0


In [15]:
results = {
    'XGBoost': xgb_study.best_value,
    'LightGBM': lgb_study.best_value,
    'CatBoost': catboost_study.best_value
}

print("\n=== Final Comparison ===")
for model_name, best_mape in results.items():
    print(f"{model_name}: {best_mape:.6f}")

best_model = min(results, key=results.get)
print(f"\nBest performing model: {best_model} with MAPE: {results[best_model]:.6f}")


=== Final Comparison ===
XGBoost: 43170975444893696.000000
LightGBM: 441894810561881920.000000
CatBoost: 1593063934845118.000000

Best performing model: CatBoost with MAPE: 1593063934845118.000000


In [16]:
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_valid.shape, y_valid.shape)
print("Any NaN values in targets:", y_train.isnull().sum().sum())


Training set shape: (265, 16) (265, 4)
Validation set shape: (67, 16) (67, 4)
Any NaN values in targets: 0


In [12]:
xgb_model = xgb.XGBRegressor(n_estimators=1062, 
                             learning_rate = 0.11964854632695483,
                             max_depth = 5,
                             min_child_weight= 1, 
                             subsample = 0.9696472459477042,
                             colsample_bytree=0.7315673461781189,
                             gamma=0.16262350633828251,
                             reg_alpha=0.6610390411463459,
                             reg_lambda=0.2257522016837052)
                             

In [13]:
lgbm_model = LGBMRegressor(n_estimators = 539,
              learning_rate=0.012781639959659507,
              max_depth=6,
              reg_alpha = 0.2456866217457347,
              reg_lambda=0.27018262454406133,
              min_child_samples = 10,
              colsample_bytree= 0.9832056829779426,
              subsample=0.5050938384557115)

In [14]:
catboost_model = CatBoostRegressor(n_estimators = 672, 
                                   learning_rate=0.04529720295810693,
                                   depth = 4,
                                   min_data_in_leaf=46,
                                   l2_leaf_reg=2.225210839773297,
                                   bagging_temperature=0.7082493403055143,
                                   random_strength=0.9450096159938354)

In [20]:
xgb_model2 = MultiOutputRegressor(xgb_model)
xgb_model2.fit(X_train, y_train)
preds =xgb_model2.predict(X_valid)
overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

# Manual MAPE calculation to debug
# manual_mape = np.mean(np.abs((y_valid - preds) / y_valid)) * 100
# print("Manual MAPE:", manual_mape)


Overall MAE: 253.74639892578125

MAE for each output column:
Column 0: 24.2099
Column 1: 24.5227
Column 2: 430.0874
Column 3: 536.1654


In [21]:
lgbm_model2 = MultiOutputRegressor(lgbm_model)
lgbm_model2.fit(X_train, y_train)
preds =lgbm_model2.predict(X_valid)
overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

Overall MAE: 432.7911449049382

MAE for each output column:
Column 0: 38.9437
Column 1: 41.7671
Column 2: 924.5382
Column 3: 725.9156


In [22]:
catboost_model2 = MultiOutputRegressor(catboost_model)
catboost_model2.fit(X_train, y_train)
preds =catboost_model2.predict(X_valid)
overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

0:	learn: 123.5955350	total: 656us	remaining: 440ms
1:	learn: 122.0131635	total: 1.08ms	remaining: 361ms
2:	learn: 120.0859564	total: 1.34ms	remaining: 299ms
3:	learn: 119.2477444	total: 1.76ms	remaining: 294ms
4:	learn: 117.2304153	total: 2.25ms	remaining: 300ms
5:	learn: 115.4603752	total: 2.61ms	remaining: 289ms
6:	learn: 113.8833913	total: 2.86ms	remaining: 272ms
7:	learn: 112.7732854	total: 3.08ms	remaining: 256ms
8:	learn: 111.0265452	total: 3.31ms	remaining: 244ms
9:	learn: 109.8288818	total: 3.58ms	remaining: 237ms
10:	learn: 108.2995450	total: 3.9ms	remaining: 234ms
11:	learn: 107.4634076	total: 4.19ms	remaining: 230ms
12:	learn: 106.0590760	total: 4.39ms	remaining: 223ms
13:	learn: 105.1293946	total: 4.58ms	remaining: 216ms
14:	learn: 103.8750997	total: 4.77ms	remaining: 209ms
15:	learn: 102.3502061	total: 5ms	remaining: 205ms
16:	learn: 101.6493410	total: 5.19ms	remaining: 200ms
17:	learn: 100.4516794	total: 5.38ms	remaining: 196ms
18:	learn: 99.6417624	total: 5.59ms	remaini

In [23]:
meta_model = LinearRegression()

In [24]:
stacking_model2 = StackingRegressor(
    estimators=[
        ('xgb', xgb_model), 
        ('lgb', lgbm_model), 
        ('catboost', catboost_model)
    ],
    final_estimator=meta_model,
    n_jobs=-1

)
stacking_model2 = MultiOutputRegressor(stacking_model2)
# stacking_model2.fit(X_scaled, y)

In [25]:
stacking_model2.fit(X_train, y_train)
preds =stacking_model2.predict(X_valid)
overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

0:	learn: 123.5955350	total: 56.9ms	remaining: 38.1s
1:	learn: 122.0131635	total: 57.1ms	remaining: 19.1s
2:	learn: 120.0859564	total: 57.3ms	remaining: 12.8s
3:	learn: 119.2477444	total: 57.4ms	remaining: 9.59s
4:	learn: 117.2304153	total: 57.6ms	remaining: 7.68s
5:	learn: 115.4603752	total: 57.9ms	remaining: 6.42s
6:	learn: 113.8833913	total: 58.1ms	remaining: 5.52s
7:	learn: 112.7732854	total: 58.3ms	remaining: 4.84s
8:	learn: 111.0265452	total: 58.6ms	remaining: 4.31s
9:	learn: 109.8288818	total: 58.8ms	remaining: 3.89s
10:	learn: 108.2995450	total: 59.1ms	remaining: 3.55s
11:	learn: 107.4634076	total: 59.3ms	remaining: 3.26s
12:	learn: 106.0590760	total: 59.5ms	remaining: 3.02s
13:	learn: 105.1293946	total: 59.7ms	remaining: 2.81s
14:	learn: 103.8750997	total: 59.9ms	remaining: 2.62s
15:	learn: 102.3502061	total: 60.1ms	remaining: 2.46s
16:	learn: 101.6493410	total: 60.3ms	remaining: 2.32s
17:	learn: 100.4516794	total: 60.4ms	remaining: 2.2s
18:	learn: 99.6417624	total: 60.6ms	rem

In [15]:
# 1. Create a blend split from training data
X_base, X_blend, y_base, y_blend = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 2. Wrap models in MultiOutputRegressor
xgb_model_blend = MultiOutputRegressor(xgb_model)
lgbm_model_blend = MultiOutputRegressor(lgbm_model)
catboost_model_blend = MultiOutputRegressor(catboost_model)

# 3. Train base models on base set
xgb_model_blend.fit(X_base, y_base)
lgbm_model_blend.fit(X_base, y_base)
catboost_model_blend.fit(X_base, y_base)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514
[LightGBM] [Info] Number of data points in the train set: 265, number of used features: 11
[LightGBM] [Info] Start training from score 43.498113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514
[LightGBM] [Info] Number of data points in the train set: 265, number of used features: 11
[LightGBM] [Info] Start training from score 90.626415
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 514
[LightGBM] [Info] Number of data points in the train set: 265, number of used features: 11
[LightGBM] [Info] Start training f

0,1,2
,estimator,<catboost.cor...t 0x12fab97f0>
,n_jobs,


In [16]:
# 4. Predict on blend set to create meta features
xgb_preds_blend = xgb_model_blend.predict(X_blend)
lgbm_preds_blend = lgbm_model_blend.predict(X_blend)
catboost_preds_blend = catboost_model_blend.predict(X_blend)

# 5. Stack base model predictions horizontally for meta input
X_meta_blend = np.hstack([xgb_preds_blend, lgbm_preds_blend, catboost_preds_blend])

# 6. Train meta-model on blend predictions
meta_model_blend = MultiOutputRegressor(LinearRegression())
meta_model_blend.fit(X_meta_blend, y_blend)

# 7. Predict on validation set using base models
xgb_preds_valid = xgb_model_blend.predict(X_valid)
lgbm_preds_valid = lgbm_model_blend.predict(X_valid)
catboost_preds_valid = catboost_model_blend.predict(X_valid)




In [17]:
# 8. Stack base predictions for final meta input
X_meta_valid = np.hstack([xgb_preds_valid, lgbm_preds_valid, catboost_preds_valid])

# 9. Final prediction from meta-model
final_preds = meta_model_blend.predict(X_meta_valid)

# 10. Evaluate
overall_mae = mean_absolute_error(y_valid, final_preds)
print("🔁 Blending - Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], final_preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

🔁 Blending - Overall MAE: 156.34115276887053

MAE for each output column:
Column 0: 21.7776
Column 1: 25.6286
Column 2: 288.5542
Column 3: 289.4042
