In [36]:
!pip install lightgbm optuna xgboost catboost



In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import StackingRegressor

import matplotlib.pyplot as plt

from scipy import stats
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
import pickle
import json


import warnings
warnings.filterwarnings("ignore")

In [38]:
preprocessed_dataset = pd.read_csv("../data/pre-processed/preprocessed_youtube_data.csv")

In [39]:
# Convert to datetime if not already
preprocessed_dataset['published_time'] = pd.to_datetime(preprocessed_dataset['published_time'], format='%H:%M:%S', errors='coerce')

# Extract hour and minute
preprocessed_dataset['published_hour'] = preprocessed_dataset['published_time'].dt.hour
preprocessed_dataset['published_minute'] = preprocessed_dataset['published_time'].dt.minute

# Drop the original time column
preprocessed_dataset.drop(columns=['published_time'], inplace=True)

In [40]:
# Separate label encoders for each column
le_day = LabelEncoder()
le_def = LabelEncoder()

preprocessed_dataset['published_day_of_week'] = le_day.fit_transform(preprocessed_dataset['published_day_of_week'])
preprocessed_dataset['definition'] = le_def.fit_transform(preprocessed_dataset['definition'])


In [None]:
missing_mask = preprocessed_dataset[['like_count_initial', 'like_count_final', 'view_count_initial', 'view_count_final']].isnull()

# Check rows where all four columns are NaN
rows_all_nan = missing_mask.all(axis=1)

print("Number of rows with all four columns NaN:", rows_all_nan.sum())

# Optionally, see those rows
print(preprocessed_dataset[rows_all_nan])

Number of rows with all four columns NaN: 5
       category_id country  definition logged_at_initial  view_count_initial  \
4465           NaN     NaN           2               NaN                 NaN   
8504           NaN     NaN           2               NaN                 NaN   
17296          NaN     NaN           2               NaN                 NaN   
18396          NaN     NaN           2               NaN                 NaN   
22842          NaN     NaN           2               NaN                 NaN   

       like_count_initial  c_view_count_initial logged_at_final  \
4465                  NaN                   NaN             NaN   
8504                  NaN                   NaN             NaN   
17296                 NaN                   NaN             NaN   
18396                 NaN                   NaN             NaN   
22842                 NaN                   NaN             NaN   

       view_count_final  like_count_final  ...  channel_avg_views  \
446

In [42]:
rows_any_nan = missing_mask.any(axis=1)
print("Rows with any of the four columns NaN:", rows_any_nan.sum())

Rows with any of the four columns NaN: 705


In [43]:
preprocessed_dataset.columns

Index(['category_id', 'country', 'definition', 'logged_at_initial',
       'view_count_initial', 'like_count_initial', 'c_view_count_initial',
       'logged_at_final', 'view_count_final', 'like_count_final',
       'published_year', 'published_month', 'published_day_of_week',
       'avg_view_diff_per_category', 'avg_likes_diff_per_category',
       'channel_avg_views', 'channel_growth_potential',
       'channel_virality_score', 'log_channel_subs',
       'relative_views_to_category', 'relative_likes_to_category',
       'logged_at_initial_hour', 'logged_at_final_hour', 'published_hour',
       'published_minute'],
      dtype='object')

In [44]:
# Step 1: Convert to category
preprocessed_dataset['country'] = preprocessed_dataset['country'].astype('category')

# Step 2: Save mapping BEFORE encoding
categories = list(preprocessed_dataset['country'].cat.categories)
country_to_code = {country: idx for idx, country in enumerate(categories)}

with open("country_encoding.json", "w") as f:
    json.dump(country_to_code, f)

# Step 3: Encode with .cat.codes
preprocessed_dataset['country_encoded'] = preprocessed_dataset['country'].cat.codes


preprocessed_dataset['category_id'] = (
    preprocessed_dataset['category_id']
    .fillna(-1)  # or any placeholder like 999
    .astype(int)
)


In [45]:
preprocessed_dataset.drop(columns=['logged_at_final', 'logged_at_initial','country'], inplace=True)

In [46]:
preprocessed_dataset = preprocessed_dataset.dropna(subset=[
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
])

# Define target columns
target_columns = [
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
]

In [71]:
from scipy import stats
import numpy as np

cols_to_check = target_columns  # e.g. ['views']
z_scores = np.abs(stats.zscore(preprocessed_dataset[cols_to_check]))

threshold = 1
higher_threshold = 2
non_outliers_mask = (z_scores < threshold).all(axis=1)
# outliers_mask = ~non_outliers_mask  # Invert mask
outliers_mask = ((z_scores >= threshold) & (z_scores < higher_threshold)).any(axis=1)

df_normal = preprocessed_dataset[non_outliers_mask]
df_outliers = preprocessed_dataset[outliers_mask]

print(f"Normal data: {len(df_normal)} rows")
print(f"Outlier data: {len(df_outliers)} rows")

Normal data: 24086 rows
Outlier data: 388 rows


In [72]:
X_normal = df_normal.drop(columns=target_columns)
y_normal = df_normal[target_columns]

X_outlier = df_outliers.drop(columns=target_columns)
y_outlier = df_outliers[target_columns]

X_normal_train, X_normal_valid, y_normal_train, y_normal_valid = train_test_split(
    X_normal, y_normal, test_size=0.2, random_state=42
)

X_outlier_train, X_outlier_valid, y_outlier_train, y_outlier_valid = train_test_split(
    X_outlier, y_outlier, test_size=0.2, random_state=42
)

In [49]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'objective': 'reg:squarederror',
        # Remove GPU parameters
        # 'tree_method': 'gpu_hist',
        # 'predictor': 'gpu_predictor',
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = xgb.XGBRegressor(**params)
    try:
        model.fit(
            X_normal_train, y_normal_train,
            eval_set=[(X_normal_valid, y_normal_valid)],
            verbose=False
        )
    except TypeError:
        model.fit(
            X_normal_train, y_normal_train,
            eval_set=[(X_normal_valid, y_normal_valid)],
            eval_metric='mape',
            verbose=False
        )
    
    preds = model.predict(X_normal_valid)
    mape = mean_absolute_percentage_error(y_normal_valid, preds)
    return mape

# Run XGBoost optimization
xgb_study_normal = optuna.create_study(direction="minimize", study_name="XGBoost_Optimization")
xgb_study_normal.optimize(xgb_objective, n_trials=100)

print("=== XGBoost Results ===")
print("Best parameters:", xgb_study_normal.best_trial.params)
print("Best MAPE:", xgb_study_normal.best_value)

[I 2025-08-04 11:16:07,167] A new study created in memory with name: XGBoost_Optimization
[I 2025-08-04 11:16:17,495] Trial 0 finished with value: 7.218871079954022e+16 and parameters: {'n_estimators': 841, 'learning_rate': 0.0901512419534924, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.8094881199409084, 'colsample_bytree': 0.6286139260165897, 'gamma': 0.14457515471152083, 'reg_alpha': 0.5551757188853466, 'reg_lambda': 0.1713974438832463}. Best is trial 0 with value: 7.218871079954022e+16.
[I 2025-08-04 11:16:47,216] Trial 1 finished with value: 8.338905779457229e+16 and parameters: {'n_estimators': 3339, 'learning_rate': 0.1729139619245368, 'max_depth': 7, 'min_child_weight': 9, 'subsample': 0.8414280021343044, 'colsample_bytree': 0.9360991396549828, 'gamma': 0.6604468749679767, 'reg_alpha': 0.5164244539875931, 'reg_lambda': 0.8163681905827547}. Best is trial 0 with value: 7.218871079954022e+16.
[I 2025-08-04 11:17:09,111] Trial 2 finished with value: 7.282830873932595e+16 a

=== XGBoost Results ===
Best parameters: {'n_estimators': 1141, 'learning_rate': 0.010187544257566544, 'max_depth': 12, 'min_child_weight': 8, 'subsample': 0.9524473133196641, 'colsample_bytree': 0.6529316656169715, 'gamma': 0.9216431453838577, 'reg_alpha': 0.35463171197526433, 'reg_lambda': 0.9975000256368836}
Best MAPE: 6.027959971138765e+16


In [50]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'objective': 'reg:squarederror',
        # Remove GPU parameters
        # 'tree_method': 'gpu_hist',
        # 'predictor': 'gpu_predictor',
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = xgb.XGBRegressor(**params)
    try:
        model.fit(
            X_outlier_train, y_outlier_train,
            eval_set=[(X_outlier_valid, y_outlier_valid)],
            verbose=False
        )
    except TypeError:
        model.fit(
            X_outlier_train, y_outlier_train,
            eval_set=[(X_outlier_valid, y_outlier_valid)],
            eval_metric='mape',
            verbose=False
        )
    
    preds = model.predict(X_outlier_valid)
    mape = mean_absolute_percentage_error(y_outlier_valid, preds)
    return mape

# Run XGBoost optimization
xgb_study_outlier = optuna.create_study(direction="minimize", study_name="XGBoost_Optimization")
xgb_study_outlier.optimize(xgb_objective, n_trials=100)

print("=== XGBoost Results ===")
print("Best parameters:", xgb_study_outlier.best_trial.params)
print("Best MAPE:", xgb_study_outlier.best_value)


[I 2025-08-04 11:44:06,031] A new study created in memory with name: XGBoost_Optimization
[I 2025-08-04 11:44:19,299] Trial 0 finished with value: 1.4394081240363827e+17 and parameters: {'n_estimators': 3864, 'learning_rate': 0.1592383084619231, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.7933369639227391, 'colsample_bytree': 0.8744455282804677, 'gamma': 0.8955404323234655, 'reg_alpha': 0.49457126987038746, 'reg_lambda': 0.08341034771223266}. Best is trial 0 with value: 1.4394081240363827e+17.
[I 2025-08-04 11:44:29,385] Trial 1 finished with value: 1.7099405548689818e+17 and parameters: {'n_estimators': 1627, 'learning_rate': 0.1944153727664222, 'max_depth': 6, 'min_child_weight': 9, 'subsample': 0.875809284940008, 'colsample_bytree': 0.8525714758035381, 'gamma': 0.9418454829585513, 'reg_alpha': 0.28637501919298824, 'reg_lambda': 0.6455284197811507}. Best is trial 0 with value: 1.4394081240363827e+17.
[I 2025-08-04 11:44:45,935] Trial 2 finished with value: 9.665976800483738

=== XGBoost Results ===
Best parameters: {'n_estimators': 2052, 'learning_rate': 0.06548921607255545, 'max_depth': 3, 'min_child_weight': 10, 'subsample': 0.6887430074817801, 'colsample_bytree': 0.6187932880145807, 'gamma': 0.34704162051902365, 'reg_alpha': 0.8462663507619503, 'reg_lambda': 0.10463786219196965}
Best MAPE: 7.539949091107635e+16


In [73]:
xgb_model_normal = xgb.XGBRegressor(n_estimators=1141,
                             learning_rate=0.010187544257566544,
                             max_depth=12,
                             min_child_weight=8,
                             subsample=0.9524473133196641,
                             colsample_bytree=0.6529316656169715,
                             gamma=0.9216431453838577,
                             reg_alpha=0.35463171197526433,
                             reg_lambda=0.9975000256368836)

xgb_model_outliers = xgb.XGBRegressor(n_estimators=2052,
                                     learning_rate=0.06548921607255545,
                                     max_depth=3,
                                     min_child_weight=10,
                                     subsample=0.6887430074817801,
                                     colsample_bytree=0.6187932880145807,
                                     gamma=0.34704162051902365,
                                     reg_alpha=0.8462663507619503,
                                     reg_lambda=0.10463786219196965)
 
xgb_model_normal = MultiOutputRegressor(xgb_model_normal)

xgb_model_outliers = MultiOutputRegressor(xgb_model_outliers)

xgb_model_normal.fit(X_normal_train, y_normal_train)
xgb_model_outliers.fit(X_outlier_train, y_outlier_train)

xgb_model_normal.fit(X_normal_train, y_normal_train)
xgb_model_outliers.fit(X_outlier_train, y_outlier_train)

preds_normal = xgb_model_normal.predict(X_normal_valid)
preds_outlier = xgb_model_outliers.predict(X_outlier_valid)

print("=== Normal Region MAE ===")
print("Overall MAE:", mean_absolute_error(y_normal_valid, preds_normal))
for i, col in enumerate(target_columns):
    print(f"{col}: {mean_absolute_error(y_normal_valid.iloc[:, i], preds_normal[:, i]):.4f}")

print("\n=== Outlier Region MAE ===")
print("Overall MAE:", mean_absolute_error(y_outlier_valid, preds_outlier))
for i, col in enumerate(target_columns):
    print(f"{col}: {mean_absolute_error(y_outlier_valid.iloc[:, i], preds_outlier[:, i]):.4f}")

=== Normal Region MAE ===
Overall MAE: 379.6600646972656
like_count_initial: 19.4974
like_count_final: 35.4814
view_count_initial: 542.9210
view_count_final: 920.7404

=== Outlier Region MAE ===
Overall MAE: 19980.529296875
like_count_initial: 794.6646
like_count_final: 1334.1140
view_count_initial: 29022.0614
view_count_final: 48771.2760


In [74]:
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd

preds_normal = xgb_model_normal.predict(X_normal_valid)
preds_outlier = xgb_model_outliers.predict(X_outlier_valid)

all_preds = np.vstack([preds_normal, preds_outlier])
all_y_true = pd.concat([y_normal_valid, y_outlier_valid], axis=0).to_numpy()

overall_mae = mean_absolute_error(all_y_true, all_preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(all_y_true.shape[1]):
    col_mae = mean_absolute_error(all_y_true[:, col_idx], all_preds[:, col_idx])
    print(f"Column {target_columns[col_idx]}: {col_mae:.4f}")

Overall MAE: 691.9288006472693

MAE for each output column:
Column like_count_initial: 31.8469
Column like_count_final: 56.1704
Column view_count_initial: 996.6328
Column view_count_final: 1683.0651


In [57]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error

def create_lgb_objective(X_train, y_train, X_valid, y_valid):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'max_depth': trial.suggest_int('max_depth', 3, 14),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'objective': 'regression',
            'metric': 'mape',
            'random_state': 42,
            'n_jobs': -1,
            'verbose': -1
        }

        base_model = LGBMRegressor(**params)
        model = MultiOutputRegressor(base_model)

        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        mape = mean_absolute_percentage_error(y_valid, preds)
        return mape

    return objective

In [58]:
lgb_study_normal = optuna.create_study(direction="minimize", study_name="LightGBM_Normal")
lgb_study_normal.optimize(create_lgb_objective(X_normal_train, y_normal_train, X_normal_valid, y_normal_valid), n_trials=100)

print("=== LightGBM Normal Results ===")
print("Best parameters:", lgb_study_normal.best_trial.params)
print("Best MAPE:", lgb_study_normal.best_value)

[I 2025-08-04 12:29:36,600] A new study created in memory with name: LightGBM_Normal
[I 2025-08-04 12:30:32,401] Trial 0 finished with value: 8.166858279221317e+16 and parameters: {'n_estimators': 2527, 'learning_rate': 0.1990990804459522, 'max_depth': 13, 'reg_alpha': 0.8638706343391723, 'reg_lambda': 0.8329827413060983, 'min_child_samples': 16, 'colsample_bytree': 0.898976907200319, 'subsample': 0.8676625931529345}. Best is trial 0 with value: 8.166858279221317e+16.
[I 2025-08-04 12:31:43,241] Trial 1 finished with value: 8.4218966271172e+16 and parameters: {'n_estimators': 3304, 'learning_rate': 0.19368257846615958, 'max_depth': 12, 'reg_alpha': 0.4808692805458178, 'reg_lambda': 0.1648082030995769, 'min_child_samples': 28, 'colsample_bytree': 0.9859916391389167, 'subsample': 0.7380065584041695}. Best is trial 0 with value: 8.166858279221317e+16.
[I 2025-08-04 12:32:42,897] Trial 2 finished with value: 7.67567354154004e+16 and parameters: {'n_estimators': 2910, 'learning_rate': 0.128

=== LightGBM Normal Results ===
Best parameters: {'n_estimators': 1289, 'learning_rate': 0.031217526804921125, 'max_depth': 14, 'reg_alpha': 0.2629943207965433, 'reg_lambda': 0.01019299949956498, 'min_child_samples': 15, 'colsample_bytree': 0.7843079729427788, 'subsample': 0.970356315270121}
Best MAPE: 6.317977851132298e+16


In [None]:
lgb_study_outlier = optuna.create_study(direction="minimize", study_name="LightGBM_Outlier")
lgb_study_outlier.optimize(create_lgb_objective(X_outlier_train, y_outlier_train, X_outlier_valid, y_outlier_valid), n_trials=100)

print("=== LightGBM Outlier Results ===")
print("Best parameters:", lgb_study_outlier.best_trial.params)
print("Best MAPE:", lgb_study_outlier.best_value)

[I 2025-08-04 13:34:34,691] A new study created in memory with name: LightGBM_Outlier
[I 2025-08-04 13:34:42,918] Trial 0 finished with value: 1.6938310567224813e+17 and parameters: {'n_estimators': 2922, 'learning_rate': 0.18001464963224795, 'max_depth': 3, 'reg_alpha': 0.583093109537375, 'reg_lambda': 0.820127668455808, 'min_child_samples': 27, 'colsample_bytree': 0.738422790017843, 'subsample': 0.5677182435064568}. Best is trial 0 with value: 1.6938310567224813e+17.
[I 2025-08-04 13:34:52,995] Trial 1 finished with value: 1.791005954357252e+17 and parameters: {'n_estimators': 572, 'learning_rate': 0.14617151441347803, 'max_depth': 12, 'reg_alpha': 0.689628193745789, 'reg_lambda': 0.16357775267495855, 'min_child_samples': 12, 'colsample_bytree': 0.7788486277094301, 'subsample': 0.8122570299282943}. Best is trial 0 with value: 1.6938310567224813e+17.
[I 2025-08-04 13:34:59,773] Trial 2 finished with value: 1.4918694398653907e+17 and parameters: {'n_estimators': 1737, 'learning_rate': 

=== LightGBM Outlier Results ===
Best parameters: {'n_estimators': 3087, 'learning_rate': 0.1951649986395337, 'max_depth': 4, 'reg_alpha': 0.9618691297082163, 'reg_lambda': 0.03033112072958473, 'min_child_samples': 33, 'colsample_bytree': 0.6886004856895882, 'subsample': 0.9559663043916242}
Best MAPE: 5.9899381100405e+16


In [67]:
lgbm_model_normal = LGBMRegressor(n_estimators=1289,
                                  learning_rate=0.031217526804921125,
                                  max_depth=14,
                                  reg_alpha=0.2629943207965433,
                                  reg_lambda=0.01019299949956498,
                                  min_child_samples=15,
                                  colsample_bytree=0.7843079729427788,
                                  subsample=0.970356315270121)

lgbm_model_outliers = LGBMRegressor(n_estimators=3087,
                                    learning_rate=0.1951649986395337,
                                    max_depth=4,
                                    reg_alpha=0.9618691297082163,
                                    reg_lambda=0.03033112072958473,
                                    min_child_samples=33,
                                    colsample_bytree=0.6886004856895882,
                                    subsample=0.9559663043916242)


lgbm_model_normal = MultiOutputRegressor(lgbm_model_normal)
lgbm_model_outliers = MultiOutputRegressor(lgbm_model_outliers)

lgbm_model_normal.fit(X_normal_train, y_normal_train)
lgbm_model_outliers.fit(X_outlier_train, y_outlier_train)

lgbm_model_normal.fit(X_normal_train, y_normal_train)
lgbm_model_outliers.fit(X_outlier_train, y_outlier_train)

preds_normal = lgbm_model_normal.predict(X_normal_valid)
preds_outlier = lgbm_model_outliers.predict(X_outlier_valid)

print("=== Normal Region MAE ===")
print("Overall MAE:", mean_absolute_error(y_normal_valid, preds_normal))
for i, col in enumerate(target_columns):
    print(f"{col}: {mean_absolute_error(y_normal_valid.iloc[:, i], preds_normal[:, i]):.4f}")

print("\n=== Outlier Region MAE ===")
print("Overall MAE:", mean_absolute_error(y_outlier_valid, preds_outlier))
for i, col in enumerate(target_columns):
    print(f"{col}: {mean_absolute_error(y_outlier_valid.iloc[:, i], preds_outlier[:, i]):.4f}")

=== Normal Region MAE ===
Overall MAE: 387.32461983880194
like_count_initial: 19.9493
like_count_final: 36.3620
view_count_initial: 554.4814
view_count_final: 938.5058

=== Outlier Region MAE ===
Overall MAE: 62781.6758614541
like_count_initial: 3201.8237
like_count_final: 4028.8394
view_count_initial: 97983.9694
view_count_final: 145912.0710


In [63]:
lgbm_preds_normal = lgbm_model_normal.predict(X_normal_valid)
lgbm_preds_outlier = lgbm_model_outliers.predict(X_outlier_valid)

lgbm_all_preds = np.vstack([lgbm_preds_normal, lgbm_preds_outlier])

overall_mae = mean_absolute_error(all_y_true, all_preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(all_y_true.shape[1]):
    col_mae = mean_absolute_error(all_y_true[:, col_idx], lgbm_all_preds[:, col_idx])
    print(f"Column {target_columns[col_idx]}: {col_mae:.4f}")

Overall MAE: 1756.4580436676624

MAE for each output column:
Column like_count_initial: 92.2353
Column like_count_final: 127.0633
Column view_count_initial: 2767.8896
Column view_count_final: 4232.0229
