In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import StackingRegressor

import matplotlib.pyplot as plt

from scipy import stats
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
import pickle
import json


import warnings
warnings.filterwarnings("ignore")

In [6]:
preprocessed_dataset = pd.read_csv("../data/pre-processed/preprocessed_youtube_data.csv")

In [7]:
# Convert to datetime if not already
preprocessed_dataset['published_time'] = pd.to_datetime(preprocessed_dataset['published_time'], format='%H:%M:%S', errors='coerce')

# Extract hour and minute
preprocessed_dataset['published_hour'] = preprocessed_dataset['published_time'].dt.hour
preprocessed_dataset['published_minute'] = preprocessed_dataset['published_time'].dt.minute

# Drop the original time column
preprocessed_dataset.drop(columns=['published_time'], inplace=True)

In [8]:
# Separate label encoders for each column
le_day = LabelEncoder()
le_def = LabelEncoder()

preprocessed_dataset['published_day_of_week'] = le_day.fit_transform(preprocessed_dataset['published_day_of_week'])
preprocessed_dataset['definition'] = le_def.fit_transform(preprocessed_dataset['definition'])


In [9]:
missing_mask = preprocessed_dataset[['like_count_initial', 'like_count_final', 'view_count_initial', 'view_count_final']].isnull()

# Check rows where all four columns are NaN
rows_all_nan = missing_mask.all(axis=1)

print("Number of rows with all four columns NaN:", rows_all_nan.sum())

# Optionally, see those rows
print(preprocessed_dataset[rows_all_nan])

Number of rows with all four columns NaN: 5
       category_id country  definition logged_at_initial  view_count_initial  \
4465           NaN     NaN           2               NaN                 NaN   
8504           NaN     NaN           2               NaN                 NaN   
17296          NaN     NaN           2               NaN                 NaN   
18396          NaN     NaN           2               NaN                 NaN   
22842          NaN     NaN           2               NaN                 NaN   

       like_count_initial  c_view_count_initial logged_at_final  \
4465                  NaN                   NaN             NaN   
8504                  NaN                   NaN             NaN   
17296                 NaN                   NaN             NaN   
18396                 NaN                   NaN             NaN   
22842                 NaN                   NaN             NaN   

       view_count_final  like_count_final  ...  channel_avg_views  \
446

In [10]:
rows_any_nan = missing_mask.any(axis=1)
print("Rows with any of the four columns NaN:", rows_any_nan.sum())

Rows with any of the four columns NaN: 705


In [11]:
rows_any_nan = missing_mask.any(axis=1)
print("Rows with any of the four columns NaN:", rows_any_nan.sum())

Rows with any of the four columns NaN: 705


In [12]:
# Step 1: Convert to category
preprocessed_dataset['country'] = preprocessed_dataset['country'].astype('category')

# Step 2: Save mapping BEFORE encoding
categories = list(preprocessed_dataset['country'].cat.categories)
country_to_code = {country: idx for idx, country in enumerate(categories)}

with open("country_encoding.json", "w") as f:
    json.dump(country_to_code, f)

# Step 3: Encode with .cat.codes
preprocessed_dataset['country_encoded'] = preprocessed_dataset['country'].cat.codes


preprocessed_dataset['category_id'] = (
    preprocessed_dataset['category_id']
    .fillna(-1)  # or any placeholder like 999
    .astype(int)
)


In [13]:
preprocessed_dataset.drop(columns=['logged_at_final', 'logged_at_initial','country'], inplace=True)

In [14]:
preprocessed_dataset = preprocessed_dataset.dropna(subset=[
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
])

# Define target columns
target_columns = [
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
]

In [None]:
# Choose columns to check for outliers, e.g. target columns
cols_to_check = target_columns  # or other numeric features

z_scores = np.abs(stats.zscore(preprocessed_dataset[cols_to_check]))
# Define threshold, e.g. 3 std deviations
threshold = 3
# Keep only rows where all z-scores are below threshold (no outlier)
non_outliers = (z_scores < threshold).all(axis=1)


print(f"Rows before outlier removal: {len(preprocessed_dataset)}")
preprocessed_dataset = preprocessed_dataset[non_outliers]
print(f"Rows after outlier removal: {len(preprocessed_dataset)}")

Rows before outlier removal: 24643
Rows after outlier removal: 24406


In [21]:
q25 = preprocessed_dataset['view_count_final'].quantile(0.25)
q75 = preprocessed_dataset['view_count_final'].quantile(0.75)

df_low = preprocessed_dataset[preprocessed_dataset['view_count_final'] <= q25]
df_mid = preprocessed_dataset[(preprocessed_dataset['view_count_final'] > q25) & (preprocessed_dataset['view_count_final'] <= q75)]
df_high = preprocessed_dataset[preprocessed_dataset['view_count_final'] > q75]

In [23]:
def describe_stats(df, cols):
    mean_vals = df[cols].mean()
    median_vals = df[cols].median()
    mode_vals = df[cols].mode().iloc[0]  # mode() returns DataFrame, take first mode row
    
    summary = pd.DataFrame({
        'mean': mean_vals,
        'median': median_vals,
        'mode': mode_vals
    })
    return summary

print("Low quantile stats:")
print(describe_stats(df_low, target_columns))

print("\nMid quantile stats:")
print(describe_stats(df_mid, target_columns))

print("\nHigh quantile stats:")
print(describe_stats(df_high, target_columns))

Low quantile stats:
                        mean  median  mode
like_count_initial  0.439024     0.0   0.0
like_count_final    0.554524     0.0   0.0
view_count_initial  2.686231     0.0   0.0
view_count_final    1.096459     0.0   0.0

Mid quantile stats:
                          mean  median  mode
like_count_initial    5.809118     2.0   0.0
like_count_final      9.325554     4.0   0.0
view_count_initial  180.674195   107.0   0.0
view_count_final    268.228022   220.0   8.0

High quantile stats:
                           mean  median   mode
like_count_initial   112.838419    20.0    0.0
like_count_final     210.416831    45.0    0.0
view_count_initial  2831.420932   984.5    0.0
view_count_final    5494.560039  1839.5  776.0


In [35]:
print(df_low.shape)
print(df_mid.shape)
print(df_high.shape)

(6355, 23)
(11955, 23)
(6096, 23)


In [None]:
X_low = df_low.drop(columns=target_columns)
y_low = df_low[target_columns]

X_mid = df_mid.drop(columns=target_columns)
y_mid = df_mid[target_columns]

X_high = df_high.drop(columns=target_columns)
y_high = df_high[target_columns]

X_low_train, X_low_valid, y_low_train, y_low_valid = train_test_split(
    X_low, y_low, test_size=0.2, random_state=42
)

X_mid_train, X_mid_valid, y_mid_train, y_mid_valid = train_test_split(
    X_mid, y_mid, test_size=0.2, random_state=42
)

X_high_train, X_high_valid, y_high_train, y_high_valid = train_test_split(
    X_high, y_high, test_size=0.2, random_state=42
)

In [None]:
y_high_train_log = np.log1p(y_high_train)
y_mid_train_log = np.log1p(y_mid_train)
y_low_train_log = np.log1p(y_low_train)

In [59]:
X = preprocessed_dataset.drop(columns=target_columns)
y = preprocessed_dataset[target_columns]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

y_train_log = np.log1p(y_train)

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1
    }

    model = xgb.XGBRegressor(**params)

    model.fit(
        X_train, y_train_log,
        eval_set=[(X_valid, np.log1p(y_valid))],
        verbose=False
    )

    preds_log = model.predict(X_valid)
    preds = np.expm1(preds_log)

    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

# Run Optuna optimization
xgb_study = optuna.create_study(direction="minimize", study_name="XGBoost_Optimization")
xgb_study.optimize(xgb_objective, n_trials=10)

print("=== XGBoost Results ===")
print("Best parameters:", xgb_study.best_trial.params)
print("Best MAPE:", xgb_study.best_value)

[I 2025-08-04 21:02:07,987] A new study created in memory with name: XGBoost_Optimization
[I 2025-08-04 21:02:22,296] Trial 0 finished with value: 1.4874246987644928e+16 and parameters: {'n_estimators': 3947, 'learning_rate': 0.09461766381030756, 'max_depth': 8, 'min_child_weight': 10, 'subsample': 0.8500051821023207, 'colsample_bytree': 0.9370362952602258, 'gamma': 0.6612943435562048, 'reg_alpha': 0.9334310625493863, 'reg_lambda': 0.1649915063463956}. Best is trial 0 with value: 1.4874246987644928e+16.
[I 2025-08-04 21:02:29,618] Trial 1 finished with value: 1.3172662549348352e+16 and parameters: {'n_estimators': 2029, 'learning_rate': 0.054490751955891044, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.9771885834035087, 'colsample_bytree': 0.8887743013768867, 'gamma': 0.8226295644712508, 'reg_alpha': 0.21217204915181642, 'reg_lambda': 0.6375549417350144}. Best is trial 1 with value: 1.3172662549348352e+16.
[I 2025-08-04 21:02:40,835] Trial 2 finished with value: 1.585237210221

=== XGBoost Results ===
Best parameters: {'n_estimators': 2029, 'learning_rate': 0.054490751955891044, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.9771885834035087, 'colsample_bytree': 0.8887743013768867, 'gamma': 0.8226295644712508, 'reg_alpha': 0.21217204915181642, 'reg_lambda': 0.6375549417350144}
Best MAPE: 1.3172662549348352e+16


In [60]:
xgb_model = xgb.XGBRegressor(**xgb_study.best_trial.params)

In [61]:
xgb_model = xgb.XGBRegressor(**xgb_study.best_trial.params)

xgb_model2 = MultiOutputRegressor(xgb_model)
xgb_model2.fit(X_train, np.log1p(y_train))

preds_log = xgb_model2.predict(X_valid)
preds = np.expm1(preds_log)

overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"{target_columns[col_idx]}: {col_mae:.4f}")

# Optional: Manual MAPE check
# manual_mape = np.mean(np.abs((y_valid.to_numpy() - preds) / y_valid.to_numpy())) * 100
# print("Manual MAPE:", manual_mape)

Overall MAE: 507.79034423828125

MAE for each output column:
like_count_initial: 28.5559
like_count_final: 47.5587
view_count_initial: 710.1768
view_count_final: 1244.8700


In [63]:
overall_smape = smape(y_valid.to_numpy(), preds)
print("\nOverall SMAPE:", overall_smape)

print("\nSMAPE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_smape = smape(y_valid.iloc[:, col_idx].values, preds[:, col_idx])
    print(f"{target_columns[col_idx]}: {col_smape:.4f}")


Overall SMAPE: 125.37239749873233

SMAPE for each output column:
like_count_initial: 129.2731
like_count_final: 125.2121
view_count_initial: 124.5875
view_count_final: 122.4170


In [52]:
import optuna
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error

def xgb_objective(trial, X_train, y_train, X_valid, y_valid):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1
    }

    model = xgb.XGBRegressor(**params)
    try:
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )
    except TypeError:
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='mape',
            verbose=False
        )

    preds = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

def optimize_xgb(X_train, y_train, X_valid, y_valid, study_name, n_trials=10):
    study = optuna.create_study(direction="minimize", study_name=study_name)
    func = lambda trial: xgb_objective(trial, X_train, y_train, X_valid, y_valid)
    study.optimize(func, n_trials=n_trials)
    print(f"=== {study_name} Results ===")
    print("Best parameters:", study.best_trial.params)
    print("Best MAPE:", study.best_value)
    return study

xgb_study_low = optimize_xgb(X_low_train, y_low_train_log, X_low_valid, y_low_valid, "XGBoost_Optimization_Low")
xgb_study_mid = optimize_xgb(X_mid_train, y_mid_train_log, X_mid_valid, y_mid_valid, "XGBoost_Optimization_Mid")
xgb_study_high = optimize_xgb(X_high_train, y_high_train_log, X_high_valid, y_high_valid, "XGBoost_Optimization_High")

[I 2025-08-04 20:33:03,783] A new study created in memory with name: XGBoost_Optimization_Low
[I 2025-08-04 20:33:08,383] Trial 0 finished with value: 915232297844736.0 and parameters: {'n_estimators': 2250, 'learning_rate': 0.1691854099151752, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.8435201389178885, 'colsample_bytree': 0.6604896931640378, 'gamma': 0.22553540603993338, 'reg_alpha': 0.41834744570475657, 'reg_lambda': 0.6521175728286375}. Best is trial 0 with value: 915232297844736.0.
[I 2025-08-04 20:33:12,489] Trial 1 finished with value: 902540535267328.0 and parameters: {'n_estimators': 1734, 'learning_rate': 0.11517847732047454, 'max_depth': 10, 'min_child_weight': 9, 'subsample': 0.8676531438291367, 'colsample_bytree': 0.7036921115448141, 'gamma': 0.1727055865205792, 'reg_alpha': 0.7059185939028298, 'reg_lambda': 0.680001133075888}. Best is trial 1 with value: 902540535267328.0.
[I 2025-08-04 20:33:18,341] Trial 2 finished with value: 893799135969280.0 and parameters

=== XGBoost_Optimization_Low Results ===
Best parameters: {'n_estimators': 1636, 'learning_rate': 0.14934854012689025, 'max_depth': 14, 'min_child_weight': 10, 'subsample': 0.7473156230517488, 'colsample_bytree': 0.7477507854280089, 'gamma': 0.8482831699121116, 'reg_alpha': 0.4144573971958716, 'reg_lambda': 0.5304128061203928}
Best MAPE: 881786179551232.0


[I 2025-08-04 20:34:07,061] Trial 0 finished with value: 718735799222272.0 and parameters: {'n_estimators': 1520, 'learning_rate': 0.024185636239910203, 'max_depth': 6, 'min_child_weight': 3, 'subsample': 0.6438807502943326, 'colsample_bytree': 0.8499734888781426, 'gamma': 0.588030135373623, 'reg_alpha': 0.2746317908348729, 'reg_lambda': 0.4712887487397688}. Best is trial 0 with value: 718735799222272.0.
[I 2025-08-04 20:34:14,232] Trial 1 finished with value: 726191862448128.0 and parameters: {'n_estimators': 3132, 'learning_rate': 0.1546728659198377, 'max_depth': 8, 'min_child_weight': 10, 'subsample': 0.8632822061713544, 'colsample_bytree': 0.7152130537566247, 'gamma': 0.9412184940318488, 'reg_alpha': 0.8189313637537742, 'reg_lambda': 0.6404119340273905}. Best is trial 0 with value: 718735799222272.0.
[I 2025-08-04 20:34:23,886] Trial 2 finished with value: 722564225695744.0 and parameters: {'n_estimators': 3974, 'learning_rate': 0.16575554888234423, 'max_depth': 12, 'min_child_weig

=== XGBoost_Optimization_Mid Results ===
Best parameters: {'n_estimators': 1713, 'learning_rate': 0.09637049526722055, 'max_depth': 7, 'min_child_weight': 7, 'subsample': 0.648534705782699, 'colsample_bytree': 0.7098509656211777, 'gamma': 0.3908758627283362, 'reg_alpha': 0.7169739574092223, 'reg_lambda': 0.0822594087841082}
Best MAPE: 718486959554560.0


[I 2025-08-04 20:35:19,904] Trial 0 finished with value: 492753242816512.0 and parameters: {'n_estimators': 984, 'learning_rate': 0.17853730322662031, 'max_depth': 13, 'min_child_weight': 5, 'subsample': 0.9918679022519797, 'colsample_bytree': 0.6525247243618869, 'gamma': 0.5762438953805084, 'reg_alpha': 0.09706995170091481, 'reg_lambda': 0.16310652200639542}. Best is trial 0 with value: 492753242816512.0.
[I 2025-08-04 20:35:22,746] Trial 1 finished with value: 514253144457216.0 and parameters: {'n_estimators': 801, 'learning_rate': 0.04455690556135171, 'max_depth': 12, 'min_child_weight': 2, 'subsample': 0.9562387450430431, 'colsample_bytree': 0.6156532670467968, 'gamma': 0.6537329456350439, 'reg_alpha': 0.7924832976997979, 'reg_lambda': 0.7258906135689638}. Best is trial 0 with value: 492753242816512.0.
[I 2025-08-04 20:35:35,001] Trial 2 finished with value: 495779886137344.0 and parameters: {'n_estimators': 3777, 'learning_rate': 0.02606472997387763, 'max_depth': 10, 'min_child_we

=== XGBoost_Optimization_High Results ===
Best parameters: {'n_estimators': 3981, 'learning_rate': 0.10366208203799912, 'max_depth': 7, 'min_child_weight': 6, 'subsample': 0.9613644152310399, 'colsample_bytree': 0.6325034376956548, 'gamma': 0.13027905238536275, 'reg_alpha': 0.6928373599998466, 'reg_lambda': 0.49691245909852977}
Best MAPE: 475133609246720.0


In [53]:
xgb_model_low = xgb.XGBRegressor(**xgb_study_low.best_trial.params)
xgb_model_mid = xgb.XGBRegressor(**xgb_study_mid.best_trial.params)
xgb_model_high = xgb.XGBRegressor(**xgb_study_high.best_trial.params)

xgb_model_low = MultiOutputRegressor(xgb_model_low)
xgb_model_mid = MultiOutputRegressor(xgb_model_mid)
xgb_model_high = MultiOutputRegressor(xgb_model_high)

xgb_model_low.fit(X_low_train, y_low_train_log)
xgb_model_mid.fit(X_mid_train, y_mid_train_log)
xgb_model_high.fit(X_high_train, y_high_train_log)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6325034376956548
,device,
,early_stopping_rounds,
,enable_categorical,False


In [54]:
preds_low = np.expm1(xgb_model_low.predict(X_low_valid))
preds_mid = np.expm1(xgb_model_mid.predict(X_mid_valid))
preds_high = np.expm1(xgb_model_high.predict(X_high_valid))

print("=== Low Region MAE ===")
print("Overall MAE:", mean_absolute_error(y_low_valid, preds_low))
for i, col in enumerate(target_columns):
    print(f"{col}: {mean_absolute_error(y_low_valid.iloc[:, i], preds_low[:, i]):.4f}")

print("\n=== Mid Region MAE ===")
print("Overall MAE:", mean_absolute_error(y_mid_valid, preds_mid))
for i, col in enumerate(target_columns):
    print(f"{col}: {mean_absolute_error(y_mid_valid.iloc[:, i], preds_mid[:, i]):.4f}")

print("\n=== High Region MAE ===")
print("Overall MAE:", mean_absolute_error(y_high_valid, preds_high))
for i, col in enumerate(target_columns):
    print(f"{col}: {mean_absolute_error(y_high_valid.iloc[:, i], preds_high[:, i]):.4f}")

=== Low Region MAE ===
Overall MAE: 1.289104700088501
like_count_initial: 0.3581
like_count_final: 0.8197
view_count_initial: 2.9388
view_count_final: 1.0397

=== Mid Region MAE ===
Overall MAE: 87.70445251464844
like_count_initial: 4.9960
like_count_final: 7.8142
view_count_initial: 143.6910
view_count_final: 194.3166

=== High Region MAE ===
Overall MAE: 1519.567626953125
like_count_initial: 85.2049
like_count_final: 166.5257
view_count_initial: 2004.3020
view_count_final: 3822.2376


In [55]:
import numpy as np

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    smape_value = np.mean(np.where(denominator == 0, 0, diff / denominator)) * 100
    return smape_value

print("=== Low Region SMAPE ===")
print("Overall SMAPE:", smape(y_low_valid.values, preds_low))
for i, col in enumerate(target_columns):
    print(f"{col}: {smape(y_low_valid.iloc[:, i].values, preds_low[:, i]):.4f}")

print("\n=== Mid Region SMAPE ===")
print("Overall SMAPE:", smape(y_mid_valid.values, preds_mid))
for i, col in enumerate(target_columns):
    print(f"{col}: {smape(y_mid_valid.iloc[:, i].values, preds_mid[:, i]):.4f}")
    

print("\n=== High Region SMAPE ===")
print("Overall SMAPE:", smape(y_high_valid.values, preds_high))
for i, col in enumerate(target_columns):
    print(f"{col}: {smape(y_high_valid.iloc[:, i].values, preds_high[:, i]):.4f}")

=== Low Region SMAPE ===
Overall SMAPE: 168.51488532083118
like_count_initial: 190.7897
like_count_final: 188.3678
view_count_initial: 152.6129
view_count_final: 142.2892

=== Mid Region SMAPE ===
Overall SMAPE: 102.38235678870046
like_count_initial: 112.9511
like_count_final: 103.8348
view_count_initial: 101.8631
view_count_final: 90.8805

=== High Region SMAPE ===
Overall SMAPE: 79.45835519876309
like_count_initial: 94.0973
like_count_final: 95.0935
view_count_initial: 67.7931
view_count_final: 60.8494


In [None]:
print("\n=== Overall SMAPE Across All Regions ===")
print("Overall SMAPE:", smape(y_all_valid.values, preds_all))

# SMAPE for each column
for i, col in enumerate(target_columns):
    print(f"{col}: {smape(y_all_valid.iloc[:, i].values, preds_all[:, i]):.4f}")

In [56]:
import numpy as np

def nrmse(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    range_ = np.max(y_true) - np.min(y_true)
    return (rmse / range_) * 100 if range_ != 0 else 0.0

print("=== Low Region nRMSE ===")
print("Overall nRMSE:", nrmse(y_low_valid.values, preds_low))
for i, col in enumerate(target_columns):
    print(f"{col}: {nrmse(y_low_valid.iloc[:, i].values, preds_low[:, i]):.4f}")

print("\n=== Mid Region nRMSE ===")
print("Overall nRMSE:", nrmse(y_mid_valid.values, preds_mid))
for i, col in enumerate(target_columns):
    print(f"{col}: {nrmse(y_mid_valid.iloc[:, i].values, preds_mid[:, i]):.4f}")

print("\n=== High Region nRMSE ===")
print("Overall nRMSE:", nrmse(y_high_valid.values, preds_high))
for i, col in enumerate(target_columns):
    print(f"{col}: {nrmse(y_high_valid.iloc[:, i].values, preds_high[:, i]):.4f}")

=== Low Region nRMSE ===
Overall nRMSE: 1.9778539517012668
like_count_initial: 3.1992
like_count_final: 2.8283
view_count_initial: 3.4944
view_count_final: 34.0807

=== Mid Region nRMSE ===
Overall nRMSE: 5.912502234213298
like_count_initial: 3.6898
like_count_final: 5.0511
view_count_initial: 7.5031
view_count_final: 33.8010

=== High Region nRMSE ===
Overall nRMSE: 5.490418634648258
like_count_initial: 7.8330
like_count_final: 11.6467
view_count_initial: 7.7678
view_count_final: 9.6796


In [57]:
preds_low = np.expm1(xgb_model_low.predict(X_low_valid))
preds_mid = np.expm1(xgb_model_mid.predict(X_mid_valid))
preds_high = np.expm1(xgb_model_high.predict(X_high_valid))

all_preds = np.vstack([preds_low, preds_mid, preds_high])

all_y_true = pd.concat([y_low_valid, y_mid_valid, y_high_valid], axis=0).to_numpy()

overall_mae = mean_absolute_error(all_y_true, all_preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(all_y_true.shape[1]):
    col_mae = mean_absolute_error(all_y_true[:, col_idx], all_preds[:, col_idx])
    print(f"Column {target_columns[col_idx]}: {col_mae:.4f}")

Overall MAE: 423.02585479589186

MAE for each output column:
Column like_count_initial: 23.8326
Column like_count_final: 45.6549
Column view_count_initial: 572.0092
Column view_count_final: 1050.6068


In [68]:
overall_smape = smape(all_y_true, all_preds)
print("Overall SMAPE:", overall_smape)

# SMAPE per column
print("\nSMAPE for each output column:")
for col_idx in range(all_y_true.shape[1]):
    col_smape = smape(all_y_true[:, col_idx], all_preds[:, col_idx])
    print(f"Column {target_columns[col_idx]}: {col_smape:.2f}%")

Overall SMAPE: 113.8709192271713

SMAPE for each output column:
Column like_count_initial: 128.50%
Column like_count_final: 123.66%
Column view_count_initial: 106.56%
Column view_count_final: 96.76%


In [30]:
def create_lgb_objective(X_train, y_train, X_valid, y_valid):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'max_depth': trial.suggest_int('max_depth', 3, 14),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'objective': 'regression',
            'metric': 'mape',
            'random_state': 42,
            'n_jobs': -1,
            'verbose': -1
        }

        model = MultiOutputRegressor(LGBMRegressor(**params))
        model.fit(X_train, y_train)

        preds = model.predict(X_valid)
        mape = mean_absolute_percentage_error(y_valid, preds)
        return mape

    return objective

In [32]:
import optuna

# LOW region
study_low = optuna.create_study(direction='minimize')
study_low.optimize(create_lgb_objective(X_low_train, y_low_train, X_low_valid, y_low_valid), n_trials=10)

# MID region
study_mid = optuna.create_study(direction='minimize')
study_mid.optimize(create_lgb_objective(X_mid_train, y_mid_train, X_mid_valid, y_mid_valid), n_trials=10)

# HIGH region
study_high = optuna.create_study(direction='minimize')
study_high.optimize(create_lgb_objective(X_high_train, y_high_train, X_high_valid, y_high_valid), n_trials=10)

[I 2025-08-04 19:47:08,522] A new study created in memory with name: no-name-820a98af-6e94-4fc6-a8ae-2d661fd8eecc
[I 2025-08-04 19:47:54,213] Trial 0 finished with value: 7928848174820195.0 and parameters: {'n_estimators': 2692, 'learning_rate': 0.18257960244361432, 'max_depth': 11, 'reg_alpha': 0.7700945671267907, 'reg_lambda': 0.8052487407141806, 'min_child_samples': 30, 'colsample_bytree': 0.9236564522824311, 'subsample': 0.502212248870031}. Best is trial 0 with value: 7928848174820195.0.
[I 2025-08-04 19:48:06,201] Trial 1 finished with value: 6354468639947926.0 and parameters: {'n_estimators': 1249, 'learning_rate': 0.06614494188025122, 'max_depth': 5, 'reg_alpha': 0.9330710201903581, 'reg_lambda': 0.9463471349620218, 'min_child_samples': 40, 'colsample_bytree': 0.6751895979346652, 'subsample': 0.5624077341224145}. Best is trial 1 with value: 6354468639947926.0.
[W 2025-08-04 19:48:13,878] Trial 2 failed with parameters: {'n_estimators': 3162, 'learning_rate': 0.18418427084098773,

KeyboardInterrupt: 

In [None]:
from optuna import create_study

# Create and run the study
lgb_study_outlier = create_study(direction="minimize", study_name="LightGBM_Outlier")
objective_fn_outlier = create_lgb_objective(
    X_outlier_train, y_outlier_train, X_outlier_valid, y_outlier_valid
)
lgb_study_outlier.optimize(objective_fn_outlier, n_trials=100)

# Display results
print("=== LightGBM Outlier Results ===")
print("Best parameters:")
for key, value in lgb_study_outlier.best_trial.params.items():
    print(f"{key}: {value}")
print(f"Best MAPE: {lgb_study_outlier.best_value:.4f}")