## Imports and Setup

In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
import numpy as np


from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import StackingRegressor

import matplotlib.pyplot as plt

from scipy import stats
import numpy as np
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [3]:
preprocessed_dataset = pd.read_csv("../data/pre-processed/preprocessed_youtube_data.csv")

In [4]:
# Convert to datetime if not already
preprocessed_dataset['published_time'] = pd.to_datetime(preprocessed_dataset['published_time'], format='%H:%M:%S', errors='coerce')

# Extract hour and minute
preprocessed_dataset['published_hour'] = preprocessed_dataset['published_time'].dt.hour
preprocessed_dataset['published_minute'] = preprocessed_dataset['published_time'].dt.minute

# Drop the original time column
preprocessed_dataset.drop(columns=['published_time'], inplace=True)

In [None]:
# Initialize label encoder
le = LabelEncoder()

# Apply on 'published_day_of_week'
preprocessed_dataset['published_day_of_week'] = le.fit_transform(preprocessed_dataset['published_day_of_week'])

In [6]:
missing_mask = preprocessed_dataset[['like_count_initial', 'like_count_final', 'view_count_initial', 'view_count_final']].isnull()

# Check rows where all four columns are NaN
rows_all_nan = missing_mask.all(axis=1)

print("Number of rows with all four columns NaN:", rows_all_nan.sum())

# Optionally, see those rows
print(preprocessed_dataset[rows_all_nan])

Number of rows with all four columns NaN: 11
     category_id  view_count_initial  like_count_initial  \
22           NaN                 NaN                 NaN   
28           NaN                 NaN                 NaN   
32           NaN                 NaN                 NaN   
61           NaN                 NaN                 NaN   
70           NaN                 NaN                 NaN   
83           NaN                 NaN                 NaN   
130          NaN                 NaN                 NaN   
137          NaN                 NaN                 NaN   
179          NaN                 NaN                 NaN   
239          NaN                 NaN                 NaN   
307          NaN                 NaN                 NaN   

     c_view_count_initial  c_subscriber_count_initial  view_count_final  \
22                    NaN                         NaN               NaN   
28                    NaN                         NaN               NaN   
32       

In [7]:
rows_any_nan = missing_mask.any(axis=1)
print("Rows with any of the four columns NaN:", rows_any_nan.sum())

Rows with any of the four columns NaN: 11


In [8]:
preprocessed_dataset = preprocessed_dataset.dropna(subset=[
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
])

# Define target columns
target_columns = [
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
]

In [9]:
# Choose columns to check for outliers, e.g. target columns
cols_to_check = target_columns  # or other numeric features

z_scores = np.abs(stats.zscore(preprocessed_dataset[cols_to_check]))
# Define threshold, e.g. 3 std deviations
threshold = 3
# Keep only rows where all z-scores are below threshold (no outlier)
non_outliers = (z_scores < threshold).all(axis=1)


print(f"Rows before outlier removal: {len(preprocessed_dataset)}")
preprocessed_dataset = preprocessed_dataset[non_outliers]
print(f"Rows after outlier removal: {len(preprocessed_dataset)}")

Rows before outlier removal: 339
Rows after outlier removal: 332


In [12]:
# Separate features and targets
X = preprocessed_dataset.drop(columns=target_columns)
y = preprocessed_dataset[target_columns]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [45]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'objective': 'reg:squarederror',
        # Remove GPU parameters
        # 'tree_method': 'gpu_hist',
        # 'predictor': 'gpu_predictor',
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = xgb.XGBRegressor(**params)
    try:
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )
    except TypeError:
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='mape',
            verbose=False
        )
    
    preds = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

# Run XGBoost optimization
xgb_study = optuna.create_study(direction="minimize", study_name="XGBoost_Optimization")
xgb_study.optimize(xgb_objective, n_trials=100)

print("=== XGBoost Results ===")
print("Best parameters:", xgb_study.best_trial.params)
print("Best MAPE:", xgb_study.best_value)


[I 2025-07-30 17:03:49,634] A new study created in memory with name: XGBoost_Optimization
[I 2025-07-30 17:04:05,310] Trial 0 finished with value: 1.8604313985574502e+17 and parameters: {'n_estimators': 2987, 'learning_rate': 0.05106071894145423, 'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.689467581064696, 'colsample_bytree': 0.8283061246245973, 'gamma': 0.5008432046251461, 'reg_alpha': 0.07103301134755069, 'reg_lambda': 0.2840480207424877}. Best is trial 0 with value: 1.8604313985574502e+17.
[I 2025-07-30 17:04:15,573] Trial 1 finished with value: 4.3166647835846246e+17 and parameters: {'n_estimators': 3103, 'learning_rate': 0.18610828263916201, 'max_depth': 5, 'min_child_weight': 8, 'subsample': 0.7887470011131147, 'colsample_bytree': 0.9984325715994076, 'gamma': 0.029430983295131274, 'reg_alpha': 0.2072872133728041, 'reg_lambda': 0.1599111136209196}. Best is trial 0 with value: 1.8604313985574502e+17.
[I 2025-07-30 17:04:21,555] Trial 2 finished with value: 5.6156178563150

=== XGBoost Results ===
Best parameters: {'n_estimators': 1062, 'learning_rate': 0.11964854632695483, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.9696472459477042, 'colsample_bytree': 0.7315673461781189, 'gamma': 0.16262350633828251, 'reg_alpha': 0.6610390411463459, 'reg_lambda': 0.2257522016837052}
Best MAPE: 4.125767918367539e+16


In [49]:
y.shape

(332, 4)

In [None]:
import optuna
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor  # Missing import
from sklearn.metrics import mean_absolute_percentage_error

def lgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        # Remove 'device': 'gpu' to avoid potential issues
        'objective': 'regression',
        'metric': 'mape',
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1  # Suppress LightGBM output
    }
    
    base_model = LGBMRegressor(**params)
    model = MultiOutputRegressor(base_model)
    
    model.fit(X_train, y_train)
    
    preds = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

# Run LightGBM optimization
lgb_study = optuna.create_study(direction="minimize", study_name="LightGBM_Optimization")
lgb_study.optimize(lgb_objective, n_trials=50)

print("=== LightGBM Results ===")
print("Best parameters:", lgb_study.best_trial.params)
print("Best MAPE:", lgb_study.best_value)


[I 2025-07-30 17:24:19,013] A new study created in memory with name: LightGBM_Optimization
[I 2025-07-30 17:24:28,740] Trial 0 finished with value: 8.684977357648224e+17 and parameters: {'n_estimators': 2953, 'learning_rate': 0.10046435021042623, 'max_depth': 7, 'reg_alpha': 0.7605928591785782, 'reg_lambda': 0.4159794769949411, 'min_child_samples': 34, 'colsample_bytree': 0.6607975266069634, 'subsample': 0.7911985095283368}. Best is trial 0 with value: 8.684977357648224e+17.
[I 2025-07-30 17:24:33,947] Trial 1 finished with value: 6.058697568775142e+17 and parameters: {'n_estimators': 1949, 'learning_rate': 0.03997816504630368, 'max_depth': 3, 'reg_alpha': 0.45476264811316036, 'reg_lambda': 0.9405044861190541, 'min_child_samples': 31, 'colsample_bytree': 0.8087973272779977, 'subsample': 0.7392943924202524}. Best is trial 1 with value: 6.058697568775142e+17.
[I 2025-07-30 17:24:37,610] Trial 2 finished with value: 8.120193817863882e+17 and parameters: {'n_estimators': 1713, 'learning_ra

=== LightGBM Results ===
Best parameters: {'n_estimators': 539, 'learning_rate': 0.012781639959659507, 'max_depth': 6, 'reg_alpha': 0.2456866217457347, 'reg_lambda': 0.27018262454406133, 'min_child_samples': 10, 'colsample_bytree': 0.9832056829779426, 'subsample': 0.5050938384557115}
Best MAPE: 2.1711750690616806e+17


In [57]:
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

def catboost_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'depth': trial.suggest_int('depth', 4, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 64),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0, 1.0),
        'loss_function': 'MAPE',
        'eval_metric': 'MAPE',
        'random_state': 42,
        # Remove 'task_type': 'GPU' - this is causing the error
        'verbose': False
    }
    
    # For multi-output regression with CatBoost
    base_model = CatBoostRegressor(**params)
    model = MultiOutputRegressor(base_model)
    
    # MultiOutputRegressor doesn't support eval_set and early_stopping_rounds
    model.fit(X_train, y_train)
    
    preds = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

# Run CatBoost optimization
catboost_study = optuna.create_study(direction="minimize", study_name="CatBoost_Optimization")
catboost_study.optimize(catboost_objective, n_trials=50)

print("=== CatBoost Results ===")
print("Best parameters:", catboost_study.best_trial.params)
print("Best MAPE:", catboost_study.best_value)


[I 2025-07-30 17:33:22,412] A new study created in memory with name: CatBoost_Optimization
[I 2025-07-30 17:33:26,166] Trial 0 finished with value: 2.82291878982703e+17 and parameters: {'n_estimators': 1387, 'learning_rate': 0.07088703921539713, 'depth': 8, 'min_data_in_leaf': 12, 'l2_leaf_reg': 3.687286032630552, 'bagging_temperature': 0.6124573491407809, 'random_strength': 0.3834157845697843}. Best is trial 0 with value: 2.82291878982703e+17.
[I 2025-07-30 17:33:29,838] Trial 1 finished with value: 3.285725174828223e+17 and parameters: {'n_estimators': 3062, 'learning_rate': 0.1683504389489451, 'depth': 6, 'min_data_in_leaf': 35, 'l2_leaf_reg': 3.06304936647736, 'bagging_temperature': 0.4021190735358261, 'random_strength': 0.44045224148758344}. Best is trial 0 with value: 2.82291878982703e+17.
[I 2025-07-30 17:33:32,081] Trial 2 finished with value: 3.1588355189496636e+16 and parameters: {'n_estimators': 1844, 'learning_rate': 0.09219369637337485, 'depth': 6, 'min_data_in_leaf': 19, 

=== CatBoost Results ===
Best parameters: {'n_estimators': 672, 'learning_rate': 0.04529720295810693, 'depth': 4, 'min_data_in_leaf': 46, 'l2_leaf_reg': 2.225210839773297, 'bagging_temperature': 0.7082493403055143, 'random_strength': 0.9450096159938354}
Best MAPE: 2541866931034411.0


In [58]:
results = {
    'XGBoost': xgb_study.best_value,
    'LightGBM': lgb_study.best_value,
    'CatBoost': catboost_study.best_value
}

print("\n=== Final Comparison ===")
for model_name, best_mape in results.items():
    print(f"{model_name}: {best_mape:.6f}")

best_model = min(results, key=results.get)
print(f"\nBest performing model: {best_model} with MAPE: {results[best_model]:.6f}")


=== Final Comparison ===
XGBoost: 41257679183675392.000000
LightGBM: 217117506906168064.000000
CatBoost: 2541866931034411.000000

Best performing model: CatBoost with MAPE: 2541866931034411.000000


In [None]:
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_valid.shape, y_valid.shape)
print("Any NaN values in targets:", y_train.isnull().sum().sum())


Training set shape: (265, 16) (265, 4)
Validation set shape: (67, 16) (67, 4)


AttributeError: 'numpy.ndarray' object has no attribute 'isnull'

In [155]:
xgb_model = xgb.XGBRegressor(n_estimators=1062, 
                             learning_rate = 0.11964854632695483,
                             max_depth = 5,
                             min_child_weight= 1, 
                             subsample = 0.9696472459477042,
                             colsample_bytree=0.7315673461781189,
                             gamma=0.16262350633828251,
                             reg_alpha=0.6610390411463459,
                             reg_lambda=0.2257522016837052)
                             

In [156]:
lgbm_model = LGBMRegressor(n_estimators = 539,
              learning_rate=0.012781639959659507,
              max_depth=6,
              reg_alpha = 0.2456866217457347,
              reg_lambda=0.27018262454406133,
              min_child_samples = 10,
              colsample_bytree= 0.9832056829779426,
              subsample=0.5050938384557115)

In [157]:
catboost_model = CatBoostRegressor(n_estimators = 672, 
                                   learning_rate=0.04529720295810693,
                                   depth = 4,
                                   min_data_in_leaf=46,
                                   l2_leaf_reg=2.225210839773297,
                                   bagging_temperature=0.7082493403055143,
                                   random_strength=0.9450096159938354)

In [172]:
xgb_model2 = MultiOutputRegressor(xgb_model)
xgb_model2.fit(X_train, y_train)
preds =xgb_model2.predict(X_valid)
overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

# Manual MAPE calculation to debug
# manual_mape = np.mean(np.abs((y_valid - preds) / y_valid)) * 100
# print("Manual MAPE:", manual_mape)


Overall MAE: 253.74639892578125

MAE for each output column:
Column 0: 24.2099
Column 1: 24.5227
Column 2: 430.0874
Column 3: 536.1654


In [168]:
lgbm_model2 = MultiOutputRegressor(lgbm_model)
lgbm_model2.fit(X_train, y_train)
preds =lgbm_model2.predict(X_valid)
overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

Overall MAE: 432.7911449049382

MAE for each output column:
Column 0: 38.9437
Column 1: 41.7671
Column 2: 924.5382
Column 3: 725.9156


In [167]:
catboost_model2 = MultiOutputRegressor(catboost_model)
catboost_model2.fit(X_train, y_train)
preds =catboost_model2.predict(X_valid)
overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

0:	learn: 123.5955350	total: 240us	remaining: 161ms
1:	learn: 122.0131635	total: 615us	remaining: 206ms
2:	learn: 120.0859564	total: 805us	remaining: 180ms
3:	learn: 119.2477444	total: 1.02ms	remaining: 170ms
4:	learn: 117.2304153	total: 1.34ms	remaining: 179ms
5:	learn: 115.4603752	total: 1.72ms	remaining: 191ms
6:	learn: 113.8833913	total: 1.93ms	remaining: 183ms
7:	learn: 112.7732854	total: 2.1ms	remaining: 174ms
8:	learn: 111.0265452	total: 2.27ms	remaining: 167ms
9:	learn: 109.8288818	total: 2.43ms	remaining: 161ms
10:	learn: 108.2995450	total: 2.57ms	remaining: 155ms
11:	learn: 107.4634076	total: 2.73ms	remaining: 150ms
12:	learn: 106.0590760	total: 2.89ms	remaining: 146ms
13:	learn: 105.1293946	total: 3.02ms	remaining: 142ms
14:	learn: 103.8750997	total: 3.18ms	remaining: 139ms
15:	learn: 102.3502061	total: 3.32ms	remaining: 136ms
16:	learn: 101.6493410	total: 3.5ms	remaining: 135ms
17:	learn: 100.4516794	total: 3.66ms	remaining: 133ms
18:	learn: 99.6417624	total: 3.84ms	remaini

In [161]:
meta_model = LinearRegression()

In [170]:
stacking_model2 = StackingRegressor(
    estimators=[
        ('xgb', xgb_model), 
        ('lgb', lgbm_model), 
        ('catboost', catboost_model)
    ],
    final_estimator=meta_model,
    n_jobs=-1

)
stacking_model2 = MultiOutputRegressor(stacking_model2)
# stacking_model2.fit(X_scaled, y)

In [171]:
stacking_model2.fit(X_train, y_train)
preds =stacking_model2.predict(X_valid)
overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"Column {col_idx}: {col_mae:.4f}")

0:	learn: 123.5955350	total: 54.6ms	remaining: 36.7s
1:	learn: 122.0131635	total: 54.8ms	remaining: 18.3s
2:	learn: 120.0859564	total: 54.9ms	remaining: 12.2s
3:	learn: 119.2477444	total: 55.1ms	remaining: 9.2s
4:	learn: 117.2304153	total: 55.2ms	remaining: 7.37s
5:	learn: 115.4603752	total: 55.4ms	remaining: 6.15s
6:	learn: 113.8833913	total: 55.5ms	remaining: 5.28s
7:	learn: 112.7732854	total: 55.7ms	remaining: 4.62s
8:	learn: 111.0265452	total: 55.8ms	remaining: 4.11s
9:	learn: 109.8288818	total: 56ms	remaining: 3.71s
10:	learn: 108.2995450	total: 56.2ms	remaining: 3.38s
11:	learn: 107.4634076	total: 56.5ms	remaining: 3.1s
12:	learn: 106.0590760	total: 56.7ms	remaining: 2.87s
13:	learn: 105.1293946	total: 56.8ms	remaining: 2.67s
14:	learn: 103.8750997	total: 57ms	remaining: 2.5s
15:	learn: 102.3502061	total: 57.2ms	remaining: 2.34s
16:	learn: 101.6493410	total: 57.4ms	remaining: 2.21s
17:	learn: 100.4516794	total: 57.6ms	remaining: 2.09s
18:	learn: 99.6417624	total: 57.8ms	remaining

In [None]:
# stacking_models = []

# for col_idx in range(y.shape[1]):  
#     y_single = y.iloc[:, col_idx] 
    
#     stacking_model = StackingRegressor(
#         estimators=[('xgb', xgb_model), ('lgb', lgbm_model), ('catboost', catboost_model)],
#         final_estimator=LinearRegression(),
#         n_jobs=-1
#     )
    
#     stacking_model.fit(X_scaled, y_single)
#     stacking_models.append(stacking_model)

# def predict_multi_output(X_test):
#     predictions = []
#     for model in stacking_models:
#         pred = model.predict(X_test)
#         predictions.append(pred)
#     return np.column_stack(predictions)


0:	learn: 126.0501380	total: 55.8ms	remaining: 37.4s
1:	learn: 124.5292107	total: 55.9ms	remaining: 18.7s
2:	learn: 122.4611232	total: 56.1ms	remaining: 12.5s
3:	learn: 120.4531563	total: 56.3ms	remaining: 9.4s
4:	learn: 118.8184708	total: 56.4ms	remaining: 7.53s
5:	learn: 116.9123755	total: 56.6ms	remaining: 6.28s
6:	learn: 115.1229817	total: 56.8ms	remaining: 5.4s
7:	learn: 113.5780115	total: 57ms	remaining: 4.73s
8:	learn: 112.1550353	total: 57.2ms	remaining: 4.22s
9:	learn: 110.7388925	total: 57.5ms	remaining: 3.8s
10:	learn: 109.0569920	total: 57.6ms	remaining: 3.46s
11:	learn: 107.3550196	total: 57.9ms	remaining: 3.18s
12:	learn: 105.9760774	total: 58ms	remaining: 2.94s
13:	learn: 104.4734315	total: 58.2ms	remaining: 2.73s
14:	learn: 103.3364194	total: 58.4ms	remaining: 2.56s
15:	learn: 102.2154100	total: 58.5ms	remaining: 2.4s
16:	learn: 101.0567054	total: 58.7ms	remaining: 2.26s
17:	learn: 99.9424307	total: 58.8ms	remaining: 2.14s
18:	learn: 98.9765307	total: 58.9ms	remaining: 

In [None]:
# catboost_model2.fit(X_train, y_train)
# preds =catboost_model2.predict(X_valid)
# mae = mean_absolute_error(y_valid, preds)