In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import StackingRegressor

import matplotlib.pyplot as plt

from scipy import stats
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
import pickle
import json


import warnings
warnings.filterwarnings("ignore")

In [3]:
preprocessed_dataset = pd.read_csv("../data/pre-processed/preprocessed_youtube_data.csv")

In [4]:
# Convert to datetime if not already
preprocessed_dataset['published_time'] = pd.to_datetime(preprocessed_dataset['published_time'], format='%H:%M:%S', errors='coerce')

# Extract hour and minute
preprocessed_dataset['published_hour'] = preprocessed_dataset['published_time'].dt.hour
preprocessed_dataset['published_minute'] = preprocessed_dataset['published_time'].dt.minute

# Drop the original time column
preprocessed_dataset.drop(columns=['published_time'], inplace=True)

In [5]:
# Separate label encoders for each column
le_day = LabelEncoder()
le_def = LabelEncoder()

preprocessed_dataset['published_day_of_week'] = le_day.fit_transform(preprocessed_dataset['published_day_of_week'])
preprocessed_dataset['definition'] = le_def.fit_transform(preprocessed_dataset['definition'])


In [6]:
missing_mask = preprocessed_dataset[['like_count_initial', 'like_count_final', 'view_count_initial', 'view_count_final']].isnull()

# Check rows where all four columns are NaN
rows_all_nan = missing_mask.all(axis=1)

print("Number of rows with all four columns NaN:", rows_all_nan.sum())

# Optionally, see those rows
print(preprocessed_dataset[rows_all_nan])

Number of rows with all four columns NaN: 12
       category_id country  definition logged_at_initial  view_count_initial  \
13893          NaN     NaN           2               NaN                 NaN   
14898          NaN     NaN           2               NaN                 NaN   
18678          NaN     NaN           2               NaN                 NaN   
22052          NaN     NaN           2               NaN                 NaN   
28135          NaN     NaN           2               NaN                 NaN   
29196          NaN     NaN           2               NaN                 NaN   
31200          NaN     NaN           2               NaN                 NaN   
32603          NaN     NaN           2               NaN                 NaN   
38926          NaN     NaN           2               NaN                 NaN   
41143          NaN     NaN           2               NaN                 NaN   
43640          NaN     NaN           2               NaN                 Na

In [7]:
rows_any_nan = missing_mask.any(axis=1)
print("Rows with any of the four columns NaN:", rows_any_nan.sum())

Rows with any of the four columns NaN: 682


In [8]:
rows_any_nan = missing_mask.any(axis=1)
print("Rows with any of the four columns NaN:", rows_any_nan.sum())

Rows with any of the four columns NaN: 682


In [9]:
# Step 1: Convert to category
preprocessed_dataset['country'] = preprocessed_dataset['country'].astype('category')

# Step 2: Save mapping BEFORE encoding
categories = list(preprocessed_dataset['country'].cat.categories)
country_to_code = {country: idx for idx, country in enumerate(categories)}

with open("country_encoding.json", "w") as f:
    json.dump(country_to_code, f)

# Step 3: Encode with .cat.codes
preprocessed_dataset['country_encoded'] = preprocessed_dataset['country'].cat.codes


preprocessed_dataset['category_id'] = (
    preprocessed_dataset['category_id']
    .fillna(-1)  # or any placeholder like 999
    .astype(int)
)


In [10]:
preprocessed_dataset.drop(columns=['logged_at_final', 'logged_at_initial','country'], inplace=True)

In [11]:
preprocessed_dataset = preprocessed_dataset.dropna(subset=[
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
])

# Define target columns
target_columns = [
    'like_count_initial',
    'like_count_final',
    'view_count_initial',
    'view_count_final'
]

In [12]:
# Choose columns to check for outliers, e.g. target columns
cols_to_check = target_columns  # or other numeric features

z_scores = np.abs(stats.zscore(preprocessed_dataset[cols_to_check]))
# Define threshold, e.g. 3 std deviations
threshold = 3
# Keep only rows where all z-scores are below threshold (no outlier)
non_outliers = (z_scores < threshold).all(axis=1)


print(f"Rows before outlier removal: {len(preprocessed_dataset)}")
preprocessed_dataset = preprocessed_dataset[non_outliers]
print(f"Rows after outlier removal: {len(preprocessed_dataset)}")

Rows before outlier removal: 51862
Rows after outlier removal: 51398


In [13]:
q25 = preprocessed_dataset['view_count_final'].quantile(0.25)
q75 = preprocessed_dataset['view_count_final'].quantile(0.75)

df_low = preprocessed_dataset[preprocessed_dataset['view_count_final'] <= q25]
df_mid = preprocessed_dataset[(preprocessed_dataset['view_count_final'] > q25) & (preprocessed_dataset['view_count_final'] <= q75)]
df_high = preprocessed_dataset[preprocessed_dataset['view_count_final'] > q75]

In [14]:
def describe_stats(df, cols):
    mean_vals = df[cols].mean()
    median_vals = df[cols].median()
    mode_vals = df[cols].mode().iloc[0]  # mode() returns DataFrame, take first mode row
    
    summary = pd.DataFrame({
        'mean': mean_vals,
        'median': median_vals,
        'mode': mode_vals
    })
    return summary

print("Low quantile stats:")
print(describe_stats(df_low, target_columns))

print("\nMid quantile stats:")
print(describe_stats(df_mid, target_columns))

print("\nHigh quantile stats:")
print(describe_stats(df_high, target_columns))

Low quantile stats:
                        mean  median  mode
like_count_initial  0.744424     0.0   0.0
like_count_final    0.894701     0.0   0.0
view_count_initial  4.539665     0.0   0.0
view_count_final    1.100359     0.0   0.0

Mid quantile stats:
                          mean  median  mode
like_count_initial    6.130695     2.0   0.0
like_count_final      9.744404     4.0   0.0
view_count_initial  187.141888   116.0   0.0
view_count_final    275.472813   230.0   8.0

High quantile stats:
                           mean  median   mode
like_count_initial   120.337227    23.0    0.0
like_count_final     225.422664    49.0    0.0
view_count_initial  3045.790576  1067.0    0.0
view_count_final    5922.651246  2058.5  873.0


In [15]:
print(df_low.shape)
print(df_mid.shape)
print(df_high.shape)

(13362, 23)
(25196, 23)
(12840, 23)


In [16]:
X_low = df_low.drop(columns=target_columns)
y_low = df_low[target_columns]

X_mid = df_mid.drop(columns=target_columns)
y_mid = df_mid[target_columns]

X_high = df_high.drop(columns=target_columns)
y_high = df_high[target_columns]

X_low_train, X_low_valid, y_low_train, y_low_valid = train_test_split(
    X_low, y_low, test_size=0.2, random_state=42
)

X_mid_train, X_mid_valid, y_mid_train, y_mid_valid = train_test_split(
    X_mid, y_mid, test_size=0.2, random_state=42
)

X_high_train, X_high_valid, y_high_train, y_high_valid = train_test_split(
    X_high, y_high, test_size=0.2, random_state=42
)

In [17]:
y_high_train_log = np.log1p(y_high_train)
y_mid_train_log = np.log1p(y_mid_train)
y_low_train_log = np.log1p(y_low_train)

In [18]:
X = preprocessed_dataset.drop(columns=target_columns)
y = preprocessed_dataset[target_columns]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

y_train_log = np.log1p(y_train)

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1
    }

    model = xgb.XGBRegressor(**params)

    model.fit(
        X_train, y_train_log,
        eval_set=[(X_valid, np.log1p(y_valid))],
        verbose=False
    )

    preds_log = model.predict(X_valid)
    preds = np.expm1(preds_log)

    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape

# Run Optuna optimization
xgb_study = optuna.create_study(direction="minimize", study_name="XGBoost_Optimization")
xgb_study.optimize(xgb_objective, n_trials=10)

print("=== XGBoost Results ===")
print("Best parameters:", xgb_study.best_trial.params)
print("Best MAPE:", xgb_study.best_value)

[I 2025-08-08 20:39:10,303] A new study created in memory with name: XGBoost_Optimization


[I 2025-08-08 20:39:18,340] Trial 0 finished with value: 1.4029244384411648e+16 and parameters: {'n_estimators': 1710, 'learning_rate': 0.09663647934733707, 'max_depth': 14, 'min_child_weight': 8, 'subsample': 0.8911038928322303, 'colsample_bytree': 0.990767441078574, 'gamma': 0.56111803675693, 'reg_alpha': 0.7383602996523383, 'reg_lambda': 0.09322553539764222}. Best is trial 0 with value: 1.4029244384411648e+16.
[I 2025-08-08 20:39:24,531] Trial 1 finished with value: 1.2669946291027968e+16 and parameters: {'n_estimators': 1089, 'learning_rate': 0.12296026056864931, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.7111112351513763, 'colsample_bytree': 0.6861109751096797, 'gamma': 0.11969774023027546, 'reg_alpha': 0.23514175456406705, 'reg_lambda': 0.529858255177841}. Best is trial 1 with value: 1.2669946291027968e+16.
[I 2025-08-08 20:39:35,845] Trial 2 finished with value: 1.5157177958268928e+16 and parameters: {'n_estimators': 630, 'learning_rate': 0.08149588616401418, 'max_dep

=== XGBoost Results ===
Best parameters: {'n_estimators': 3314, 'learning_rate': 0.025746278531409112, 'max_depth': 10, 'min_child_weight': 2, 'subsample': 0.9018818608280503, 'colsample_bytree': 0.7568780839066979, 'gamma': 0.550676890668629, 'reg_alpha': 0.33634325189798364, 'reg_lambda': 0.25745029524510865}
Best MAPE: 1.2668416208928768e+16


In [19]:
xgb_model = xgb.XGBRegressor(**xgb_study.best_trial.params)

In [20]:
xgb_model = xgb.XGBRegressor(**xgb_study.best_trial.params)

xgb_model2 = MultiOutputRegressor(xgb_model)
xgb_model2.fit(X_train, np.log1p(y_train))

preds_log = xgb_model2.predict(X_valid)
preds = np.expm1(preds_log)

overall_mae = mean_absolute_error(y_valid, preds)
print("Overall MAE:", overall_mae)

print("\nMAE for each output column:")
for col_idx in range(y_valid.shape[1]):
    col_mae = mean_absolute_error(y_valid.iloc[:, col_idx], preds[:, col_idx])
    print(f"{target_columns[col_idx]}: {col_mae:.4f}")

# Optional: Manual MAPE check
# manual_mape = np.mean(np.abs((y_valid.to_numpy() - preds) / y_valid.to_numpy())) * 100
# print("Manual MAPE:", manual_mape)

Overall MAE: 524.650146484375

MAE for each output column:
like_count_initial: 28.7724
like_count_final: 52.8419
view_count_initial: 692.4897
view_count_final: 1324.4967


In [21]:
import joblib

joblib.dump(xgb_model2, 'xgb_model.pkl')

['xgb_model.pkl']