In [18]:
from utils import visualise_gridsearch, fetch_top_predictions, make_top_3

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold, TimeSeriesSplit, GridSearchCV
from imblearn.over_sampling import SMOTE, KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [6]:
df = pd.read_csv('data/OnlineNewsPopularity.csv')
df = df.rename(columns=lambda x: x.strip())
y_raw = df['shares']

POPULARITY_SPLIT = 0.9

high_thresh = y_raw.quantile(POPULARITY_SPLIT)
y_class = (y_raw >= high_thresh).astype(int)

print("Class distribution:")
print(y_class.value_counts())

df_sorted = df.copy().sort_values('timedelta', ascending=False)

TRAIN_SPLIT = 0.85
train_size = int(len(df_sorted) * TRAIN_SPLIT)

train_df = df_sorted.iloc[:train_size]
test_df = df_sorted.iloc[train_size:]

X_train = train_df.drop(columns=['url', 'timedelta', 'shares'])
y_train_class = (train_df['shares'] >= high_thresh).astype(int)
y_train_reg_log = np.log1p(train_df['shares'])

X_test_full = test_df.drop(columns=['url', 'timedelta', 'shares'])
y_test_full_class = (test_df['shares'] >= high_thresh).astype(int)
y_test_full_reg_log = np.log1p(test_df['shares'])

test_splits = np.array_split(test_df, 100)

X_test_sets = []
y_test_sets_class = []
y_test_sets_reg_log = []

for ts in test_splits:
    X_test_sets.append(ts.drop(columns=['url', 'timedelta', 'shares']))
    y_test_sets_class.append((ts['shares'] >= high_thresh).astype(int))
    y_test_sets_reg_log.append(np.log1p(ts['shares']))

Class distribution:
shares
0    35615
1     4029
Name: count, dtype: int64


**BASELINE CATBOOST-CLASSIFIER**

In [9]:
# ============================================================
# 1. TRAIN BASELINE CATBOOST CLASSIFIER
# ============================================================

model = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.1,
    loss_function="Logloss",
    verbose=False
)

model.fit(X_train, y_train_class)


<catboost.core.CatBoostClassifier at 0x20a92d3aad0>

In [10]:
# ============================================================
# 2. EVALUATION ON FULL TEST SET
# ============================================================

y_pred = model.predict(X_test_full)
y_proba = model.predict_proba(X_test_full)[:, 1]

print("\n=== CLASSIFICATION REPORT (FULL TEST SET) ===")
print(classification_report(y_test_full_class, y_pred, digits=3))

print("\n=== CONFUSION MATRIX ===")
print(confusion_matrix(y_test_full_class, y_pred))

auc = roc_auc_score(y_test_full_class, y_proba)
print(f"\nTest ROC-AUC: {auc:.3f}")

# ============================================================
# 3. DAILY TOP‑3 HIT EVALUATION (RANKING METRIC)
# ============================================================

daily_hits = []

for X_day, y_day_class in zip(X_test_sets, y_test_sets_class):

    # Skip empty days
    if len(X_day) == 0:
        daily_hits.append(0)
        continue

    # 1. Predict probabilities
    y_proba_day = model.predict_proba(X_day)[:, 1]

    # Ranking dataframe
    day_df = pd.DataFrame({
        "proba": y_proba_day,
        "true_popular": y_day_class.values
    })

    # 2. Top‑3 predicted
    top3_pred = day_df.sort_values("proba", ascending=False).head(3)

    # 3. Top‑3 true
    top3_true = day_df.sort_values("true_popular", ascending=False).head(3)

    pred_idx = set(top3_pred.index)
    true_idx = set(top3_true.index)

    # 4. Count hits
    hits = len(pred_idx & true_idx)
    daily_hits.append(hits)

# 5. Summary
average_hits = np.mean(daily_hits)

print("\n=== DAILY TOP‑3 HIT RESULTS ===")
for day, hits in enumerate(daily_hits, start=1):
    print(f"Day {day}: {hits} / 3 correct")

print(f"\nAverage Top‑3 Hit Rate Across 100 Days: {average_hits:.4f}")


=== CLASSIFICATION REPORT (FULL TEST SET) ===
              precision    recall  f1-score   support

           0      0.920     0.998     0.957      5466
           1      0.250     0.008     0.016       481

    accuracy                          0.918      5947
   macro avg      0.585     0.503     0.487      5947
weighted avg      0.865     0.918     0.881      5947


=== CONFUSION MATRIX ===
[[5454   12]
 [ 477    4]]

Test ROC-AUC: 0.733

=== DAILY TOP‑3 HIT RESULTS ===
Day 1: 0 / 3 correct
Day 2: 1 / 3 correct
Day 3: 1 / 3 correct
Day 4: 0 / 3 correct
Day 5: 0 / 3 correct
Day 6: 2 / 3 correct
Day 7: 0 / 3 correct
Day 8: 0 / 3 correct
Day 9: 0 / 3 correct
Day 10: 0 / 3 correct
Day 11: 1 / 3 correct
Day 12: 0 / 3 correct
Day 13: 0 / 3 correct
Day 14: 1 / 3 correct
Day 15: 0 / 3 correct
Day 16: 0 / 3 correct
Day 17: 0 / 3 correct
Day 18: 0 / 3 correct
Day 19: 0 / 3 correct
Day 20: 1 / 3 correct
Day 21: 1 / 3 correct
Day 22: 0 / 3 correct
Day 23: 0 / 3 correct
Day 24: 1 / 3 correct


**SMOTE + UNDERSAMPLING + BASELINE CATBOOST**

In [8]:
oversample = SMOTE(
    sampling_strategy=0.5,
    random_state=42
    )

undersample = RandomUnderSampler(
    sampling_strategy=0.8,
    random_state=42
)

cat_model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=False,
    random_state=42
)

pipeline = ImbPipeline(steps=[
    ('smote', oversample),
    ('under', undersample),
    ('catboost', cat_model)
])

# ============================================================
# 6. TRAIN MODEL (resampling only applied to training data)
# ============================================================

pipeline.fit(X_train, y_train_class)

# ============================================================
# 7. EVALUATE ON FULL TEST SET
# ============================================================

y_pred = pipeline.predict(X_test_full)
y_proba = pipeline.predict_proba(X_test_full)[:, 1]

print("\n=== CLASSIFICATION REPORT (FULL TEST SET) ===")
print(classification_report(y_test_full_class, y_pred, digits=3))

print("\n=== CONFUSION MATRIX ===")
print(confusion_matrix(y_test_full_class, y_pred))

auc = roc_auc_score(y_test_full_class, y_proba)
print(f"\nTest ROC-AUC: {auc:.3f}")

# ============================================================
# 8. DAILY TOP‑3 HIT EVALUATION (YOUR METHOD)
# ============================================================

daily_hits = []

for X_day, y_day_class in zip(X_test_sets, y_test_sets_class):

    if len(X_day) == 0:
        daily_hits.append(0)
        continue

    # 1. Predict probabilities
    y_proba_day = pipeline.predict_proba(X_day)[:, 1]

    # Build ranking dataframe
    day_df = pd.DataFrame({
        "proba": y_proba_day,
        "true_popular": y_day_class.values
    })

    # 2. Top‑3 predicted
    top3_pred = day_df.sort_values("proba", ascending=False).head(3)

    # 3. Top‑3 true
    top3_true = day_df.sort_values("true_popular", ascending=False).head(3)

    pred_idx = set(top3_pred.index)
    true_idx = set(top3_true.index)

    # 4. Count hits
    hits = len(pred_idx & true_idx)
    daily_hits.append(hits)

# 5. Summary
average_hits = np.mean(daily_hits)

print("\n=== DAILY TOP‑3 HIT RESULTS ===")
for day, hits in enumerate(daily_hits, start=1):
    print(f"Day {day}: {hits} / 3 correct")

print(f"\nAverage Top‑3 Hit Rate Across 100 Days: {average_hits:.4f}")


=== CLASSIFICATION REPORT (FULL TEST SET) ===
              precision    recall  f1-score   support

           0      0.922     0.988     0.954      5466
           1      0.284     0.056     0.094       481

    accuracy                          0.912      5947
   macro avg      0.603     0.522     0.524      5947
weighted avg      0.871     0.912     0.884      5947


=== CONFUSION MATRIX ===
[[5398   68]
 [ 454   27]]

Test ROC-AUC: 0.733

=== DAILY TOP‑3 HIT RESULTS ===
Day 1: 0 / 3 correct
Day 2: 1 / 3 correct
Day 3: 1 / 3 correct
Day 4: 0 / 3 correct
Day 5: 0 / 3 correct
Day 6: 1 / 3 correct
Day 7: 0 / 3 correct
Day 8: 0 / 3 correct
Day 9: 0 / 3 correct
Day 10: 0 / 3 correct
Day 11: 0 / 3 correct
Day 12: 0 / 3 correct
Day 13: 0 / 3 correct
Day 14: 0 / 3 correct
Day 15: 1 / 3 correct
Day 16: 0 / 3 correct
Day 17: 0 / 3 correct
Day 18: 0 / 3 correct
Day 19: 1 / 3 correct
Day 20: 1 / 3 correct
Day 21: 0 / 3 correct
Day 22: 0 / 3 correct
Day 23: 0 / 3 correct
Day 24: 1 / 3 correct


**BASELINE CATBOOST REGRESSOR**

In [13]:

# ============================================================
# 1. TRAIN CATBOOST REGRESSOR ON LOG-SHARES
# ============================================================

reg_model = CatBoostRegressor(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function="RMSE",
    verbose=False,
    random_state=42
)

# y_train_reg_log = np.log1p(train_df['shares'])
reg_model.fit(X_train, y_train_reg_log)

# ============================================================
# 2. PREDICT ON FULL TEST SET
# ============================================================

y_pred_log = reg_model.predict(X_test_full)
y_pred_shares = np.expm1(y_pred_log)   # convert back to original scale

# ============================================================
# 3. CONVERT REGRESSION OUTPUT → CLASSIFICATION LABELS
# ============================================================

y_pred_class = (y_pred_shares >= high_thresh).astype(int)

print("\n=== CLASSIFICATION REPORT (REGRESSION MODEL) ===")
print(classification_report(y_test_full_class, y_pred_class, digits=3))

print("\n=== CONFUSION MATRIX ===")
print(confusion_matrix(y_test_full_class, y_pred_class))

# For ROC-AUC we need probabilities → normalize predicted shares
y_pred_proba = (y_pred_shares - y_pred_shares.min()) / (y_pred_shares.max() - y_pred_shares.min())

auc = roc_auc_score(y_test_full_class, y_pred_proba)
print(f"\nTest ROC-AUC (regression-based): {auc:.3f}")

# ============================================================
# 4. DAILY TOP‑3 HIT EVALUATION (RANKING METRIC)
# ============================================================

daily_hits = []

for X_day, y_day_class in zip(X_test_sets, y_test_sets_class):

    if len(X_day) == 0:
        daily_hits.append(0)
        continue

    # 1. Predict log-shares → shares
    y_day_pred_log = reg_model.predict(X_day)
    y_day_pred_shares = np.expm1(y_day_pred_log)

    # Ranking dataframe
    day_df = pd.DataFrame({
        "pred_shares": y_day_pred_shares,
        "true_popular": y_day_class.values
    })

    # 2. Top‑3 predicted by regression
    top3_pred = day_df.sort_values("pred_shares", ascending=False).head(3)

    # 3. Top‑3 true popular
    top3_true = day_df.sort_values("true_popular", ascending=False).head(3)

    pred_idx = set(top3_pred.index)
    true_idx = set(top3_true.index)

    # 4. Count hits
    hits = len(pred_idx & true_idx)
    daily_hits.append(hits)

# 5. Summary
average_hits = np.mean(daily_hits)

print("\n=== DAILY TOP‑3 HIT RESULTS (REGRESSION MODEL) ===")
for day, hits in enumerate(daily_hits, start=1):
    print(f"Day {day}: {hits} / 3 correct")

print(f"\nAverage Top‑3 Hit Rate Across 100 Days: {average_hits:.4f}")


=== CLASSIFICATION REPORT (REGRESSION MODEL) ===
              precision    recall  f1-score   support

           0      0.919     1.000     0.958      5466
           1      0.000     0.000     0.000       481

    accuracy                          0.919      5947
   macro avg      0.460     0.500     0.479      5947
weighted avg      0.845     0.919     0.880      5947


=== CONFUSION MATRIX ===
[[5464    2]
 [ 481    0]]

Test ROC-AUC (regression-based): 0.728

=== DAILY TOP‑3 HIT RESULTS (REGRESSION MODEL) ===
Day 1: 0 / 3 correct
Day 2: 1 / 3 correct
Day 3: 0 / 3 correct
Day 4: 0 / 3 correct
Day 5: 0 / 3 correct
Day 6: 2 / 3 correct
Day 7: 0 / 3 correct
Day 8: 0 / 3 correct
Day 9: 0 / 3 correct
Day 10: 0 / 3 correct
Day 11: 1 / 3 correct
Day 12: 0 / 3 correct
Day 13: 0 / 3 correct
Day 14: 1 / 3 correct
Day 15: 1 / 3 correct
Day 16: 0 / 3 correct
Day 17: 0 / 3 correct
Day 18: 0 / 3 correct
Day 19: 1 / 3 correct
Day 20: 1 / 3 correct
Day 21: 0 / 3 correct
Day 22: 0 / 3 correct
Day