In [3]:
from utils import visualise_gridsearch, fetch_top_predictions, make_top_3

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold, TimeSeriesSplit, GridSearchCV

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [7]:
df = pd.read_csv('data/OnlineNewsPopularity.csv')
df = df.rename(columns=lambda x: x.strip())
y_raw = df['shares']

POPULARITY_SPLIT = 0.9

high_thresh = y_raw.quantile(POPULARITY_SPLIT)
y_class = (y_raw >= high_thresh).astype(int)

print("Class distribution:")
print(y_class.value_counts())

df_sorted = df.copy().sort_values('timedelta', ascending=False)

TRAIN_SPLIT = 0.85
train_size = int(len(df_sorted) * TRAIN_SPLIT)

train_df = df_sorted.iloc[:train_size]
test_df = df_sorted.iloc[train_size:]

X_train = train_df.drop(columns=['url', 'timedelta', 'shares'])
y_train_class = (train_df['shares'] >= high_thresh).astype(int)
y_train_reg_log = np.log1p(train_df['shares'])

X_test_full = test_df.drop(columns=['url', 'timedelta', 'shares'])
y_test_full_class = (test_df['shares'] >= high_thresh).astype(int)
y_test_full_reg_log = np.log1p(test_df['shares'])

test_splits = np.array_split(test_df, 100)

X_test_sets = []
y_test_sets_class = []
y_test_sets_reg_log = []

for ts in test_splits:
    X_test_sets.append(ts.drop(columns=['url', 'timedelta', 'shares']))
    y_test_sets_class.append((ts['shares'] >= high_thresh).astype(int))
    y_test_sets_reg_log.append(np.log1p(ts['shares']))

Class distribution:
shares
0    35615
1     4029
Name: count, dtype: int64


In [9]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.1,
    loss_function="Logloss",
    verbose=False
)

model.fit(X_train, y_train_class)

<catboost.core.CatBoostClassifier at 0x1e19f7d9590>

In [10]:
import numpy as np
import pandas as pd

daily_hits = []

for X_day, y_day_class in zip(X_test_sets, y_test_sets_class):

    # Skip empty days (safety)
    if len(X_day) == 0:
        daily_hits.append(0)
        continue

    # --- 1. Predict probabilities for that day ---
    y_proba_day = model.predict_proba(X_day)[:, 1]

    # Build dataframe for ranking
    day_df = pd.DataFrame({
        "proba": y_proba_day,
        "true_popular": y_day_class.values
    })

    # --- 2. Top‑3 predicted popular articles ---
    top3_pred = day_df.sort_values("proba", ascending=False).head(3)

    # --- 3. True top‑3 actual popular articles ---
    # true_popular is already 0/1 based on top‑10% threshold
    top3_true = day_df.sort_values("true_popular", ascending=False).head(3)

    pred_idx = set(top3_pred.index)
    true_idx = set(top3_true.index)

    # --- 4. Count hits ---
    hits = len(pred_idx & true_idx)
    daily_hits.append(hits)

# --- 5. Average hit rate across all 100 days ---
average_hits = np.mean(daily_hits)

print("\n=== DAILY TOP‑3 HIT RESULTS ===")
for day, hits in enumerate(daily_hits, start=1):
    print(f"Day {day}: {hits} / 3 correct")

print(f"\nAverage Top‑3 Hit Rate Across 100 Days: {average_hits:.4f}")


=== DAILY TOP‑3 HIT RESULTS ===
Day 1: 0 / 3 correct
Day 2: 1 / 3 correct
Day 3: 1 / 3 correct
Day 4: 0 / 3 correct
Day 5: 0 / 3 correct
Day 6: 2 / 3 correct
Day 7: 0 / 3 correct
Day 8: 0 / 3 correct
Day 9: 0 / 3 correct
Day 10: 0 / 3 correct
Day 11: 1 / 3 correct
Day 12: 0 / 3 correct
Day 13: 0 / 3 correct
Day 14: 1 / 3 correct
Day 15: 0 / 3 correct
Day 16: 0 / 3 correct
Day 17: 0 / 3 correct
Day 18: 0 / 3 correct
Day 19: 0 / 3 correct
Day 20: 1 / 3 correct
Day 21: 1 / 3 correct
Day 22: 0 / 3 correct
Day 23: 0 / 3 correct
Day 24: 1 / 3 correct
Day 25: 0 / 3 correct
Day 26: 0 / 3 correct
Day 27: 0 / 3 correct
Day 28: 0 / 3 correct
Day 29: 0 / 3 correct
Day 30: 1 / 3 correct
Day 31: 1 / 3 correct
Day 32: 0 / 3 correct
Day 33: 1 / 3 correct
Day 34: 0 / 3 correct
Day 35: 0 / 3 correct
Day 36: 0 / 3 correct
Day 37: 0 / 3 correct
Day 38: 1 / 3 correct
Day 39: 0 / 3 correct
Day 40: 0 / 3 correct
Day 41: 0 / 3 correct
Day 42: 1 / 3 correct
Day 43: 1 / 3 correct
Day 44: 1 / 3 correct
Day 45: 