In [None]:
import time
from pathlib import Path

import category_encoders as ce
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.calibration import calibration_curve
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    auc,
    average_precision_score,
    brier_score_loss,
    confusion_matrix,
    log_loss,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer, MaxAbsScaler, OneHotEncoder

In [None]:
# ---------- Paths ----------
DATA_DIR = Path("../data/raw/avazu")
TRAIN_FILE = DATA_DIR / "train.gz"
TEST_FILE = DATA_DIR / "test.gz"

# ---------- Core Column Definitions ----------
ID_COL = "id"
TARGET_COL = "click"
DATETIME_COL = "hour"
DATETIME_FORMAT = "%y%m%d%H"

# ---------- Data Reading & Sampling Controls ----------
SAMPLE_FRAC = 0.1  # Use a fraction of the data for faster iteration
RANDOM_STATE = 42

# ---------- Data Type Casting ----------
# Explicitly setting dtypes during read optimizes memory usage.
DTYPE_COLS = {
    "id": np.int64,
    "click": np.int8,
    "hour": np.int64,
    "C1": np.int32,
    "banner_pos": np.int8,
    "site_id": "category",
    "site_domain": "category",
    "site_category": "category",
    "app_id": "category",
    "app_domain": "category",
    "app_category": "category",
    "device_id": "category",
    "device_ip": "category",
    "device_model": "category",
    "device_type": np.int8,
    "device_conn_type": np.int8,
    "C14": np.int32,
    "C15": np.int32,
    "C16": np.int32,
    "C17": np.int32,
    "C18": np.int32,
    "C19": np.int32,
    "C20": np.int32,
    "C21": np.int32,
}
ALL_COLS = list(DTYPE_COLS.keys())

# ---------- Feature Engineering & Modeling Hyperparameters ----------
TEST_SIZE = 0.2
N_BINS_NUM = 100
HASH_N_FEATURES = 2**18
RARE_MIN_FREQUENCY = 2000
LR_PARAMS = {
    "solver": "saga",
    "class_weight": "balanced",  # Crucial for imbalanced datasets
    "max_iter": 5000,
    "random_state": RANDOM_STATE,
}

# ---------- Feature Grouping for Preprocessing Pipelines ----------
HIGH_CARD_COLS = [
    "device_id",
    "device_ip",
    "device_model",
    "site_domain",
    "site_id",
    "app_id",
]
LOW_MED_CARD_COLS = [
    "site_category",
    "app_category",
    "app_domain",
    "banner_pos",
    "device_type",
    "device_conn_type",
    "hod",
    "dow",
]
C_COLS = [
    "C14",
    "C15",
    "C16",
    "C18",
    "C19",
    "C20",
    "C21",
]

In [None]:
def log(msg: str):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}")

In [None]:
def read_csv_infer(path: Path, usecols: list[str] = None) -> pd.DataFrame:
    return pd.read_csv(path, usecols=usecols, dtype=DTYPE_COLS, compression="infer")


log("Reading Avazu data...")

df = read_csv_infer(TRAIN_FILE, usecols=ALL_COLS)
log(f"Train read: shape={df.shape}")

# df_test = read_csv_infer(TEST_FILE, usecols=[c for c in ALL_COLS if c != TARGET_COL])
# log(f"Test read: shape={df_test.shape}")

# df = downcast_numeric(df)
# df_test = downcast_numeric(df_test)

df = df.sample(frac=SAMPLE_FRAC, random_state=RANDOM_STATE)
log(f"Sampled df to fraction={SAMPLE_FRAC}: shape={df.shape}")

# df_test = df_test.sample(frac=SAMPLE_FRAC, random_state=RANDOM_STATE)
# log(f"Sampled df_test to fraction={SAMPLE_FRAC}: shape={df.shape}")

In [None]:
def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    s = df[DATETIME_COL].astype(str).str.zfill(8)
    ts = pd.to_datetime(s, format=DATETIME_FORMAT, errors="coerce", utc=True)
    df = df.assign(hod=ts.dt.hour, dow=ts.dt.day_of_week)
    return df


log("Creating time-based features (hod, dow)...")

df = add_time_features(df)

log("Time-based features created.")

In [None]:
COLS_TO_DROP = [
    ID_COL,
    TARGET_COL,
    DATETIME_COL,
    "C1",
    "C17",
]

X = df.drop(columns=COLS_TO_DROP, axis=1)
y = df[TARGET_COL]

In [None]:
def join_tokens_rowwise(X: pd.DataFrame) -> np.ndarray:
    X = pd.DataFrame(X).astype("string")
    arrays = [np.char.add(f"{c}=", X[c].to_numpy(dtype=str)) for c in X.columns]

    joined = arrays[0]
    for a in arrays[1:]:
        joined = np.char.add(np.char.add(joined, " "), a)

    return joined


high_card_pipeline = Pipeline(
    steps=[
        ("to_tokens", FunctionTransformer(join_tokens_rowwise, validate=False)),
        (
            "hash",
            HashingVectorizer(
                n_features=HASH_N_FEATURES,
                alternate_sign=False,
                lowercase=False,
                token_pattern=r"[^ ]+",
            ),
        ),
    ]
)
low_med_card_pipeline = Pipeline(
    steps=[
        (
            "one_hot_encoder",
            OneHotEncoder(
                handle_unknown="infrequent_if_exist",
                min_frequency=RARE_MIN_FREQUENCY,
            ),
        ),
    ]
)
c_pipeline = Pipeline(
    steps=[
        (
            "kbins",
            KBinsDiscretizer(
                n_bins=N_BINS_NUM,
                strategy="quantile",
                encode="ordinal",
                quantile_method="averaged_inverted_cdf",
            ).set_output(transform="pandas"),
        ),
        ("woe", ce.WOEEncoder(cols=C_COLS, random_state=RANDOM_STATE)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("high_card_cols", high_card_pipeline, HIGH_CARD_COLS + C_COLS),
        ("low_med_car_cols", low_med_card_pipeline, LOW_MED_CARD_COLS),
        # ("c_cols", c_pipeline, C_COLS),
    ],
    remainder="drop",
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
log(f"X train: {X_train.shape=}")
log(f"y train: {y_train.shape=}")

In [None]:
logreg = LogisticRegression(**LR_PARAMS)
model = Pipeline(
    steps=[
        ("features", preprocessor),
        ("scale", MaxAbsScaler()),
        ("clf", logreg),
    ]
)

log("Starting model training...")

model.fit(X_train, y_train)

log("Model training complete.")

In [None]:
log("Generating predictions on the test set...")

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)

roc_auc = roc_auc_score(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)
logloss = log_loss(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)

log(f"ROC-AUC: {roc_auc:.6f}")
log(f"PR-AUC: {pr_auc:.6f}")
log(f"LogLoss: {logloss:.6f}")
log(f"Accuracy: {acc:.6f}")

In [None]:
# # tscv = TimeSeriesSplit(n_splits=5)
# param_grid = {
#     "clf__C": np.logspace(-3, 2, 6),
#     "clf__max_iter": [1000, 2000],
# }

# grid = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring="roc_auc",
#     cv=3,
#     refit=True,
#     verbose=2,
# )
# grid.fit(X, y)

# log(f"Best params: {grid.best_params_}")
# log(f"Best CV ROC-AUC: {round(grid.best_score_, 6)}")

# best_model = grid.best_estimator_

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig_roc_avazu = go.Figure()
fig_roc_avazu.add_trace(
    go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={roc_auc:.4f})")
)
fig_roc_avazu.add_trace(
    go.Scatter(x=[0, 1], y=[0, 1], mode="lines", name="Chance", line=dict(dash="dash"))
)
fig_roc_avazu.update_layout(
    title="ROC Curve",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
    ),
)
fig_roc_avazu.show()

In [None]:
prec, rec, _ = precision_recall_curve(y_test, y_pred_proba)
ap = average_precision_score(y_test, y_pred_proba)

fig_pr_avazu = go.Figure()
fig_pr_avazu.add_trace(
    go.Scatter(x=rec, y=prec, mode="lines", name=f"PR (AP={ap:.4f})")
)
fig_pr_avazu.update_layout(
    title="Precision-Recall Curve",
    xaxis_title="Recall",
    yaxis_title="Precision",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
    ),
)
fig_pr_avazu.show()

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])

fig_cm_avazu = go.Figure(
    data=go.Heatmap(
        z=cm_df.values,
        text=cm_df.values,
        texttemplate="%{text}",
        hovertemplate="Row: %{y}<br>Col: %{x}<br>Count: %{z}<extra></extra>",
    )
)
fig_cm_avazu.update_layout(
    title="Confusion Matrix (Threshold = 0.5)",
    xaxis_title="Predicted",
    yaxis_title="Actual",
)
fig_cm_avazu.show()

In [None]:
prob_true, prob_pred = calibration_curve(
    y_test, y_pred_proba, n_bins=10, strategy="quantile"
)
brier = float(brier_score_loss(y_test, y_pred_proba))

fig_cal_avazu = go.Figure()
fig_cal_avazu.add_trace(
    go.Scatter(
        x=[0, 1],
        y=[0, 1],
        mode="lines",
        name="Perfectly Calibrated",
        line=dict(dash="dash"),
    )
)
fig_cal_avazu.add_trace(
    go.Scatter(x=prob_pred, y=prob_true, mode="lines+markers", name="Model")
)
fig_cal_avazu.update_layout(
    title=f"Calibration Curve{' — Brier: ' + f'{brier:.5f}' if brier is not None else ''}",
    xaxis_title="Predicted Probability",
    yaxis_title="Observed Frequency",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)
fig_cal_avazu.show()