...

...

In [33]:
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import lightgbm as lgb
import numpy as np
import pandas as pd
import plotly.express as px
import polars as pl
from catboost import CatBoostClassifier, Pool
from loguru import logger
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    log_loss,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [34]:
fmt = (
    "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level} | {name}:{function}:{line} — "
    "<b>{message}</b>"
)
config = {
    "handlers": [
        {
            "sink": sys.stderr,
            "format": fmt,
            "colorize": True,
            "backtrace": True,
            "diagnose": True,
        },
    ],
}
logger.configure(**config)

[3]

In [35]:
@dataclass
class CFG:
    # Paths...
    train_path: Path = Path("../data/raw/avazu/train.gz")

    # Data handling...
    sample_frac: float = 0.1
    random_state: int = 42
    test_size: int = 0.2  # 20%

    # Dataset config...
    id_col: str = "id"
    hour_col: str = "hour"
    hour_col_format: str = "%y%m%d%H%M"
    target_col: str = "click"
    dtype: dict[str, Any] = field(
        default_factory=lambda: {
            "id": pl.UInt64,
            "click": pl.Int8,
            "hour": pl.Utf8,
            "C1": pl.Categorical,
            "banner_pos": pl.Categorical,
            "site_id": pl.Categorical,
            "site_domain": pl.Categorical,
            "site_category": pl.Categorical,
            "app_id": pl.Categorical,
            "app_domain": pl.Categorical,
            "app_category": pl.Categorical,
            "device_id": pl.Categorical,
            "device_ip": pl.Categorical,
            "device_model": pl.Categorical,
            "device_type": pl.Categorical,
            "device_conn_type": pl.Categorical,
            "C14": pl.Categorical,
            "C15": pl.Categorical,
            "C16": pl.Categorical,
            "C17": pl.Categorical,
            "C18": pl.Categorical,
            "C19": pl.Categorical,
            "C20": pl.Categorical,
            "C21": pl.Categorical,
        }
    )

    cols_to_drop: tuple[str] = (
        id_col,
        hour_col,
        target_col,
    )
    cat_cols: tuple[str] = (
        "C1",
        "banner_pos",
        "site_id",
        "site_domain",
        "site_category",
        "app_id",
        "app_domain",
        "app_category",
        "device_id",
        "device_ip",
        "device_model",
        "device_type",
        "device_conn_type",
        "C14",
        "C15",
        "C16",
        "C17",
        "C18",
        "C19",
        "C20",
        "C21",
    )


CFG = CFG()

...

In [36]:
logger.info("Reading the Avazu dataset...")

X_out = pl.read_csv(CFG.train_path, schema_overrides=CFG.dtype, low_memory=True)
logger.info(f"The train dataset has been read; {X_out.shape=}")

X_out = X_out.sample(fraction=CFG.sample_frac)
logger.info(f"Sampled train dataset; {X_out.shape=}")

2025-11-09 00:43:00.888 | INFO | __main__:<module>:1 — [1mReading the Avazu dataset...[0m
2025-11-09 00:43:21.156 | INFO | __main__:<module>:4 — [1mThe train dataset has been read; X_out.shape=(40428967, 24)[0m
2025-11-09 00:43:21.932 | INFO | __main__:<module>:7 — [1mSampled train dataset; X_out.shape=(4042896, 24)[0m


In [37]:
logger.info("Starting feature engineering steps...")

X_out = X_out.with_columns(
    pl.concat_str([pl.col(CFG.hour_col), pl.lit("00")])
.str.to_datetime(CFG.hour_col_format)   
    .alias(CFG.hour_col)
)
logger.info("Parsed the `hour` column.")

# X_out = X_out.with_columns(
#     pl.col(CFG.hour_col).dt.hour().alias("hod"),
#     pl.col(CFG.hour_col).dt.weekday().alias("dow"),
#     pl.col(CFG.hour_col).dt.weekday().is_in([5, 6]).alias("is_weekend"),
# )
# logger.info(f"Engineered the following columns: `hod`, `dow`, and `is_weekend`.")

display(X_out.head(5))

X = X_out.drop(CFG.cols_to_drop)
y = X_out[CFG.target_col]
logger.info("Dropped the following cols: `id`, `hour`, and `click`.")

logger.info(
    f"Splitted the dataset to the features and target subsets: {X.shape=}; {y.shape=}"
)

display(X.head(5))

2025-11-09 00:43:21.950 | INFO | __main__:<module>:1 — [1mStarting feature engineering steps...[0m
2025-11-09 00:43:22.090 | INFO | __main__:<module>:8 — [1mParsed the `hour` column.[0m


id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
u64,i8,datetime[μs],cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat
4535416936930372659,0,2014-10-25 16:00:00,"""1005""","""1""","""e5c60a05""","""7256c623""","""f028772b""","""ecad2386""","""7801e8d9""","""07d7df22""","""a99f214a""","""983efeee""","""d4897fef""","""1""","""0""","""19950""","""320""","""50""","""1800""","""3""","""167""","""100075""","""23"""
4035621528444649187,1,2014-10-30 10:00:00,"""1005""","""0""","""85f751fd""","""c4e18dd6""","""50e219e0""","""9c13b419""","""2347f47a""","""f95efa07""","""a99f214a""","""146b12a7""","""1c6a881d""","""1""","""2""","""23161""","""320""","""50""","""2667""","""0""","""47""","""-1""","""221"""
11545037803692786669,0,2014-10-29 22:00:00,"""1005""","""0""","""85f751fd""","""c4e18dd6""","""50e219e0""","""6fc85e22""","""7801e8d9""","""0f2161f8""","""43150a70""","""26b3bf6d""","""1c6a881d""","""1""","""2""","""23723""","""320""","""50""","""2716""","""3""","""47""","""-1""","""23"""
10773078208574731142,0,2014-10-21 05:00:00,"""1005""","""0""","""85f751fd""","""c4e18dd6""","""50e219e0""","""8b89048f""","""2347f47a""","""cef3e649""","""d9190d96""","""977045e8""","""7abbbd5c""","""1""","""3""","""21664""","""320""","""50""","""2492""","""3""","""35""","""-1""","""43"""
15417551281966082909,0,2014-10-28 05:00:00,"""1005""","""0""","""85f751fd""","""c4e18dd6""","""50e219e0""","""a5184c22""","""b8d325c3""","""0f2161f8""","""a99f214a""","""8d849783""","""3d64834e""","""1""","""2""","""22810""","""320""","""50""","""2647""","""2""","""35""","""100148""","""23"""


2025-11-09 00:43:22.104 | INFO | __main__:<module>:21 — [1mDropped the following cols: `id`, `hour`, and `click`.[0m
2025-11-09 00:43:22.105 | INFO | __main__:<module>:23 — [1mSplitted the dataset to the features and target subsets: X.shape=(4042896, 21); y.shape=(4042896,)[0m


C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat
"""1005""","""1""","""e5c60a05""","""7256c623""","""f028772b""","""ecad2386""","""7801e8d9""","""07d7df22""","""a99f214a""","""983efeee""","""d4897fef""","""1""","""0""","""19950""","""320""","""50""","""1800""","""3""","""167""","""100075""","""23"""
"""1005""","""0""","""85f751fd""","""c4e18dd6""","""50e219e0""","""9c13b419""","""2347f47a""","""f95efa07""","""a99f214a""","""146b12a7""","""1c6a881d""","""1""","""2""","""23161""","""320""","""50""","""2667""","""0""","""47""","""-1""","""221"""
"""1005""","""0""","""85f751fd""","""c4e18dd6""","""50e219e0""","""6fc85e22""","""7801e8d9""","""0f2161f8""","""43150a70""","""26b3bf6d""","""1c6a881d""","""1""","""2""","""23723""","""320""","""50""","""2716""","""3""","""47""","""-1""","""23"""
"""1005""","""0""","""85f751fd""","""c4e18dd6""","""50e219e0""","""8b89048f""","""2347f47a""","""cef3e649""","""d9190d96""","""977045e8""","""7abbbd5c""","""1""","""3""","""21664""","""320""","""50""","""2492""","""3""","""35""","""-1""","""43"""
"""1005""","""0""","""85f751fd""","""c4e18dd6""","""50e219e0""","""a5184c22""","""b8d325c3""","""0f2161f8""","""a99f214a""","""8d849783""","""3d64834e""","""1""","""2""","""22810""","""320""","""50""","""2647""","""2""","""35""","""100148""","""23"""


In [38]:
X = X.to_pandas()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=CFG.random_state, stratify=y
)
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=CFG.random_state, stratify=y_train
)

logger.info(f"The train subset: {X_train.shape=}; {y_train.shape=}")
logger.info(f"The test subset: {X_test.shape=}; {y_test.shape=}")
logger.info(f"The validation subset: {X_val.shape=}; {y_val.shape=}")

display(X_train.head(5))

2025-11-09 00:43:26.140 | INFO | __main__:<module>:10 — [1mThe train subset: X_train.shape=(3638606, 21); y_train.shape=(3638606,)[0m
2025-11-09 00:43:26.140 | INFO | __main__:<module>:11 — [1mThe test subset: X_test.shape=(404290, 21); y_test.shape=(404290,)[0m
2025-11-09 00:43:26.140 | INFO | __main__:<module>:12 — [1mThe validation subset: X_val.shape=(363861, 21); y_val.shape=(363861,)[0m


Unnamed: 0,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
2125159,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,cf3181f1,...,1,0,4687,320,50,423,2,39,100148,32
3458718,1005,0,5b08c53b,7687a86e,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,46c738c1,...,1,2,17653,300,250,1994,2,39,-1,33
3769434,1005,1,5b4d2eda,16a36ef3,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,f12979c4,...,1,0,19950,320,50,1800,3,167,100077,23
3523625,1002,0,203b00f1,c4e18dd6,50e219e0,ecad2386,7801e8d9,07d7df22,0f7c61dc,a0bf6778,...,0,0,23438,320,50,2684,2,1327,100004,52
136587,1005,0,727e79e2,40b37f4e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,0a42a9dd,...,1,0,19216,320,50,863,3,169,100105,61


In [39]:
model_gbdt_transformer = lgb.LGBMClassifier(
    objective="binary",
    metric="binary_logloss",
    num_leaves=64,
    n_estimators=100,
    learning_rate=0.05,
    max_depth=-1,
    verbose=1,
    n_jobs=-1,
    seed=CFG.random_state,
)
model_gbdt_transformer.fit(
    X_train_sub,
    y_train_sub,
    categorical_feature=CFG.cat_cols,
)

[LightGBM] [Info] Number of positive: 555758, number of negative: 2718987
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2480
[LightGBM] [Info] Number of data points in the train set: 3274745, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169710 -> initscore=-1.587682
[LightGBM] [Info] Start training from score -1.587682


0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.05
,n_estimators,100
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [40]:
logger.info("Extracting leaf indices...")

leaf_indices_train = model_gbdt_transformer.predict(X_train_sub, pred_leaf=True)
leaf_indices_val = model_gbdt_transformer.predict(X_val, pred_leaf=True)
leaf_indices_test = model_gbdt_transformer.predict(X_test, pred_leaf=True)

2025-11-09 00:43:33.400 | INFO | __main__:<module>:1 — [1mExtracting leaf indices...[0m


In [41]:
logger.info("Fitting `OneHotEncoder` on leaf indices...")

ohe_leaf_encoder = OneHotEncoder(handle_unknown="infrequent_if_exist")
ohe_leaf_encoder.fit(leaf_indices_train)

logger.info("Transforming indices to sparse OHE features...")

X_train_lr = ohe_leaf_encoder.transform(leaf_indices_train)
X_val_lr = ohe_leaf_encoder.transform(leaf_indices_val)
X_test_lr = ohe_leaf_encoder.transform(leaf_indices_test)

2025-11-09 00:43:38.908 | INFO | __main__:<module>:1 — [1mFitting `OneHotEncoder` on leaf indices...[0m
2025-11-09 00:43:47.087 | INFO | __main__:<module>:6 — [1mTransforming indices to sparse OHE features...[0m


In [42]:
logger.info("Training Logistic Regression on sparse features...")

model_lr = LogisticRegression(
    solver="saga",  # Good solver for sparse data
    # C=0.1,  # Regularization strength
    max_iter=1000,  # Increase iterations for convergence
    random_state=CFG.random_state,
    n_jobs=-1,
    verbose=1,
)
model_lr.fit(X_train_lr, y_train_sub)

2025-11-09 00:44:14.464 | INFO | __main__:<module>:1 — [1mTraining Logistic Regression on sparse features...[0m
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


Epoch 1, change: 1
Epoch 2, change: 0.19365686
Epoch 3, change: 0.10637384
Epoch 4, change: 0.06656936
Epoch 5, change: 0.056324805
Epoch 6, change: 0.052480282
Epoch 7, change: 0.046870906
Epoch 8, change: 0.044819
Epoch 9, change: 0.037085923
Epoch 10, change: 0.036360351
Epoch 11, change: 0.034353289
Epoch 12, change: 0.030821163
Epoch 13, change: 0.028073964
Epoch 14, change: 0.024012573
Epoch 15, change: 0.021732348
Epoch 16, change: 0.020585837
Epoch 17, change: 0.018744349
Epoch 18, change: 0.017678009
Epoch 19, change: 0.015415278
Epoch 20, change: 0.014587541
Epoch 21, change: 0.014583583
Epoch 22, change: 0.013866732
Epoch 23, change: 0.013532839
Epoch 24, change: 0.013146668
Epoch 25, change: 0.01249492
Epoch 26, change: 0.012268241
Epoch 27, change: 0.011929332
Epoch 28, change: 0.011485933
Epoch 29, change: 0.011185922
Epoch 30, change: 0.010835119
Epoch 31, change: 0.010538619
Epoch 32, change: 0.01004716
Epoch 33, change: 0.0097247697
Epoch 34, change: 0.0095439505
Epoch

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'saga'
,max_iter,1000


In [43]:
model_cb = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    eval_metric="Logloss",
    od_type="Iter",
    # od_wait=50,  # Early stopping
    random_seed=CFG.random_state,
    verbose=1,
    allow_writing_files=False,
)

# Use CatBoost's Pool object for efficient categorical handling
train_pool = Pool(X_train_sub, y_train_sub.to_numpy(), cat_features=CFG.cat_cols)
val_pool = Pool(X_val, y_val.to_numpy(), cat_features=CFG.cat_cols)
test_pool = Pool(X_test, y_test.to_numpy(), cat_features=CFG.cat_cols)

model_cb.fit(train_pool, eval_set=val_pool)

0:	learn: 0.6583588	test: 0.6582228	best: 0.6582228 (0)	total: 2.13s	remaining: 35m 29s
1:	learn: 0.6284953	test: 0.6282682	best: 0.6282682 (1)	total: 4.64s	remaining: 38m 37s
2:	learn: 0.6023776	test: 0.6021336	best: 0.6021336 (2)	total: 6.86s	remaining: 37m 59s
3:	learn: 0.5796801	test: 0.5793694	best: 0.5793694 (3)	total: 8.89s	remaining: 36m 54s
4:	learn: 0.5598568	test: 0.5594717	best: 0.5594717 (4)	total: 11.3s	remaining: 37m 37s
5:	learn: 0.5414596	test: 0.5409183	best: 0.5409183 (5)	total: 12.7s	remaining: 34m 56s
6:	learn: 0.5259309	test: 0.5253421	best: 0.5253421 (6)	total: 14.6s	remaining: 34m 33s
7:	learn: 0.5124834	test: 0.5118990	best: 0.5118990 (7)	total: 15.9s	remaining: 32m 46s
8:	learn: 0.4999337	test: 0.4993585	best: 0.4993585 (8)	total: 18.2s	remaining: 33m 28s
9:	learn: 0.4898687	test: 0.4892770	best: 0.4892770 (9)	total: 21.8s	remaining: 36m 3s
10:	learn: 0.4803700	test: 0.4797053	best: 0.4797053 (10)	total: 23.7s	remaining: 35m 30s
11:	learn: 0.4725504	test: 0.47

<catboost.core.CatBoostClassifier at 0x481412bf0>

In [44]:
logger.info("Getting logits from validation set...")

# .decision_function() gives logits for Logistic Regression
logits_gbdt_lr_val = model_lr.decision_function(X_val_lr)

# prediction_type='RawFormulaVal' gives logits for CatBoost
logits_cb_val = model_cb.predict(val_pool, prediction_type="RawFormulaVal")

2025-11-09 01:46:19.506 | INFO | __main__:<module>:1 — [1mGetting logits from validation set...[0m


In [45]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


best_w = 0.5
best_logloss = float("inf")

search_space = np.linspace(0, 1, 101)  # Search w from 0.0 to 1.0

for w in search_space:
    blended_logits = (w * logits_gbdt_lr_val) + ((1 - w) * logits_cb_val)

    blended_probs = sigmoid(blended_logits)
    current_logloss = log_loss(y_val, blended_probs)

    if current_logloss < best_logloss:
        best_logloss = current_logloss
        best_w = w

logger.info(f"Optimal weight `w` found: {best_w:.2f}")
logger.info(f"Best Validation LogLoss: {best_logloss:.5f}")

2025-11-09 01:46:24.761 | INFO | __main__:<module>:20 — [1mOptimal weight `w` found: 0.04[0m
2025-11-09 01:46:24.762 | INFO | __main__:<module>:21 — [1mBest Validation LogLoss: 0.38668[0m


In [46]:
logger.info("Getting logits from TEST set...")

logits_gbdt_lr_test = model_lr.decision_function(X_test_lr)
logits_cb_test = model_cb.predict(test_pool, prediction_type="RawFormulaVal")

probs_gbdt_lr_test = sigmoid(logits_gbdt_lr_test)
probs_cb_test = sigmoid(logits_cb_test)

logloss_gbdt_lr = log_loss(y_test, probs_gbdt_lr_test)
auc_gbdt_lr = roc_auc_score(y_test, probs_gbdt_lr_test)

logloss_cb = log_loss(y_test, probs_cb_test)
auc_cb = roc_auc_score(y_test, probs_cb_test)

2025-11-09 01:46:24.770 | INFO | __main__:<module>:1 — [1mGetting logits from TEST set...[0m


In [None]:
blended_logits_test = (best_w * logits_gbdt_lr_test) + ((1 - best_w) * logits_cb_test)
blended_probs_test = sigmoid(blended_logits_test)

logloss_blended = log_loss(y_test, blended_probs_test)
auc_blended = roc_auc_score(y_test, blended_probs_test)

In [56]:
logger.info("--- Final Results (on Test Set) ---")

results = {
    "Model": ["Model 1: GBDT-LR", "Model 2: CatBoost", "FINAL: Blended Model"],
    "LogLoss": [logloss_gbdt_lr, logloss_cb, logloss_blended],
    "AUC": [auc_gbdt_lr, auc_cb, auc_blended],
}
display(pl.DataFrame(results))

2025-11-09 13:32:14.103 | INFO | __main__:<module>:1 — [1m--- Final Results (on Test Set) ---[0m


Model,LogLoss,AUC
str,f64,f64
"""Model 1: GBDT-LR""",0.397566,0.745821
"""Model 2: CatBoost""",0.386993,0.767139
"""FINAL: Blended Model""",0.386987,0.767154


In [51]:
logger.info("Plotting Prediction Distribution...")

dist_df = pd.DataFrame({"Probability": blended_probs_test, "Actual Class": y_test})

dist_fig = px.histogram(
    dist_df,
    x="Probability",
    color="Actual Class",
    color_discrete_map={
        "0": "blue",
        "1": "red",
    },
    histnorm="density",
    barmode="overlay",
    title="Distribution of Predicted Probabilities",
    marginal="box",
)
dist_fig.update_traces(opacity=0.7)
dist_fig.show()

2025-11-09 02:36:20.037 | INFO | __main__:<module>:1 — [1mPlotting Prediction Distribution...[0m
