# Stage 0 LightGBM Baseline

In [1]:
import sys
from pathlib import Path
ROOT = Path('..').resolve()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
ROOT

PosixPath('/Competition/toss-ad-click-prediction')

In [2]:
from src.data.downsample import DownsampleConfig, create_downsampled_dataset
from src.train.lightgbm_runner import LightGBMConfig, train_lightgbm
from src.train.log_utils import append_metric, MetricRecord, default_run_id


## Generate / Load Downsampled Dataset

In [3]:
cfg = DownsampleConfig(
    raw_path='../data/train.parquet',
    output_path='../data/processed/train_downsample_1_2.parquet',
    negative_multiplier=2.0,
    seed=42,
    shuffle=True,
)
downsample_path = create_downsampled_dataset(cfg)
downsample_path

PosixPath('../data/processed/train_downsample_1_2.parquet')

## Train LightGBM

In [6]:
lgb_cfg = LightGBMConfig(
    train_path=str(downsample_path),
    random_state=42,
    enable_wandb=True
)
result = train_lightgbm(lgb_cfg)
result.metrics

Training until validation scores don't improve for 50 rounds
[50]	valid_0's binary_logloss: 0.556469
[100]	valid_0's binary_logloss: 0.5523
[150]	valid_0's binary_logloss: 0.5513
[200]	valid_0's binary_logloss: 0.550798
[250]	valid_0's binary_logloss: 0.550563
[300]	valid_0's binary_logloss: 0.550552
Early stopping, best iteration is:
[297]	valid_0's binary_logloss: 0.550533


0,1
validation/ap,▁
validation/best_iteration,▁
validation/competition_score,▁
validation/wll,▁

0,1
validation/ap,0.60386
validation/best_iteration,297.0
validation/competition_score,0.47814
validation/wll,0.64757


{'ap': 0.6038613762857505,
 'wll': 0.6475747211292291,
 'competition_score': 0.4781433275782607,
 'best_iteration': 297}

## Log Metrics

In [7]:
run_id = result.run_id
append_metric(
    MetricRecord(
        run_id=run_id,
        stage='stage0',
        dataset='validation',
        metrics=result.metrics,
        notes='stage0 baseline (notebook)',
    )
)
run_id

'20250919_172520_lgbm_baseline'