In [103]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, RobustScaler

In [None]:
train = pd.read_csv(r"\playground-series-s6e2\train.csv")
train = train.set_index('id')
le = LabelEncoder()
train['Heart Disease'] = le.fit_transform(train['Heart Disease'])
y = train['Heart Disease']
train = train.drop(columns='Heart Disease')

test = pd.read_csv(r"\playground-series-s6e2\test.csv")
test = test.set_index('id')

# Basic check

In [105]:
y.value_counts(normalize=True)

Heart Disease
0    0.55166
1    0.44834
Name: proportion, dtype: float64

In [106]:
import EDA
eda = EDA.BasicEDA(train)
print(eda.check_missing_values())
print(eda.check_duplicate())
print(eda.check_unique_count())

No Missing value
No duplicate
                    Feature  Unique  Sample_size  Cardinality_Ratio
4               Cholesterol     150       630000            0.02381
7                    Max HR      93       630000            0.01476
3                        BP      66       630000            0.01048
9             ST depression      66       630000            0.01048
0                       Age      42       630000            0.00667
11  Number of vessels fluro       4       630000            0.00063
2           Chest pain type       4       630000            0.00063
10              Slope of ST       3       630000            0.00048
6               EKG results       3       630000            0.00048
12                 Thallium       3       630000            0.00048
1                       Sex       2       630000            0.00032
8           Exercise angina       2       630000            0.00032
5              FBS over 120       2       630000            0.00032


In [107]:
num_fea = ['Age','BP','Cholesterol','Max HR','ST depression']
cate_fea = list(set(train.columns).difference(num_fea))
eda.skewness_kurtosis(num_fea)

Unnamed: 0,feature,count,mean,std,min,25%,50%,75%,max,range,iqr,skewness,kurtosis
3,Max HR,630000.0,152.816763,19.112927,71.0,142.0,157.0,166.0,202.0,131.0,24.0,-0.754556,0.203852
0,Age,630000.0,54.136706,8.256301,29.0,48.0,54.0,60.0,77.0,48.0,12.0,-0.164011,-0.570157
2,Cholesterol,630000.0,245.011814,33.681581,126.0,223.0,243.0,269.0,564.0,438.0,46.0,0.27315,0.068237
1,BP,630000.0,130.497433,14.975802,94.0,120.0,130.0,140.0,200.0,106.0,20.0,0.629283,0.933183
4,ST depression,630000.0,0.716028,0.948472,0.0,0.0,0.1,1.4,6.2,6.2,1.4,1.328429,1.374855


In [108]:
train['ST_dpe_is_zero'] = (train['ST depression'] == 0).astype(int)
train['ST depression'] = np.log(train['ST depression']+1)
train['cholesterol_high'] = (train['Cholesterol'] > train['Cholesterol'].quantile(.9)).astype(int)

In [109]:
cate_fea

['Number of vessels fluro',
 'Slope of ST',
 'EKG results',
 'Thallium',
 'Exercise angina',
 'Sex',
 'FBS over 120',
 'Chest pain type']

In [110]:
eda.Categorical_Dis(cate_fea)

Unnamed: 0,feature,gini,entropy
6,FBS over 120,0.147179,0.402134
4,Exercise angina,0.3976,0.84676
5,Sex,0.407778,0.862526
2,EKG results,0.501924,1.019351
3,Thallium,0.497098,1.07986
1,Slope of ST,0.510557,1.122396
0,Number of vessels fluro,0.461556,1.265748
7,Chest pain type,0.612719,1.58177


In [111]:
train['hard1'] = train['Age'] * train['FBS over 120']
train['hard2'] = train['Cholesterol'] * train['FBS over 120']

# Train

In [112]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
oof = np.zeros(len(train))

def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 256),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0)
    }
    
    fold_aucs = []
    for train_idx, valid_idx in kf.split(train, y):
        X_train, X_valid = train.iloc[train_idx], train.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid,reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,    
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),  # 改用 callback 形式
                lgb.log_evaluation(period=0)  # period=0 相当于 verbose_eval=False
            ]
        )
        preds = model.predict(X_valid)
        fold_aucs.append(roc_auc_score(y_valid, preds))
    
    return np.mean(fold_aucs)

study = optuna.create_study(direction='maximize', study_name='lgb_baseline')
study.optimize(objective, n_trials=20)

[32m[I 2026-02-01 22:53:03,343][0m A new study created in memory with name: lgb_baseline[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[191]	valid_0's auc: 0.955587
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[171]	valid_0's auc: 0.954571
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[178]	valid_0's auc: 0.955371
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[189]	valid_0's auc: 0.955024
Training until validation scores don't improve for 50 rounds


[32m[I 2026-02-01 22:53:17,230][0m Trial 0 finished with value: 0.9552739040374327 and parameters: {'learning_rate': 0.11230404499759662, 'num_leaves': 93, 'max_depth': 9, 'min_child_samples': 62, 'subsample': 0.8631393922830213, 'colsample_bytree': 0.5317337830989044, 'reg_alpha': 7.055293715366454, 'reg_lambda': 4.495551116317049}. Best is trial 0 with value: 0.9552739040374327.[0m


Early stopping, best iteration is:
[182]	valid_0's auc: 0.955817
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[747]	valid_0's auc: 0.955741
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[550]	valid_0's auc: 0.954729
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[598]	valid_0's auc: 0.955525
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[653]	valid_0's auc: 0.955117
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[706]	valid_0's auc: 0.955955


[32m[I 2026-02-01 22:54:02,136][0m Trial 1 finished with value: 0.9554133483075018 and parameters: {'learning_rate': 0.04626986965181246, 'num_leaves': 222, 'max_depth': 6, 'min_child_samples': 78, 'subsample': 0.5157531931463704, 'colsample_bytree': 0.5549777666485125, 'reg_alpha': 6.721730229564979, 'reg_lambda': 1.1998701191116412}. Best is trial 1 with value: 0.9554133483075018.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[397]	valid_0's auc: 0.955664
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[381]	valid_0's auc: 0.954602
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[365]	valid_0's auc: 0.955441
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[395]	valid_0's auc: 0.955079
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[365]	valid_0's auc: 0.955863


[32m[I 2026-02-01 22:54:23,136][0m Trial 2 finished with value: 0.9553298176724565 and parameters: {'learning_rate': 0.10042179937657413, 'num_leaves': 38, 'max_depth': 5, 'min_child_samples': 27, 'subsample': 0.9708763425439481, 'colsample_bytree': 0.7595705877030299, 'reg_alpha': 5.940123423892149, 'reg_lambda': 8.12633959377614}. Best is trial 1 with value: 0.9554133483075018.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955572
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.954588
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955331
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.954973
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955733


[32m[I 2026-02-01 22:55:17,122][0m Trial 3 finished with value: 0.9552393383369667 and parameters: {'learning_rate': 0.018252674968507392, 'num_leaves': 91, 'max_depth': 6, 'min_child_samples': 14, 'subsample': 0.7973369400938939, 'colsample_bytree': 0.6706963012665423, 'reg_alpha': 6.800218199616035, 'reg_lambda': 7.4440323896427865}. Best is trial 1 with value: 0.9554133483075018.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[491]	valid_0's auc: 0.955671
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[478]	valid_0's auc: 0.954644
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[523]	valid_0's auc: 0.955454
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[518]	valid_0's auc: 0.955046
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[494]	valid_0's auc: 0.955886


[32m[I 2026-02-01 22:55:50,391][0m Trial 4 finished with value: 0.955339910176788 and parameters: {'learning_rate': 0.05070863051722441, 'num_leaves': 172, 'max_depth': 7, 'min_child_samples': 64, 'subsample': 0.5115691933952926, 'colsample_bytree': 0.603174541592842, 'reg_alpha': 6.921329362476964, 'reg_lambda': 8.446592576295362}. Best is trial 1 with value: 0.9554133483075018.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955236
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.954373
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955148
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.95464
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955518


[32m[I 2026-02-01 22:57:15,924][0m Trial 5 finished with value: 0.9549828406755478 and parameters: {'learning_rate': 0.011559040390608342, 'num_leaves': 244, 'max_depth': 9, 'min_child_samples': 45, 'subsample': 0.9006755245975171, 'colsample_bytree': 0.7221398405143475, 'reg_alpha': 0.8459720270296478, 'reg_lambda': 8.194470972955555}. Best is trial 1 with value: 0.9554133483075018.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[281]	valid_0's auc: 0.955738
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[258]	valid_0's auc: 0.954666
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[293]	valid_0's auc: 0.955485
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[333]	valid_0's auc: 0.955076
Training until validation scores don't improve for 50 rounds


[32m[I 2026-02-01 22:57:32,749][0m Trial 6 finished with value: 0.9553764509955469 and parameters: {'learning_rate': 0.18217850527808138, 'num_leaves': 96, 'max_depth': 4, 'min_child_samples': 87, 'subsample': 0.9561531230790299, 'colsample_bytree': 0.5518180933102952, 'reg_alpha': 0.5720028070978567, 'reg_lambda': 9.155816774504213}. Best is trial 1 with value: 0.9554133483075018.[0m


Early stopping, best iteration is:
[288]	valid_0's auc: 0.955917
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[292]	valid_0's auc: 0.955295
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[250]	valid_0's auc: 0.954317
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[299]	valid_0's auc: 0.955103
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[278]	valid_0's auc: 0.954626
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[286]	valid_0's auc: 0.95545


[32m[I 2026-02-01 22:57:57,925][0m Trial 7 finished with value: 0.9549583035014425 and parameters: {'learning_rate': 0.06111194353623702, 'num_leaves': 214, 'max_depth': 9, 'min_child_samples': 34, 'subsample': 0.8117811428069651, 'colsample_bytree': 0.9012054020058058, 'reg_alpha': 6.910004906962498, 'reg_lambda': 4.071188770314232}. Best is trial 1 with value: 0.9554133483075018.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[690]	valid_0's auc: 0.955747
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[633]	valid_0's auc: 0.954729
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[593]	valid_0's auc: 0.955519
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[654]	valid_0's auc: 0.955134
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[643]	valid_0's auc: 0.955942


[32m[I 2026-02-01 22:58:25,893][0m Trial 8 finished with value: 0.9554140920401595 and parameters: {'learning_rate': 0.09008514589467877, 'num_leaves': 237, 'max_depth': 4, 'min_child_samples': 76, 'subsample': 0.6508701769677392, 'colsample_bytree': 0.8927290398267991, 'reg_alpha': 7.434549616388004, 'reg_lambda': 5.051976806369137}. Best is trial 8 with value: 0.9554140920401595.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[125]	valid_0's auc: 0.955349
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[122]	valid_0's auc: 0.954428
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[105]	valid_0's auc: 0.955186
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[121]	valid_0's auc: 0.954746
Training until validation scores don't improve for 50 rounds


[32m[I 2026-02-01 22:58:36,472][0m Trial 9 finished with value: 0.9550509401085282 and parameters: {'learning_rate': 0.18046112443582346, 'num_leaves': 232, 'max_depth': 7, 'min_child_samples': 38, 'subsample': 0.6395373943020852, 'colsample_bytree': 0.8114887267211263, 'reg_alpha': 2.438924347163709, 'reg_lambda': 9.241184382868092}. Best is trial 8 with value: 0.9554140920401595.[0m


Early stopping, best iteration is:
[137]	valid_0's auc: 0.955547
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[526]	valid_0's auc: 0.955219
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[552]	valid_0's auc: 0.954247
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[613]	valid_0's auc: 0.955087
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[558]	valid_0's auc: 0.954544
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[530]	valid_0's auc: 0.955378


[32m[I 2026-02-01 22:59:26,742][0m Trial 10 finished with value: 0.9548948937323122 and parameters: {'learning_rate': 0.02872192540937266, 'num_leaves': 174, 'max_depth': 12, 'min_child_samples': 99, 'subsample': 0.6710058407423202, 'colsample_bytree': 0.9905758130731832, 'reg_alpha': 9.909096846581246, 'reg_lambda': 1.574463610424412}. Best is trial 8 with value: 0.9554140920401595.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.9555
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's auc: 0.954535
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955247
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.95489
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.955658


[32m[I 2026-02-01 23:00:02,048][0m Trial 11 finished with value: 0.9551660618168535 and parameters: {'learning_rate': 0.034201039755344095, 'num_leaves': 191, 'max_depth': 3, 'min_child_samples': 77, 'subsample': 0.5082759805613958, 'colsample_bytree': 0.857987647377397, 'reg_alpha': 9.319266755065804, 'reg_lambda': 0.05425195681983097}. Best is trial 8 with value: 0.9554140920401595.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[966]	valid_0's auc: 0.955798
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[955]	valid_0's auc: 0.954782
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[990]	valid_0's auc: 0.955547
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955217
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[958]	valid_0's auc: 0.955992


[32m[I 2026-02-01 23:00:36,855][0m Trial 12 finished with value: 0.9554671302116828 and parameters: {'learning_rate': 0.07933476625964035, 'num_leaves': 248, 'max_depth': 3, 'min_child_samples': 79, 'subsample': 0.6415240448792937, 'colsample_bytree': 0.9726025388112427, 'reg_alpha': 4.259401181069312, 'reg_lambda': 2.57387320523945}. Best is trial 12 with value: 0.9554671302116828.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[975]	valid_0's auc: 0.955815
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[901]	valid_0's auc: 0.95479
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.955544
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[996]	valid_0's auc: 0.955248
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[908]	valid_0's auc: 0.956016


[32m[I 2026-02-01 23:01:10,966][0m Trial 13 finished with value: 0.9554825560138003 and parameters: {'learning_rate': 0.08543930562413159, 'num_leaves': 145, 'max_depth': 3, 'min_child_samples': 60, 'subsample': 0.6558357034494361, 'colsample_bytree': 0.9781025654623433, 'reg_alpha': 3.926247204931002, 'reg_lambda': 6.31004717625125}. Best is trial 13 with value: 0.9554825560138003.[0m


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955812
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.954733
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.955547
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.9552
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.955994


[32m[I 2026-02-01 23:01:45,744][0m Trial 14 finished with value: 0.9554574528636051 and parameters: {'learning_rate': 0.0700641994899063, 'num_leaves': 142, 'max_depth': 3, 'min_child_samples': 53, 'subsample': 0.7183307343893043, 'colsample_bytree': 0.9654216231076316, 'reg_alpha': 3.8762354646457196, 'reg_lambda': 6.397859032996144}. Best is trial 13 with value: 0.9554825560138003.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[602]	valid_0's auc: 0.955857
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[565]	valid_0's auc: 0.954689
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[695]	valid_0's auc: 0.955507
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[889]	valid_0's auc: 0.955196
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[592]	valid_0's auc: 0.955967


[32m[I 2026-02-01 23:02:11,064][0m Trial 15 finished with value: 0.9554431615936576 and parameters: {'learning_rate': 0.13165628060284482, 'num_leaves': 16, 'max_depth': 3, 'min_child_samples': 60, 'subsample': 0.6096386159908945, 'colsample_bytree': 0.9380883990152188, 'reg_alpha': 4.202860157165032, 'reg_lambda': 3.004899838450955}. Best is trial 13 with value: 0.9554825560138003.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[494]	valid_0's auc: 0.95564
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[479]	valid_0's auc: 0.954606
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[398]	valid_0's auc: 0.955395
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[534]	valid_0's auc: 0.954994
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[508]	valid_0's auc: 0.955788


[32m[I 2026-02-01 23:02:34,270][0m Trial 16 finished with value: 0.9552846459701729 and parameters: {'learning_rate': 0.07845321820462771, 'num_leaves': 143, 'max_depth': 5, 'min_child_samples': 100, 'subsample': 0.5750474110110568, 'colsample_bytree': 0.9970867065806717, 'reg_alpha': 2.3928360239083517, 'reg_lambda': 6.465583023609714}. Best is trial 13 with value: 0.9554825560138003.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[448]	valid_0's auc: 0.955136
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[416]	valid_0's auc: 0.954273
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[418]	valid_0's auc: 0.955033
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[464]	valid_0's auc: 0.954547
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[457]	valid_0's auc: 0.95536


[32m[I 2026-02-01 23:03:19,470][0m Trial 17 finished with value: 0.9548697125272746 and parameters: {'learning_rate': 0.030761392710064752, 'num_leaves': 256, 'max_depth': 12, 'min_child_samples': 71, 'subsample': 0.7370192751682683, 'colsample_bytree': 0.8016407193087124, 'reg_alpha': 4.913408578475066, 'reg_lambda': 2.177267607652761}. Best is trial 13 with value: 0.9554825560138003.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[388]	valid_0's auc: 0.95566
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[320]	valid_0's auc: 0.954632
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[337]	valid_0's auc: 0.955408
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[385]	valid_0's auc: 0.955082
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[393]	valid_0's auc: 0.955876


[32m[I 2026-02-01 23:03:37,011][0m Trial 18 finished with value: 0.9553316882663893 and parameters: {'learning_rate': 0.1505190886607639, 'num_leaves': 119, 'max_depth': 4, 'min_child_samples': 87, 'subsample': 0.6929969004130206, 'colsample_bytree': 0.9172314030325278, 'reg_alpha': 2.7724723316581326, 'reg_lambda': 5.4831424673760605}. Best is trial 13 with value: 0.9554825560138003.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[606]	valid_0's auc: 0.95565
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[494]	valid_0's auc: 0.954595
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[607]	valid_0's auc: 0.955443
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[603]	valid_0's auc: 0.955007
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[760]	valid_0's auc: 0.955831


[32m[I 2026-02-01 23:04:07,379][0m Trial 19 finished with value: 0.9553051934755136 and parameters: {'learning_rate': 0.06020512210485548, 'num_leaves': 199, 'max_depth': 5, 'min_child_samples': 52, 'subsample': 0.5765625096703267, 'colsample_bytree': 0.8561126512163446, 'reg_alpha': 3.3410985903729458, 'reg_lambda': 3.020046076312271}. Best is trial 13 with value: 0.9554825560138003.[0m


In [114]:
train_data = lgb.Dataset(train, label=y)
model = lgb.train(
            study.best_params,
            train_data,
            num_boost_round=1000,    
            callbacks=[
                lgb.log_evaluation(period=0)  
            ]
        )

# Check distribution

In [6]:
import CheckData
check = CheckData.ShiftData(train, test)

In [62]:
check.check_num_shift(num_fea)

Unnamed: 0,feature,wasserstein_dist,ks_pvalue
4,ST depression,0.003037,0.16406
1,BP,0.067123,0.256862
0,Age,0.029071,0.339312
3,Max HR,0.059884,0.618227
2,Cholesterol,0.069201,0.854634


In [8]:
check.check_cate_shift(cate_fea)

Unnamed: 0,feature,chi2_pvalue,kl_divergence,js_divergence,n_unique_train,n_unique_test
6,FBS over 120,0.920982,2.829684e-08,7.075078e-09,2,2
3,Thallium,0.904966,5.292115e-07,1.322044e-07,3,3
4,Exercise angina,0.365955,2.172984e-06,5.434371e-07,2,2
2,EKG results,0.313603,6.123446e-06,1.532237e-06,3,3
5,Sex,0.12868,6.134793e-06,1.532842e-06,2,2
7,Chest pain type,0.289258,9.938104e-06,2.483804e-06,4,4
0,Number of vessels fluro,0.199467,1.227232e-05,3.070785e-06,4,4
1,Slope of ST,0.020893,2.04443e-05,5.113684e-06,3,3


# Fit

In [115]:
test['ST_dpe_is_zero'] = (test['ST depression'] == 0).astype(int)
test['ST depression'] = np.log(test['ST depression']+1)
test['cholesterol_high'] = (test['Cholesterol'] > test['Cholesterol'].quantile(.9)).astype(int)
test['hard1'] = test['Age'] * test['FBS over 120']
test['hard2'] = test['Cholesterol'] * test['FBS over 120']

In [116]:
preds = model.predict(test)

In [117]:
test['Heart Disease'] = (preds >0.5).astype(int)

In [118]:
test = test.reset_index()

In [None]:
test[['id','Heart Disease']].set_index('id').to_csv(r'\playground-series-s6e2\submission.csv')