### Import

In [1]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib
import seaborn as sns

# 각 라이브러리 버전 출력
print("Numpy version:", np.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Pandas version:", pd.__version__)
print("Matplotlib version:", matplotlib.__version__)
print("Seaborn version:", sns.__version__)



Numpy version: 1.26.4
Scikit-learn version: 1.5.1
Pandas version: 1.5.3
Matplotlib version: 3.9.2
Seaborn version: 0.13.2


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

### Data Load

In [3]:
train = pd.read_csv('../../train.csv').drop(columns=['ID'])
test = pd.read_csv('../../test.csv').drop(columns=['ID'])

In [4]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [5]:
categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [6]:
# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)


In [7]:
# 결측값을 채울 칼럼 목록
columns_to_fill = [
    '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부',
    '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수',
    '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수',
    '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수',
    '대리모 여부'
]

# 시술 유형이 'DI'인 경우에만 결측값을 0으로 채우기
X.loc[X['시술 유형'] == 'DI', columns_to_fill] = X.loc[X['시술 유형'] == 'DI', columns_to_fill].fillna(0)

# 1. '난자 출처'의 결측값을 '본인 제공'으로 채우기
X['난자 출처'].replace('알 수 없음','본인 제공', inplace=True)

# 2. '난자 기증자 나이' 결측값을 시술 당시 나이로 채우기
X.loc[X['난자 기증자 나이'] == '알 수 없음', '난자 기증자 나이'] = X['시술 당시 나이']

# 3. '파트너 정자와 혼합된 난자 수'의 결측값을 0으로 채우기
X['파트너 정자와 혼합된 난자 수'].fillna(0, inplace=True)

# 4. '기증자 정자와 혼합된 난자 수'의 결측값을 1로 채우기
X['기증자 정자와 혼합된 난자 수'].fillna(1, inplace=True)

# 5. '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부'의 결측값을 0으로 채우기
for column in ['동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부']:
    X[column].fillna(0, inplace=True)

    # 변경 내용 확인
print(X[['난자 출처', '난자 기증자 나이', '파트너 정자와 혼합된 난자 수', 
            '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부']].head())


   난자 출처 난자 기증자 나이  파트너 정자와 혼합된 난자 수  기증자 정자와 혼합된 난자 수 동결 배아 사용 여부  \
0  본인 제공   만18-34세               5.0               0.0         0.0   
1  본인 제공   만45-50세               1.0               0.0         0.0   
2  본인 제공   만18-34세               7.0               0.0         0.0   
3  본인 제공   만35-37세               4.0               0.0         0.0   
4  본인 제공   만18-34세               6.0               0.0         0.0   

  신선 배아 사용 여부 기증 배아 사용 여부  
0         1.0         0.0  
1         1.0         0.0  
2         1.0         0.0  
3         1.0         0.0  
4         1.0         0.0  


In [8]:
# 시술 유형이 'DI'인 경우에만 결측값을 0으로 채우기
test.loc[test['시술 유형'] == 'DI', columns_to_fill] = test.loc[test['시술 유형'] == 'DI', columns_to_fill].fillna(0)

# 1. '난자 출처'의 결측값을 '본인 제공'으로 채우기
test['난자 출처'].replace('알 수 없음','본인 제공', inplace=True)

# 2. '난자 기증자 나이' 결측값을 시술 당시 나이로 채우기
test.loc[test['난자 기증자 나이'] == '알 수 없음', '난자 기증자 나이'] = test['시술 당시 나이']

# 3. '파트너 정자와 혼합된 난자 수'의 결측값을 0으로 채우기
test['파트너 정자와 혼합된 난자 수'].fillna(0, inplace=True)

# 4. '기증자 정자와 혼합된 난자 수'의 결측값을 1로 채우기
test['기증자 정자와 혼합된 난자 수'].fillna(1, inplace=True)

# 5. '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부'의 결측값을 0으로 채우기
for column in ['동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부']:
    test[column].fillna(0, inplace=True)

In [9]:
# 각 열의 결측값 개수 확인
missing_values_count = X.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)

결측값이 있는 열과 개수:
임신 시도 또는 마지막 임신 경과 연수    246981
난자 채취 경과일                 57488
난자 해동 경과일                254915
난자 혼합 경과일                 53735
배아 이식 경과일                 43566
배아 해동 경과일                215982
dtype: int64


In [10]:

X['난자 채취 경과일'].fillna(1, inplace=True)
X['난자 해동 경과일'].fillna(0, inplace=True)
X['난자 혼합 경과일'].fillna(0, inplace=True)
X['배아 이식 경과일'].fillna(0, inplace=True)
X['배아 해동 경과일'].fillna(0, inplace=True)

test['난자 채취 경과일'].fillna(1, inplace=True)
test['난자 해동 경과일'].fillna(0, inplace=True)
test['난자 혼합 경과일'].fillna(0, inplace=True)
test['배아 이식 경과일'].fillna(0, inplace=True)
test['배아 해동 경과일'].fillna(0, inplace=True)

In [11]:
# '알 수 없음'이 포함된 열 찾기
unknown_columns = []

for col in X.columns:
    if X[col].astype(str).str.contains('알 수 없음').any():
        unknown_columns.append(col)

# 결과 출력
print("⚠️ '알 수 없음' 값을 포함하는 컬럼들:")
print(unknown_columns)


⚠️ '알 수 없음' 값을 포함하는 컬럼들:
['시술 당시 나이', '배란 유도 유형', '난자 기증자 나이', '정자 기증자 나이']


In [12]:
# 변환 함수 정의 (isinstance 없이 단순화)
def categorize_reason(value):
    if value == "배아 저장용":  # "배아 저장용" 단독일 경우 1
        return 1
    elif "현재 시술용" in value:  # "현재 시술용"이 포함된 경우 1
        return 1
    else:  # 그 외 모든 값은 0
        return 0

X["배아 생성 주요 이유"] = X["배아 생성 주요 이유"].replace("nan", "현재 시술용")
test["배아 생성 주요 이유"] = test["배아 생성 주요 이유"].replace("nan", "현재 시술용")

# "배아 생성 주요 이유" 칼럼 변환 적용
X["배아 생성 주요 이유"] = X["배아 생성 주요 이유"].apply(categorize_reason)
test["배아 생성 주요 이유"] = test["배아 생성 주요 이유"].apply(categorize_reason)



In [13]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [14]:
columns_to_drop = [
        "임신 시도 또는 마지막 임신 경과 연수",
        "PGD 시술 여부",
        "PGS 시술 여부",
        "남성 주 불임 원인",
        "남성 부 불임 원인",
        "불임 원인 - 정자 농도",
        "불임 원인 - 정자 면역학적 요인",
        "불임 원인 - 정자 운동성",
        "불임 원인 - 정자 형태",
        '정자 기증자 나이',
        '배란 유도 유형'
]
X_train_encoded = X_train_encoded.drop(columns = columns_to_drop)    
X_test_encoded = X_test_encoded.drop(columns = columns_to_drop)  

In [15]:
# 병합할 칼럼들
columns_to_merge = [
    '난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일',
    '배아 이식 경과일', '배아 해동 경과일'
]

# 새로운 칼럼 생성: 각 경과일의 합
X_train_encoded['총 경과일'] = X_train_encoded[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
X_train_encoded = X_train_encoded.drop(columns=columns_to_merge)

# 새로운 칼럼 생성: 각 경과일의 합
X_test_encoded['총 경과일'] = X_test_encoded[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
X_test_encoded = X_test_encoded.drop(columns=columns_to_merge)

In [16]:
# 각 열의 결측값 개수 확인
missing_values_count = X_train_encoded.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)


결측값이 있는 열과 개수:
Series([], dtype: int64)


In [17]:
X['배아 생성 주요 이유'].value_counts()

1    253108
0      3243
Name: 배아 생성 주요 이유, dtype: int64

In [18]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256351 entries, 0 to 256350
Data columns (total 52 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   시술 시기 코드          256351 non-null  float64
 1   시술 당시 나이          256351 non-null  float64
 2   시술 유형             256351 non-null  float64
 3   특정 시술 유형          256351 non-null  float64
 4   배란 자극 여부          256351 non-null  float64
 5   단일 배아 이식 여부       256351 non-null  float64
 6   착상 전 유전 검사 사용 여부  256351 non-null  float64
 7   착상 전 유전 진단 사용 여부  256351 non-null  float64
 8   여성 주 불임 원인        256351 non-null  float64
 9   여성 부 불임 원인        256351 non-null  float64
 10  부부 주 불임 원인        256351 non-null  float64
 11  부부 부 불임 원인        256351 non-null  float64
 12  불명확 불임 원인         256351 non-null  float64
 13  불임 원인 - 난관 질환     256351 non-null  float64
 14  불임 원인 - 남성 요인     256351 non-null  float64
 15  불임 원인 - 배란 장애     256351 non-null  float64
 16  불임 원인 - 여성 요인     25

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA



# 데이터 정규화
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X_train_encoded)
scaled_test_x = scaler.transform(X_test_encoded)

# PCA 적용
pca = PCA(n_components=0.95)  # 설명 분산의 93%를 유지하도록 설정
X_train_encoded = pca.fit_transform(scaled_features)
X_test_encoded = pca.transform(scaled_test_x)

# PCA 적용 후 데이터셋의 형태 확인
pca_features_shape = X_train_encoded.shape

# 설명된 분산 비율
explained_variance = pca.explained_variance_ratio_

pca_features_shape, explained_variance

((256351, 29),
 array([0.16753015, 0.11126319, 0.07962371, 0.05178603, 0.04250971,
        0.03946191, 0.03723161, 0.03422201, 0.03335946, 0.03292284,
        0.02821389, 0.02750599, 0.02500275, 0.02422428, 0.0222452 ,
        0.0206592 , 0.01960565, 0.01914876, 0.01862165, 0.01843826,
        0.01761127, 0.01645639, 0.01397385, 0.01081516, 0.01027633,
        0.00865718, 0.00772819, 0.0070885 , 0.00667548]))

### Train

In [20]:
model = ExtraTreesClassifier(random_state=42)

model.fit(X_train_encoded, y)

**앙상블 및 Optuna**

In [21]:
import optuna
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [22]:
def optimize_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5)
    }
    model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
    return np.mean(scores)

In [23]:
def optimize_lightgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0)
    }
    model = LGBMClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
    return np.mean(scores)

In [24]:
def optimize_random_forest(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 3, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4)
    }
    model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
    return np.mean(scores)


In [25]:
# Optuna 스터디 실행
print("Optimizing XGBoost...")
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(optimize_xgboost, n_trials=30)
best_xgb_params = xgb_study.best_params

print("Optimizing LightGBM...")
lgbm_study = optuna.create_study(direction="maximize")
lgbm_study.optimize(optimize_lightgbm, n_trials=30)
best_lgbm_params = lgbm_study.best_params

print("Optimizing RandomForest...")
rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(optimize_random_forest, n_trials=30)
best_rf_params = rf_study.best_params

[I 2025-02-12 22:41:06,437] A new study created in memory with name: no-name-2b906558-605c-4e68-81ca-c07a6c554d63


Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-12 22:41:19,961] Trial 0 finished with value: 0.704841468490957 and parameters: {'n_estimators': 417, 'max_depth': 9, 'learning_rate': 0.2562445903388598, 'subsample': 0.604353931875624, 'colsample_bytree': 0.7216244735358961, 'gamma': 4.3196070715073915}. Best is trial 0 with value: 0.704841468490957.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-12 22:41:27,349] Trial 1 finished with value: 0.7241920091281189 and parameters: {'n_estimators': 245, 'max_depth': 4, 'learning_rate': 0.27710724407811904, 'subsam

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from s

[I 2025-02-12 22:49:49,994] Trial 0 finished with value: 0.7217835367899899 and parameters: {'n_estimators': 425, 'max_depth': 12, 'learning_rate': 0.140427165399226, 'num_leaves': 49, 'subsample': 0.601190463068341}. Best is trial 0 with value: 0.7217835367899899.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009902 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:49:54,866] Trial 1 finished with value: 0.7303921990423973 and parameters: {'n_estimators': 195, 'max_depth': 6, 'learning_rate': 0.0777612809655261, 'num_leaves': 22, 'subsample': 0.7591906424174907}. Best is trial 1 with value: 0.7303921990423973.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010550 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:04,825] Trial 2 finished with value: 0.7191261271731881 and parameters: {'n_estimators': 469, 'max_depth': 7, 'learning_rate': 0.1704979456355226, 'num_leaves': 47, 'subsample': 0.7042340354666694}. Best is trial 1 with value: 0.7303921990423973.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:09,774] Trial 3 finished with value: 0.7297784498579469 and parameters: {'n_estimators': 271, 'max_depth': 3, 'learning_rate': 0.1997198862968912, 'num_leaves': 39, 'subsample': 0.7938609967603337}. Best is trial 1 with value: 0.7303921990423973.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:14,741] Trial 4 finished with value: 0.7243645015073591 and parameters: {'n_estimators': 225, 'max_depth': 12, 'learning_rate': 0.2064451386639399, 'num_leaves': 31, 'subsample': 0.9051232654860932}. Best is trial 1 with value: 0.7303921990423973.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:21,804] Trial 5 finished with value: 0.7307207935471567 and parameters: {'n_estimators': 291, 'max_depth': 5, 'learning_rate': 0.05282341078677656, 'num_leaves': 45, 'subsample': 0.8019509732556491}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009786 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:29,152] Trial 6 finished with value: 0.7304840838817823 and parameters: {'n_estimators': 256, 'max_depth': 10, 'learning_rate': 0.04297558822524023, 'num_leaves': 32, 'subsample': 0.7792446007243656}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:37,191] Trial 7 finished with value: 0.7146125028740757 and parameters: {'n_estimators': 462, 'max_depth': 7, 'learning_rate': 0.2863500546090958, 'num_leaves': 25, 'subsample': 0.95806621764152}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:46,456] Trial 8 finished with value: 0.7070779772544542 and parameters: {'n_estimators': 460, 'max_depth': 7, 'learning_rate': 0.29356521241240646, 'num_leaves': 45, 'subsample': 0.6766454052475648}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:52,073] Trial 9 finished with value: 0.7275292784410328 and parameters: {'n_estimators': 335, 'max_depth': 9, 'learning_rate': 0.19058428173188233, 'num_leaves': 15, 'subsample': 0.6193583545594182}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:50:55,057] Trial 10 finished with value: 0.7119262138951268 and parameters: {'n_estimators': 102, 'max_depth': 4, 'learning_rate': 0.01964092541518015, 'num_leaves': 40, 'subsample': 0.8739594129688709}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:06,392] Trial 11 finished with value: 0.7274442536500739 and parameters: {'n_estimators': 358, 'max_depth': 10, 'learning_rate': 0.011500726321984914, 'num_leaves': 33, 'subsample': 0.8334498240267025}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:12,552] Trial 12 finished with value: 0.7306613676745002 and parameters: {'n_estimators': 274, 'max_depth': 5, 'learning_rate': 0.07940111083515092, 'num_leaves': 38, 'subsample': 0.751975691686112}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009875 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:17,014] Trial 13 finished with value: 0.7305702335878845 and parameters: {'n_estimators': 182, 'max_depth': 5, 'learning_rate': 0.0906752510391889, 'num_leaves': 40, 'subsample': 0.7193622075285488}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:24,528] Trial 14 finished with value: 0.7290182638517719 and parameters: {'n_estimators': 358, 'max_depth': 5, 'learning_rate': 0.11275267082493598, 'num_leaves': 37, 'subsample': 0.8720913287889382}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010622 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:29,593] Trial 15 finished with value: 0.7281353418576388 and parameters: {'n_estimators': 304, 'max_depth': 3, 'learning_rate': 0.0599319621814047, 'num_leaves': 45, 'subsample': 0.8302249717576256}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009267 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:33,147] Trial 16 finished with value: 0.7303938779964501 and parameters: {'n_estimators': 139, 'max_depth': 5, 'learning_rate': 0.11926922407146612, 'num_leaves': 25, 'subsample': 0.726925794835608}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006737 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:38,898] Trial 17 finished with value: 0.729825778400253 and parameters: {'n_estimators': 298, 'max_depth': 4, 'learning_rate': 0.055058322581551696, 'num_leaves': 50, 'subsample': 0.6591165703261194}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009588 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:47,202] Trial 18 finished with value: 0.7275009691818329 and parameters: {'n_estimators': 410, 'max_depth': 8, 'learning_rate': 0.10212905294566996, 'num_leaves': 37, 'subsample': 0.7581783058933731}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009744 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:52,821] Trial 19 finished with value: 0.7273364705062563 and parameters: {'n_estimators': 229, 'max_depth': 6, 'learning_rate': 0.14491616676602823, 'num_leaves': 43, 'subsample': 0.9995082434321667}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009479 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:51:59,740] Trial 20 finished with value: 0.724747433202687 and parameters: {'n_estimators': 403, 'max_depth': 4, 'learning_rate': 0.2539111985178641, 'num_leaves': 15, 'subsample': 0.8335884832346413}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009503 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:03,925] Trial 21 finished with value: 0.7303200990941213 and parameters: {'n_estimators': 165, 'max_depth': 5, 'learning_rate': 0.08219104773520625, 'num_leaves': 42, 'subsample': 0.7237038399599627}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:08,784] Trial 22 finished with value: 0.7304530274480429 and parameters: {'n_estimators': 194, 'max_depth': 6, 'learning_rate': 0.0908062313267, 'num_leaves': 35, 'subsample': 0.6750981806036861}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006354 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:15,327] Trial 23 finished with value: 0.7305057711332663 and parameters: {'n_estimators': 286, 'max_depth': 5, 'learning_rate': 0.04174952370858799, 'num_leaves': 40, 'subsample': 0.7381388875530959}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:20,074] Trial 24 finished with value: 0.7302465371425526 and parameters: {'n_estimators': 244, 'max_depth': 4, 'learning_rate': 0.12618078401203958, 'num_leaves': 43, 'subsample': 0.7851706175102341}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:27,090] Trial 25 finished with value: 0.7305788166026929 and parameters: {'n_estimators': 325, 'max_depth': 6, 'learning_rate': 0.0700011821095705, 'num_leaves': 28, 'subsample': 0.7019841286296284}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009589 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:33,959] Trial 26 finished with value: 0.7304248543433041 and parameters: {'n_estimators': 319, 'max_depth': 8, 'learning_rate': 0.06618833083589039, 'num_leaves': 28, 'subsample': 0.6488151932819672}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:42,029] Trial 27 finished with value: 0.7301604228303714 and parameters: {'n_estimators': 370, 'max_depth': 6, 'learning_rate': 0.03503915947941806, 'num_leaves': 21, 'subsample': 0.6981099574532429}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:47,548] Trial 28 finished with value: 0.7241128084171654 and parameters: {'n_estimators': 326, 'max_depth': 3, 'learning_rate': 0.029020367735533015, 'num_leaves': 27, 'subsample': 0.8216238406274845}. Best is trial 5 with value: 0.7307207935471567.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-12 22:52:52,055] Trial 29 finished with value: 0.7291693184402218 and parameters: {'n_estimators': 267, 'max_depth': 6, 'learning_rate': 0.16482184040983644, 'num_leaves': 11, 'subsample': 0.6315908948175457}. Best is trial 5 with value: 0.7307207935471567.
[I 2025-02-12 22:52:52,056] A new study created in memory with name: no-name-c7c7144f-9342-4ce6-9ea9-56fa05dfccd5


Optimizing RandomForest...


[I 2025-02-12 22:58:20,157] Trial 0 finished with value: 0.6938538892004525 and parameters: {'n_estimators': 253, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.6938538892004525.
[I 2025-02-12 23:06:28,046] Trial 1 finished with value: 0.719009416789052 and parameters: {'n_estimators': 178, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.719009416789052.
[I 2025-02-12 23:17:35,828] Trial 2 finished with value: 0.7223265769995271 and parameters: {'n_estimators': 217, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.7223265769995271.
[I 2025-02-12 23:31:54,187] Trial 3 finished with value: 0.7247606539678765 and parameters: {'n_estimators': 254, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.7247606539678765.
[I 2025-02-12 23:45:03,083] Trial 4 finished with value: 0.7191344332466432 and parameters: {'n_es

In [26]:
# 최적화된 모델 생성
xgb_model = XGBClassifier(**best_xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
lgbm_model = LGBMClassifier(**best_lgbm_params, random_state=42)
rf_model = RandomForestClassifier(**best_rf_params, random_state=42)

In [27]:
# XGBoost 모델의 파라미터 확인
print("XGBoost Best Parameters:")
print(xgb_model.get_params())

# LightGBM 모델의 파라미터 확인
print("\nLightGBM Best Parameters:")
print(lgbm_model.get_params())

# RandomForest 모델의 파라미터 확인
print("\nRandomForest Best Parameters:")
print(rf_model.get_params())

XGBoost Best Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.803187142177975, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': 1.747988745807904, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.02995526858218256, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 7, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 417, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.703232557283515, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': F

In [28]:
# Soft Voting 앙상블
ensemble_model = VotingClassifier(
    estimators=[
        ("xgb", xgb_model),
        ("lgbm", lgbm_model),
        ("rf", rf_model)
    ],
    voting="soft"
)

In [29]:
# 전체 데이터로 학습
ensemble_model.fit(X_train_encoded, y)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568


### Predict

In [30]:
pred_proba = ensemble_model.predict_proba(X_test_encoded)[:, 1]

### Submission

In [31]:
sample_submission = pd.read_csv('../../sample_submission.csv')
sample_submission['probability'] = pred_proba

In [32]:
sample_submission.to_csv('./Result/baseline_submit.csv', index=False)

In [33]:
pred_proba

array([0.00678237, 0.01148497, 0.16466486, ..., 0.43074943, 0.31741161,
       0.02870452])

In [34]:
unique, counts = np.unique(sample_submission['probability'], return_counts=True)
print(unique, counts)

[3.01266463e-04 3.12348210e-04 3.18056581e-04 ... 7.16311330e-01
 7.23030638e-01 7.38325415e-01] [1 1 1 ... 1 1 1]
