### Import

In [1]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib
import seaborn as sns

# 각 라이브러리 버전 출력
print("Numpy version:", np.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Pandas version:", pd.__version__)
print("Matplotlib version:", matplotlib.__version__)
print("Seaborn version:", sns.__version__)



Numpy version: 1.26.4
Scikit-learn version: 1.5.1
Pandas version: 1.5.3
Matplotlib version: 3.9.2
Seaborn version: 0.13.2


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

### Data Load

In [3]:
import sys
import os
import pandas as pd

# 현재 작업 디렉토리 경로를 가져와 shared codes 폴더의 위치를 sys.path에 추가합니다.
# sys.path에 추가된 경로에 있는 py 폴더는 임포트할 수 있다.
current_dir = os.getcwd()
shared_codes_dir = os.path.join(current_dir, '../shared codes')
sys.path.append(shared_codes_dir)


# cover_nan 모듈을 임포트
from cover_nan import missing_value_removal_function

# 원본 train 데이터 로드
train = pd.read_csv("../shared codes/data/train.csv")
test = pd.read_csv("../shared codes/data/test.csv")

# missing_value_removal_function 사용
train = missing_value_removal_function(train)
test = missing_value_removal_function(test)

✅ '대리모 여부' 결측값을 최빈값 (0.0) 으로 대체 완료!
✅ 컬럼 삭제 완료: ['PGD 시술 여부', 'PGS 시술 여부']
✅ '대리모 여부' 결측값을 최빈값 (0.0) 으로 대체 완료!
✅ 컬럼 삭제 완료: ['PGD 시술 여부', 'PGS 시술 여부']


In [4]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [5]:
# 각 열의 결측값 개수 확인
missing_values_count = X.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)

결측값이 있는 열과 개수:
난자 채취 경과일     57488
난자 해동 경과일    254915
난자 혼합 경과일     53735
배아 이식 경과일     43566
배아 해동 경과일    215982
dtype: int64


In [6]:
# '알 수 없음'이 포함된 열 찾기
unknown_columns = []

for col in X.columns:
    if X[col].astype(str).str.contains('알 수 없음').any():
        unknown_columns.append(col)

# 결과 출력
print("⚠️ '알 수 없음' 값을 포함하는 컬럼들:")
print(unknown_columns)


⚠️ '알 수 없음' 값을 포함하는 컬럼들:
['시술 당시 나이', '배란 유도 유형', '난자 기증자 나이', '정자 기증자 나이']


In [7]:
# '알 수 없음'이 포함된 열 찾기
unknown_columns = []

for col in X.columns:
    if X[col].astype(str).str.contains('nan').any():
        unknown_columns.append(col)

# 결과 출력
print("⚠️ 'nan' 값을 포함하는 컬럼들:")
print(unknown_columns)


⚠️ 'nan' 값을 포함하는 컬럼들:
['난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일', '배아 이식 경과일', '배아 해동 경과일']


In [8]:
# NaN 값이 존재하는 컬럼 찾기
unknown_columns = X.columns[X.isna().any()].tolist()

unknown_columns

['난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일', '배아 이식 경과일', '배아 해동 경과일']

In [9]:
# Categorical(범주형) 칼럼 찾기
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 결과 출력
print("📌 Categorical(범주형) 칼럼 리스트:")
print(categorical_columns)


📌 Categorical(범주형) 칼럼 리스트:
['ID', '시술 시기 코드', '시술 당시 나이', '시술 유형', '특정 시술 유형', '배란 유도 유형', '총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이']


In [10]:
X['난자 채취 경과일'].fillna(1, inplace=True)
X['난자 해동 경과일'].fillna(0, inplace=True)
X['난자 혼합 경과일'].fillna(0, inplace=True)
X['배아 이식 경과일'].fillna(0, inplace=True)
X['배아 해동 경과일'].fillna(0, inplace=True)

test['난자 채취 경과일'].fillna(1, inplace=True)
test['난자 해동 경과일'].fillna(0, inplace=True)
test['난자 혼합 경과일'].fillna(0, inplace=True)
test['배아 이식 경과일'].fillna(0, inplace=True)
test['배아 해동 경과일'].fillna(0, inplace=True)

In [11]:
# 병합할 칼럼들
columns_to_merge = [
    '난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일',
    '배아 이식 경과일', '배아 해동 경과일'
]

# 새로운 칼럼 생성: 각 경과일의 합
X['총 경과일'] = X[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
X = X.drop(columns=columns_to_merge)

# 새로운 칼럼 생성: 각 경과일의 합
test['총 경과일'] = test[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
test = test.drop(columns=columns_to_merge)

In [12]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [13]:
columns_to_drop = [
        "남성 주 불임 원인",
        "남성 부 불임 원인",
        "불임 원인 - 정자 농도",
        "불임 원인 - 정자 면역학적 요인",
        "불임 원인 - 정자 운동성",
        "불임 원인 - 정자 형태",
        '정자 기증자 나이',
        '배란 유도 유형'
]
X_train_encoded = X_train_encoded.drop(columns = columns_to_drop)    
X_test_encoded = X_test_encoded.drop(columns = columns_to_drop)  

In [14]:
# 각 열의 결측값 개수 확인
missing_values_count = X_train_encoded.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)


결측값이 있는 열과 개수:
Series([], dtype: int64)


In [15]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256351 entries, 0 to 256350
Data columns (total 51 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                256351 non-null  float64
 1   시술 시기 코드          256351 non-null  float64
 2   시술 당시 나이          256351 non-null  float64
 3   시술 유형             256351 non-null  float64
 4   특정 시술 유형          256351 non-null  float64
 5   배란 자극 여부          256351 non-null  int64  
 6   단일 배아 이식 여부       256351 non-null  float64
 7   여성 주 불임 원인        256351 non-null  int64  
 8   여성 부 불임 원인        256351 non-null  int64  
 9   부부 주 불임 원인        256351 non-null  int64  
 10  부부 부 불임 원인        256351 non-null  int64  
 11  불명확 불임 원인         256351 non-null  int64  
 12  불임 원인 - 난관 질환     256351 non-null  int64  
 13  불임 원인 - 남성 요인     256351 non-null  int64  
 14  불임 원인 - 배란 장애     256351 non-null  int64  
 15  불임 원인 - 여성 요인     256351 non-null  int64  
 16  불임 원인 - 자궁경부 문제   25

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# 데이터 정규화 (X_train_encoded & X_test_encoded)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)  # 동일한 스케일 적용

# DataFrame 변환 (Feature 이름 유지)
feature_names = [f"Feature_{i}" for i in range(X_train_scaled.shape[1])]
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

# 상관 행렬 계산
correlation_matrix_train = X_train_scaled_df.corr()

# 다중 공선성이 높은 칼럼 찾기 (절대 상관 계수가 0.8 이상)
threshold = 0.8
high_corr_features = set()

for i in range(len(feature_names)):
    for j in range(i + 1, len(feature_names)):
        if abs(correlation_matrix_train.iloc[i, j]) > threshold:
            high_corr_features.add(feature_names[j])  # 공선성이 높은 컬럼 추가

# 다중 공선성이 높은 컬럼 제거
X_train_encoded = X_train_scaled_df.drop(columns=high_corr_features, errors='ignore')
X_test_encoded = X_test_scaled_df.drop(columns=high_corr_features, errors='ignore')


In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# 데이터 정규화
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X_train_encoded)
scaled_test_x = scaler.transform(X_test_encoded)

# PCA 적용
pca = PCA(n_components=0.95)  # 설명 분산의 93%를 유지하도록 설정
X_train_encoded = pca.fit_transform(scaled_features)
X_test_encoded = pca.transform(scaled_test_x)

# PCA 적용 후 데이터셋의 형태 확인
pca_features_shape = X_train_encoded.shape

# 설명된 분산 비율
explained_variance = pca.explained_variance_ratio_

pca_features_shape, explained_variance

((256351, 29),
 array([0.13912339, 0.07263677, 0.06484474, 0.05356398, 0.05056996,
        0.04451159, 0.04211973, 0.03818675, 0.03517667, 0.03034786,
        0.03028364, 0.02715763, 0.02629677, 0.02587397, 0.02563106,
        0.02556748, 0.02513928, 0.02432389, 0.02389067, 0.02073711,
        0.01993778, 0.01885015, 0.01762415, 0.01356929, 0.01322534,
        0.01149947, 0.01111797, 0.00967765, 0.0090568 ]))

In [18]:
X_train_encoded

array([[ 1.89229537, -0.20389939,  1.265592  , ..., -0.50557297,
        -0.33434394,  0.68355468],
       [-1.40557028, -0.78639786, -0.7818652 , ...,  0.67707958,
         0.29338869, -0.35475044],
       [ 0.2819716 , -0.60328262, -0.22471846, ..., -0.24739822,
         0.23488192, -0.15893267],
       ...,
       [ 1.45566366, -0.05327141,  1.70952632, ...,  0.17211282,
        -0.4730378 ,  0.65839254],
       [-0.07398048, -0.44900873,  1.45441123, ...,  0.02243967,
        -0.55430079,  0.61867901],
       [-3.41431049, -2.10046582,  0.10252429, ...,  0.1057419 ,
        -0.41453668,  0.03227329]])

### Train

In [19]:
model = ExtraTreesClassifier(random_state=42)

model.fit(X_train_encoded, y)

**앙상블 및 Optuna**

In [20]:
import optuna
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [21]:
def optimize_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5)
    }
    model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
    return np.mean(scores)

In [22]:
def optimize_lightgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0)
    }
    model = LGBMClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
    return np.mean(scores)

In [23]:
def optimize_random_forest(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 3, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4)
    }
    model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
    return np.mean(scores)


In [24]:
# Optuna 스터디 실행
print("Optimizing XGBoost...")
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(optimize_xgboost, n_trials=30)
best_xgb_params = xgb_study.best_params

print("Optimizing LightGBM...")
lgbm_study = optuna.create_study(direction="maximize")
lgbm_study.optimize(optimize_lightgbm, n_trials=30)
best_lgbm_params = lgbm_study.best_params

print("Optimizing RandomForest...")
rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(optimize_random_forest, n_trials=30)
best_rf_params = rf_study.best_params

[I 2025-02-13 23:58:06,682] A new study created in memory with name: no-name-537105c1-7deb-4f92-ae80-e17214bc55e0


Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-13 23:58:26,468] Trial 0 finished with value: 0.727129318269065 and parameters: {'n_estimators': 260, 'max_depth': 12, 'learning_rate': 0.03717428311625397, 'subsample': 0.8067697393999076, 'colsample_bytree': 0.6382772712264706, 'gamma': 3.9801166597234854}. Best is trial 0 with value: 0.727129318269065.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-13 23:58:33,443] Trial 1 finished with value: 0.7270145661574466 and parameters: {'n_estimators': 354, 'max_depth': 4, 'learning_rate': 0.24257626505010557, 'sub

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from s

[I 2025-02-14 00:05:18,029] Trial 0 finished with value: 0.7077164867535595 and parameters: {'n_estimators': 431, 'max_depth': 6, 'learning_rate': 0.22853021685814584, 'num_leaves': 44, 'subsample': 0.97820287178573}. Best is trial 0 with value: 0.7077164867535595.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:05:26,585] Trial 1 finished with value: 0.719997626757217 and parameters: {'n_estimators': 417, 'max_depth': 12, 'learning_rate': 0.1437095385363812, 'num_leaves': 41, 'subsample': 0.9479599248865584}. Best is trial 1 with value: 0.719997626757217.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:05:35,193] Trial 2 finished with value: 0.727011292723273 and parameters: {'n_estimators': 361, 'max_depth': 11, 'learning_rate': 0.0799583489376126, 'num_leaves': 50, 'subsample': 0.8170418600542038}. Best is trial 2 with value: 0.727011292723273.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:05:43,508] Trial 3 finished with value: 0.7138130263197567 and parameters: {'n_estimators': 448, 'max_depth': 10, 'learning_rate': 0.2025473913326492, 'num_leaves': 33, 'subsample': 0.843895097463163}. Best is trial 2 with value: 0.727011292723273.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:05:50,679] Trial 4 finished with value: 0.7176679537168721 and parameters: {'n_estimators': 368, 'max_depth': 11, 'learning_rate': 0.187542117051921, 'num_leaves': 32, 'subsample': 0.6539422961753188}. Best is trial 2 with value: 0.727011292723273.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:05:58,555] Trial 5 finished with value: 0.7312106884200749 and parameters: {'n_estimators': 275, 'max_depth': 11, 'learning_rate': 0.03828951785133659, 'num_leaves': 33, 'subsample': 0.674831164438163}. Best is trial 5 with value: 0.7312106884200749.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:04,530] Trial 6 finished with value: 0.7273043374994322 and parameters: {'n_estimators': 361, 'max_depth': 3, 'learning_rate': 0.26721694205189167, 'num_leaves': 33, 'subsample': 0.7677867522705875}. Best is trial 5 with value: 0.7312106884200749.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:12,809] Trial 7 finished with value: 0.719471681378843 and parameters: {'n_estimators': 386, 'max_depth': 5, 'learning_rate': 0.18445166687317577, 'num_leaves': 38, 'subsample': 0.6834236347292366}. Best is trial 5 with value: 0.7312106884200749.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:17,119] Trial 8 finished with value: 0.7304873825992152 and parameters: {'n_estimators': 135, 'max_depth': 10, 'learning_rate': 0.09084769273730474, 'num_leaves': 38, 'subsample': 0.9532671233981335}. Best is trial 5 with value: 0.7312106884200749.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:23,609] Trial 9 finished with value: 0.7312666521313671 and parameters: {'n_estimators': 182, 'max_depth': 7, 'learning_rate': 0.03162243934964985, 'num_leaves': 43, 'subsample': 0.848671181273871}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009666 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:28,544] Trial 10 finished with value: 0.7291806368585818 and parameters: {'n_estimators': 158, 'max_depth': 8, 'learning_rate': 0.0369464317051323, 'num_leaves': 17, 'subsample': 0.8950892899328706}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010832 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:35,973] Trial 11 finished with value: 0.7251875835254344 and parameters: {'n_estimators': 236, 'max_depth': 8, 'learning_rate': 0.011394268629785653, 'num_leaves': 20, 'subsample': 0.7457178767624703}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:41,907] Trial 12 finished with value: 0.7309416099905188 and parameters: {'n_estimators': 249, 'max_depth': 6, 'learning_rate': 0.06982800049349427, 'num_leaves': 24, 'subsample': 0.6176896241303315}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:46,749] Trial 13 finished with value: 0.7290484877620511 and parameters: {'n_estimators': 206, 'max_depth': 8, 'learning_rate': 0.11693875081001329, 'num_leaves': 26, 'subsample': 0.7068872953873092}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:53,445] Trial 14 finished with value: 0.7295423548925458 and parameters: {'n_estimators': 293, 'max_depth': 9, 'learning_rate': 0.0334767762509762, 'num_leaves': 10, 'subsample': 0.8736006454148215}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:06:57,643] Trial 15 finished with value: 0.7304068615713261 and parameters: {'n_estimators': 108, 'max_depth': 6, 'learning_rate': 0.0461595155094991, 'num_leaves': 50, 'subsample': 0.7503033896827006}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010647 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:03,630] Trial 16 finished with value: 0.7295377593172695 and parameters: {'n_estimators': 301, 'max_depth': 4, 'learning_rate': 0.12086434770312149, 'num_leaves': 44, 'subsample': 0.8018986130444324}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009801 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:10,361] Trial 17 finished with value: 0.7255565886653075 and parameters: {'n_estimators': 191, 'max_depth': 7, 'learning_rate': 0.011426822706443605, 'num_leaves': 27, 'subsample': 0.9003791812339571}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:21,172] Trial 18 finished with value: 0.7288253544814042 and parameters: {'n_estimators': 500, 'max_depth': 12, 'learning_rate': 0.059615995798624005, 'num_leaves': 38, 'subsample': 0.6320001088422211}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010729 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:27,706] Trial 19 finished with value: 0.7056256226351099 and parameters: {'n_estimators': 305, 'max_depth': 9, 'learning_rate': 0.2941249481946557, 'num_leaves': 45, 'subsample': 0.6901247268624374}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:33,660] Trial 20 finished with value: 0.7291669360846756 and parameters: {'n_estimators': 256, 'max_depth': 7, 'learning_rate': 0.09827554246171333, 'num_leaves': 30, 'subsample': 0.8479310928246511}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:39,522] Trial 21 finished with value: 0.7307920105334258 and parameters: {'n_estimators': 251, 'max_depth': 6, 'learning_rate': 0.07497506880558147, 'num_leaves': 23, 'subsample': 0.6101709913648592}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009301 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:44,487] Trial 22 finished with value: 0.7307971760063054 and parameters: {'n_estimators': 197, 'max_depth': 5, 'learning_rate': 0.05742877682232535, 'num_leaves': 18, 'subsample': 0.6006714572848897}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009791 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:51,312] Trial 23 finished with value: 0.7291211779197899 and parameters: {'n_estimators': 281, 'max_depth': 5, 'learning_rate': 0.02604940799143511, 'num_leaves': 13, 'subsample': 0.6449269330324628}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:07:58,552] Trial 24 finished with value: 0.7305926123388762 and parameters: {'n_estimators': 323, 'max_depth': 7, 'learning_rate': 0.06577336832902644, 'num_leaves': 27, 'subsample': 0.7052480459581926}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:08:03,541] Trial 25 finished with value: 0.7301594537033613 and parameters: {'n_estimators': 222, 'max_depth': 9, 'learning_rate': 0.11447587500420366, 'num_leaves': 23, 'subsample': 0.7660529673377245}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:08:07,617] Trial 26 finished with value: 0.7303716933479513 and parameters: {'n_estimators': 176, 'max_depth': 4, 'learning_rate': 0.13607490777370876, 'num_leaves': 36, 'subsample': 0.6559727113787615}. Best is trial 9 with value: 0.7312666521313671.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:08:15,751] Trial 27 finished with value: 0.7313966269993596 and parameters: {'n_estimators': 331, 'max_depth': 6, 'learning_rate': 0.045645106034219264, 'num_leaves': 29, 'subsample': 0.7359255822427301}. Best is trial 27 with value: 0.7313966269993596.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:08:24,087] Trial 28 finished with value: 0.7312349811546665 and parameters: {'n_estimators': 328, 'max_depth': 7, 'learning_rate': 0.045453527810958495, 'num_leaves': 35, 'subsample': 0.7147265407993119}. Best is trial 27 with value: 0.7313966269993596.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009784 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258352 -> initscore=-1.054554
[LightGBM] [Info] Start training from score -1.054554
[LightGB

[I 2025-02-14 00:08:31,997] Trial 29 finished with value: 0.7266122668418433 and parameters: {'n_estimators': 326, 'max_depth': 7, 'learning_rate': 0.09234766643780723, 'num_leaves': 45, 'subsample': 0.7279543813309295}. Best is trial 27 with value: 0.7313966269993596.
[I 2025-02-14 00:08:31,998] A new study created in memory with name: no-name-e19dc09a-30f3-484f-9568-6596d9bcbb30


Optimizing RandomForest...


[I 2025-02-14 00:13:32,182] Trial 0 finished with value: 0.7288440372072362 and parameters: {'n_estimators': 71, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7288440372072362.
[I 2025-02-14 00:16:09,824] Trial 1 finished with value: 0.702618565910244 and parameters: {'n_estimators': 120, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7288440372072362.
[I 2025-02-14 00:19:26,910] Trial 2 finished with value: 0.7259988270644556 and parameters: {'n_estimators': 64, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7288440372072362.
[I 2025-02-14 00:27:31,856] Trial 3 finished with value: 0.7290222014048643 and parameters: {'n_estimators': 132, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.7290222014048643.
[I 2025-02-14 00:32:07,456] Trial 4 finished with value: 0.7285773437221006 and parameters: {'

In [25]:
# 최적화된 모델 생성
xgb_model = XGBClassifier(**best_xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
lgbm_model = LGBMClassifier(**best_lgbm_params, random_state=42)
rf_model = RandomForestClassifier(**best_rf_params, random_state=42)

In [26]:
# XGBoost 모델의 파라미터 확인
print("XGBoost Best Parameters:")
print(xgb_model.get_params())

# LightGBM 모델의 파라미터 확인
print("\nLightGBM Best Parameters:")
print(lgbm_model.get_params())

# RandomForest 모델의 파라미터 확인
print("\nRandomForest Best Parameters:")
print(rf_model.get_params())

XGBoost Best Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.7361110605303595, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': 1.8240562673488567, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.0187215871394234, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 6, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 434, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.652427898291898, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': 

In [27]:
# Soft Voting 앙상블
ensemble_model = VotingClassifier(
    estimators=[
        ("xgb", xgb_model),
        ("lgbm", lgbm_model),
        ("rf", rf_model)
    ],
    voting="soft"
)

In [28]:
# 전체 데이터로 학습
ensemble_model.fit(X_train_encoded, y)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013249 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568


### Predict

In [29]:
pred_proba = ensemble_model.predict_proba(X_test_encoded)[:, 1]

### Submission

In [30]:
sample_submission = pd.read_csv('../../sample_submission.csv')
sample_submission['probability'] = pred_proba

In [31]:
sample_submission.to_csv('./Result/baseline_submit.csv', index=False)

In [32]:
pred_proba

array([0.00800224, 0.05053782, 0.14635449, ..., 0.37234659, 0.35229498,
       0.01099245])

In [33]:
unique, counts = np.unique(sample_submission['probability'], return_counts=True)
print(unique, counts)

[3.34645691e-04 3.50046011e-04 3.59361925e-04 ... 6.51468245e-01
 6.54272923e-01 6.59107177e-01] [1 1 1 ... 1 1 1]
