### Import

In [5]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib
import seaborn as sns

# 각 라이브러리 버전 출력
print("Numpy version:", np.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Pandas version:", pd.__version__)
print("Matplotlib version:", matplotlib.__version__)
print("Seaborn version:", sns.__version__)

Numpy version: 1.26.4
Scikit-learn version: 1.5.1
Pandas version: 1.5.3
Matplotlib version: 3.9.2
Seaborn version: 0.13.2


In [6]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

### Data Load

In [7]:
import sys
import os
import pandas as pd

# 현재 작업 디렉토리 경로를 가져와 shared codes 폴더의 위치를 sys.path에 추가합니다.
# sys.path에 추가된 경로에 있는 py 폴더는 임포트할 수 있다.
current_dir = os.getcwd()
shared_codes_dir = os.path.join(current_dir, '../shared codes')
sys.path.append(shared_codes_dir)


# cover_nan 모듈을 임포트
from cover_nan import missing_value_removal_function

# 원본 train 데이터 로드
train = pd.read_csv("../shared codes/data/train.csv")
test = pd.read_csv("../shared codes/data/test.csv")

# missing_value_removal_function 사용
train = missing_value_removal_function(train)
test = missing_value_removal_function(test)

✅ '대리모 여부' 결측값을 최빈값 (0.0) 으로 대체 완료!
✅ 컬럼 삭제 완료: ['PGD 시술 여부', 'PGS 시술 여부', '난자 해동 경과일', '배아 해동 경과일']
✅ '난자 채취 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '난자 혼합 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '배아 이식 경과일' 결측값을 중앙값 (3.0) 으로 대체 완료!
✅ '대리모 여부' 결측값을 최빈값 (0.0) 으로 대체 완료!
✅ 컬럼 삭제 완료: ['PGD 시술 여부', 'PGS 시술 여부', '난자 해동 경과일', '배아 해동 경과일']
✅ '난자 채취 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '난자 혼합 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '배아 이식 경과일' 결측값을 중앙값 (3.0) 으로 대체 완료!


### Data Pre-processing

In [8]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

In [9]:
# 각 열의 결측값 개수 확인
missing_values_count = X.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)

결측값이 있는 열과 개수:
Series([], dtype: int64)


In [10]:
# '알 수 없음'이 포함된 열 찾기
unknown_columns = []

for col in X.columns:
    if X[col].astype(str).str.contains('알 수 없음').any():
        unknown_columns.append(col)

# 결과 출력
print("⚠️ '알 수 없음' 값을 포함하는 컬럼들:")
print(unknown_columns)


⚠️ '알 수 없음' 값을 포함하는 컬럼들:
['시술 당시 나이', '배란 유도 유형', '난자 기증자 나이', '정자 기증자 나이']


In [11]:
# '알 수 없음'이 포함된 열 찾기
unknown_columns = []

for col in X.columns:
    if X[col].astype(str).str.contains('nan').any():
        unknown_columns.append(col)

# 결과 출력
print("⚠️ 'nan' 값을 포함하는 컬럼들:")
print(unknown_columns)


⚠️ 'nan' 값을 포함하는 컬럼들:
[]


In [12]:
# NaN 값이 존재하는 컬럼 찾기
unknown_columns = X.columns[X.isna().any()].tolist()

unknown_columns

[]

In [13]:
# Categorical(범주형) 칼럼 찾기
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 결과 출력
print("📌 Categorical(범주형) 칼럼 리스트:")
print(categorical_columns)


📌 Categorical(범주형) 칼럼 리스트:
['ID', '시술 시기 코드', '시술 당시 나이', '시술 유형', '특정 시술 유형', '배란 유도 유형', '총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이']


In [14]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [15]:
columns_to_drop = [
        "남성 주 불임 원인",
        "남성 부 불임 원인",
        "불임 원인 - 정자 농도",
        "불임 원인 - 정자 면역학적 요인",
        "불임 원인 - 정자 운동성",
        "불임 원인 - 정자 형태",
        '정자 기증자 나이',
        '배란 유도 유형'
]
X_train_encoded = X_train_encoded.drop(columns = columns_to_drop)    
X_test_encoded = X_test_encoded.drop(columns = columns_to_drop)  

In [16]:
# 각 열의 결측값 개수 확인
missing_values_count = X_train_encoded.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)


결측값이 있는 열과 개수:
Series([], dtype: int64)


In [17]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256351 entries, 0 to 256350
Data columns (total 53 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                256351 non-null  float64
 1   시술 시기 코드          256351 non-null  float64
 2   시술 당시 나이          256351 non-null  float64
 3   시술 유형             256351 non-null  float64
 4   특정 시술 유형          256351 non-null  float64
 5   배란 자극 여부          256351 non-null  int64  
 6   단일 배아 이식 여부       256351 non-null  float64
 7   여성 주 불임 원인        256351 non-null  int64  
 8   여성 부 불임 원인        256351 non-null  int64  
 9   부부 주 불임 원인        256351 non-null  int64  
 10  부부 부 불임 원인        256351 non-null  int64  
 11  불명확 불임 원인         256351 non-null  int64  
 12  불임 원인 - 난관 질환     256351 non-null  int64  
 13  불임 원인 - 남성 요인     256351 non-null  int64  
 14  불임 원인 - 배란 장애     256351 non-null  int64  
 15  불임 원인 - 여성 요인     256351 non-null  int64  
 16  불임 원인 - 자궁경부 문제   25

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# 데이터 정규화 (X_train_encoded & X_test_encoded)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)  # 동일한 스케일 적용

# DataFrame 변환 (Feature 이름 유지)
feature_names = [f"Feature_{i}" for i in range(X_train_scaled.shape[1])]
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

# 상관 행렬 계산
correlation_matrix_train = X_train_scaled_df.corr()

# 다중 공선성이 높은 칼럼 찾기 (절대 상관 계수가 0.8 이상)
threshold = 0.8
high_corr_features = set()

for i in range(len(feature_names)):
    for j in range(i + 1, len(feature_names)):
        if abs(correlation_matrix_train.iloc[i, j]) > threshold:
            high_corr_features.add(feature_names[j])  # 공선성이 높은 컬럼 추가

# 다중 공선성이 높은 컬럼 제거
X_train_encoded = X_train_scaled_df.drop(columns=high_corr_features, errors='ignore')
X_test_encoded = X_test_scaled_df.drop(columns=high_corr_features, errors='ignore')


In [19]:
X_train_encoded

Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_10,...,Feature_39,Feature_42,Feature_43,Feature_44,Feature_46,Feature_48,Feature_49,Feature_50,Feature_51,Feature_52
0,-1.732044,1.518507,-0.935575,0.158613,-0.962962,0.544551,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,-0.135020
1,-1.732031,1.020975,2.655919,0.158613,-0.962962,-1.836376,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,-0.135020
2,-1.732017,0.025910,-0.935575,0.158613,0.672585,0.544551,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,-0.773571
3,-1.732004,-0.471622,-0.217277,0.158613,-0.962962,0.544551,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,-0.135020
4,-1.731990,0.025910,-0.935575,0.158613,-0.962962,0.544551,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,-0.135020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256346,1.731990,1.020975,-0.935575,0.158613,-0.962962,0.544551,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,1.142081
256347,1.732004,1.020975,0.501022,0.158613,-0.962962,0.544551,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,-0.135020
256348,1.732017,0.025910,-0.217277,0.158613,-0.962962,0.544551,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,-0.135020
256349,1.732031,1.518507,0.501022,0.158613,-0.962962,0.544551,-0.543057,-0.178038,-0.112199,-0.094036,...,-0.083423,-0.220261,0.256018,0.344047,-0.430784,-0.098393,-0.0641,0.0,-0.042922,-1.412122


### Train

In [20]:
from imblearn.over_sampling import SMOTE

# SMOTE 적용
smote = SMOTE(random_state=42)
X_train_encoded, y = smote.fit_resample(X_train_encoded, y)

In [21]:
model = ExtraTreesClassifier(random_state=42)

model.fit(X_train_encoded, y)

**앙상블 및 Optuna**

In [22]:
import optuna
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [23]:
def optimize_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5)
    }
    model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
    return np.mean(scores)

In [24]:
def optimize_lightgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0)
    }
    model = LGBMClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
    return np.mean(scores)

In [25]:
def optimize_random_forest(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 3, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4)
    }
    model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
    return np.mean(scores)


In [26]:
# Optuna 스터디 실행
print("Optimizing XGBoost...")
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(optimize_xgboost, n_trials=30)
best_xgb_params = xgb_study.best_params

print("Optimizing LightGBM...")
lgbm_study = optuna.create_study(direction="maximize")
lgbm_study.optimize(optimize_lightgbm, n_trials=30)
best_lgbm_params = lgbm_study.best_params

print("Optimizing RandomForest...")
rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(optimize_random_forest, n_trials=30)
best_rf_params = rf_study.best_params

[I 2025-02-13 21:09:03,652] A new study created in memory with name: no-name-859623c9-fda8-45e9-83fa-2ae6e9a5f554


Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-13 21:09:16,169] Trial 0 finished with value: 0.3492611886153188 and parameters: {'n_estimators': 377, 'max_depth': 3, 'learning_rate': 0.03988666579457875, 'subsample': 0.6445002323810115, 'colsample_bytree': 0.8146878943913239, 'gamma': 2.082974422660176}. Best is trial 0 with value: 0.3492611886153188.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-13 21:09:33,833] Trial 1 finished with value: 0.4929809977486325 and parameters: {'n_estimators': 405, 'max_depth': 11, 'learning_rate': 0.12866730454719094, 'su

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010510 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: p

[I 2025-02-13 21:18:57,304] Trial 0 finished with value: 0.4757748720286905 and parameters: {'n_estimators': 277, 'max_depth': 12, 'learning_rate': 0.16606825698750302, 'num_leaves': 48, 'subsample': 0.8735894117755366}. Best is trial 0 with value: 0.4757748720286905.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:19:06,711] Trial 1 finished with value: 0.42022984028737387 and parameters: {'n_estimators': 330, 'max_depth': 12, 'learning_rate': 0.02572567090929532, 'num_leaves': 19, 'subsample': 0.6343015917187423}. Best is trial 0 with value: 0.4757748720286905.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009595 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:19:14,002] Trial 2 finished with value: 0.46935352848035794 and parameters: {'n_estimators': 404, 'max_depth': 3, 'learning_rate': 0.20982006629643193, 'num_leaves': 31, 'subsample': 0.7103002908111126}. Best is trial 0 with value: 0.4757748720286905.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009050 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:19:21,069] Trial 3 finished with value: 0.4382826088206736 and parameters: {'n_estimators': 285, 'max_depth': 9, 'learning_rate': 0.10200539461766416, 'num_leaves': 15, 'subsample': 0.610253260756964}. Best is trial 0 with value: 0.4757748720286905.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:19:26,458] Trial 4 finished with value: 0.4474585386558765 and parameters: {'n_estimators': 221, 'max_depth': 4, 'learning_rate': 0.1907875029835253, 'num_leaves': 45, 'subsample': 0.9617691879837919}. Best is trial 0 with value: 0.4757748720286905.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010901 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:19:37,616] Trial 5 finished with value: 0.4570567977913932 and parameters: {'n_estimators': 484, 'max_depth': 12, 'learning_rate': 0.09548233178291377, 'num_leaves': 27, 'subsample': 0.9745911850784378}. Best is trial 0 with value: 0.4757748720286905.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010624 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:19:42,081] Trial 6 finished with value: 0.4564730301955239 and parameters: {'n_estimators': 147, 'max_depth': 7, 'learning_rate': 0.22565551553437785, 'num_leaves': 13, 'subsample': 0.8286938258264713}. Best is trial 0 with value: 0.4757748720286905.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010903 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:19:49,989] Trial 7 finished with value: 0.5094442501578568 and parameters: {'n_estimators': 293, 'max_depth': 9, 'learning_rate': 0.20816480937918913, 'num_leaves': 39, 'subsample': 0.7709303940966716}. Best is trial 7 with value: 0.5094442501578568.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009631 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:19:55,030] Trial 8 finished with value: 0.40044895037101824 and parameters: {'n_estimators': 223, 'max_depth': 3, 'learning_rate': 0.14184239993196257, 'num_leaves': 13, 'subsample': 0.8400881417873665}. Best is trial 7 with value: 0.5094442501578568.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010564 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:20:03,180] Trial 9 finished with value: 0.5503081096977586 and parameters: {'n_estimators': 388, 'max_depth': 5, 'learning_rate': 0.2795107404684135, 'num_leaves': 18, 'subsample': 0.6749655491221866}. Best is trial 9 with value: 0.5503081096977586.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:20:12,032] Trial 10 finished with value: 0.5378427554214673 and parameters: {'n_estimators': 409, 'max_depth': 6, 'learning_rate': 0.28330700442037754, 'num_leaves': 24, 'subsample': 0.7117180474264347}. Best is trial 9 with value: 0.5503081096977586.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010937 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:20:20,953] Trial 11 finished with value: 0.5366024610288216 and parameters: {'n_estimators': 412, 'max_depth': 6, 'learning_rate': 0.291337244130512, 'num_leaves': 23, 'subsample': 0.700200209914692}. Best is trial 9 with value: 0.5503081096977586.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010740 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:20:30,006] Trial 12 finished with value: 0.5575635073357658 and parameters: {'n_estimators': 395, 'max_depth': 5, 'learning_rate': 0.29199664361205524, 'num_leaves': 32, 'subsample': 0.7007644402587073}. Best is trial 12 with value: 0.5575635073357658.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010017 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009815 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:20:41,057] Trial 13 finished with value: 0.5629568145774124 and parameters: {'n_estimators': 498, 'max_depth': 5, 'learning_rate': 0.25482842106367054, 'num_leaves': 36, 'subsample': 0.6721956635678511}. Best is trial 13 with value: 0.5629568145774124.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007833 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:20:51,971] Trial 14 finished with value: 0.5630975712216542 and parameters: {'n_estimators': 496, 'max_depth': 5, 'learning_rate': 0.25217865837911674, 'num_leaves': 36, 'subsample': 0.7711289070138857}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010211 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:21:03,337] Trial 15 finished with value: 0.479326389732336 and parameters: {'n_estimators': 492, 'max_depth': 9, 'learning_rate': 0.2432617503917261, 'num_leaves': 38, 'subsample': 0.7720419783247867}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010418 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:21:14,009] Trial 16 finished with value: 0.5450500327505082 and parameters: {'n_estimators': 459, 'max_depth': 7, 'learning_rate': 0.2563170735511683, 'num_leaves': 37, 'subsample': 0.7618560557542822}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009951 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:21:24,120] Trial 17 finished with value: 0.5171658895473775 and parameters: {'n_estimators': 450, 'max_depth': 5, 'learning_rate': 0.16086117449923826, 'num_leaves': 43, 'subsample': 0.8928503914309037}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:21:31,351] Trial 18 finished with value: 0.504179805229769 and parameters: {'n_estimators': 344, 'max_depth': 4, 'learning_rate': 0.24540256348538206, 'num_leaves': 34, 'subsample': 0.6554108839843066}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010789 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:21:43,857] Trial 19 finished with value: 0.5182845684035989 and parameters: {'n_estimators': 498, 'max_depth': 6, 'learning_rate': 0.13150034470831637, 'num_leaves': 42, 'subsample': 0.7463034480471714}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010627 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009466 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:21:56,313] Trial 20 finished with value: 0.4208269241565176 and parameters: {'n_estimators': 437, 'max_depth': 8, 'learning_rate': 0.03768291794121577, 'num_leaves': 28, 'subsample': 0.8105708173991628}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010619 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:22:04,982] Trial 21 finished with value: 0.5469534333195509 and parameters: {'n_estimators': 371, 'max_depth': 5, 'learning_rate': 0.29840775000569725, 'num_leaves': 34, 'subsample': 0.7312146529682104}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initscore=0.000007
[LightGBM] [Info] Start training from score 0.000007

[I 2025-02-13 21:22:14,265] Trial 22 finished with value: 0.5373752817543471 and parameters: {'n_estimators': 451, 'max_depth': 4, 'learning_rate': 0.2630885032696771, 'num_leaves': 32, 'subsample': 0.678339273848109}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:22:24,650] Trial 23 finished with value: 0.5619564583317734 and parameters: {'n_estimators': 466, 'max_depth': 5, 'learning_rate': 0.23277406408998202, 'num_leaves': 37, 'subsample': 0.6066646729160989}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010508 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:22:33,261] Trial 24 finished with value: 0.4924577813095564 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.18960556415392904, 'num_leaves': 50, 'subsample': 0.6124193577544526}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:22:44,527] Trial 25 finished with value: 0.5600979567584878 and parameters: {'n_estimators': 464, 'max_depth': 6, 'learning_rate': 0.2309396551275238, 'num_leaves': 40, 'subsample': 0.6432753212753469}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010841 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010691 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:22:54,390] Trial 26 finished with value: 0.5287225618827536 and parameters: {'n_estimators': 429, 'max_depth': 8, 'learning_rate': 0.2669579350778757, 'num_leaves': 36, 'subsample': 0.6652121312924953}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011158 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011122 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:23:01,633] Trial 27 finished with value: 0.498690337288165 and parameters: {'n_estimators': 347, 'max_depth': 4, 'learning_rate': 0.21892286899900523, 'num_leaves': 45, 'subsample': 0.6103702598720148}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:23:06,115] Trial 28 finished with value: 0.44154570450624525 and parameters: {'n_estimators': 117, 'max_depth': 7, 'learning_rate': 0.18822624499891993, 'num_leaves': 28, 'subsample': 0.7935876518360362}. Best is trial 14 with value: 0.5630975712216542.


[LightGBM] [Info] Number of positive: 152098, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6234
[LightGBM] [Info] Number of data points in the train set: 304196, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 152099, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6220
[LightGBM] [Info] Number of data points in the train set: 304197, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initsco

[I 2025-02-13 21:23:18,040] Trial 29 finished with value: 0.5106007491587918 and parameters: {'n_estimators': 470, 'max_depth': 10, 'learning_rate': 0.23933678835655472, 'num_leaves': 47, 'subsample': 0.884815780224057}. Best is trial 14 with value: 0.5630975712216542.
[I 2025-02-13 21:23:18,041] A new study created in memory with name: no-name-09c744f3-88b9-47e6-a314-265aaffaa5b7


Optimizing RandomForest...


[I 2025-02-13 21:26:50,702] Trial 0 finished with value: 0.4481783513101284 and parameters: {'n_estimators': 199, 'max_depth': 11, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.4481783513101284.
[I 2025-02-13 21:29:27,553] Trial 1 finished with value: 0.47355139890319853 and parameters: {'n_estimators': 170, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.47355139890319853.
[I 2025-02-13 21:32:23,245] Trial 2 finished with value: 0.48509346931233166 and parameters: {'n_estimators': 256, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.48509346931233166.
[I 2025-02-13 21:34:09,347] Trial 3 finished with value: 0.48723846543379956 and parameters: {'n_estimators': 156, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.48723846543379956.
[I 2025-02-13 21:35:35,645] Trial 4 finished with value: 0.5162595258170976 and parameter

In [27]:
# 최적화된 모델 생성
xgb_model = XGBClassifier(**best_xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
lgbm_model = LGBMClassifier(**best_lgbm_params, random_state=42)
rf_model = RandomForestClassifier(**best_rf_params, random_state=42)

In [28]:
# XGBoost 모델의 파라미터 확인
print("XGBoost Best Parameters:")
print(xgb_model.get_params())

# LightGBM 모델의 파라미터 확인
print("\nLightGBM Best Parameters:")
print(lgbm_model.get_params())

# RandomForest 모델의 파라미터 확인
print("\nRandomForest Best Parameters:")
print(rf_model.get_params())

XGBoost Best Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.7388933894835854, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': 0.14580601261269263, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.2955812427886471, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 9, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 491, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.6948425192973119, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder'

In [29]:
# Soft Voting 앙상블
ensemble_model = VotingClassifier(
    estimators=[
        ("xgb", xgb_model),
        ("lgbm", lgbm_model),
        ("rf", rf_model)
    ],
    voting="soft"
)

In [30]:
# 전체 데이터로 학습
ensemble_model.fit(X_train_encoded, y)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 190123, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5862
[LightGBM] [Info] Number of data points in the train set: 380246, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


### Predict

In [31]:
pred_proba = ensemble_model.predict_proba(X_test_encoded)[:, 1]

### Submission

In [32]:
sample_submission = pd.read_csv('../../sample_submission.csv')
sample_submission['probability'] = pred_proba

In [33]:
sample_submission.to_csv('./Result/baseline_submit.csv', index=False)

In [34]:
pred_proba

array([0.11746949, 0.10448746, 0.18770913, ..., 0.32780166, 0.37119434,
       0.11895023])

In [35]:
unique, counts = np.unique(sample_submission['probability'], return_counts=True)
print(unique, counts)

[0.05594023 0.05675022 0.05694426 ... 0.86500426 0.87276575 0.87335867] [1 1 1 ... 1 1 1]
