### Import

In [34]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib
import seaborn as sns

# 각 라이브러리 버전 출력
print("Numpy version:", np.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Pandas version:", pd.__version__)
print("Matplotlib version:", matplotlib.__version__)
print("Seaborn version:", sns.__version__)

Numpy version: 1.26.4
Scikit-learn version: 1.4.2
Pandas version: 2.2.2
Matplotlib version: 3.8.4
Seaborn version: 0.13.2


In [35]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

### Data Load

In [36]:
import sys
import os
import pandas as pd

# 현재 작업 디렉토리 경로를 가져와 shared codes 폴더의 위치를 sys.path에 추가합니다.
# sys.path에 추가된 경로에 있는 py 폴더는 임포트할 수 있다.
current_dir = os.getcwd()
shared_codes_dir = os.path.join(current_dir, '../shared codes')
sys.path.append(shared_codes_dir)


# cover_nan 모듈을 임포트
from cover_nan import missing_value_removal_function

# 원본 train 데이터 로드
train = pd.read_csv("../shared codes/data/train.csv").drop(columns=['ID'])
test = pd.read_csv("../shared codes/data/test.csv").drop(columns=['ID'])

# missing_value_removal_function 사용
train = missing_value_removal_function(train)
test = missing_value_removal_function(test)

KeyboardInterrupt: 

In [None]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [None]:
columns_fill_zero = [
    # 'PGD 시술 여부', 'PGS 시술 여부',
    '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부'
]
X[columns_fill_zero] = X[columns_fill_zero].fillna(0)
test[columns_fill_zero] = test[columns_fill_zero].fillna(0)

In [None]:

X['난자 채취 경과일'].fillna(1, inplace=True)
X['난자 해동 경과일'].fillna(0, inplace=True)
X['난자 혼합 경과일'].fillna(0, inplace=True)
X['배아 이식 경과일'].fillna(0, inplace=True)
X['배아 해동 경과일'].fillna(0, inplace=True)

test['난자 채취 경과일'].fillna(1, inplace=True)
test['난자 해동 경과일'].fillna(0, inplace=True)
test['난자 혼합 경과일'].fillna(0, inplace=True)
test['배아 이식 경과일'].fillna(0, inplace=True)
test['배아 해동 경과일'].fillna(0, inplace=True)

# 병합할 칼럼들
columns_to_merge = [
    '난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일',
    '배아 이식 경과일', '배아 해동 경과일'
]

# 새로운 칼럼 생성: 각 경과일의 합
X['총 경과일'] = X[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
X = X.drop(columns=columns_to_merge)

# 새로운 칼럼 생성: 각 경과일의 합
test['총 경과일'] = test[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
test = test.drop(columns=columns_to_merge)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['난자 채취 경과일'].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['난자 해동 경과일'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves 

In [None]:
# 각 열의 결측값 개수 확인
missing_values_count = X.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)

결측값이 있는 열과 개수:
Series([], dtype: int64)


In [None]:
# '알 수 없음'이 포함된 열 찾기
unknown_columns = []

for col in X.columns:
    if X[col].astype(str).str.contains('알 수 없음').any():
        unknown_columns.append(col)

# 결과 출력
print("⚠️ '알 수 없음' 값을 포함하는 컬럼들:")
print(unknown_columns)


⚠️ '알 수 없음' 값을 포함하는 컬럼들:
['시술 당시 나이', '배란 유도 유형', '난자 기증자 나이', '정자 기증자 나이']


In [None]:
# '알 수 없음'이 포함된 열 찾기
unknown_columns = []

for col in X.columns:
    if X[col].astype(str).str.contains('nan').any():
        unknown_columns.append(col)

# 결과 출력
print("⚠️ 'nan' 값을 포함하는 컬럼들:")
print(unknown_columns)


⚠️ 'nan' 값을 포함하는 컬럼들:
[]


In [None]:
# NaN 값이 존재하는 컬럼 찾기
unknown_columns = X.columns[X.isna().any()].tolist()

unknown_columns

[]

In [None]:
# Categorical(범주형) 칼럼 찾기
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

# 결과 출력
print("📌 Categorical(범주형) 칼럼 리스트:")
print(categorical_columns)


📌 Categorical(범주형) 칼럼 리스트:
['시술 시기 코드', '시술 당시 나이', '시술 유형', '특정 시술 유형', '배란 유도 유형', '총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이']


In [None]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [None]:
columns_to_drop = [
        "남성 주 불임 원인",
        "남성 부 불임 원인",
        "불임 원인 - 정자 농도",
        "불임 원인 - 정자 면역학적 요인",
        "불임 원인 - 정자 운동성",
        "불임 원인 - 정자 형태",
        '정자 기증자 나이',
        '배란 유도 유형'
]
X_train_encoded = X_train_encoded.drop(columns = columns_to_drop)    
X_test_encoded = X_test_encoded.drop(columns = columns_to_drop)  

In [None]:
# 각 열의 결측값 개수 확인
missing_values_count = X_train_encoded.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)


결측값이 있는 열과 개수:
Series([], dtype: int64)


In [None]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256351 entries, 0 to 256350
Data columns (total 52 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   시술 시기 코드          256351 non-null  float64
 1   시술 당시 나이          256351 non-null  float64
 2   시술 유형             256351 non-null  float64
 3   특정 시술 유형          256351 non-null  float64
 4   배란 자극 여부          256351 non-null  int64  
 5   단일 배아 이식 여부       256351 non-null  float64
 6   착상 전 유전 검사 사용 여부  256351 non-null  float64
 7   착상 전 유전 진단 사용 여부  256351 non-null  float64
 8   여성 주 불임 원인        256351 non-null  int64  
 9   여성 부 불임 원인        256351 non-null  int64  
 10  부부 주 불임 원인        256351 non-null  int64  
 11  부부 부 불임 원인        256351 non-null  int64  
 12  불명확 불임 원인         256351 non-null  int64  
 13  불임 원인 - 난관 질환     256351 non-null  int64  
 14  불임 원인 - 남성 요인     256351 non-null  int64  
 15  불임 원인 - 배란 장애     256351 non-null  int64  
 16  불임 원인 - 여성 요인     25

In [None]:

X_train_encoded['배아 생성 주요 이유'].value_counts()

배아 생성 주요 이유
1    253108
0      3243
Name: count, dtype: int64

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 데이터 정규화 (X_train_encoded & X_test_encoded)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)  # 동일한 스케일 적용

# DataFrame 변환 (Feature 이름 유지)
feature_names = X_train_encoded.columns
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

# 상관 행렬 계산
correlation_matrix_train = X_train_scaled_df.corr()

# 다중 공선성이 높은 칼럼 찾기 (절대 상관 계수가 0.9 이상)
threshold = 0.9
high_corr_features = set()

for i in range(len(feature_names)):
    for j in range(i + 1, len(feature_names)):
        if abs(correlation_matrix_train.iloc[i, j]) > threshold:
            high_corr_features.add(feature_names[j])  # 공선성이 높은 컬럼 추가

print(f"📌 상관 계수 기준으로 제거할 후보 변수들: {high_corr_features}")

# 📌 VIF 계산 함수
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

# VIF 계산
vif_df = calculate_vif(X_train_scaled_df)

# VIF 기준(예: 10) 초과하는 변수 제거
high_vif_features = vif_df[vif_df["VIF"] > 10]["Feature"].tolist()
print(f"📌 VIF 기준으로 제거할 후보 변수들: {high_vif_features}")

# 공선성이 높은 변수 최종 제거 리스트 (상관 계수 + VIF 결합)
final_features_to_remove = set(high_corr_features) | set(high_vif_features)
print(f"📌 최종 제거할 변수들: {final_features_to_remove}")

# 다중 공선성이 높은 칼럼 제거
X_train_encoded = X_train_scaled_df.drop(columns=final_features_to_remove, errors='ignore')
X_test_encoded = X_test_scaled_df.drop(columns=final_features_to_remove, errors='ignore')

print(f"✅ 최종 남은 변수 개수: {X_train_encoded.shape[1]}")


📌 상관 계수 기준으로 제거할 후보 변수들: {'신선 배아 사용 여부', 'IVF 시술 횟수', 'IVF 임신 횟수', 'IVF 출산 횟수', '미세주입에서 생성된 배아 수', '파트너 정자와 혼합된 난자 수'}


  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)


📌 VIF 기준으로 제거할 후보 변수들: ['시술 유형', '여성 주 불임 원인', '부부 주 불임 원인', '총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부']
📌 최종 제거할 변수들: {'신선 배아 사용 여부', 'IVF 출산 횟수', '여성 주 불임 원인', '총 시술 횟수', '시술 유형', 'DI 출산 횟수', '총 생성 배아 수', '혼합된 난자 수', '부부 주 불임 원인', '총 출산 횟수', 'IVF 시술 횟수', '동결 배아 사용 여부', 'IVF 임신 횟수', '미세주입에서 생성된 배아 수', 'DI 시술 횟수', 'DI 임신 횟수', '파트너 정자와 혼합된 난자 수', '미세주입된 난자 수', '총 임신 횟수'}
✅ 최종 남은 변수 개수: 52


In [None]:
X_train_encoded

Unnamed: 0,시술 시기 코드,시술 당시 나이,특정 시술 유형,배란 자극 여부,단일 배아 이식 여부,착상 전 유전 검사 사용 여부,착상 전 유전 진단 사용 여부,여성 부 불임 원인,부부 부 불임 원인,불명확 불임 원인,...,해동 난자 수,수집된 신선 난자 수,저장된 신선 난자 수,기증자 정자와 혼합된 난자 수,난자 출처,정자 출처,난자 기증자 나이,기증 배아 사용 여부,대리모 여부,총 경과일
0,1.518507,-0.935575,-0.962962,0.544551,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,-0.578475,...,-0.066583,-0.184224,-0.083423,-0.220261,0.256018,0.344047,-1.173581,-0.098393,-0.0641,0.033850
1,1.020975,2.655919,-0.962962,-1.836376,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,-0.578475,...,-0.066583,-1.000848,-0.083423,-0.220261,0.256018,0.344047,1.803789,-0.098393,-0.0641,-1.586105
2,0.025910,-0.935575,0.672585,0.544551,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,-0.578475,...,-0.066583,-0.048121,-0.083423,-0.220261,0.256018,0.344047,-1.173581,-0.098393,-0.0641,-0.506135
3,-0.471622,-0.217277,-0.962962,0.544551,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,-0.578475,...,-0.066583,-0.456432,-0.083423,-0.220261,0.256018,0.344047,0.480513,-0.098393,-0.0641,-1.586105
4,0.025910,-0.935575,-0.962962,0.544551,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,-0.578475,...,-0.066583,-0.184224,-0.083423,-0.220261,0.256018,0.344047,-1.173581,-0.098393,-0.0641,0.033850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256346,1.020975,-0.935575,-0.962962,0.544551,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,1.728684,...,-0.066583,0.360191,-0.083423,-0.220261,0.256018,0.344047,-1.173581,-0.098393,-0.0641,1.113821
256347,1.020975,0.501022,-0.962962,0.544551,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,-0.578475,...,-0.066583,-0.456432,-0.083423,-0.220261,0.256018,0.344047,0.811332,-0.098393,-0.0641,0.033850
256348,0.025910,-0.217277,-0.962962,0.544551,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,-0.578475,...,-0.066583,0.224087,-0.083423,-0.220261,0.256018,0.344047,0.480513,-0.098393,-0.0641,0.033850
256349,1.518507,0.501022,-0.962962,0.544551,-0.543057,-0.103519,-0.11236,-0.112199,-0.094036,1.728684,...,-0.066583,-0.728640,-0.083423,-0.220261,0.256018,0.344047,0.811332,-0.098393,-0.0641,-1.046120


### Train

In [None]:
model = ExtraTreesClassifier(random_state=42)

model.fit(X_train_encoded, y)

**앙상블 및 Optuna**

In [None]:
import optuna
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [None]:
def optimize_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5)
    }
    model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
    return np.mean(scores)

In [None]:
def optimize_lightgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0)
    }
    model = LGBMClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
    return np.mean(scores)

In [None]:
def optimize_random_forest(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 3, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4)
    }
    model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
    return np.mean(scores)


In [None]:
# Optuna 스터디 실행
print("Optimizing XGBoost...")
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(optimize_xgboost, n_trials=30)
best_xgb_params = xgb_study.best_params

print("Optimizing LightGBM...")
lgbm_study = optuna.create_study(direction="maximize")
lgbm_study.optimize(optimize_lightgbm, n_trials=30)
best_lgbm_params = lgbm_study.best_params

print("Optimizing RandomForest...")
rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(optimize_random_forest, n_trials=30)
best_rf_params = rf_study.best_params

[I 2025-02-16 22:33:34,099] A new study created in memory with name: no-name-51db5fbd-5320-4d0f-8fac-c4f9b41e9f43


Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-16 22:33:38,498] Trial 0 finished with value: 0.7340837002314029 and parameters: {'n_estimators': 283, 'max_depth': 9, 'learning_rate': 0.18680693288094835, 'subsample': 0.6303907352649968, 'colsample_bytree': 0.9035942048740728, 'gamma': 3.8301109691770643}. Best is trial 0 with value: 0.7340837002314029.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-16 22:33:42,501] Trial 1 finished with value: 0.7340578774487525 and parameters: {'n_estimators': 291, 'max_depth': 8, 'learning_rate': 0.26062514635325235, 'su

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005570 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used featur

[I 2025-02-16 22:36:39,492] Trial 0 finished with value: 0.7343432320967155 and parameters: {'n_estimators': 190, 'max_depth': 8, 'learning_rate': 0.01812039018286032, 'num_leaves': 30, 'subsample': 0.7636237486535218}. Best is trial 0 with value: 0.7343432320967155.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:36:42,787] Trial 1 finished with value: 0.7329508066382451 and parameters: {'n_estimators': 271, 'max_depth': 8, 'learning_rate': 0.21718632920672357, 'num_leaves': 19, 'subsample': 0.7331793966365048}. Best is trial 0 with value: 0.7343432320967155.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:36:45,682] Trial 2 finished with value: 0.731890411499142 and parameters: {'n_estimators': 159, 'max_depth': 11, 'learning_rate': 0.18759080156484048, 'num_leaves': 44, 'subsample': 0.6938730619934}. Best is trial 0 with value: 0.7343432320967155.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006682 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:36:49,370] Trial 3 finished with value: 0.7350620508593166 and parameters: {'n_estimators': 312, 'max_depth': 11, 'learning_rate': 0.1074220097992988, 'num_leaves': 12, 'subsample': 0.9382481999292298}. Best is trial 3 with value: 0.7350620508593166.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:36:51,942] Trial 4 finished with value: 0.7351313330232794 and parameters: {'n_estimators': 160, 'max_depth': 11, 'learning_rate': 0.09228066944697007, 'num_leaves': 10, 'subsample': 0.6279974862148351}. Best is trial 4 with value: 0.7351313330232794.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006074 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:36:54,986] Trial 5 finished with value: 0.7344267589025677 and parameters: {'n_estimators': 244, 'max_depth': 9, 'learning_rate': 0.1953758905302927, 'num_leaves': 11, 'subsample': 0.883378862691973}. Best is trial 4 with value: 0.7351313330232794.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005758 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005349 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:01,259] Trial 6 finished with value: 0.7331484886120929 and parameters: {'n_estimators': 495, 'max_depth': 12, 'learning_rate': 0.07629602624537651, 'num_leaves': 32, 'subsample': 0.9320515182104463}. Best is trial 4 with value: 0.7351313330232794.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004954 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:05,099] Trial 7 finished with value: 0.7349920667373413 and parameters: {'n_estimators': 224, 'max_depth': 7, 'learning_rate': 0.06672817982295463, 'num_leaves': 36, 'subsample': 0.8732826039038892}. Best is trial 4 with value: 0.7351313330232794.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:09,163] Trial 8 finished with value: 0.7334663001865509 and parameters: {'n_estimators': 352, 'max_depth': 12, 'learning_rate': 0.13786123412800774, 'num_leaves': 19, 'subsample': 0.8034980446646672}. Best is trial 4 with value: 0.7351313330232794.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:12,262] Trial 9 finished with value: 0.7340039271449287 and parameters: {'n_estimators': 183, 'max_depth': 12, 'learning_rate': 0.1268975723541606, 'num_leaves': 33, 'subsample': 0.7215528607319508}. Best is trial 4 with value: 0.7351313330232794.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:14,301] Trial 10 finished with value: 0.7350280635192152 and parameters: {'n_estimators': 114, 'max_depth': 3, 'learning_rate': 0.2800450217825473, 'num_leaves': 50, 'subsample': 0.6230854832944968}. Best is trial 4 with value: 0.7351313330232794.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:18,579] Trial 11 finished with value: 0.735224937128958 and parameters: {'n_estimators': 385, 'max_depth': 10, 'learning_rate': 0.08600689016807848, 'num_leaves': 10, 'subsample': 0.9824121378161383}. Best is trial 11 with value: 0.735224937128958.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004867 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:24,313] Trial 12 finished with value: 0.735221751574875 and parameters: {'n_estimators': 402, 'max_depth': 10, 'learning_rate': 0.023101886994935233, 'num_leaves': 19, 'subsample': 0.607793295019505}. Best is trial 11 with value: 0.735224937128958.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:30,797] Trial 13 finished with value: 0.7345167454206603 and parameters: {'n_estimators': 417, 'max_depth': 6, 'learning_rate': 0.011774978993663823, 'num_leaves': 20, 'subsample': 0.8262923566220683}. Best is trial 11 with value: 0.735224937128958.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005770 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004416 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:36,686] Trial 14 finished with value: 0.7351113331519772 and parameters: {'n_estimators': 409, 'max_depth': 9, 'learning_rate': 0.042046206873135726, 'num_leaves': 25, 'subsample': 0.974253169339839}. Best is trial 11 with value: 0.735224937128958.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:41,937] Trial 15 finished with value: 0.735325858698204 and parameters: {'n_estimators': 399, 'max_depth': 10, 'learning_rate': 0.05102181402132013, 'num_leaves': 16, 'subsample': 0.9988320587921031}. Best is trial 15 with value: 0.735325858698204.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:47,659] Trial 16 finished with value: 0.7351535794073067 and parameters: {'n_estimators': 487, 'max_depth': 5, 'learning_rate': 0.054637723835967295, 'num_leaves': 15, 'subsample': 0.989605753210734}. Best is trial 15 with value: 0.735325858698204.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005972 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:52,062] Trial 17 finished with value: 0.7316205142233155 and parameters: {'n_estimators': 366, 'max_depth': 10, 'learning_rate': 0.16864649521179206, 'num_leaves': 25, 'subsample': 0.9235659316219693}. Best is trial 15 with value: 0.735325858698204.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:37:56,916] Trial 18 finished with value: 0.7341885073990331 and parameters: {'n_estimators': 454, 'max_depth': 9, 'learning_rate': 0.10983725378910483, 'num_leaves': 15, 'subsample': 0.8775714092939717}. Best is trial 15 with value: 0.735325858698204.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006438 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:00,772] Trial 19 finished with value: 0.7305458767014441 and parameters: {'n_estimators': 322, 'max_depth': 10, 'learning_rate': 0.2445658618393205, 'num_leaves': 24, 'subsample': 0.9997344998475458}. Best is trial 15 with value: 0.735325858698204.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:05,921] Trial 20 finished with value: 0.7303992600659389 and parameters: {'n_estimators': 367, 'max_depth': 7, 'learning_rate': 0.1543080945599984, 'num_leaves': 40, 'subsample': 0.9582965732361446}. Best is trial 15 with value: 0.735325858698204.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005881 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:10,954] Trial 21 finished with value: 0.7353422844407314 and parameters: {'n_estimators': 406, 'max_depth': 10, 'learning_rate': 0.041225232291948556, 'num_leaves': 15, 'subsample': 0.6637715577700649}. Best is trial 21 with value: 0.7353422844407314.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:16,245] Trial 22 finished with value: 0.735255343865514 and parameters: {'n_estimators': 445, 'max_depth': 10, 'learning_rate': 0.04740865383183875, 'num_leaves': 15, 'subsample': 0.6859622475920141}. Best is trial 21 with value: 0.7353422844407314.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:21,636] Trial 23 finished with value: 0.7353176113154212 and parameters: {'n_estimators': 448, 'max_depth': 9, 'learning_rate': 0.0436691333027741, 'num_leaves': 15, 'subsample': 0.680598714050596}. Best is trial 21 with value: 0.7353422844407314.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004789 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:27,464] Trial 24 finished with value: 0.735154398201708 and parameters: {'n_estimators': 445, 'max_depth': 9, 'learning_rate': 0.03992425724044032, 'num_leaves': 22, 'subsample': 0.6624275572828876}. Best is trial 21 with value: 0.7353422844407314.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005747 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:32,674] Trial 25 finished with value: 0.7347956041549917 and parameters: {'n_estimators': 466, 'max_depth': 8, 'learning_rate': 0.06965068293377016, 'num_leaves': 15, 'subsample': 0.6587964491970317}. Best is trial 21 with value: 0.7353422844407314.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004990 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:37,932] Trial 26 finished with value: 0.7352022583851248 and parameters: {'n_estimators': 338, 'max_depth': 11, 'learning_rate': 0.031940931434624606, 'num_leaves': 29, 'subsample': 0.7630037151165942}. Best is trial 21 with value: 0.7353422844407314.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:42,805] Trial 27 finished with value: 0.734218409776344 and parameters: {'n_estimators': 425, 'max_depth': 6, 'learning_rate': 0.1038848343547752, 'num_leaves': 17, 'subsample': 0.7454792129850953}. Best is trial 21 with value: 0.7353422844407314.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:47,035] Trial 28 finished with value: 0.7350837119151954 and parameters: {'n_estimators': 273, 'max_depth': 9, 'learning_rate': 0.05301922087425711, 'num_leaves': 27, 'subsample': 0.8363539971843583}. Best is trial 21 with value: 0.7353422844407314.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005756 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 31
[LightGBM] [Info

[I 2025-02-16 22:38:52,388] Trial 29 finished with value: 0.7344942356726714 and parameters: {'n_estimators': 385, 'max_depth': 8, 'learning_rate': 0.016194661889882528, 'num_leaves': 13, 'subsample': 0.7808343924978168}. Best is trial 21 with value: 0.7353422844407314.
[I 2025-02-16 22:38:52,389] A new study created in memory with name: no-name-f7429d9a-c2db-4f28-899d-9f7fa9432ff4


Optimizing RandomForest...


[I 2025-02-16 22:40:35,758] Trial 0 finished with value: 0.7327288979777066 and parameters: {'n_estimators': 241, 'max_depth': 11, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7327288979777066.
[I 2025-02-16 22:41:14,395] Trial 1 finished with value: 0.7248345783744777 and parameters: {'n_estimators': 156, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7327288979777066.
[I 2025-02-16 22:41:48,491] Trial 2 finished with value: 0.722071890858957 and parameters: {'n_estimators': 161, 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7327288979777066.
[I 2025-02-16 22:42:34,789] Trial 3 finished with value: 0.7287759426396456 and parameters: {'n_estimators': 146, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7327288979777066.
[I 2025-02-16 22:43:40,325] Trial 4 finished with value: 0.733252572571027 and parameters: {'n

In [None]:
# 최적화된 모델 생성
xgb_model = XGBClassifier(**best_xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
lgbm_model = LGBMClassifier(**best_lgbm_params, random_state=42)
rf_model = RandomForestClassifier(**best_rf_params, random_state=42)

In [None]:
# XGBoost 모델의 파라미터 확인
print("XGBoost Best Parameters:")
print(xgb_model.get_params())

# LightGBM 모델의 파라미터 확인
print("\nLightGBM Best Parameters:")
print(lgbm_model.get_params())

# RandomForest 모델의 파라미터 확인
print("\nRandomForest Best Parameters:")
print(rf_model.get_params())

XGBoost Best Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.6015480730177228, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': 3.8713817890813846, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.04429291094984883, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 9, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 410, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.8432019183270041, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder'

In [None]:
# Soft Voting 앙상블
ensemble_model = VotingClassifier(
    estimators=[
        ("xgb", xgb_model),
        ("lgbm", lgbm_model),
        ("rf", rf_model)
    ],
    voting="soft"
)

In [None]:
# 전체 데이터로 학습
ensemble_model.fit(X_train_encoded, y)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568


### Predict

In [None]:
pred_proba = ensemble_model.predict_proba(X_test_encoded)[:, 1]

### Submission

In [None]:
sample_submission = pd.read_csv('../../sample_submission.csv')
sample_submission['probability'] = pred_proba

In [None]:
sample_submission.to_csv('./Result/baseline_submit.csv', index=False)

In [None]:
pred_proba

array([0.00419399, 0.00949558, 0.14818891, ..., 0.46271164, 0.17253709,
       0.00435354])

In [None]:
unique, counts = np.unique(sample_submission['probability'], return_counts=True)
print(unique, counts)

[3.15686441e-04 3.18624443e-04 3.20559992e-04 ... 6.37033961e-01
 6.38688690e-01 6.55065360e-01] [1 1 1 ... 1 1 1]
