### Import

In [47]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib
import seaborn as sns

# 각 라이브러리 버전 출력
print("Numpy version:", np.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Pandas version:", pd.__version__)
print("Matplotlib version:", matplotlib.__version__)
print("Seaborn version:", sns.__version__)

Numpy version: 1.26.4
Scikit-learn version: 1.5.1
Pandas version: 1.5.3
Matplotlib version: 3.9.2
Seaborn version: 0.13.2


In [48]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

### Data Load

In [49]:
train = pd.read_csv('../../train.csv').drop(columns=['ID'])
test = pd.read_csv('../../test.csv').drop(columns=['ID'])

In [50]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [51]:
categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [52]:
# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)


In [53]:
# 결측값을 채울 칼럼 목록
columns_to_fill = [
    '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부',
    '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수',
    '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수',
    '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수',
    '대리모 여부'
]

# 시술 유형이 'DI'인 경우에만 결측값을 0으로 채우기
X.loc[X['시술 유형'] == 'DI', columns_to_fill] = X.loc[X['시술 유형'] == 'DI', columns_to_fill].fillna(0)

# 1. '난자 출처'의 결측값을 '본인 제공'으로 채우기
X['난자 출처'].replace('알 수 없음','본인 제공', inplace=True)

# 2. '난자 기증자 나이' 결측값을 시술 당시 나이로 채우기
X.loc[X['난자 기증자 나이'] == '알 수 없음', '난자 기증자 나이'] = X['시술 당시 나이']

# 3. '파트너 정자와 혼합된 난자 수'의 결측값을 0으로 채우기
X['파트너 정자와 혼합된 난자 수'].fillna(0, inplace=True)

# 4. '기증자 정자와 혼합된 난자 수'의 결측값을 1로 채우기
X['기증자 정자와 혼합된 난자 수'].fillna(1, inplace=True)

# 5. '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부'의 결측값을 0으로 채우기
for column in ['동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부']:
    X[column].fillna(0, inplace=True)

    # 변경 내용 확인
print(X[['난자 출처', '난자 기증자 나이', '파트너 정자와 혼합된 난자 수', 
            '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부']].head())


   난자 출처 난자 기증자 나이  파트너 정자와 혼합된 난자 수  기증자 정자와 혼합된 난자 수 동결 배아 사용 여부  \
0  본인 제공   만18-34세               5.0               0.0         0.0   
1  본인 제공   만45-50세               1.0               0.0         0.0   
2  본인 제공   만18-34세               7.0               0.0         0.0   
3  본인 제공   만35-37세               4.0               0.0         0.0   
4  본인 제공   만18-34세               6.0               0.0         0.0   

  신선 배아 사용 여부 기증 배아 사용 여부  
0         1.0         0.0  
1         1.0         0.0  
2         1.0         0.0  
3         1.0         0.0  
4         1.0         0.0  


In [54]:
# 시술 유형이 'DI'인 경우에만 결측값을 0으로 채우기
test.loc[test['시술 유형'] == 'DI', columns_to_fill] = test.loc[test['시술 유형'] == 'DI', columns_to_fill].fillna(0)

# 1. '난자 출처'의 결측값을 '본인 제공'으로 채우기
test['난자 출처'].replace('알 수 없음','본인 제공', inplace=True)

# 2. '난자 기증자 나이' 결측값을 시술 당시 나이로 채우기
test.loc[test['난자 기증자 나이'] == '알 수 없음', '난자 기증자 나이'] = test['시술 당시 나이']

# 3. '파트너 정자와 혼합된 난자 수'의 결측값을 0으로 채우기
test['파트너 정자와 혼합된 난자 수'].fillna(0, inplace=True)

# 4. '기증자 정자와 혼합된 난자 수'의 결측값을 1로 채우기
test['기증자 정자와 혼합된 난자 수'].fillna(1, inplace=True)

# 5. '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부'의 결측값을 0으로 채우기
for column in ['동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부']:
    test[column].fillna(0, inplace=True)

In [55]:
# 각 열의 결측값 개수 확인
missing_values_count = X.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)

결측값이 있는 열과 개수:
임신 시도 또는 마지막 임신 경과 연수    246981
난자 채취 경과일                 57488
난자 해동 경과일                254915
난자 혼합 경과일                 53735
배아 이식 경과일                 43566
배아 해동 경과일                215982
dtype: int64


In [56]:

X['난자 채취 경과일'].fillna(1, inplace=True)
X['난자 해동 경과일'].fillna(0, inplace=True)
X['난자 혼합 경과일'].fillna(0, inplace=True)
X['배아 이식 경과일'].fillna(0, inplace=True)
X['배아 해동 경과일'].fillna(0, inplace=True)

test['난자 채취 경과일'].fillna(1, inplace=True)
test['난자 해동 경과일'].fillna(0, inplace=True)
test['난자 혼합 경과일'].fillna(0, inplace=True)
test['배아 이식 경과일'].fillna(0, inplace=True)
test['배아 해동 경과일'].fillna(0, inplace=True)

In [57]:
# '알 수 없음'이 포함된 열 찾기
unknown_columns = []

for col in X.columns:
    if X[col].astype(str).str.contains('알 수 없음').any():
        unknown_columns.append(col)

# 결과 출력
print("⚠️ '알 수 없음' 값을 포함하는 컬럼들:")
print(unknown_columns)


⚠️ '알 수 없음' 값을 포함하는 컬럼들:
['시술 당시 나이', '배란 유도 유형', '난자 기증자 나이', '정자 기증자 나이']


In [58]:
# 변환 함수 정의 (isinstance 없이 단순화)
def categorize_reason(value):
    if value == "배아 저장용":  # "배아 저장용" 단독일 경우 1
        return 1
    elif "현재 시술용" in value:  # "현재 시술용"이 포함된 경우 1
        return 1
    else:  # 그 외 모든 값은 0
        return 0

X["배아 생성 주요 이유"] = X["배아 생성 주요 이유"].replace("nan", "현재 시술용")
test["배아 생성 주요 이유"] = test["배아 생성 주요 이유"].replace("nan", "현재 시술용")

# "배아 생성 주요 이유" 칼럼 변환 적용
X["배아 생성 주요 이유"] = X["배아 생성 주요 이유"].apply(categorize_reason)
test["배아 생성 주요 이유"] = test["배아 생성 주요 이유"].apply(categorize_reason)



In [59]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [60]:
columns_to_drop = [
        "임신 시도 또는 마지막 임신 경과 연수",
        "PGD 시술 여부",
        "PGS 시술 여부",
        "남성 주 불임 원인",
        "남성 부 불임 원인",
        "불임 원인 - 정자 농도",
        "불임 원인 - 정자 면역학적 요인",
        "불임 원인 - 정자 운동성",
        "불임 원인 - 정자 형태",
        '정자 기증자 나이',
        '배란 유도 유형'
]
X_train_encoded = X_train_encoded.drop(columns = columns_to_drop)    
X_test_encoded = X_test_encoded.drop(columns = columns_to_drop)  

In [61]:
# 병합할 칼럼들
columns_to_merge = [
    '난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일',
    '배아 이식 경과일', '배아 해동 경과일'
]

# 새로운 칼럼 생성: 각 경과일의 합
X_train_encoded['총 경과일'] = X_train_encoded[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
X_train_encoded = X_train_encoded.drop(columns=columns_to_merge)

# 새로운 칼럼 생성: 각 경과일의 합
X_test_encoded['총 경과일'] = X_test_encoded[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
X_test_encoded = X_test_encoded.drop(columns=columns_to_merge)

In [62]:
# 각 열의 결측값 개수 확인
missing_values_count = X_train_encoded.isnull().sum()

# 결측값이 있는 열만 필터링
missing_columns = missing_values_count[missing_values_count > 0]

print("결측값이 있는 열과 개수:")
print(missing_columns)


결측값이 있는 열과 개수:
Series([], dtype: int64)


In [63]:
X['배아 생성 주요 이유'].value_counts()

1    253108
0      3243
Name: 배아 생성 주요 이유, dtype: int64

In [64]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256351 entries, 0 to 256350
Data columns (total 52 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   시술 시기 코드          256351 non-null  float64
 1   시술 당시 나이          256351 non-null  float64
 2   시술 유형             256351 non-null  float64
 3   특정 시술 유형          256351 non-null  float64
 4   배란 자극 여부          256351 non-null  float64
 5   단일 배아 이식 여부       256351 non-null  float64
 6   착상 전 유전 검사 사용 여부  256351 non-null  float64
 7   착상 전 유전 진단 사용 여부  256351 non-null  float64
 8   여성 주 불임 원인        256351 non-null  float64
 9   여성 부 불임 원인        256351 non-null  float64
 10  부부 주 불임 원인        256351 non-null  float64
 11  부부 부 불임 원인        256351 non-null  float64
 12  불명확 불임 원인         256351 non-null  float64
 13  불임 원인 - 난관 질환     256351 non-null  float64
 14  불임 원인 - 남성 요인     256351 non-null  float64
 15  불임 원인 - 배란 장애     256351 non-null  float64
 16  불임 원인 - 여성 요인     25

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# 데이터 정규화 (X_train_encoded & X_test_encoded)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)  # 동일한 스케일 적용

# DataFrame 변환 (Feature 이름 유지)
feature_names = [f"Feature_{i}" for i in range(X_train_scaled.shape[1])]
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

# 상관 행렬 계산
correlation_matrix_train = X_train_scaled_df.corr()

# 다중 공선성이 높은 칼럼 찾기 (절대 상관 계수가 0.8 이상)
threshold = 0.8
high_corr_features = set()

for i in range(len(feature_names)):
    for j in range(i + 1, len(feature_names)):
        if abs(correlation_matrix_train.iloc[i, j]) > threshold:
            high_corr_features.add(feature_names[j])  # 공선성이 높은 컬럼 추가

# 다중 공선성이 높은 컬럼 제거
X_train_encoded = X_train_scaled_df.drop(columns=high_corr_features, errors='ignore')
X_test_encoded = X_test_scaled_df.drop(columns=high_corr_features, errors='ignore')


In [66]:
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.preprocessing import StandardScaler

# # 데이터 정규화 (X_train_encoded & X_test_encoded)
# scaler = StandardScaler()
# X_train_encoded = scaler.fit_transform(X_train_encoded)
# X_test_encoded = scaler.transform(X_test_encoded)  # 동일한 스케일 적용

# # 공분산 행렬 계산 (훈련 & 테스트 데이터셋)
# X_train_encoded = np.cov(X_train_encoded.T)
# X_test_encoded = np.cov(X_test_encoded.T)

# # 공분산 행렬 시각화
# fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# # 훈련 데이터 공분산 행렬
# axes[0].imshow(X_train_encoded, cmap="coolwarm", aspect="auto")
# axes[0].set_title("훈련 데이터 공분산 행렬")
# axes[0].set_xlabel("Feature Index")
# axes[0].set_ylabel("Feature Index")

# # 테스트 데이터 공분산 행렬
# axes[1].imshow(X_test_encoded, cmap="coolwarm", aspect="auto")
# axes[1].set_title("테스트 데이터 공분산 행렬")
# axes[1].set_xlabel("Feature Index")
# axes[1].set_ylabel("Feature Index")

# plt.tight_layout()
# plt.show()

# # 공분산 행렬 크기 출력
# X_train_encoded.shape, X_test_encoded.shape


In [67]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA



# # 데이터 정규화
# scaler = StandardScaler()
# scaled_features = scaler.fit_transform(X_train_encoded)
# scaled_test_x = scaler.transform(X_test_encoded)

# # PCA 적용
# pca = PCA(n_components=0.95)  # 설명 분산의 93%를 유지하도록 설정
# X_train_encoded = pca.fit_transform(scaled_features)
# X_test_encoded = pca.transform(scaled_test_x)

# # PCA 적용 후 데이터셋의 형태 확인
# pca_features_shape = X_train_encoded.shape

# # 설명된 분산 비율
# explained_variance = pca.explained_variance_ratio_

# pca_features_shape, explained_variance

### Train

In [68]:
model = ExtraTreesClassifier(random_state=42)

model.fit(X_train_encoded, y)

**앙상블 및 Optuna**

In [69]:
import optuna
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [70]:
def optimize_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5)
    }
    model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
    return np.mean(scores)

In [71]:
def optimize_lightgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 10, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0)
    }
    model = LGBMClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
    return np.mean(scores)

In [72]:
def optimize_random_forest(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 3, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4)
    }
    model = RandomForestClassifier(**params, random_state=42)
    scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
    return np.mean(scores)


In [73]:
# Optuna 스터디 실행
print("Optimizing XGBoost...")
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(optimize_xgboost, n_trials=30)
best_xgb_params = xgb_study.best_params

print("Optimizing LightGBM...")
lgbm_study = optuna.create_study(direction="maximize")
lgbm_study.optimize(optimize_lightgbm, n_trials=30)
best_lgbm_params = lgbm_study.best_params

print("Optimizing RandomForest...")
rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(optimize_random_forest, n_trials=30)
best_rf_params = rf_study.best_params

[I 2025-02-13 13:49:50,433] A new study created in memory with name: no-name-5cd640e0-8f49-4f45-828f-eac2ec946399


Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-13 13:49:54,982] Trial 0 finished with value: 0.7389381085510989 and parameters: {'n_estimators': 192, 'max_depth': 7, 'learning_rate': 0.08535976031865274, 'subsample': 0.8138615463284129, 'colsample_bytree': 0.6810946986733892, 'gamma': 4.571425433890813}. Best is trial 0 with value: 0.7389381085510989.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-13 13:50:03,869] Trial 1 finished with value: 0.7382718407834885 and parameters: {'n_estimators': 429, 'max_depth': 3, 'learning_rate': 0.03420023273420503, 'sub

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used featur

[I 2025-02-13 13:52:54,437] Trial 0 finished with value: 0.7382649950342842 and parameters: {'n_estimators': 125, 'max_depth': 8, 'learning_rate': 0.0459632597078306, 'num_leaves': 25, 'subsample': 0.7563795418030833}. Best is trial 0 with value: 0.7382649950342842.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:52:57,440] Trial 1 finished with value: 0.7385792112955339 and parameters: {'n_estimators': 107, 'max_depth': 10, 'learning_rate': 0.05793086965353473, 'num_leaves': 37, 'subsample': 0.8604319151146287}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006255 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:01,436] Trial 2 finished with value: 0.7328888768044224 and parameters: {'n_estimators': 220, 'max_depth': 8, 'learning_rate': 0.21737047527090045, 'num_leaves': 48, 'subsample': 0.9841211703917765}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007536 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:04,740] Trial 3 finished with value: 0.7355976445683374 and parameters: {'n_estimators': 189, 'max_depth': 5, 'learning_rate': 0.27415522440810663, 'num_leaves': 43, 'subsample': 0.8514369676550795}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:08,223] Trial 4 finished with value: 0.7384408303577656 and parameters: {'n_estimators': 148, 'max_depth': 11, 'learning_rate': 0.061824031127327296, 'num_leaves': 37, 'subsample': 0.9085437978292872}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006470 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:13,268] Trial 5 finished with value: 0.7349085322158704 and parameters: {'n_estimators': 464, 'max_depth': 9, 'learning_rate': 0.224914128134302, 'num_leaves': 14, 'subsample': 0.9861172563672795}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:18,758] Trial 6 finished with value: 0.7327605472843672 and parameters: {'n_estimators': 380, 'max_depth': 11, 'learning_rate': 0.21865678470950106, 'num_leaves': 37, 'subsample': 0.7380642946977831}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:22,138] Trial 7 finished with value: 0.7377821765343482 and parameters: {'n_estimators': 198, 'max_depth': 7, 'learning_rate': 0.15980967459530218, 'num_leaves': 21, 'subsample': 0.795239497082389}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005805 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007948 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:25,526] Trial 8 finished with value: 0.7382371325814095 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.13294141061227063, 'num_leaves': 17, 'subsample': 0.6020114849461631}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:28,244] Trial 9 finished with value: 0.7381258767595432 and parameters: {'n_estimators': 124, 'max_depth': 8, 'learning_rate': 0.14499360135475065, 'num_leaves': 22, 'subsample': 0.7262457201040043}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005497 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:32,208] Trial 10 finished with value: 0.7384575854499639 and parameters: {'n_estimators': 301, 'max_depth': 3, 'learning_rate': 0.08895767543544923, 'num_leaves': 32, 'subsample': 0.8765751601157656}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:36,353] Trial 11 finished with value: 0.7330995144135171 and parameters: {'n_estimators': 305, 'max_depth': 3, 'learning_rate': 0.01534389553859658, 'num_leaves': 32, 'subsample': 0.8828231920429296}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:40,263] Trial 12 finished with value: 0.7385170612661406 and parameters: {'n_estimators': 296, 'max_depth': 3, 'learning_rate': 0.08513842392552964, 'num_leaves': 31, 'subsample': 0.8389447220337991}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005873 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:46,199] Trial 13 finished with value: 0.7361533820943856 and parameters: {'n_estimators': 386, 'max_depth': 12, 'learning_rate': 0.09577916038538187, 'num_leaves': 40, 'subsample': 0.813980887597914}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006272 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:50,855] Trial 14 finished with value: 0.7356065712624937 and parameters: {'n_estimators': 253, 'max_depth': 5, 'learning_rate': 0.013638478098411205, 'num_leaves': 26, 'subsample': 0.9285712435910702}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:53:56,768] Trial 15 finished with value: 0.7354630752830564 and parameters: {'n_estimators': 358, 'max_depth': 10, 'learning_rate': 0.10027144330846872, 'num_leaves': 50, 'subsample': 0.6809037765998158}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:03,267] Trial 16 finished with value: 0.7381908335508859 and parameters: {'n_estimators': 424, 'max_depth': 6, 'learning_rate': 0.05901299394555212, 'num_leaves': 30, 'subsample': 0.8105908998807935}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007092 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:06,811] Trial 17 finished with value: 0.7382667973158223 and parameters: {'n_estimators': 261, 'max_depth': 4, 'learning_rate': 0.17779617492622637, 'num_leaves': 10, 'subsample': 0.9360203310507238}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006669 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:12,103] Trial 18 finished with value: 0.7364627324202239 and parameters: {'n_estimators': 338, 'max_depth': 6, 'learning_rate': 0.12019995301732489, 'num_leaves': 36, 'subsample': 0.8442800531415346}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:20,017] Trial 19 finished with value: 0.737533716530181 and parameters: {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.04347754000613727, 'num_leaves': 45, 'subsample': 0.6656240938875203}. Best is trial 1 with value: 0.7385792112955339.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:23,323] Trial 20 finished with value: 0.7386186800739305 and parameters: {'n_estimators': 160, 'max_depth': 12, 'learning_rate': 0.07443864333865192, 'num_leaves': 28, 'subsample': 0.78259871009307}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:26,224] Trial 21 finished with value: 0.7386173761441036 and parameters: {'n_estimators': 101, 'max_depth': 12, 'learning_rate': 0.07750102803308047, 'num_leaves': 29, 'subsample': 0.7748799179026403}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:29,010] Trial 22 finished with value: 0.7384961182164433 and parameters: {'n_estimators': 100, 'max_depth': 12, 'learning_rate': 0.07160506405481185, 'num_leaves': 27, 'subsample': 0.7737336178438882}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:32,884] Trial 23 finished with value: 0.7385250117966825 and parameters: {'n_estimators': 161, 'max_depth': 11, 'learning_rate': 0.03653516860377481, 'num_leaves': 35, 'subsample': 0.7120370695912056}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007325 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:35,599] Trial 24 finished with value: 0.7381424146278774 and parameters: {'n_estimators': 103, 'max_depth': 12, 'learning_rate': 0.11605190003297378, 'num_leaves': 41, 'subsample': 0.7852246019468422}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:38,773] Trial 25 finished with value: 0.7370332204957426 and parameters: {'n_estimators': 166, 'max_depth': 10, 'learning_rate': 0.1739475342162359, 'num_leaves': 28, 'subsample': 0.6949426629577201}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:42,340] Trial 26 finished with value: 0.7370137062063504 and parameters: {'n_estimators': 149, 'max_depth': 11, 'learning_rate': 0.02769715900130444, 'num_leaves': 20, 'subsample': 0.649923144486848}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006475 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:46,491] Trial 27 finished with value: 0.7386035224783363 and parameters: {'n_estimators': 238, 'max_depth': 12, 'learning_rate': 0.07082338236913567, 'num_leaves': 23, 'subsample': 0.7494133992733186}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007103 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:50,751] Trial 28 finished with value: 0.738371905953038 and parameters: {'n_estimators': 254, 'max_depth': 12, 'learning_rate': 0.07878538818670823, 'num_leaves': 24, 'subsample': 0.7462072033457685}. Best is trial 20 with value: 0.7386186800739305.


[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52983, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006275 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 37
[LightGBM] [Info

[I 2025-02-13 13:54:54,630] Trial 29 finished with value: 0.738327952855227 and parameters: {'n_estimators': 234, 'max_depth': 9, 'learning_rate': 0.10689893449685803, 'num_leaves': 23, 'subsample': 0.7742649154700965}. Best is trial 20 with value: 0.7386186800739305.
[I 2025-02-13 13:54:54,631] A new study created in memory with name: no-name-832605ac-d947-48dc-b383-ee7d47155bf1


Optimizing RandomForest...


[I 2025-02-13 13:56:59,523] Trial 0 finished with value: 0.7321750717217841 and parameters: {'n_estimators': 286, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7321750717217841.
[I 2025-02-13 13:58:08,835] Trial 1 finished with value: 0.7347406457925217 and parameters: {'n_estimators': 131, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.7347406457925217.
[I 2025-02-13 13:59:00,167] Trial 2 finished with value: 0.7338471379649286 and parameters: {'n_estimators': 103, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.7347406457925217.
[I 2025-02-13 14:00:30,270] Trial 3 finished with value: 0.7347829317925774 and parameters: {'n_estimators': 169, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.7347829317925774.
[I 2025-02-13 14:02:08,777] Trial 4 finished with value: 0.7262916629827297 and parameters: 

In [74]:
# 최적화된 모델 생성
xgb_model = XGBClassifier(**best_xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
lgbm_model = LGBMClassifier(**best_lgbm_params, random_state=42)
rf_model = RandomForestClassifier(**best_rf_params, random_state=42)

In [75]:
# XGBoost 모델의 파라미터 확인
print("XGBoost Best Parameters:")
print(xgb_model.get_params())

# LightGBM 모델의 파라미터 확인
print("\nLightGBM Best Parameters:")
print(lgbm_model.get_params())

# RandomForest 모델의 파라미터 확인
print("\nRandomForest Best Parameters:")
print(rf_model.get_params())

XGBoost Best Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.7432369132751219, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': 0.4146122328389401, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.09769974246133367, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 4, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 321, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.730184359922344, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder':

In [76]:
# Soft Voting 앙상블
ensemble_model = VotingClassifier(
    estimators=[
        ("xgb", xgb_model),
        ("lgbm", lgbm_model),
        ("rf", rf_model)
    ],
    voting="soft"
)

In [77]:
# 전체 데이터로 학습
ensemble_model.fit(X_train_encoded, y)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568


### Predict

In [78]:
pred_proba = ensemble_model.predict_proba(X_test_encoded)[:, 1]

### Submission

In [79]:
sample_submission = pd.read_csv('../../sample_submission.csv')
sample_submission['probability'] = pred_proba

In [80]:
sample_submission.to_csv('./Result/baseline_submit.csv', index=False)

In [81]:
pred_proba

array([0.00282509, 0.00279431, 0.14851737, ..., 0.44079754, 0.2308924 ,
       0.00294522])

In [82]:
unique, counts = np.unique(sample_submission['probability'], return_counts=True)
print(unique, counts)

[1.30601812e-04 1.39078754e-04 1.45125560e-04 ... 6.85666520e-01
 6.93220731e-01 6.96865893e-01] [1 1 1 ... 1 1 1]
