### Import

In [25]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib
import seaborn as sns

# 각 라이브러리 버전 출력
print("Numpy version:", np.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Pandas version:", pd.__version__)
print("Matplotlib version:", matplotlib.__version__)
print("Seaborn version:", sns.__version__)

Numpy version: 2.0.1
Scikit-learn version: 1.6.1
Pandas version: 2.2.3
Matplotlib version: 3.10.0
Seaborn version: 0.13.2


In [26]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

### Data Load

In [38]:
import sys
import os
import pandas as pd

# 현재 작업 디렉토리 경로를 가져와 shared codes 폴더의 위치를 sys.path에 추가합니다.
# sys.path에 추가된 경로에 있는 py 폴더는 임포트할 수 있다.
current_dir = os.getcwd()
shared_codes_dir = os.path.join(current_dir, '../shared codes')
sys.path.append(shared_codes_dir)


# cover_nan 모듈을 임포트
from cover_nan_0215_dahun import missing_value_removal_function

# 원본 train 데이터 로드
train = pd.read_csv("../shared codes/data/train.csv")
test = pd.read_csv("../shared codes/data/test.csv")

# missing_value_removal_function 사용
train_young, train_middle, train_old, train_unknown = missing_value_removal_function(train)
test_young, test_middle, test_old, test_unknown = missing_value_removal_function(test)

✅ '대리모 여부' 결측값을 최빈값 (0.0) 으로 대체 완료!
✅ 컬럼 삭제 완료: ['PGD 시술 여부', 'PGS 시술 여부', '난자 해동 경과일', '배아 해동 경과일']
✅ '난자 채취 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '난자 혼합 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '배아 이식 경과일' 결측값을 중앙값 (3.0) 으로 대체 완료!
✅ '대리모 여부' 결측값을 최빈값 (0.0) 으로 대체 완료!
✅ 컬럼 삭제 완료: ['PGD 시술 여부', 'PGS 시술 여부', '난자 해동 경과일', '배아 해동 경과일']
✅ '난자 채취 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '난자 혼합 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '배아 이식 경과일' 결측값을 중앙값 (3.0) 으로 대체 완료!


In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import optuna
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

def data_preprocessing(train, test):
    index_train = train['idx']
    X = train.drop(['임신 성공 여부', 'idx'], axis=1)
    y = train['임신 성공 여부']

    # index_test = test['idx']
    index_test = test['idx'].copy()  # Ensure a copy is made to prevent any shared memory issues
    test = test.drop('idx', axis=1)

    #Data Pre-processing
    # Categorical(범주형) 칼럼 찾기
    categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()

    ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    X_train_encoded = X.copy()
    X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

    X_test_encoded = test.copy()
    X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

    columns_to_drop = [
            "남성 주 불임 원인",
            "남성 부 불임 원인",
            "불임 원인 - 정자 농도",
            "불임 원인 - 정자 면역학적 요인",
            "불임 원인 - 정자 운동성",
            "불임 원인 - 정자 형태",
            '배란 유도 유형'
    ]
    X_train_encoded = X_train_encoded.drop(columns = columns_to_drop)    
    X_test_encoded = X_test_encoded.drop(columns = columns_to_drop)
    

    # 데이터 정규화 (X_train_encoded & X_test_encoded)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_encoded)
    X_test_scaled = scaler.transform(X_test_encoded)  # 동일한 스케일 적용

    # DataFrame 변환 (Feature 이름 유지)
    feature_names = [f"Feature_{i}" for i in range(X_train_scaled.shape[1])]
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

    # 상관 행렬 계산
    correlation_matrix_train = X_train_scaled_df.corr()

    # 다중 공선성이 높은 칼럼 찾기 (절대 상관 계수가 0.8 이상)
    threshold = 0.8
    high_corr_features = set()

    for i in range(len(feature_names)):
        for j in range(i + 1, len(feature_names)):
            if abs(correlation_matrix_train.iloc[i, j]) > threshold:
                high_corr_features.add(feature_names[j])  # 공선성이 높은 컬럼 추가

    # 다중 공선성이 높은 컬럼 제거
    X_train_encoded = X_train_scaled_df.drop(columns=high_corr_features, errors='ignore')
    X_test_encoded = X_test_scaled_df.drop(columns=high_corr_features, errors='ignore')


    X_train_encoded['idx'] = index_train.reset_index(drop=True)
    X_test_encoded['idx'] = index_test.reset_index(drop=True)


    return X_train_encoded, X_test_encoded, y


In [29]:
def optimize_models_and_get_best_params(X_train_encoded, y):

    def optimize_xgboost(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5)
        }
        model = XGBClassifier(**params, random_state=42, eval_metric="logloss")
        scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
        return np.mean(scores)


    def optimize_lightgbm(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 10, 50),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0)
        }
        model = LGBMClassifier(**params, random_state=42)
        scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
        return np.mean(scores)

    def optimize_random_forest(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_samples_split": trial.suggest_int("min_samples_split", 3, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4)
        }
        model = RandomForestClassifier(**params, random_state=42)
        scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
        return np.mean(scores)

    # Optuna 스터디 실행
    print("Optimizing XGBoost...")
    xgb_study = optuna.create_study(direction="maximize")
    xgb_study.optimize(optimize_xgboost, n_trials=1)
    best_xgb_params = xgb_study.best_params

    print("Optimizing LightGBM...")
    lgbm_study = optuna.create_study(direction="maximize")
    lgbm_study.optimize(optimize_lightgbm, n_trials=1)
    best_lgbm_params = lgbm_study.best_params

    print("Optimizing RandomForest...")
    rf_study = optuna.create_study(direction="maximize")
    rf_study.optimize(optimize_random_forest, n_trials=1)
    best_rf_params = rf_study.best_params

    params = [best_xgb_params, best_lgbm_params, best_rf_params]
    
    return params


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def optimize_models_and_get_best_params_v2(X_train_encoded, y):

    def optimize_logistic_regression(trial):
        params = {
            "C": trial.suggest_float("C", 0.01, 10.0, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
            "max_iter": trial.suggest_int("max_iter", 100, 1000),
        }
        model = LogisticRegression(**params, random_state=42)
        scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
        return np.mean(scores)

    def optimize_random_forest(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4),
            "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"])
        }
        model = RandomForestClassifier(**params, random_state=42)
        scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")  
        return np.mean(scores)

    def optimize_xgboost(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5)
        }
        model = XGBClassifier(**params, random_state=42, eval_metric="logloss")
        scores = cross_val_score(model, X_train_encoded, y, cv=5, scoring="roc_auc")
        return np.mean(scores)

    # Optuna 스터디 실행
    print("Optimizing Logistic Regression...")
    logreg_study = optuna.create_study(direction="maximize")
    logreg_study.optimize(optimize_logistic_regression, n_trials=50)
    best_logreg_params = logreg_study.best_params

    print("Optimizing Random Forest...")
    rf_study = optuna.create_study(direction="maximize")
    rf_study.optimize(optimize_random_forest, n_trials=10)
    best_rf_params = rf_study.best_params

    print("Optimizing XGBoost...")
    xgb_study = optuna.create_study(direction="maximize")
    xgb_study.optimize(optimize_xgboost, n_trials=50)
    best_xgb_params = xgb_study.best_params

    params = [best_logreg_params, best_rf_params, best_xgb_params]
    
    return params

In [30]:
def train_ensemble_model(X_train_encoded, y, X_test_encoded, params):

    
    # best params
    best_xgb_params, best_lgbm_params, best_rf_params = params

    # 최적화된 모델 생성
    xgb_model = XGBClassifier(**best_xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    lgbm_model = LGBMClassifier(**best_lgbm_params, random_state=42)
    rf_model = RandomForestClassifier(**best_rf_params, random_state=42)

    # Soft Voting 앙상블
    ensemble_model = VotingClassifier(
        estimators=[
            ("xgb", xgb_model),
            ("lgbm", lgbm_model),
            ("rf", rf_model)
        ],
        voting="soft"
    )

    # 전체 데이터로 학습
    ensemble_model.fit(X_train_encoded, y)

    pred_proba = ensemble_model.predict_proba(X_test_encoded)[:, 1]

    submission = pd.DataFrame({
       'probability': pred_proba,
       'idx': X_test_encoded['idx']
   })

    return submission

In [None]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

def train_ensemble_model_v2(X_train_encoded, y, X_test_encoded, params):

    # best params
    best_logreg_params, best_rf_params, best_xgb_params = params

    # 최적화된 모델 생성
    logreg_model = LogisticRegression(**best_logreg_params, random_state=42)
    rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
    xgb_model = XGBClassifier(**best_xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")

    # Soft Voting 앙상블
    ensemble_model = VotingClassifier(
        estimators=[
            ("logreg", logreg_model),
            ("rf", rf_model),
            ("xgb", xgb_model)
        ],
        voting="soft"
    )

    # 전체 데이터로 학습
    ensemble_model.fit(X_train_encoded, y)

    # 예측 확률
    pred_proba = ensemble_model.predict_proba(X_test_encoded)[:, 1]

    # 제출용 데이터프레임 생성
    submission = pd.DataFrame({
       'probability': pred_proba,
       'idx': X_test_encoded['idx']
    })

    return submission

In [None]:
X_train_encoded_young, X_test_encoded_young, y_young = data_preprocessing(train_young, test_young)
X_train_encoded_middle, X_test_encoded_middle, y_middle = data_preprocessing(train_middle, test_middle)
X_train_encoded_old, X_test_encoded_old, y_old = data_preprocessing(train_old, test_old)

params_young = optimize_models_and_get_best_params(X_train_encoded_young, y_young)
params_middle = optimize_models_and_get_best_params(X_train_encoded_middle, y_middle)
params_old = optimize_models_and_get_best_params(X_train_encoded_old, y_old)

submission_young = train_ensemble_model(X_train_encoded_young, y_young, X_test_encoded_young, params_young)
submission_middle = train_ensemble_model(X_train_encoded_middle, y_middle, X_test_encoded_middle, params_middle)
submission_old = train_ensemble_model(X_train_encoded_old, y_old, X_test_encoded_old, params_old)


In [41]:
def this_func_is_all_you_need(train, test):
    # missing_value_removal_function 사용
    train_young, train_middle, train_old, train_unknown = missing_value_removal_function(train)
    test_young, test_middle, test_old, test_unknown = missing_value_removal_function(test)

    X_train_encoded_young, X_test_encoded_young, y_young = data_preprocessing(train_young, test_young)
    X_train_encoded_middle, X_test_encoded_middle, y_middle = data_preprocessing(train_middle, test_middle)
    X_train_encoded_old, X_test_encoded_old, y_old = data_preprocessing(train_old, test_old)

    params_young = optimize_models_and_get_best_params(X_train_encoded_young, y_young)
    params_middle = optimize_models_and_get_best_params(X_train_encoded_middle, y_middle)
    params_old = optimize_models_and_get_best_params(X_train_encoded_old, y_old)

    submission_young = train_ensemble_model(X_train_encoded_young, y_young, X_test_encoded_young, params_young)
    submission_middle = train_ensemble_model(X_train_encoded_middle, y_middle, X_test_encoded_middle, params_middle)
    submission_old = train_ensemble_model(X_train_encoded_old, y_old, X_test_encoded_old, params_old)

    # test_unknown의 데이터 개수만큼 0으로 채워진 submission_unknown 생성
    submission_unknown = pd.DataFrame({
        'probability': [0] * len(test_unknown),
        'idx': test_unknown['idx']
    })

    submission = pd.concat([submission_young, submission_middle, submission_old, submission_unknown])

    submission = submission.sort_values('idx').reset_index(drop=True)
    submission = submission.drop(columns=['idx'])

    submission['ID'] = [f"TEST_{i:05d}" for i in range(len(submission))]

    submission = submission[['ID', 'probability']]

    submission.to_csv("submission.csv", index=False)

    return 

In [42]:
# 원본 train 데이터 로드
train = pd.read_csv("../shared codes/data/train.csv")
test = pd.read_csv("../shared codes/data/test.csv")

this_func_is_all_you_need(train, test)

✅ '대리모 여부' 결측값을 최빈값 (0.0) 으로 대체 완료!
✅ 컬럼 삭제 완료: ['PGD 시술 여부', 'PGS 시술 여부', '난자 해동 경과일', '배아 해동 경과일']
✅ '난자 채취 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '난자 혼합 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '배아 이식 경과일' 결측값을 중앙값 (3.0) 으로 대체 완료!
✅ '대리모 여부' 결측값을 최빈값 (0.0) 으로 대체 완료!
✅ 컬럼 삭제 완료: ['PGD 시술 여부', 'PGS 시술 여부', '난자 해동 경과일', '배아 해동 경과일']
✅ '난자 채취 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '난자 혼합 경과일' 결측값을 중앙값 (0.0) 으로 대체 완료!
✅ '배아 이식 경과일' 결측값을 중앙값 (3.0) 으로 대체 완료!
index_test: 1            1
6            6
8            8
11          11
12          12
         ...  
90061    90061
90062    90062
90063    90063
90064    90064
90066    90066
Name: idx, Length: 40092, dtype: int64
X_test_encoded['idx'][0:30]: 0      1
1      6
2      8
3     11
4     12
5     14
6     15
7     16
8     18
9     21
10    22
11    24
12    29
13    32
14    35
15    37
16    38
17    42
18    45
19    46
20    48
21    49
22    51
23    53
24    56
25    58
26    69
27    73
28    80
29    84
Name: idx, dtype: int64
Length of test: 40092
Len

[I 2025-02-16 17:15:20,189] A new study created in memory with name: no-name-70a87843-b9f0-43df-9f6a-9d9a04415b09


index_test: 2            2
3            3
5            5
10          10
23          23
         ...  
90031    90031
90032    90032
90051    90051
90057    90057
90065    90065
Name: idx, Length: 17008, dtype: int64
X_test_encoded['idx'][0:30]: 0       2
1       3
2       5
3      10
4      23
5      28
6      36
7      52
8      54
9      57
10     64
11     65
12     75
13     76
14     77
15     78
16     79
17     92
18     94
19     97
20    101
21    112
22    119
23    120
24    126
25    127
26    136
27    140
28    151
29    154
Name: idx, dtype: int64
Length of test: 17008
Length of X_test_encoded before assigning index: 17008
Length of index_test: 17008
Optimizing Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Optimizing Random Forest...


[I 2025-02-16 17:25:16,199] Trial 0 finished with value: 0.6920624912622817 and parameters: {'n_estimators': 164, 'max_depth': 11, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6920624912622817.
[W 2025-02-16 17:25:16,233] Trial 1 failed with parameters: {'n_estimators': 243, 'max_depth': 4, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'auto'} because of the following error: ValueError('\nAll the 5 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score=\'raise\'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n5 fits failed with the following error:\nTraceback (most recent call last):\n  File "/opt/anaconda3/envs/lg_aimers/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score\n    estimator.fit(X_train, y_train, **fit_params)

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/lg_aimers/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/lg_aimers/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/envs/lg_aimers/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/envs/lg_aimers/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.


In [36]:
# test_unknown의 데이터 개수만큼 0으로 채워진 submission_unknown 생성
submission_unknown = pd.DataFrame({
       'probability': [0] * len(test_unknown),
       'idx': test_unknown['idx']
   })


submission = pd.concat([submission_young, submission_middle, submission_old, submission_unknown])

submission = submission.sort_values('idx').reset_index(drop=True)
submission = submission.drop(columns=['idx'])

home_dir = '/Users/downy/Documents/2025 LG aimers/DKU-LG-Capstone-6'
sample_submission = pd.read_csv(home_dir + '/shared codes/data/sample_submission.csv')

submission['ID'] = sample_submission['ID']


submission = submission[['ID', 'probability']]

submission.to_csv("submission.csv", index=False)



submission:     probability  idx
0      0.023750    0
1      0.017482    1
2      0.125027    2
3      0.092320    3
4      0.299258    4
5      0.098695    5
6      0.447826    6
7      0.304938    7
8      0.362012    8
9      0.010316    9
10     0.055482   10
11     0.428713   11
12     0.403876   12
13     0.461165   13
14     0.275473   14
15     0.008903   15
16     0.192963   16
17     0.371699   17
18     0.214438   18
19     0.607154   19
20     0.268116   20
21     0.388507   21
22     0.243889   22
23     0.133326   23
24     0.370955   24
25     0.012334   25
26     0.080959   26
27     0.493752   27
28     0.115279   28
29     0.192797   29
