## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [82]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

### Data Load

In [83]:
# 데이터 로드
IVF_train = pd.read_csv('../data/IVF_train_dataset_20.csv')
IVF_test = pd.read_csv('../data/IVF_test_dataset_20.csv')

DI_train = pd.read_csv('../data/DI_train_dataset_20.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_20.csv')

In [84]:
# ID 열을 제외한 특성과 타겟 변수 분리
IVF_X = IVF_train.drop(['임신_성공_여부', 'ID'], axis=1)
IVF_y = IVF_train['임신_성공_여부']

DI_X = DI_train.drop(['임신_성공_여부', 'ID'], axis=1)
DI_y = DI_train['임신_성공_여부']

### 인코딩 

In [85]:
IVF_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "임신_시도_또는_마지막_임신_경과_연수",
    "배란_유도_유형",
    "배아_생성_주요_이유",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이",
    "변환된_특정_시술_유형",
    "채취_해동_차이",
    "해동_혼합_차이",
    "혼합_이식_차이",
    "이식_해동_차이"
]

In [86]:
DI_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "임신_시도_또는_마지막_임신_경과_연수",
    "정자_기증자_나이",
    "변환된_특정_시술_유형"
]

In [87]:
# 모든 범주형 변수를 문자열로 변환
IVF_X[IVF_categorical_columns] = IVF_X[IVF_categorical_columns].astype(str)
DI_X[DI_categorical_columns] = DI_X[DI_categorical_columns].astype(str)
IVF_test[IVF_categorical_columns] = IVF_test[IVF_categorical_columns].astype(str)
DI_test[DI_categorical_columns] = DI_test[DI_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
IVF_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
DI_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

IVF_X[IVF_categorical_columns] = IVF_encoder.fit_transform(IVF_X[IVF_categorical_columns])
DI_X[DI_categorical_columns] = DI_encoder.fit_transform(DI_X[DI_categorical_columns])
IVF_test[IVF_categorical_columns] = IVF_encoder.transform(IVF_test[IVF_categorical_columns])
DI_test[DI_categorical_columns] = DI_encoder.transform(DI_test[DI_categorical_columns])

In [88]:
# 데이터 분할
IVF_X_train, IVF_X_test, IVF_y_train, IVF_y_test = train_test_split(IVF_X, IVF_y, test_size=0.2, random_state=42)
DI_X_train, DI_X_test, DI_y_train, DI_y_test = train_test_split(DI_X, DI_y, test_size=0.2, random_state=42)

## Modeling

test 데이터 전부 사용해서

In [90]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 모델 파라미터 설정
IVF_model_params = {
    'n_estimators': 4471,
    'num_leaves': 13,
    'max_depth': 279,
    'learning_rate': 0.007075124517450591,
    'min_child_samples': 26,
    'subsample': 0.29772991936701476,
    'colsample_bytree': 0.8913054521763838,
    'reg_alpha': 0.0004860363321690653,
    'reg_lambda': 311.08056657247363,
    'min_split_gain': 0.18214905183450955,
    'random_state': 42,
    'boosting_type': 'gbdt',
    'verbose': -1
}

DI_model_params = {
    'n_estimators': 1816,
    'num_leaves': 3926,
    'max_depth': 259,
    'learning_rate': 0.00238377640011148,
    'min_child_samples': 1,
    'subsample': 0.7610056627240331,
    'colsample_bytree': 0.6655579164853634,
    'reg_alpha': 0.00025227758337188327,
    'reg_lambda': 76.744107215122684,
    'min_split_gain': 0.007773520329665474,
    'random_state': 42,
    'boosting_type': 'gbdt',
    'verbose': -1
}

# LGBM 모델 학습 및 예측
IVF_model_final = lgb.LGBMClassifier(**IVF_model_params)
IVF_model_final.fit(IVF_X, IVF_y)
IVF_test_pred = IVF_model_final.predict(IVF_test.drop('ID', axis=1))

DI_model_final = lgb.LGBMClassifier(**DI_model_params)
DI_model_final.fit(DI_X, DI_y)
DI_test_pred = DI_model_final.predict(DI_test.drop('ID', axis=1))

# K-Fold를 사용하여 IVF와 DI의 잘못된 예측 데이터 수집
kf = KFold(n_splits=5, shuffle=True, random_state=42)

IVF_wrong_predictions = []
DI_wrong_predictions = []

# K-Fold 교차 검증을 통한 잘못된 예측 수집 함수
def collect_wrong_predictions(X, y, wrong_predictions, model_params):
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**model_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        wrong_idx = val_idx[y_pred != y_val.values]
        wrong_predictions.append(X.iloc[wrong_idx])

collect_wrong_predictions(IVF_X, IVF_y, IVF_wrong_predictions, IVF_model_params)
collect_wrong_predictions(DI_X, DI_y, DI_wrong_predictions, DI_model_params)

# 오토인코더 정의
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8)
        )
        self.decoder = nn.Sequential(
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# IVF 오토인코더 학습
IVF_input_dim = IVF_wrong_predictions[0].shape[1]
IVF_autoencoder = Autoencoder(IVF_input_dim)
IVF_criterion = nn.MSELoss()
IVF_optimizer = optim.Adam(IVF_autoencoder.parameters(), lr=0.01)

IVF_data_tensor = torch.tensor(pd.concat(IVF_wrong_predictions).values, dtype=torch.float32)
IVF_dataloader = DataLoader(TensorDataset(IVF_data_tensor), batch_size=32, shuffle=True)

for epoch in range(50):
    for batch in IVF_dataloader:
        IVF_optimizer.zero_grad()
        output = IVF_autoencoder(batch[0])
        loss = IVF_criterion(output, batch[0])
        loss.backward()
        IVF_optimizer.step()

# DI 오토인코더 학습
DI_input_dim = DI_wrong_predictions[0].shape[1]
DI_autoencoder = Autoencoder(DI_input_dim)
DI_criterion = nn.MSELoss()
DI_optimizer = optim.Adam(DI_autoencoder.parameters(), lr=0.01)

DI_data_tensor = torch.tensor(pd.concat(DI_wrong_predictions).values, dtype=torch.float32)
DI_dataloader = DataLoader(TensorDataset(DI_data_tensor), batch_size=32, shuffle=True)

for epoch in range(50):
    for batch in DI_dataloader:
        DI_optimizer.zero_grad()
        output = DI_autoencoder(batch[0])
        loss = DI_criterion(output, batch[0])
        loss.backward()
        DI_optimizer.step()

제출 파일(code22_auto.csv) 생성 완료!


In [141]:
# IVF 테스트 데이터 보정
IVF_test_tensor = torch.tensor(IVF_test.drop(['ID'], axis=1).values, dtype=torch.float32)
IVF_predicted_output = IVF_autoencoder(IVF_test_tensor).detach().numpy()
IVF_reconstruction_error = np.mean((IVF_test.drop(['ID'], axis=1).values - IVF_predicted_output) ** 2, axis=1)
IVF_threshold = np.percentile(IVF_reconstruction_error, 95)

# DI 테스트 데이터 보정
DI_test_tensor = torch.tensor(DI_test.drop(['ID'], axis=1).values, dtype=torch.float32)
DI_predicted_output = DI_autoencoder(DI_test_tensor).detach().numpy()
DI_reconstruction_error = np.mean((DI_test.drop(['ID'], axis=1).values - DI_predicted_output) ** 2, axis=1)
DI_threshold = np.percentile(DI_reconstruction_error, 95)

# IVF 예측 수정
IVF_test_prob = IVF_model_final.predict_proba(IVF_test.drop('ID', axis=1))[:, 1]
IVF_test_prob[IVF_reconstruction_error > IVF_threshold] = np.where(
    IVF_test_prob[IVF_reconstruction_error > IVF_threshold] < 0.5,
    IVF_test_prob[IVF_reconstruction_error > IVF_threshold] + 0.5,
    1 - IVF_test_prob[IVF_reconstruction_error > IVF_threshold]
)

# DI 예측 수정
DI_test_prob = DI_model_final.predict_proba(DI_test.drop('ID', axis=1))[:, 1]
DI_test_prob[DI_reconstruction_error > DI_threshold] = np.where(
    DI_test_prob[DI_reconstruction_error > DI_threshold] < 0.5,
    DI_test_prob[DI_reconstruction_error > DI_threshold] + 0.5,
    1 - DI_test_prob[DI_reconstruction_error > DI_threshold]
)

# 최종 제출 파일 생성
IVF_submission = pd.DataFrame({'ID': IVF_test['ID'], 'probability': IVF_test_prob})
DI_submission = pd.DataFrame({'ID': DI_test['ID'], 'probability': DI_test_prob})

submission = pd.concat([IVF_submission, DI_submission])
submission.to_csv('../submission/code22_auto.csv', index=False)

print("제출 파일(code22_auto.csv) 생성 완료!")

제출 파일(code22_auto.csv) 생성 완료!


In [142]:
import pandas as pd

# 두 CSV 파일을 읽어옵니다.
df1 = pd.read_csv('../submission/code22_auto.csv')
df2 = pd.read_csv('../submission/code22_submit.csv')

# ID를 기준으로 병합합니다.
merged_df = pd.merge(df1, df2, on='ID', suffixes=('_df1', '_df2'))

# 'probability' 열을 비교합니다.
comparison = merged_df['probability_df1'] == merged_df['probability_df2']

# 차이점이 있는 행을 출력합니다.
differences = merged_df[~comparison]

# differences 데이터셋의 길이를 출력합니다.
print(len(differences))

4504


In [143]:
differences.head(10)

Unnamed: 0,ID,probability_df1,probability_df2
38,TEST_00039,0.501018,0.001018
50,TEST_00052,0.625715,0.125715
130,TEST_00135,0.501148,0.001148
178,TEST_00184,0.631045,0.131045
200,TEST_00206,0.501132,0.001132
230,TEST_00237,0.501186,0.001186
253,TEST_00260,0.501192,0.001192
263,TEST_00270,0.661021,0.161021
280,TEST_00288,0.501175,0.001175
285,TEST_00293,0.501136,0.001136


데이콘 PUBLIC 0.6957687046

.