In [320]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from datetime import datetime
import time
import math

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [321]:
train = pd.read_csv('../../train.csv')
test = pd.read_csv('../../test.csv')
submission = pd.read_csv('../../sample_submission.csv')

In [None]:
# 결측값을 채울 칼럼 목록
columns_to_fill = [
    '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부',
    '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수',
    '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수',
    '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수',
    '대리모 여부', 'PGD 시술 여부', 'PGS 시술 여부'
]

# 시술 유형이 'DI'인 경우에만 결측값을 0으로 채우기
train.loc[train['시술 유형'] == 'DI', columns_to_fill] = train.loc[train['시술 유형'] == 'DI', columns_to_fill].fillna(0)

# 1. '난자 출처'의 결측값을 '본인 제공'으로 채우기
train['난자 출처'].replace('알 수 없음','본인 제공', inplace=True)

# 2. '난자 기증자 나이' 결측값을 시술 당시 나이로 채우기
train.loc[train['난자 기증자 나이'] == '알 수 없음', '난자 기증자 나이'] = train['시술 당시 나이']

# 3. '파트너 정자와 혼합된 난자 수'의 결측값을 0으로 채우기
train['파트너 정자와 혼합된 난자 수'].fillna(0, inplace=True)

# 4. '기증자 정자와 혼합된 난자 수'의 결측값을 1로 채우기
train['기증자 정자와 혼합된 난자 수'].fillna(1, inplace=True)

train['난자 기증자 나이'].fillna(train['시술 당시 나이'], inplace=True)

# 5. '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부'의 결측값을 0으로 채우기
for column in ['동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부']:
    train[column].fillna(0, inplace=True)

    # 변경 내용 확인
print(train[['난자 출처', '난자 기증자 나이', '파트너 정자와 혼합된 난자 수', 
            '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부']].head())


   난자 출처 난자 기증자 나이  파트너 정자와 혼합된 난자 수  기증자 정자와 혼합된 난자 수  동결 배아 사용 여부  \
0  본인 제공   만18-34세               5.0               0.0          0.0   
1  본인 제공   만45-50세               1.0               0.0          0.0   
2  본인 제공   만18-34세               7.0               0.0          0.0   
3  본인 제공   만35-37세               4.0               0.0          0.0   
4  본인 제공   만18-34세               6.0               0.0          0.0   

   신선 배아 사용 여부  기증 배아 사용 여부  
0          1.0          0.0  
1          1.0          0.0  
2          1.0          0.0  
3          1.0          0.0  
4          1.0          0.0  


In [324]:
# 결측값을 임시로 'Unknown'으로 채우기 (KNN 적용 전 범주형 데이터 처리)
train = train.fillna('Unknown')

# 범주형 열 식별 및 타입 최적화
categorical_columns = train.select_dtypes(include=['object']).columns.tolist()

# 범주형 데이터에 대해 Label Encoding 수행
label_encoders = {}
for col in categorical_columns:
    train[col] = train[col].astype(str)  # 모든 데이터를 문자열로 변환
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    label_encoders[col] = le

In [325]:

# 범주형 열 식별 및 타입 최적화
categorical_columns = test.select_dtypes(include=['object']).columns.tolist()

# 범주형 데이터에 대해 Label Encoding 수행
label_encoders = {}
for col in categorical_columns:
    test[col] = test[col].astype(str)  # 모든 데이터를 문자열로 변환
    le = LabelEncoder()
    test[col] = le.fit_transform(test[col])
    label_encoders[col] = le

In [326]:
# 메모리를 절약하기 위해 데이터 타입 변경
for col in train.columns:
    if train[col].dtype == 'int64':
        train[col] = train[col].astype('int32')
    elif train[col].dtype == 'float64':
        train[col] = train[col].astype('float32')

# KNN Imputer 적용 (수치형 데이터 결측값 처리)
knn_imputer = KNNImputer(n_neighbors=3)
train = pd.DataFrame(knn_imputer.fit_transform(train), columns=train.columns)


In [327]:
all_data = pd.concat([train, test], sort=False)
all_data = all_data.drop(columns = 'ID')

In [328]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346418 entries, 0 to 90066
Data columns (total 68 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   시술 시기 코드               346418 non-null  float64
 1   시술 당시 나이               346418 non-null  float64
 2   임신 시도 또는 마지막 임신 경과 연수  259648 non-null  float64
 3   시술 유형                  346418 non-null  float64
 4   특정 시술 유형               346418 non-null  float64
 5   배란 자극 여부               346418 non-null  float64
 6   배란 유도 유형               346418 non-null  float64
 7   단일 배아 이식 여부            344242 non-null  float64
 8   착상 전 유전 검사 사용 여부       257284 non-null  float64
 9   착상 전 유전 진단 사용 여부       344242 non-null  float64
 10  남성 주 불임 원인             346418 non-null  float64
 11  남성 부 불임 원인             346418 non-null  float64
 12  여성 주 불임 원인             346418 non-null  float64
 13  여성 부 불임 원인             346418 non-null  float64
 14  부부 주 불임 원인             346418 non-nul

In [329]:
col = ['임신 시도 또는 마지막 임신 경과 연수','착상 전 유전 검사 사용 여부','PGD 시술 여부','PGS 시술 여부']
all_data = all_data.drop(columns=col)


In [330]:
# 병합할 칼럼들
columns_to_merge = [
    '난자 채취 경과일', '난자 해동 경과일', '난자 혼합 경과일',
    '배아 이식 경과일', '배아 해동 경과일'
]

# 새로운 칼럼 생성: 각 경과일의 합
all_data['총 경과일'] = all_data[columns_to_merge].sum(axis=1)

# 기존 칼럼 제거
all_data = all_data.drop(columns=columns_to_merge)


In [331]:
# 임신 성공 여부 칼럼 지정 (예: 'success_column' 이름 수정 필요)
target_column = '임신 성공 여부'

# 임신 성공 여부 칼럼 제외한 나머지 칼럼들 선택
columns_to_impute = [col for col in all_data.columns if col != target_column]

# 각 칼럼의 최빈값으로 결측값 채우기
for col in columns_to_impute:
    most_frequent_value = all_data[col].mode()[0]  # 최빈값 추출
    all_data[col].fillna(most_frequent_value, inplace=True)  # 결측값 대체

In [332]:
# 각 열의 결측값 개수 확인
missing_values_count = all_data.isnull().sum()

# 결측값이 있는 열만 필터링
missing_values_count = missing_values_count[missing_values_count > 0]

print(missing_values_count)


임신 성공 여부    90067
dtype: int64


In [333]:
all_data

Unnamed: 0,시술 시기 코드,시술 당시 나이,시술 유형,특정 시술 유형,배란 자극 여부,배란 유도 유형,단일 배아 이식 여부,착상 전 유전 진단 사용 여부,남성 주 불임 원인,남성 부 불임 원인,...,난자 출처,정자 출처,난자 기증자 나이,정자 기증자 나이,동결 배아 사용 여부,신선 배아 사용 여부,기증 배아 사용 여부,대리모 여부,임신 성공 여부,총 경과일
0,6.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,13.0
1,5.0,5.0,1.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,...,1.0,3.0,9.0,6.0,0.0,1.0,0.0,0.0,0.0,18.0
2,3.0,0.0,1.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,12.0
3,2.0,1.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,5.0,6.0,0.0,1.0,0.0,0.0,0.0,18.0
4,3.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90062,1.0,0.0,1.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,3.0,4.0,6.0,0.0,1.0,0.0,0.0,,0.0
90063,5.0,4.0,1.0,19.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,3.0,2.0,6.0,1.0,0.0,0.0,0.0,,0.0
90064,3.0,0.0,1.0,11.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,3.0,1.0,6.0,0.0,1.0,0.0,0.0,,5.0
90065,0.0,4.0,1.0,19.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,4.0,1.0,0.0,0.0,0.0,,4.0


In [None]:
# from sklearn.ensemble import RandomForestClassifier

# # 데이터 준비
# x = all_data[~pd.isnull(all_data['임신 성공 여부'])]
# X = x.drop(['임신 성공 여부'], axis=1)
# y = x['임신 성공 여부']


# # 랜덤 포레스트 모델 학습
# rf_model = RandomForestClassifier(random_state=42)
# rf_model.fit(X, y)

# # 중요도 기준으로 피처 선택
# feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
# selected_features_rfe = feature_importances[feature_importances > 0.01].index

# # '임신 성공 여부' 칼럼 추가
# selected_features_rfe = list(selected_features_rfe)  # 리스트로 변환
# selected_features_rfe.append('임신 성공 여부')       # '임신 성공 여부' 추가

# print("Selected important features (including target):", selected_features_rfe)


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

x = all_data[~pd.isnull(all_data['임신 성공 여부'])]
X = x.drop(['임신 성공 여부'], axis=1)
y = x['임신 성공 여부']

# 로지스틱 회귀 모델과 RFE 사용
model = LogisticRegression(max_iter=1000)
selector = RFE(model, n_features_to_select=30)  # 원하는 피처 수로 설정
selector.fit(X, y)

# 선택된 피처 출력
selected_features_rfe = X.columns[selector.support_]
print("Features selected by RFE:", selected_features_rfe)
# 선택된 피처 출력 및 '임신 성공 여부' 추가
selected_features_rfe = list(X.columns[selector.support_])
selected_features_rfe.append('임신 성공 여부')  # 기존 y 값 추가

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# # 데이터 준비
# x = all_data[~pd.isnull(all_data['임신 성공 여부'])]
# X = x.drop(['임신 성공 여부'], axis=1)
# y = x['임신 성공 여부']

# # 데이터 준비
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # 후진 제거법 구현
# selected_features = list(X.columns)
# best_score = 0

# # 초기 모델 학습
# model = LogisticRegression(max_iter=1000)
# model.fit(X_train[selected_features], y_train)
# y_pred = model.predict(X_test[selected_features])
# best_score = accuracy_score(y_test, y_pred)

# # 반복적으로 피처 제거
# for feature in X.columns:
#     features_to_test = [f for f in selected_features if f != feature]
#     model.fit(X_train[features_to_test], y_train)
#     y_pred = model.predict(X_test[features_to_test])
#     score = accuracy_score(y_test, y_pred)

#     # 모델 성능이 악화되지 않으면 해당 피처 제거
#     if score >= best_score:
#         selected_features.remove(feature)
#         best_score = score

# print("Features selected by Backward Elimination:", selected_features)


Features selected by Backward Elimination: ['특정 시술 유형', '배란 유도 유형', '단일 배아 이식 여부', '착상 전 유전 진단 사용 여부', '남성 주 불임 원인', '남성 부 불임 원인', '여성 주 불임 원인', '여성 부 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인', '불명확 불임 원인', '불임 원인 - 난관 질환', '불임 원인 - 남성 요인', '불임 원인 - 배란 장애', '불임 원인 - 여성 요인', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증', '불임 원인 - 정자 농도', '불임 원인 - 정자 면역학적 요인', '불임 원인 - 정자 운동성', '불임 원인 - 정자 형태', '배아 생성 주요 이유', '총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '이식된 배아 수', '저장된 배아 수', '해동 난자 수', '저장된 신선 난자 수', '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '대리모 여부']


In [None]:
# selected_features = list(selected_features)
# selected_features.append('임신 성공 여부')

In [339]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split

# # 데이터 준비
# x = all_data[~pd.isnull(all_data['임신 성공 여부'])]
# X = x.drop(['임신 성공 여부'], axis=1)
# y = x['임신 성공 여부']

# # 데이터 분할
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # 전진 선택법 구현
# selected_features = []
# remaining_features = list(X.columns)
# best_score = 0

# while remaining_features:
#     scores = {}
#     for feature in remaining_features:
#         features_to_test = selected_features + [feature]
#         model = LogisticRegression(max_iter=1000)
#         model.fit(X_train[features_to_test], y_train)
#         y_pred = model.predict(X_test[features_to_test])
#         scores[feature] = accuracy_score(y_test, y_pred)

#     # 가장 성능이 좋은 피처 선택
#     best_feature = max(scores, key=scores.get)
#     if scores[best_feature] > best_score:
#         selected_features.append(best_feature)
#         remaining_features.remove(best_feature)
#         best_score = scores[best_feature]
#     else:
#         break

# print("Features selected by Forward Selection:", selected_features)


In [None]:
# # 선택된 피처만 남기기
all_data = all_data[selected_features_rfe]

In [341]:
all_data['임신 성공 여부'].value_counts()

0.0    190123
1.0     66228
Name: 임신 성공 여부, dtype: int64

In [342]:
all_data

Unnamed: 0,특정 시술 유형,배란 유도 유형,단일 배아 이식 여부,착상 전 유전 진단 사용 여부,남성 주 불임 원인,남성 부 불임 원인,여성 주 불임 원인,여성 부 불임 원인,부부 주 불임 원인,부부 부 불임 원인,...,파트너 정자와 혼합된 난자 수,난자 출처,정자 출처,난자 기증자 나이,정자 기증자 나이,동결 배아 사용 여부,신선 배아 사용 여부,기증 배아 사용 여부,대리모 여부,임신 성공 여부
0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,1.0,3.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0
1,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,3.0,9.0,6.0,0.0,1.0,0.0,0.0,0.0
2,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,1.0,3.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,3.0,5.0,6.0,0.0,1.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,1.0,3.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90062,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,1.0,3.0,4.0,6.0,0.0,1.0,0.0,0.0,
90063,19.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,2.0,6.0,1.0,0.0,0.0,0.0,
90064,11.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.0,0.0,3.0,1.0,6.0,0.0,1.0,0.0,0.0,
90065,19.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,4.0,4.0,1.0,0.0,0.0,0.0,


In [343]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

train_X = all_data[~pd.isnull(all_data['임신 성공 여부'])]

test_x = all_data[pd.isnull(all_data['임신 성공 여부'])]
test_x = test_x.drop(columns=['임신 성공 여부'])

# 타겟 변수를 제외한 피처들만 사용
features = train_X.drop(columns=['임신 성공 여부'])

# 데이터 정규화
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_test_x = scaler.transform(test_x)

# PCA 적용
pca = PCA(n_components=0.95)  # 설명 분산의 93%를 유지하도록 설정
pca_features = pca.fit_transform(scaled_features)
pca_features_test = pca.transform(scaled_test_x)

# PCA 적용 후 데이터셋의 형태 확인
pca_features_shape = pca_features.shape

# 설명된 분산 비율
explained_variance = pca.explained_variance_ratio_

pca_features_shape, explained_variance

((256351, 30),
 array([0.14889436, 0.0805341 , 0.06234663, 0.05245558, 0.05199453,
        0.04215983, 0.03864843, 0.03486533, 0.03270335, 0.03216755,
        0.02966712, 0.02743274, 0.02573106, 0.02308381, 0.02254547,
        0.02222673, 0.02218753, 0.02177503, 0.0212452 , 0.02117128,
        0.02000876, 0.01921897, 0.01819703, 0.01648422, 0.01305544,
        0.01284842, 0.01160233, 0.01073104, 0.01031732, 0.01022606]))

In [344]:
target = train_X['임신 성공 여부']

# 데이터 분리 (학습용 80%, 검증용 20%)
X_train, X_val, y_train, y_val = train_test_split(pca_features, target, test_size=0.2, random_state=42)


In [345]:
# SMOTE 적용
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# 결과 확인
print("After SMOTE:")
print(y_train_resampled.value_counts())

After SMOTE:
0.0    151978
1.0    151978
Name: 임신 성공 여부, dtype: int64


In [346]:
target.value_counts()

0.0    190123
1.0     66228
Name: 임신 성공 여부, dtype: int64

In [347]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

# Random Forest 모델 생성
rf = RandomForestClassifier(random_state=42, n_estimators=100)

# K-Fold 교차 검증 설정
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 교차 검증을 통해 ROC AUC 점수 계산
roc_auc_scores = cross_val_score(rf, X_train_resampled, y_train_resampled, cv=kf, scoring='roc_auc', n_jobs=-1)

# 평균 ROC AUC 점수 계산 및 출력
mean_roc_auc = np.mean(roc_auc_scores)

print("ROC AUC scores from each fold:", roc_auc_scores)
print("Mean ROC AUC:", mean_roc_auc)


ROC AUC scores from each fold: [0.82249678 0.82292541 0.82213246 0.8215474  0.82246967]
Mean ROC AUC: 0.8223143449576298


In [348]:
# 모델 최종 학습 및 테스트 예측
rf.fit(X_train_resampled, y_train_resampled)

test_predictions = rf.predict(pca_features_test)

In [349]:
# 결과를 DataFrame으로 저장
result = pd.DataFrame({
    'ID': submission['ID'].values,  # Set ID를 원래대로 유지
    'probability': test_predictions
})
result.to_csv("../../submission.csv", index=False)

In [350]:
result

Unnamed: 0,ID,probability
0,TEST_00000,0.0
1,TEST_00001,0.0
2,TEST_00002,0.0
3,TEST_00003,0.0
4,TEST_00004,0.0
...,...,...
90062,TEST_90062,0.0
90063,TEST_90063,0.0
90064,TEST_90064,1.0
90065,TEST_90065,1.0


In [351]:
unique, counts = np.unique(result['probability'], return_counts=True)
print(unique, counts)

[0. 1.] [64963 25104]
