<a href="https://colab.research.google.com/github/Byeon-MJ/Dacon_SNP_Repo/blob/main/SNP_Ensemble_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Module Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score
from sklearn import preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

## Seed 고정

In [None]:
class CFG:
    SEED = 42

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## DataSet Load

In [None]:
train = pd.read_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/train.csv')
test = pd.read_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/test.csv')
info = pd.read_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/snp_info.csv')

In [None]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [None]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

## Label Encoding

In [None]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [None]:
train_data = []
for col in snp_col:
    train_data += list(train_x[col].values)

In [None]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(train_data)

In [None]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

## Data Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train_x, train_y, test_size=0.2)

## 개별 ML 모델 생성

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100)
ada_clf = AdaBoostClassifier(n_estimators=100)
xgb_clf = XGBClassifier(n_estimators=100)
lgb_clf = LGBMClassifier(n_estimators=100)

lr_final = LogisticRegression()

## 개별 모델 학습

In [None]:
rf_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)
lgb_clf.fit(X_train, y_train)

## 예측 데이터 세트 생성, 개별 모델 정확도 측정

In [None]:
rf_pred = rf_clf.predict(X_val)
ada_pred = ada_clf.predict(X_val)
xgb_pred = xgb_clf.predict(X_val)
lgb_pred = lgb_clf.predict(X_val)
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy_score(y_val, rf_pred)))
print('에이다부스트 정확도: {0:.4f}'.format(accuracy_score(y_val, ada_pred)))
print('XGBoost 정확도: {0:.4f}'.format(accuracy_score(y_val, xgb_pred)))
print('LightGBM 정확도: {0:.4f}'.format(accuracy_score(y_val, lgb_pred)))

In [None]:
pred = np.array([rf_pred, ada_pred, xgb_pred, lgb_pred])
print(pred.shape)

In [None]:
pred = np.transpose(pred)
print(pred.shape)

In [None]:
lr_final.fit(pred, y_val)
final = lr_final.predict(pred)

print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_val, final)))

# Submision

In [None]:
rf_pred = rf_clf.predict(test_x)
ada_pred = ada_clf.predict(test_x)
xgb_pred = xgb_clf.predict(test_x)
lgb_pred = lgb_clf.predict(test_x)

pred = np.array([rf_pred, ada_pred, xgb_pred, lgb_pred])
pred = np.transpose(pred)

test_pred = lr_final.predict(pred)

In [None]:
submit = pd.read_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/sample_submission.csv')

In [None]:
submit['class'] = class_le.inverse_transform(test_pred)

In [None]:
submit.to_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/submit.csv', index=False)

## 최종 메타 모델이 사용할 학습 및 테스트옹 데이터 생성 함수

In [None]:
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    # KFold 생성
    kf = KFold(n_splits=n_folds, shuffle=False)
    
    # 학습 데이터 반환을 위한 넘파이 배열 초기화
    train_fold_pred = np.zeros((X_train_n.shape[0], 1 ))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__, ' model 시작 ')

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        # 입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 세트 추출
        print('\t 폴드 세트: ', folder_counter, ' 시작 ')
        X_tr = X_train_n.iloc[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n.iloc[valid_index]

        # 폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델 학습 수행
        model.fit(X_tr, y_tr)
        # 폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
        test_pred[:, folder_counter] = model.predict(X_test_n)

    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
    test_pred_mead = np.mean(test_pred, axis=1).reshape(-1, 1)

    # train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred, test_pred_mead

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100)
ada_clf = AdaBoostClassifier(n_estimators=100)
xgb_clf = XGBClassifier(n_estimators=100)
lgb_clf = LGBMClassifier(n_estimators=100)

lr_final = LogisticRegression()

In [None]:
rf_train, rf_test = get_stacking_base_datasets(rf_clf, train_x, train_y, test_x, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, train_x, train_y, test_x, 7)
xgb_train, xgb_test = get_stacking_base_datasets(xgb_clf, train_x, train_y, test_x, 7)
lgb_train, lgb_test = get_stacking_base_datasets(lgb_clf, train_x, train_y, test_x, 7)

In [None]:
Stack_final_X_train = np.concatenate((rf_train, ada_train, xgb_train, lgb_train), axis=1)
Stack_final_X_test = np.concatenate((rf_test, ada_test, xgb_test, lgb_test), axis=1)

In [None]:
lr_final.fit(Stack_final_X_train, train_y)
stack_final = lr_final.predict(Stack_final_X_test)

## Submission

In [None]:
test_pred = lr_final.predict(Stack_final_X_test)

In [None]:
submit = pd.read_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/sample_submission.csv')

In [None]:
submit['class'] = class_le.inverse_transform(test_pred)

In [None]:
submit.to_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/submit.csv', index=False)