<a href="https://colab.research.google.com/github/Byeon-MJ/Dacon_SNP_Repo/blob/main/Dacon_SNP_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Module Import

In [None]:
import pandas as pd
import random
import os
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

## Fix Random Seed

In [None]:
class CFG:
    SEED = 42

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Dataset Load

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
train = pd.read_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/train.csv')
test = pd.read_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/test.csv')
info = pd.read_csv('/content/gdrive/MyDrive/Project/Dacon_SNP/dataset/snp_info.csv')

In [None]:
train.head()

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,0,0,0,2,G G,A G,A A,G A,C A,...,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,0,0,0,2,A G,A G,C A,A A,A A,...,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,0,0,0,2,G G,G G,A A,G A,C C,...,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,0,0,0,1,A A,G G,A A,G A,A A,...,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,0,0,0,2,G G,G G,C C,A A,C C,...,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


In [None]:
train.describe(include='all')

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
count,262,262.0,262.0,262.0,262.0,262,262,262,262,262,...,262,262,262,262,262,262,262,262,262,262
unique,262,,,,,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
top,TRAIN_000,,,,,G G,G G,A A,A A,A A,...,A A,G A,A A,G G,A G,A A,G G,A A,A A,B
freq,1,,,,,141,108,122,120,94,...,163,96,182,151,96,136,115,185,107,114
mean,,0.0,0.0,0.0,1.736641,,,,,,...,,,,,,,,,,
std,,0.0,0.0,0.0,0.441298,,,,,,...,,,,,,,,,,
min,,0.0,0.0,0.0,1.0,,,,,,...,,,,,,,,,,
25%,,0.0,0.0,0.0,1.0,,,,,,...,,,,,,,,,,
50%,,0.0,0.0,0.0,2.0,,,,,,...,,,,,,,,,,
75%,,0.0,0.0,0.0,2.0,,,,,,...,,,,,,,,,,


In [None]:
test.describe(include='all')

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
count,175,175.0,175.0,175.0,175.0,175,175,175,175,175,175,175,175,175,175,175,175,175,175,175
unique,175,,,,,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
top,TEST_000,,,,,G G,G G,A A,G A,C C,A G,A A,A A,A A,G G,A G,A A,G G,A A,G A
freq,1,,,,,84,81,82,76,62,80,101,64,120,97,68,93,76,120,68
mean,,0.0,0.0,0.0,1.708571,,,,,,,,,,,,,,,
std,,0.0,0.0,0.0,0.455724,,,,,,,,,,,,,,,
min,,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,
25%,,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,
50%,,0.0,0.0,0.0,2.0,,,,,,,,,,,,,,,
75%,,0.0,0.0,0.0,2.0,,,,,,,,,,,,,,,


In [None]:
info

Unnamed: 0,SNP_id,name,chrom,cm,pos
0,SNP_01,BTA-19852-no-rs,2,67.0546,42986890
1,SNP_02,ARS-USMARC-Parent-DQ647190-rs29013632,6,31.1567,13897068
2,SNP_03,ARS-BFGL-NGS-117009,6,68.2892,44649549
3,SNP_04,ARS-BFGL-NGS-60567,6,77.8749,53826064
4,SNP_05,BovineHD0600017032,6,80.5015,61779512
5,SNP_06,BovineHD0600017424,6,80.5954,63048481
6,SNP_07,Hapmap49442-BTA-111073,6,80.78,64037334
7,SNP_08,BovineHD0600018638,6,82.6856,67510588
8,SNP_09,ARS-BFGL-NGS-37727,6,86.874,73092782
9,SNP_10,BTB-01558306,7,62.0692,40827112


### Unused Columns Drop

In [None]:
train.drop(['father', 'mother', 'gender'], axis=1, inplace=True)
test.drop(['father', 'mother', 'gender'], axis=1, inplace=True)

In [None]:
train.head()

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


In [None]:
info.head()

Unnamed: 0,SNP_id,name,chrom,cm,pos
0,SNP_01,BTA-19852-no-rs,2,67.0546,42986890
1,SNP_02,ARS-USMARC-Parent-DQ647190-rs29013632,6,31.1567,13897068
2,SNP_03,ARS-BFGL-NGS-117009,6,68.2892,44649549
3,SNP_04,ARS-BFGL-NGS-60567,6,77.8749,53826064
4,SNP_05,BovineHD0600017032,6,80.5015,61779512


In [None]:
info.describe()

Unnamed: 0,chrom,cm,pos
count,15.0,15.0,15.0
mean,6.8,67.321736,53765240.0
std,1.971222,23.633314,23110740.0
min,2.0,1.78774,814291.0
25%,6.0,63.0822,43818220.0
50%,6.0,68.2892,59692850.0
75%,8.5,80.6877,65773960.0
max,10.0,97.1731,92485680.0


## Data Pre-processing

## Set Index

In [None]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [None]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

In [None]:
train_x.head()

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A


In [None]:
train_y.head()

0    B
1    C
2    B
3    A
4    C
Name: class, dtype: object

In [None]:
test_x.head()

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A G,A G,G A,G G,C A,G A
1,2,G G,A G,C C,G G,C C,A A,A A,A A,A A,G G,A G,A A,A A,A A,A A
2,2,G G,A G,A A,A A,C A,A G,A A,A A,A A,A G,A A,G A,G G,A A,G G
3,2,G G,A G,C A,A A,C C,A A,A A,A A,A A,G G,A A,G A,A G,A A,A A
4,1,A A,G G,A A,G G,A A,G G,G G,A A,G G,A G,G G,G A,G G,A A,G G


### Label Encoding

In [None]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [None]:
train_data = []
for col in snp_col:
    train_data += list(train_x[col].values)

In [None]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(train_data)

LabelEncoder()

In [None]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

In [None]:
train_x.head()

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,2,5,1,0,4,2,0,0,5,0,5,1,0,0,0,0
1,2,1,1,2,0,0,1,0,4,0,1,0,4,5,0,0
2,2,5,5,0,4,3,5,0,4,4,1,0,0,0,0,0
3,1,0,5,0,4,0,5,5,0,5,1,5,5,5,0,5
4,2,5,5,3,0,3,0,0,0,0,5,0,0,1,0,4


## Modeling

### Simple LightGBM

In [None]:
lgb_clf = LGBMClassifier(random_state=42)

In [None]:
lgb_clf.fit(train_x, train_y)

LGBMClassifier(random_state=42)

In [None]:
test_pred_simple = lgb_clf.predict(test_x)

### LightGBM + GridSearchCV

In [None]:
parameter = {
    'learning_rate' : [0.01, 0.1, 0.3, 0.5, 0.7],
    'max_depth' : [5, 7, 10, 30, 50],
    'subsample' : [0.5, 0.6, 0.7, 0.8, 1],
    'n_estimators' : [100, 200, 300, 500, 1000]
}

In [None]:
lgb_clf = LGBMClassifier(random_state=42)

In [None]:
gs_model = GridSearchCV(
    estimator = lgb_clf, param_grid = parameter, scoring='f1_macro', cv=5
)

In [None]:
gs_model.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=LGBMClassifier(random_state=42),
             param_grid={'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7],
                         'max_depth': [5, 7, 10, 30, 50],
                         'n_estimators': [100, 200, 300, 500, 1000],
                         'subsample': [0.5, 0.6, 0.7, 0.8, 1]},
             scoring='f1_macro')

In [None]:
test_pred_gs = gs_model.predict(test_x)

### LightGBM + HyperOpt

In [None]:
lgb_search_space = {'max_depth':hp.quniform('max_depth', 5, 20, 1),
                    'min_child_weight':hp.quniform('min_child_weight', 1, 2, 1),
                    'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
                    'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1),
                    'subsample' : hp.quniform('subsample', 0.5, 1, 0.1),
                    'n_estimators' : hp.quniform('n_estimators', 100, 1000, 50)
                    }