<a href="https://colab.research.google.com/github/Byeon-MJ/Dacon_SNP_Repo/blob/main/Dacon_SNP_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Module Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

## Fix Random Seed

In [2]:
class CFG:
    SEED = 42

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Dataset Load

In [4]:
# from google.colab import drive
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
train = pd.read_csv('https://raw.githubusercontent.com/Byeon-MJ/Dacon_SNP_Repo/main/dacon_SNP_dataset/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Byeon-MJ/Dacon_SNP_Repo/main/dacon_SNP_dataset/test.csv')
info = pd.read_csv('https://raw.githubusercontent.com/Byeon-MJ/Dacon_SNP_Repo/main/dacon_SNP_dataset/snp_info.csv')

In [6]:
train.head()

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,0,0,0,2,G G,A G,A A,G A,C A,...,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,0,0,0,2,A G,A G,C A,A A,A A,...,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,0,0,0,2,G G,G G,A A,G A,C C,...,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,0,0,0,1,A A,G G,A A,G A,A A,...,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,0,0,0,2,G G,G G,C C,A A,C C,...,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


In [7]:
train.describe(include='all')

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
count,262,262.0,262.0,262.0,262.0,262,262,262,262,262,...,262,262,262,262,262,262,262,262,262,262
unique,262,,,,,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
top,TRAIN_000,,,,,G G,G G,A A,A A,A A,...,A A,G A,A A,G G,A G,A A,G G,A A,A A,B
freq,1,,,,,141,108,122,120,94,...,163,96,182,151,96,136,115,185,107,114
mean,,0.0,0.0,0.0,1.736641,,,,,,...,,,,,,,,,,
std,,0.0,0.0,0.0,0.441298,,,,,,...,,,,,,,,,,
min,,0.0,0.0,0.0,1.0,,,,,,...,,,,,,,,,,
25%,,0.0,0.0,0.0,1.0,,,,,,...,,,,,,,,,,
50%,,0.0,0.0,0.0,2.0,,,,,,...,,,,,,,,,,
75%,,0.0,0.0,0.0,2.0,,,,,,...,,,,,,,,,,


In [8]:
test.describe(include='all')

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
count,175,175.0,175.0,175.0,175.0,175,175,175,175,175,175,175,175,175,175,175,175,175,175,175
unique,175,,,,,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
top,TEST_000,,,,,G G,G G,A A,G A,C C,A G,A A,A A,A A,G G,A G,A A,G G,A A,G A
freq,1,,,,,84,81,82,76,62,80,101,64,120,97,68,93,76,120,68
mean,,0.0,0.0,0.0,1.708571,,,,,,,,,,,,,,,
std,,0.0,0.0,0.0,0.455724,,,,,,,,,,,,,,,
min,,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,
25%,,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,
50%,,0.0,0.0,0.0,2.0,,,,,,,,,,,,,,,
75%,,0.0,0.0,0.0,2.0,,,,,,,,,,,,,,,


In [9]:
info

Unnamed: 0,SNP_id,name,chrom,cm,pos
0,SNP_01,BTA-19852-no-rs,2,67.0546,42986890
1,SNP_02,ARS-USMARC-Parent-DQ647190-rs29013632,6,31.1567,13897068
2,SNP_03,ARS-BFGL-NGS-117009,6,68.2892,44649549
3,SNP_04,ARS-BFGL-NGS-60567,6,77.8749,53826064
4,SNP_05,BovineHD0600017032,6,80.5015,61779512
5,SNP_06,BovineHD0600017424,6,80.5954,63048481
6,SNP_07,Hapmap49442-BTA-111073,6,80.78,64037334
7,SNP_08,BovineHD0600018638,6,82.6856,67510588
8,SNP_09,ARS-BFGL-NGS-37727,6,86.874,73092782
9,SNP_10,BTB-01558306,7,62.0692,40827112


### Unused Columns Drop

In [10]:
train.drop(['father', 'mother', 'gender'], axis=1, inplace=True)
test.drop(['father', 'mother', 'gender'], axis=1, inplace=True)

In [11]:
train.head()

Unnamed: 0,id,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


In [12]:
info.head()

Unnamed: 0,SNP_id,name,chrom,cm,pos
0,SNP_01,BTA-19852-no-rs,2,67.0546,42986890
1,SNP_02,ARS-USMARC-Parent-DQ647190-rs29013632,6,31.1567,13897068
2,SNP_03,ARS-BFGL-NGS-117009,6,68.2892,44649549
3,SNP_04,ARS-BFGL-NGS-60567,6,77.8749,53826064
4,SNP_05,BovineHD0600017032,6,80.5015,61779512


In [13]:
info.describe()

Unnamed: 0,chrom,cm,pos
count,15.0,15.0,15.0
mean,6.8,67.321736,53765240.0
std,1.971222,23.633314,23110740.0
min,2.0,1.78774,814291.0
25%,6.0,63.0822,43818220.0
50%,6.0,68.2892,59692850.0
75%,8.5,80.6877,65773960.0
max,10.0,97.1731,92485680.0


## Data Pre-processing

## Set Index

In [14]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [15]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

In [16]:
train_x.head()

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A


In [17]:
train_y.head()

0    B
1    C
2    B
3    A
4    C
Name: class, dtype: object

In [18]:
test_x.head()

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A G,A G,G A,G G,C A,G A
1,2,G G,A G,C C,G G,C C,A A,A A,A A,A A,G G,A G,A A,A A,A A,A A
2,2,G G,A G,A A,A A,C A,A G,A A,A A,A A,A G,A A,G A,G G,A A,G G
3,2,G G,A G,C A,A A,C C,A A,A A,A A,A A,G G,A A,G A,A G,A A,A A
4,1,A A,G G,A A,G G,A A,G G,G G,A A,G G,A G,G G,G A,G G,A A,G G


### Label Encoding

In [19]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [20]:
train_data = []
for col in snp_col:
    train_data += list(train_x[col].values)

In [21]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(train_data)

LabelEncoder()

In [22]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

In [23]:
train_x.head()

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,2,5,1,0,4,2,0,0,5,0,5,1,0,0,0,0
1,2,1,1,2,0,0,1,0,4,0,1,0,4,5,0,0
2,2,5,5,0,4,3,5,0,4,4,1,0,0,0,0,0
3,1,0,5,0,4,0,5,5,0,5,1,5,5,5,0,5
4,2,5,5,3,0,3,0,0,0,0,5,0,0,1,0,4


## Modeling

### Simple LightGBM

In [24]:
# lgb_clf = LGBMClassifier(random_state=42)

In [25]:
# lgb_clf.fit(train_x, train_y)

LGBMClassifier(random_state=42)

In [26]:
# test_pred_simple = lgb_clf.predict(test_x)

### LightGBM + GridSearchCV

In [27]:
# parameter = {
#     'learning_rate' : [0.01, 0.1, 0.3, 0.5, 0.7],
#     'max_depth' : [5, 7, 10, 30, 50],
#     'subsample' : [0.5, 0.6, 0.7, 0.8, 1],
#     'n_estimators' : [100, 200, 300, 500, 1000]
# }

In [28]:
# lgb_clf = LGBMClassifier(random_state=42)

In [29]:
# gs_model = GridSearchCV(
#     estimator = lgb_clf, param_grid = parameter, scoring='f1_macro', cv=5
# )

In [30]:
# gs_model.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=LGBMClassifier(random_state=42),
             param_grid={'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7],
                         'max_depth': [5, 7, 10, 30, 50],
                         'n_estimators': [100, 200, 300, 500, 1000],
                         'subsample': [0.5, 0.6, 0.7, 0.8, 1]},
             scoring='f1_macro')

In [31]:
# test_pred_gs = gs_model.predict(test_x)

### LightGBM + HyperOpt

In [36]:
from hyperopt import hp
from sklearn.model_selection import cross_val_score
from hyperopt import STATUS_OK, fmin, tpe, Trials

In [50]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [51]:
lgb_search_space = {'n_estimators' : hp.quniform('n_estimators', 100, 1000, 50),
                    'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
                    'max_depth':hp.quniform('max_depth', 5, 20, 1),
                    'min_child_weight':hp.quniform('min_child_weight', 1, 2, 1),
                    'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1),
                    'subsample' : hp.quniform('subsample', 0.5, 1, 0.1),
                    'num_leaves' : hp.quniform('num_leaves', 10, 50, 5)
                    # 'lambda_l1' : 
                    # 'lambda_l2' :
                    }

In [63]:
def objective_func(search_space):
    lgb_clf = LGBMClassifier(n_estimators=int(search_space['n_estimators']),
                             learning_rate = search_space['learning_rate'],
                             max_depth=int(search_space['max_depth']),
                             min_child_weight=int(search_space['min_child_weight']),
                             colsample_bytree=search_space['colsample_bytree'],
                             subsample = search_space['subsample'],
                             num_leaves = int(search_space['num_leaves']),
                             eval_metric='logloss')
    accuracy = cross_val_score(lgb_clf, X_train, y_train, scoring='f1_macro', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))

    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}

In [64]:
trial_val = Trials()
best = fmin(fn=objective_func, space=lgb_search_space,
            algo=tpe.suggest, max_evals=50, trials=trial_val, rstate=np.random.seed(42))
print('best:', best)

100%|██████████| 50/50 [00:34<00:00,  1.44it/s, best loss: -0.9463268846586924]
best: {'colsample_bytree': 0.8196609504174855, 'learning_rate': 0.01320599503574755, 'max_depth': 6.0, 'min_child_weight': 2.0, 'n_estimators': 350.0, 'num_leaves': 30.0, 'subsample': 0.5}


In [65]:
lgb_clf = LGBMClassifier(colsample_bytree = best['colsample_bytree'],
                         learning_rate = best['learning_rate'],
                         max_depth = int(best['max_depth']),
                         min_child_weight = int(best['min_child_weight']),
                         n_estimators = int(best['n_estimators']),
                         num_leaves = int(best['num_leaves']),
                         subsample = best['subsample']
                         )

In [66]:
lgb_clf.fit(train_x, train_y, early_stopping_rounds=50, eval_metric='logloss', 
            eval_set=[(X_train, y_train), (X_val, y_val)], verbose=True)

[1]	valid_0's multi_logloss: 1.06136	valid_0's multi_logloss: 1.06136	valid_1's multi_logloss: 1.06125	valid_1's multi_logloss: 1.06125
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_logloss: 1.04729	valid_0's multi_logloss: 1.04729	valid_1's multi_logloss: 1.04644	valid_1's multi_logloss: 1.04644
[3]	valid_0's multi_logloss: 1.03445	valid_0's multi_logloss: 1.03445	valid_1's multi_logloss: 1.03255	valid_1's multi_logloss: 1.03255
[4]	valid_0's multi_logloss: 1.02118	valid_0's multi_logloss: 1.02118	valid_1's multi_logloss: 1.01907	valid_1's multi_logloss: 1.01907
[5]	valid_0's multi_logloss: 1.00781	valid_0's multi_logloss: 1.00781	valid_1's multi_logloss: 1.00624	valid_1's multi_logloss: 1.00624
[6]	valid_0's multi_logloss: 0.995074	valid_0's multi_logloss: 0.995074	valid_1's multi_logloss: 0.992755	valid_1's multi_logloss: 0.992755
[7]	valid_0's multi_logloss: 0.982285	valid_0's multi_logloss: 0.982285	valid_1's multi_logloss: 0.97899	valid_1's mul

LGBMClassifier(colsample_bytree=0.8196609504174855,
               learning_rate=0.01320599503574755, max_depth=6,
               min_child_weight=2, n_estimators=350, num_leaves=30,
               subsample=0.5)

In [67]:
train_pred = lgb_clf.predict(X_train)
val_pred = lgb_clf.predict(X_val)

In [69]:
train_f1 = f1_score(y_train, train_pred, average='macro')
val_f1 = f1_score(y_val, val_pred, average='macro')

In [70]:
print(train_f1)
print(val_f1)

0.9730014025245443
1.0


In [71]:
test = lgb_clf.predict(test_x)

## Submission

In [75]:
submit = pd.read_csv('https://raw.githubusercontent.com/Byeon-MJ/Dacon_SNP_Repo/main/dacon_SNP_dataset/sample_submission.csv')

In [76]:
submit['class'] = class_le.inverse_transform(test)

In [77]:
submit.to_csv('submit.csv', index=False)