# 라이브러리 로드

In [4]:
import numpy as np
import pandas as pd
from kaggler.data_io import load_data

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Hyperparameter tunning

아래의 항목에 대해서 Hyperparameter tunning을 한다.

여기서는 GridSearch를 통해서 tunning 예시를 보여줌.

- logistic regression

- decision tree

- random forest

- extra trees

In [8]:
# j1 feature 사용 예시
train_file = '../build/feature/j1.trn.h5'

X, y = load_data(train_file)

## logistic regression

In [14]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lr_param_grid = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l1', 'l2'],
    'C': np.linspace(-4, 4, 20)}

lr = LogisticRegression()

lr_gscv = GridSearchCV(lr, param_grid=lr_param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=1)

lr_gscv.fit(X, y)

lr_best = lr_gscv.best_estimator_

Fitting 5 folds for each of 120 candidates, totalling 600 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan 0.81481388 0.8271546  0.8271546  0.8271546
        nan        nan 0.8271546  0.82939552 0.82939552 0.82939552
        nan        nan 0.8294018  0.82938924 0.82938924 0.82939552
        nan        nan 0.82828448 0.8282782  0.8282782  0.8282782
        nan        nan 0.8294018  0.8282782  0.8282782  0.8282782
        nan        nan 0.8282782  0.8282782  0.8282782  0.8271608

In [18]:
lr_gscv.best_params_

{'C': 1.0526315789473681, 'penalty': 'l1', 'solver': 'liblinear'}

In [19]:
lr_gscv.best_score_

0.8294017952419811

## Decision Tree

In [20]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(1, 10),
    'min_samples_split': np.arange(1, 10),
    'min_samples_leaf': np.arange(1, 5)}

dt = DecisionTreeClassifier()

dt_gscv = GridSearchCV(dt, param_grid=dt_param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=1)

dt_gscv.fit(X, y)

dt_best = dt_gscv.best_estimator_

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


 0.78228611 0.78228611 0.78228611        nan 0.78228611 0.78228611
 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611
        nan 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611
 0.78228611 0.78228611 0.78228611        nan 0.78228611 0.78228611
 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611
        nan 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611
 0.78228611 0.78228611 0.78228611        nan 0.78228611 0.78228611
 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611
        nan 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611
 0.78228611 0.78228611 0.78228611        nan 0.78228611 0.78228611
 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611 0.78228611
        nan 0.82603729 0.82603729 0.82491369 0.82491369 0.82603729
 0.82603729 0.82603729 0.82603729        nan 0.82603729 0.82603729
 0.82491369 0.82603729 0.82603729 0.82491369 0.82603729 0.82491369
        nan 0.82491369 0.82603729 0.82603729 0.82491369 0.8249

In [28]:
dt_gscv.best_params_

{'criterion': 'entropy',
 'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 9}

In [29]:
dt_gscv.best_score_

0.8339150084740444

## Random Forest

In [30]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_param_grid = {
    "max_depth": [None],
    "max_features": [1, 3, 10],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [False],
    "n_estimators" :[100,300],
    "criterion": ["gini"]}

rf = RandomForestClassifier()

rf_gscv = GridSearchCV(rf, param_grid=rf_param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=1)

rf_gscv.fit(X, y)

rf_best = rf_gscv.best_estimator_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [31]:
rf_gscv.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 10,
 'n_estimators': 100}

In [32]:
rf_gscv.best_score_

0.8439897056054233

## Extra trees

In [33]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

et_param_grid = {
    "max_depth": [None],
    "max_features": [1, 3, 10],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [False],
    "n_estimators" :[100,300],
    "criterion": ["gini"]}

et = ExtraTreesClassifier()

et_gscv = GridSearchCV(et, param_grid=et_param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=1)

et_gscv.fit(X, y)

et_best = et_gscv.best_estimator_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [34]:
et_gscv.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 100}

In [35]:
et_gscv.best_score_

0.8406377502981609