# 라이브러리 Import

멀티 클래시픽캐이션

In [1]:
import numpy as np
import pandas as pd
import warnings

# Ignore display of unnecessary warnings
def fxn():
    warnings.warn("deprecated", DeprecationWarning)
    
warnings.filterwarnings("ignore")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()
    
# data preprocessing libs
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# sklearn classifiers to import
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# tensorflow classifier import
import tensorflow as tf
from tensorflow.estimator import DNNClassifier

# model building, predict, accuracy imports
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from IPython.display import display

<br><br>
예제를 위한 iris 데이터 로드

In [2]:
from sklearn.datasets import load_iris # scikit-learn의 샘플 데이터 로드를 위해 import
iris = load_iris() # sample data load

In [3]:
data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

In [4]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [5]:
# Get data from csv file 
print('Dataset used: Iris Data set')
print('Number of instances in dataset:', len(data))
print('Number of attributes in dataset:', len(data.columns.values)-1)

Dataset used: Iris Data set
Number of instances in dataset: 150
Number of attributes in dataset: 4


<br>
범주형 데이터로 되어있는 Label (종속변수) encoding

In [6]:
# categorize output class labels to numeric values
le = LabelEncoder()
le.fit(data['target'])
data['target'] = le.transform(data['target'])

In [7]:
data['target']

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int64

<br><br>
독립변수, 종속변수 (X, y) 로 분리<br>
Train / Test 데이터 split

In [8]:
# Remove any NAN rows from the dataset
data.dropna(inplace=True)

# separate feature data and target data
X, y = data.iloc[:, :-1].values, data.iloc[:, -1].values

In [9]:
# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.18, random_state=42)

<br><br><br>

# 모든 machine learning model에 대한 parameter, Classifier 객체 선언

In [10]:
# Build parameters of all classifiers
random_forest_params = dict(n_estimators=[5, 10, 15, 20, 25], criterion=['gini', 'entropy'], 
                            max_features=[2, 3, 4, 'auto', 'log2', 'sqrt', None], bootstrap=[False, True]
                            )
decision_tree_params = dict(criterion=['gini', 'entropy'], splitter=['best', 'random'], min_samples_split=[2, 3, 4],
                            max_features=[2,3,'auto', 'log2', 'sqrt', None], class_weight=['balanced', None])

perceptron_params = dict(penalty=[None, 'l2', 'l1', 'elasticnet'], fit_intercept=[False, True], shuffle=[False, True],
                         class_weight=['balanced', None], alpha=[0.0001, 0.00025], max_iter=[30,50,90])

svm_params = dict(shrinking=[False, True], degree=[3,4], class_weight=['balanced', None])

neural_net_params = dict(activation=['identity', 'logistic', 'tanh', 'relu'], hidden_layer_sizes = [(20,15,10),(30,20,15,10),(16,8,4)], 
                         max_iter=[50,80,150], solver=['adam','lbfgs'], learning_rate=['constant', 'invscaling', 'adaptive'], shuffle=[True, False])

log_reg_params = dict(class_weight=['balanced', None], solver=['newton-cg', 'lbfgs', 'liblinear', 'sag'], fit_intercept=[True, False])

knn_params = dict(n_neighbors=[2, 3, 5, 10], weights=['uniform', 'distance'],
                  algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'], leaf_size=[5,10,15,20])

bagging_params = dict(n_estimators=[5, 12, 15, 20], bootstrap=[False, True])

ada_boost_params = dict(n_estimators=[50, 75, 100], algorithm=['SAMME', 'SAMME.R'])

guassiannb_params = dict()

gradient_boosting_params = dict(n_estimators=[15, 25, 50])


In [11]:
params = [
    random_forest_params, decision_tree_params, perceptron_params,
    svm_params, neural_net_params, log_reg_params, knn_params,
    bagging_params, ada_boost_params, guassiannb_params, gradient_boosting_params
]
# classifiers to test
classifiers = [
    RandomForestClassifier(), DecisionTreeClassifier(), Perceptron(),
    SVC(), MLPClassifier(), LogisticRegression(),
    KNeighborsClassifier(), BaggingClassifier(), AdaBoostClassifier(),
    GaussianNB(), GradientBoostingClassifier()
]

names = [
    'RandomForest', 'DecisionTree', 'Perceptron', 'SVM',
    'NeuralNetwork', 'LogisticRegression',
    'KNearestNeighbors', 'Bagging', 'AdaBoost', 'Naive-Bayes', 'GradientBoosting'
]


param,eter, classifier, names 모두 합치기

In [12]:
models = dict(zip(names, zip(classifiers, params)))

<br><br>
## Cross-validation을 통한 최적의 파라미터 찾기

In [32]:
#Finding best parameters using Gridsearch 
def parameter_tuning(models, X_train, X_test, y_train, y_test, num_folds):
    print(num_folds,'fold cross-validation is used')
    print()
    accuracies = []
    # dataframe to store intermediate results
    dataframes = []
    best_parameters = []
    best_predictions = []
    for name, clf_and_params in models.items():
        print('Computing GridSearch on {} '.format(name))
        clf, clf_params = clf_and_params
        grid_clf = GridSearchCV(estimator=clf, param_grid=clf_params, cv=num_folds)
        grid_clf = grid_clf.fit(X_train, y_train)
        dataframes.append((name, grid_clf.cv_results_))
        best_parameters.append((name, grid_clf.best_params_))
        predictions = grid_clf.predict(X_test)
        
        if hasattr(grid_clf, 'predict_proba'):
            predictions_proba = grid_clf.predict_proba(X_test)
            best_predictions.append(predictions_proba)
        else:
            best_predictions.append(predictions)
        
        accuracy = accuracy_score(y_test, predictions)
        cv_scores = cross_val_score(clf, X_train, y_train, cv=num_folds)
        accuracies.append((name, accuracy, np.mean(cv_scores)))
    return accuracies, dataframes, best_parameters, best_predictions

In [33]:
results, dataframes, best_parameters, best_predictions = parameter_tuning(models, X_train, X_test, y_train, y_test, num_folds=5)
print()
print('============================================================')
for classifier, acc, cv_acc in results:
    print('{}: Accuracy with Best Parameters = {}% || Mean Cross Validation Accuracy = {}%'.format(classifier, round(acc*100,4), round(cv_acc*100,4)))
print()

for name, bp in best_parameters:
    print('============================================================')
    print('{} classifier GridSearch Best Parameters'.format(name))
    display(bp)
print()
print()

5 fold cross-validation is used

Computing GridSearch on RandomForest 
Computing GridSearch on DecisionTree 
Computing GridSearch on Perceptron 
Computing GridSearch on SVM 
Computing GridSearch on NeuralNetwork 
Computing GridSearch on LogisticRegression 
Computing GridSearch on KNearestNeighbors 
Computing GridSearch on Bagging 
Computing GridSearch on AdaBoost 
Computing GridSearch on Naive-Bayes 
Computing GridSearch on GradientBoosting 

RandomForest: Accuracy with Best Parameters = 100.0% || Mean Cross Validation Accuracy = 94.3667%
DecisionTree: Accuracy with Best Parameters = 88.8889% || Mean Cross Validation Accuracy = 94.3333%
Perceptron: Accuracy with Best Parameters = 74.0741% || Mean Cross Validation Accuracy = 79.6667%
SVM: Accuracy with Best Parameters = 100.0% || Mean Cross Validation Accuracy = 95.1667%
NeuralNetwork: Accuracy with Best Parameters = 92.5926% || Mean Cross Validation Accuracy = 97.6%
LogisticRegression: Accuracy with Best Parameters = 100.0% || Mean Cro

{'bootstrap': True, 'criterion': 'gini', 'max_features': 2, 'n_estimators': 25}

DecisionTree classifier GridSearch Best Parameters


{'class_weight': None,
 'criterion': 'gini',
 'max_features': 2,
 'min_samples_split': 4,
 'splitter': 'random'}

Perceptron classifier GridSearch Best Parameters


{'alpha': 0.0001,
 'class_weight': 'balanced',
 'fit_intercept': True,
 'max_iter': 30,
 'penalty': None,
 'shuffle': True}

SVM classifier GridSearch Best Parameters


{'class_weight': 'balanced', 'degree': 3, 'shrinking': False}

NeuralNetwork classifier GridSearch Best Parameters


{'activation': 'identity',
 'hidden_layer_sizes': (20, 15, 10),
 'learning_rate': 'constant',
 'max_iter': 150,
 'shuffle': False,
 'solver': 'adam'}

LogisticRegression classifier GridSearch Best Parameters


{'class_weight': 'balanced', 'fit_intercept': True, 'solver': 'sag'}

KNearestNeighbors classifier GridSearch Best Parameters


{'algorithm': 'auto', 'leaf_size': 5, 'n_neighbors': 3, 'weights': 'uniform'}

Bagging classifier GridSearch Best Parameters


{'bootstrap': True, 'n_estimators': 12}

AdaBoost classifier GridSearch Best Parameters


{'algorithm': 'SAMME', 'n_estimators': 50}

Naive-Bayes classifier GridSearch Best Parameters


{}

GradientBoosting classifier GridSearch Best Parameters


{'n_estimators': 15}





In [36]:
results

[('RandomForest', 1.0, 0.9436666666666665),
 ('DecisionTree', 0.8888888888888888, 0.9433333333333334),
 ('Perceptron', 0.7407407407407407, 0.7966666666666666),
 ('SVM', 1.0, 0.9516666666666665),
 ('NeuralNetwork', 0.9259259259259259, 0.976),
 ('LogisticRegression', 1.0, 0.9676666666666666),
 ('KNearestNeighbors', 1.0, 0.9436666666666665),
 ('Bagging', 1.0, 0.9436666666666665),
 ('AdaBoost', 0.9629629629629629, 0.9353333333333333),
 ('Naive-Bayes', 1.0, 0.9433333333333334),
 ('GradientBoosting', 1.0, 0.9433333333333334)]

In [37]:
dataframes

[('RandomForest',
  {'mean_fit_time': array([0.00378947, 0.00598516, 0.00877037, 0.01150002, 0.01456223,
          0.00318794, 0.00619764, 0.00919795, 0.01144099, 0.01445746,
          0.00320549, 0.00619779, 0.00920911, 0.01179686, 0.01459236,
          0.00340071, 0.0059968 , 0.00879722, 0.01162333, 0.01479092,
          0.00299206, 0.00581532, 0.00860834, 0.01159062, 0.01413345,
          0.00302367, 0.00597959, 0.0087863 , 0.01121078, 0.01439052,
          0.00318151, 0.00600505, 0.00880346, 0.01159973, 0.01460433,
          0.00340295, 0.00601039, 0.00899343, 0.01158881, 0.01439977,
          0.00332465, 0.0059782 , 0.00897374, 0.01197371, 0.0149662 ,
          0.00320601, 0.006217  , 0.0095109 , 0.01199398, 0.0151217 ,
          0.00319891, 0.00618339, 0.00898228, 0.01176176, 0.01450706,
          0.00319133, 0.00598621, 0.00877619, 0.01176958, 0.01438074,
          0.00320992, 0.00621462, 0.00881577, 0.01179395, 0.01481318,
          0.00319672, 0.0059926 , 0.00900698, 0.0117964

In [38]:
best_parameters

[('RandomForest',
  {'bootstrap': True,
   'criterion': 'gini',
   'max_features': 2,
   'n_estimators': 25}),
 ('DecisionTree',
  {'class_weight': None,
   'criterion': 'gini',
   'max_features': 2,
   'min_samples_split': 4,
   'splitter': 'random'}),
 ('Perceptron',
  {'alpha': 0.0001,
   'class_weight': 'balanced',
   'fit_intercept': True,
   'max_iter': 30,
   'penalty': None,
   'shuffle': True}),
 ('SVM', {'class_weight': 'balanced', 'degree': 3, 'shrinking': False}),
 ('NeuralNetwork',
  {'activation': 'identity',
   'hidden_layer_sizes': (20, 15, 10),
   'learning_rate': 'constant',
   'max_iter': 150,
   'shuffle': False,
   'solver': 'adam'}),
 ('LogisticRegression',
  {'class_weight': 'balanced', 'fit_intercept': True, 'solver': 'sag'}),
 ('KNearestNeighbors',
  {'algorithm': 'auto',
   'leaf_size': 5,
   'n_neighbors': 3,
   'weights': 'uniform'}),
 ('Bagging', {'bootstrap': True, 'n_estimators': 12}),
 ('AdaBoost', {'algorithm': 'SAMME', 'n_estimators': 50}),
 ('Naive-Ba

In [39]:
best_predictions

[array([[0.  , 1.  , 0.  ],
        [0.96, 0.04, 0.  ],
        [0.  , 0.  , 1.  ],
        [0.  , 1.  , 0.  ],
        [0.  , 0.76, 0.24],
        [1.  , 0.  , 0.  ],
        [0.  , 1.  , 0.  ],
        [0.  , 0.  , 1.  ],
        [0.  , 0.84, 0.16],
        [0.  , 1.  , 0.  ],
        [0.  , 0.  , 1.  ],
        [1.  , 0.  , 0.  ],
        [0.96, 0.04, 0.  ],
        [1.  , 0.  , 0.  ],
        [1.  , 0.  , 0.  ],
        [0.  , 0.92, 0.08],
        [0.  , 0.  , 1.  ],
        [0.  , 0.96, 0.04],
        [0.  , 1.  , 0.  ],
        [0.  , 0.  , 1.  ],
        [1.  , 0.  , 0.  ],
        [0.  , 0.08, 0.92],
        [1.  , 0.  , 0.  ],
        [0.  , 0.  , 1.  ],
        [0.  , 0.  , 1.  ],
        [0.  , 0.  , 1.  ],
        [0.  , 0.  , 1.  ]]),
 array([[0. , 1. , 0. ],
        [1. , 0. , 0. ],
        [0. , 0. , 1. ],
        [0. , 0.5, 0.5],
        [0. , 1. , 0. ],
        [1. , 0. , 0. ],
        [0. , 1. , 0. ],
        [0. , 0.5, 0.5],
        [0. , 0.5, 0.5],
        [0. , 1. 