In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_iris

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp

%matplotlib inline

パラメータセット

In [90]:
model_parameters = {
    'n_estimators': [1000],
    'criterion': ['gini', 'entropy'],
    'random_state': [71],
    'min_samples_leaf': sp.stats.randint(3, 30),
    'min_samples_split': sp.stats.randint(3, 30),
    'max_depth': sp.stats.randint(3, 5),
    'class_weight': [None, 'balanced']
}
search_parameters = {
    'estimator': RandomForestClassifier(),
    'param_distributions': model_parameters,
    'scoring': 'accuracy',
    'cv': 3,
    'n_iter': 2,
    'n_jobs': -1,
    'random_state': 71,
    'verbose': 0
}

まずは普通に3つのモデルを用意する

In [12]:
data = load_iris()

In [13]:
data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [23]:
_df1 = pd.DataFrame(data['data'] , columns=data['feature_names'])
_df2 = pd.DataFrame(data['target'], columns=['target'])
df = pd.concat([_df1, _df2], axis=1)

In [24]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [77]:
df = pd.concat([df, pd.get_dummies(df['target'], prefix='target_')], axis=1)

In [78]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target__0,target__1,target__2
0,5.1,3.5,1.4,0.2,0,1,0,0
1,4.9,3.0,1.4,0.2,0,1,0,0
2,4.7,3.2,1.3,0.2,0,1,0,0
3,4.6,3.1,1.5,0.2,0,1,0,0
4,5.0,3.6,1.4,0.2,0,1,0,0


In [91]:
X_train, X_test, y_train, y_test = train_test_split(df[['sepal length (cm)', 'sepal width (cm)']], df['target__0'], test_size=0.3, random_state=71)

In [92]:
rs0 = RandomizedSearchCV(**search_parameters)

In [93]:
rs.fit(X_train, y_train)

RandomizedSearchCV(cv=2, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [10], 'criterion': ['gini', 'entropy'], 'random_state': [71], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d485f60>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d45f0b8>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d45f208>, 'class_weight': [None]},
          pre_dispatch='2*n_jobs', random_state=5

In [94]:
print(classification_report(y_train, rs.predict(X_train)))
print(classification_report(y_test, rs.predict(X_test)))

             precision    recall  f1-score   support

          0       0.84      0.97      0.90        69
          1       0.92      0.64      0.75        36

avg / total       0.87      0.86      0.85       105

             precision    recall  f1-score   support

          0       0.86      0.97      0.91        31
          1       0.90      0.64      0.75        14

avg / total       0.87      0.87      0.86        45



In [95]:
X_train, X_test, y_train, y_test = train_test_split(df[['sepal length (cm)', 'sepal width (cm)']], df['target__1'], test_size=0.3, random_state=71)
rs1 = RandomizedSearchCV(**search_parameters)
rs1.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=2, n_jobs=-1,
          param_distributions={'n_estimators': [1000], 'criterion': ['gini', 'entropy'], 'random_state': [71], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d486470>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d486780>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d486630>, 'class_weight': [None, 'balanced']},
          pre_dispatch='2*n_jobs', r

In [96]:
print(classification_report(y_train, rs1.predict(X_train)))
print(classification_report(y_test, rs1.predict(X_test)))

             precision    recall  f1-score   support

          0       0.84      0.90      0.87        73
          1       0.73      0.59      0.66        32

avg / total       0.80      0.81      0.80       105

             precision    recall  f1-score   support

          0       0.74      0.93      0.82        27
          1       0.82      0.50      0.62        18

avg / total       0.77      0.76      0.74        45



In [97]:
X_train, X_test, y_train, y_test = train_test_split(df[['sepal length (cm)', 'sepal width (cm)']], df['target__2'], test_size=0.3, random_state=71)
rs2 = RandomizedSearchCV(**search_parameters)
rs2.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=2, n_jobs=-1,
          param_distributions={'n_estimators': [1000], 'criterion': ['gini', 'entropy'], 'random_state': [71], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d486470>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d486780>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f287d486630>, 'class_weight': [None, 'balanced']},
          pre_dispatch='2*n_jobs', r

In [98]:
print(classification_report(y_train, rs2.predict(X_train)))
print(classification_report(y_test, rs2.predict(X_test)))

             precision    recall  f1-score   support

          0       0.88      0.88      0.88        68
          1       0.78      0.78      0.78        37

avg / total       0.85      0.85      0.85       105

             precision    recall  f1-score   support

          0       0.93      0.78      0.85        32
          1       0.61      0.85      0.71        13

avg / total       0.83      0.80      0.81        45



# アンサンブルしてみる

アンサンブルのモデルをさらにアンサンブル...

1. yをlistで渡す
2. dummies
3. 複数モデルを学習するクラス

In [111]:
class Ensemble():
    def __init__(self, df, feature_names, target_name, parameters):
        self.models = []
        self.search_parameters = parameters
        
        self.feature_names = feature_names
        y_df = pd.get_dummies(df[target_name], prefix='y_')
        self.y_columns = y_df.columns
        _df = pd.concat([df, y_df], axis=1)
        self.df = _df
    
    def fit(self, test_size=0.3):
        for y in self.y_columns:
            X_train, X_test, y_train, y_test = train_test_split(self.df[self.feature_names],
                                                                self.df[y],
                                                                test_size=test_size,
                                                                random_state=71)    
            rs = RandomizedSearchCV(** self.search_parameters)
            rs.fit(X_train, y_train)
            self.models.append(rs)
            print('#'*30, y)
            print(classification_report(y_train, rs.predict(X_train)))
            print(classification_report(y_test, rs.predict(X_test)))

In [101]:
_df1 = pd.DataFrame(data['data'] , columns=data['feature_names'])
_df2 = pd.DataFrame(data['target'], columns=['target'])
df = pd.concat([_df1, _df2], axis=1)

In [102]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [112]:
e = Ensemble(df, feature_names=df.columns[:2].tolist(),
             target_name='target', parameters=search_parameters)

In [113]:
e.fit()

############################## y__0
             precision    recall  f1-score   support

          0       1.00      0.97      0.99        69
          1       0.95      1.00      0.97        36

avg / total       0.98      0.98      0.98       105

             precision    recall  f1-score   support

          0       1.00      0.87      0.93        31
          1       0.78      1.00      0.88        14

avg / total       0.93      0.91      0.91        45

############################## y__1
             precision    recall  f1-score   support

          0       0.84      0.90      0.87        73
          1       0.73      0.59      0.66        32

avg / total       0.80      0.81      0.80       105

             precision    recall  f1-score   support

          0       0.74      0.93      0.82        27
          1       0.82      0.50      0.62        18

avg / total       0.77      0.76      0.74        45

############################## y__2
             precision    recall

In [127]:
a = np.zeros([e.df[e.df.columns[:2]].shape[0], len(e.y_columns)])
for idx, m in enumerate(e.models):
    predicted = m.predict_proba(e.df[e.df.columns[:2]])
    a[:, idx] = predicted[:, 1]

In [132]:
print(classification_report(e.df['target'], np.argmax(a, axis=1)))

             precision    recall  f1-score   support

          0       0.89      1.00      0.94        50
          1       0.78      0.58      0.67        50
          2       0.72      0.82      0.77        50

avg / total       0.80      0.80      0.79       150



上がってはいるかな？

テスト

In [137]:
class Ensemble():
    def __init__(self, df, feature_names, target_name, parameters):
        self.models = []
        self.search_parameters = parameters
        
        self.feature_names = feature_names
        y_df = pd.get_dummies(df[target_name], prefix='y_')
        self.y_columns = y_df.columns
        _df = pd.concat([df, y_df], axis=1)
        self.df = _df
    
    def fit(self, test_size=0.3):
        for y in self.y_columns:
            X_train, X_test, y_train, y_test = train_test_split(self.df[self.feature_names],
                                                                self.df[y],
                                                                test_size=test_size,
                                                                random_state=71)    
            rs = RandomizedSearchCV(** self.search_parameters)
            rs.fit(X_train, y_train)
            self.models.append(rs)
            print('#'*30, y)
            print(classification_report(y_train, rs.predict(X_train)))
            print(classification_report(y_test, rs.predict(X_test)))
    
    def predict(self, X, y):
        a = np.zeros([X.shape[0], len(self.y_columns)])
        for idx, m in enumerate(self.models):
            predicted = m.predict_proba(X)
            a[:, idx] = predicted[:, 1]
        
        print(classification_report(y, np.argmax(a, axis=1)))

In [145]:
e = Ensemble(df, feature_names=df.columns.tolist()[:4],
             target_name='target', parameters=search_parameters)
e.fit()

############################## y__0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        69
          1       1.00      1.00      1.00        36

avg / total       1.00      1.00      1.00       105

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        31
          1       1.00      1.00      1.00        14

avg / total       1.00      1.00      1.00        45

############################## y__1
             precision    recall  f1-score   support

          0       0.99      1.00      0.99        73
          1       1.00      0.97      0.98        32

avg / total       0.99      0.99      0.99       105

             precision    recall  f1-score   support

          0       0.93      0.96      0.95        27
          1       0.94      0.89      0.91        18

avg / total       0.93      0.93      0.93        45

############################## y__2
             precision    recall

In [147]:
e.predict(e.df[e.df.columns[:4]], e.df['target'])

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.98      0.94      0.96        50
          2       0.94      0.98      0.96        50

avg / total       0.97      0.97      0.97       150

