## 투표 분류기

In [4]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [5]:
X = np.load("data_ML/Ensemble_titanic_X_train.npy")
y = np.load("data_ML/Ensemble_titanic_y_train.npy")

In [6]:
X[0]

array([0.27345609, 0.01415106, 0.        , 1.        , 0.        ,
       0.125     , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [7]:
y[:10]

array([0., 1., 1., 1., 0., 0., 0., 0., 1., 1.])

In [9]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1, max_depth=4)
clf3 = GaussianNB()

eclf = VotingClassifier(
 estimators=[('lr', clf1), ('rf', clf2), ('gnb',clf3)], voting='hard')

In [10]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

np.float64(0.8222941661905668)

In [11]:
cross_val_score(clf1, X, y, cv=5).mean()

np.float64(0.8290420872214816)

In [12]:
cross_val_score(clf2, X, y, cv=5).mean()

np.float64(0.8223068621849807)

In [13]:
cross_val_score(clf3, X, y, cv=5).mean()

np.float64(0.4600139655938551)

In [14]:
eclf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2)], voting='hard')
cross_val_score(eclf, X, y, cv=5).mean()

np.float64(0.8301783787215135)

### 하이퍼 매개변수를 튜닝한 투표 분류기

In [17]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')

In [18]:
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]

params ={
    "lr__solver" : 
        ['liblinear'], "lr__penalty" : ["l2"], "lr__C" : c_params,
    "dt__criterion" : ["gini", "entropy"],
    "dt__max_depth" : [10,8,7,6,5,4,3,2],
    "dt__min_samples_leaf": [1,2,3,4,5,6,7,8,9]
    }


In [19]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X, y)
grid.best_score_

np.float64(0.8425569732749316)

In [None]:
grid.best_params_

## 배깅 

In [65]:
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

X = np.load("data_ML/Ensemble_titanic_X_train.npy")
y = np.load("data_ML/Ensemble_titanic_y_train.npy")
clf1 = LogisticRegression(random_state=1)
eclf = BaggingClassifier(clf1, n_estimators=100, random_state=1)

from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean() 


np.float64(0.8279184917158637)

In [66]:
params ={
    "n_estimators" : [10,20,30,40,50,55],
    "max_samples" : [0.5,0.6,0.7,0.8,0.9,1] 
}


In [67]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X, y)

In [68]:
print("Best score:", grid.best_score_)
print("Best params:", grid.best_params_)

Best score: 0.8279184917158636
Best params: {'max_samples': 0.9, 'n_estimators': 30}


In [69]:
# OOB 평가
best_model = grid.best_estimator_
best_model.set_params(oob_score=True)
best_model.fit(X, y)
print("OOB score:", best_model.oob_score_)

OOB score: 0.8323959505061868


### 랜덤포레스트

In [70]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
X = np.load("data_ML/Ensemble_titanic_X_train.npy")
y = np.load("data_ML/Ensemble_titanic_y_train.npy")
eclf = RandomForestClassifier(n_estimators=100, max_features=2, n_jobs=7)
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

np.float64(0.8020694470894434)

In [71]:
from sklearn.model_selection import GridSearchCV

params ={
    "n_estimators" : [10, 20, 30, 50, 100],
    "max_features" : [1,2,3,4,5,6,7, 10, 15, 20, 25, len(X[0])]
    }


In [72]:
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X, y)

In [73]:
print("Best score:", grid.best_score_)
print("Best params:", grid.best_params_)

Best score: 0.829073827207516
Best params: {'max_features': 27, 'n_estimators': 30}


In [74]:
# OOB score 확인
best_model = grid.best_estimator_
best_model.set_params(oob_score=True)
best_model.fit(X, y)

print("OOB score:", best_model.oob_score_)

OOB score: 0.8188976377952756


### 에이다부스트 

In [75]:
import numpy as np
X = np.load("data_ML/Ensemble_titanic_X_train.npy")
y = np.load("data_ML/Ensemble_titanic_y_train.npy")

In [77]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

eclf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2), n_estimators=500)

In [78]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

np.float64(0.8279248397130706)

In [79]:
from sklearn.ensemble import RandomForestClassifier
eclf = RandomForestClassifier(n_estimators=500)
cross_val_score(eclf, X, y, cv=5).mean()

np.float64(0.7998222560782073)

In [81]:
eclf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2), n_estimators=500)

In [83]:
params = {
    "estimator__criterion": ["gini", "entropy"],
    "estimator__max_features": [7, 8],
    "estimator__max_depth": [1, 2],
    "n_estimators": [23, 24, 25, 26, 27],
    "learning_rate": [0.4, 0.45, 0.5, 0.55, 0.6]
}

In [86]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=7)
grid.fit(X, y)

In [87]:
print("Best score:", grid.best_score_)
print("Best params:", grid.best_params_)

Best score: 0.8234304576905986
Best params: {'estimator__criterion': 'entropy', 'estimator__max_depth': 2, 'estimator__max_features': 7, 'learning_rate': 0.6, 'n_estimators': 27}


In [88]:
grid.best_estimator_.feature_importances_

array([0.14451155, 0.04439731, 0.02929276, 0.09842972, 0.18971156,
       0.19660184, 0.04529486, 0.01176282, 0.00040079, 0.        ,
       0.        , 0.        , 0.        , 0.12517506, 0.0016739 ,
       0.0280196 , 0.02467424, 0.00359388, 0.02328443, 0.        ,
       0.02102152, 0.        , 0.0053706 , 0.00325227, 0.        ,
       0.00178791, 0.00174339])