In [36]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.feature_selection import SelectFromModel
import numpy as np



In [2]:
x = load_breast_cancer().data
x

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [3]:
y = load_breast_cancer().target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [4]:
x_train,x_test,y_train,y_test=train_test_split(x, y,
test_size=0.2, random_state=84)

In [5]:
parameters_RFC = {'n_estimators':[50,100,200], 'max_depth':[3, 5, 7]}
parameters_ABC = {'n_estimators':[50,100,200], 'learning_rate':[0.1, 0.05, 0.01 ,0.5]}

In [6]:
RFC = RandomForestClassifier()
ABC = AdaBoostClassifier()

In [7]:
grid1 = GridSearchCV(RFC, parameters_RFC)
grid2 = GridSearchCV(ABC, parameters_ABC)

In [8]:
clf1 = make_pipeline(StandardScaler(), grid1)
clf2 = make_pipeline(StandardScaler(), grid2)

In [9]:
clf1.fit(x_train,y_train)

In [10]:
grid1.best_params_

{'max_depth': 7, 'n_estimators': 100}

In [11]:
clf2.fit(x_train,y_train)

In [12]:
grid2.best_params_

{'learning_rate': 0.5, 'n_estimators': 200}

In [29]:
model1 = RandomForestClassifier(n_estimators=100, max_depth=7)
model2 = AdaBoostClassifier(learning_rate=0.5, n_estimators=200)

In [30]:
model1.fit(x_train,y_train)

In [31]:
model1_pred = model1.predict(x_test)

print("Accuracy=%.2f" % accuracy_score(y_test,model1_pred))
print('Precision: %.2f' % precision_score(y_test,model1_pred))
print('Recall: %.2f' % recall_score(y_test,model1_pred))
print('F1: %.2f' % f1_score(y_test,model1_pred))

Accuracy=0.95
Precision: 0.99
Recall: 0.93
F1: 0.96


In [32]:
model2.fit(x_train,y_train)

In [33]:
model2_pred = model2.predict(x_test)

print("Accuracy=%.2f" % accuracy_score(y_test,model2_pred))
print('Precision: %.2f' % precision_score(y_test,model2_pred))
print('Recall: %.2f' % recall_score(y_test,model2_pred))
print('F1: %.2f' % f1_score(y_test,model2_pred))

Accuracy=0.97
Precision: 0.99
Recall: 0.97
F1: 0.98


Em taxa de acerto o segundo modelo é melhor. Os dois modelos tem a mesma taxa de acerto na diferenciação de verdadeiros positivos e falsos positivos. O segundo modelo acerta mais na identificação de verdadeiros positivos.

In [34]:
print(model2.feature_importances_)

[0.    0.095 0.    0.02  0.025 0.06  0.03  0.035 0.015 0.    0.04  0.03
 0.02  0.095 0.01  0.035 0.045 0.01  0.005 0.075 0.01  0.035 0.05  0.025
 0.08  0.01  0.045 0.06  0.03  0.01 ]


In [37]:
selector = SelectFromModel(model2, threshold=-np.inf, max_features=10)
selector.fit(x_train, y_train)

In [38]:
x_new_train = selector.transform(x_train)

In [42]:
x_new_test = selector.transform(x_test)

In [39]:
model2.fit(x_new_train,y_train)

In [44]:
model2_pred = model2.predict(x_new_test)

print("Accuracy=%.2f" % accuracy_score(y_test,model2_pred))
print('Precision: %.2f' % precision_score(y_test,model2_pred))
print('Recall: %.2f' % recall_score(y_test,model2_pred))
print('F1: %.2f' % f1_score(y_test,model2_pred))

Accuracy=0.95
Precision: 0.96
Recall: 0.96
F1: 0.96


O modelo está acertando menos no contexto geral e acertando menos as instâncias verdadeiras positivas.