In [1]:
from sklearn.datasets import make_moons,load_iris
from sklearn.ensemble import VotingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score,roc_auc_score,mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import numpy as np

In [2]:
x,y = make_moons(n_samples=1000,noise=0.5)
lr = LogisticRegression()
svc = SVC()
rf = RandomForestClassifier()
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
vot_clas = VotingClassifier(
    estimators=[('lr',lr),('svc',svc),('rf',rf)],
    voting='hard'
)
vot_clas.fit(x_train,y_train)

Оценим что получилось при жесткой классификации

In [3]:
for cl in (lr,svc,rf,vot_clas):
  cl.fit(x_train,y_train)
  y_pred = cl.predict(x_test)
  print(cl.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.825
SVC 0.845
RandomForestClassifier 0.805
VotingClassifier 0.84


Мягкое голосование

In [4]:
svc_prob = SVC(probability=True)
vot_clas_soft = VotingClassifier(
    estimators=[('lr',lr),('svc',svc_prob),('rf',rf)],
    voting='soft'
)
for cl in (lr,svc_prob,rf,vot_clas_soft):
  cl.fit(x_train,y_train)
  y_pred = cl.predict_proba(x_test)[:,1]
  print(cl.__class__.__name__,roc_auc_score(y_test,y_pred))

LogisticRegression 0.9068035426731079
SVC 0.9061996779388084
RandomForestClassifier 0.876358695652174
VotingClassifier 0.9068035426731079


Бэггинг и вставка

In [37]:
bag_cl = BaggingClassifier(
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1,
    oob_score=True
)
bag_cl.fit(x_train,y_train)
pred = bag_cl.predict(x_test)
print(bag_cl.oob_score_)
print(accuracy_score(y_test,pred))

0.81125
0.845


Сам случайный лес

In [38]:
rnd = RandomForestClassifier(n_estimators=500,
                             max_leaf_nodes=16,
                             n_jobs=-1)
rnd.fit(x_train,y_train)
pred = rnd.predict(x_test)

Значимость признаков

In [40]:
iris = load_iris()
rnd_cl = RandomForestClassifier(
    n_estimators=500,
    n_jobs=-1
)
rnd_cl.fit(iris['data'],iris['target'])
for name, score in zip(iris['feature_names'],rnd_cl.feature_importances_):
  print(f"Name {name} is {score}")

Name sepal length (cm) is 0.09695234159122376
Name sepal width (cm) is 0.02388616354960457
Name petal length (cm) is 0.4304659775814688
Name petal width (cm) is 0.4486955172777029


Бустинги

In [44]:
ada = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=500,
    algorithm="SAMME",
    learning_rate=0.5
)
ada.fit(x_train,y_train)



In [46]:
grd = GradientBoostingClassifier(
    max_depth=2,
    n_estimators=3,
    learning_rate=1.0
)
grd.fit(x_train,y_train)

Раннее прекращение

In [5]:
gr = GradientBoostingClassifier(
    max_depth=2,
    n_estimators=120
)
gr.fit(x_train,y_train)
errors = [mean_squared_error(y_test,y_pred) for y_pred in gr.staged_predict(x_test)]
best_estimators = np.argmin(errors)+1
gr_best = GradientBoostingClassifier(
    max_depth=2,
    n_estimators=best_estimators
)
gr_best.fit(x_train,y_train)