In [1]:
from sklearn import datasets
from sklearn import ensemble
import pandas as pd
data_breast_cancer_X, data_breast_cancer_y = datasets.load_breast_cancer(return_X_y= True, as_frame=True)
data_breast_cancer_X = data_breast_cancer_X[['mean texture','mean symmetry']]
data_breast_cancer_X
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_breast_cancer_X, data_breast_cancer_y, test_size=0.2, random_state=42)

In [2]:
# tworzenie estymatorów 
# 1.drzewa 
# 2.regresja logistyczna
# 3.knn

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

tree_clf = DecisionTreeClassifier()
log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()

voting_clf_hard = VotingClassifier(
    estimators=[('tree', tree_clf), ('log', log_clf), ('knn', knn_clf)],
    voting='hard'
)
voting_clf_soft = VotingClassifier(
    estimators=[('tree', tree_clf), ('log', log_clf), ('knn', knn_clf)],
    voting='soft'
)

In [3]:
voting_clf_hard.fit(X_train, y_train)
voting_clf_soft.fit(X_train, y_train)

VotingClassifier(estimators=[('tree', DecisionTreeClassifier()),
                             ('log', LogisticRegression()),
                             ('knn', KNeighborsClassifier())],
                 voting='soft')

In [4]:
#liczenie dokładności dla głosowanie hard i soft
from sklearn.metrics import accuracy_score

y_test_predict_hard = voting_clf_hard.predict(X_test)
y_test_predict_soft = voting_clf_soft.predict(X_test)

y_train_predict_hard = voting_clf_hard.predict(X_train)
y_train_predict_soft = voting_clf_soft.predict(X_train)

y_test_predict_hard_score = accuracy_score(y_test, y_test_predict_hard)
y_test_predict_soft_score = accuracy_score(y_test, y_test_predict_soft)

y_train_predict_hard_score = accuracy_score(y_train, y_train_predict_hard)
y_train_predict_soft_score = accuracy_score(y_train, y_train_predict_soft)

voting_clf_hard_scores = (y_train_predict_hard_score, y_test_predict_hard_score)
voting_clf_soft_scores = (y_train_predict_soft_score, y_test_predict_soft_score)

print(voting_clf_hard_scores, voting_clf_soft_scores)

(0.8351648351648352, 0.6929824561403509) (0.9648351648351648, 0.6666666666666666)


In [5]:
accuracy_scores = []
clfs = []

In [6]:
# dokładności dla kazdego z samych estymatorów
for estimator in voting_clf_hard.estimators_:
    accuracy_score_train = accuracy_score(y_train, estimator.predict(X_train))
    accuracy_score_test = accuracy_score(y_test, estimator.predict(X_test))
    print(accuracy_score_train, accuracy_score_test, estimator.__class__.__name__)
    accuracy_scores.append((accuracy_score_train, accuracy_score_test))
    clfs.append(estimator)

1.0 0.6140350877192983 DecisionTreeClassifier
0.7230769230769231 0.7017543859649122 LogisticRegression
0.7714285714285715 0.6403508771929824 KNeighborsClassifier


In [7]:
accuracy_scores.append(voting_clf_hard_scores)
accuracy_scores.append(voting_clf_soft_scores)
print(accuracy_scores)

clfs.append(voting_clf_hard)
clfs.append(voting_clf_soft)

import pickle
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(accuracy_scores, f)

with open('vote.pkl', 'wb') as f:
    pickle.dump(clfs, f)


[(1.0, 0.6140350877192983), (0.7230769230769231, 0.7017543859649122), (0.7714285714285715, 0.6403508771929824), (0.8351648351648352, 0.6929824561403509), (0.9648351648351648, 0.6666666666666666)]


In [8]:
accuracy_scores_trees = []
clfs_trees = []

In [9]:
# 6 bagginc clf, bagging 50 prc, pasting, pasting 50 prc, random forest, adaboost, gradient boosing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

bgg_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, random_state=42)
bgg_clf_50 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=.5, random_state=42)
pst_clf = BaggingClassifier(DecisionTreeClassifier(), bootstrap=False, n_estimators=30, random_state=42)
pst_clf_50 = BaggingClassifier(DecisionTreeClassifier(), bootstrap=False, n_estimators=30, max_samples=.5, random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=30, random_state=42)
ada_clf = AdaBoostClassifier(n_estimators=30, random_state=42)
grd_clf = GradientBoostingClassifier(n_estimators=30, random_state=42)

for i, estimator in enumerate([bgg_clf, bgg_clf_50, pst_clf, pst_clf_50, rnd_clf, ada_clf, grd_clf]):
    estimator.fit(X_train, y_train)
    acc_train = accuracy_score(y_train, estimator.predict(X_train))
    acc_test = accuracy_score(y_test, estimator.predict(X_test))
    print(acc_train, acc_test, estimator.__class__.__name__,i)
    accuracy_scores_trees.append((acc_train, acc_test))
    clfs_trees.append(estimator)

0.9956043956043956 0.6754385964912281 BaggingClassifier 0
0.9296703296703297 0.6842105263157895 BaggingClassifier 1
1.0 0.6228070175438597 BaggingClassifier 2
0.9736263736263736 0.6491228070175439 BaggingClassifier 3
0.9956043956043956 0.6754385964912281 RandomForestClassifier 4
0.8 0.7368421052631579 AdaBoostClassifier 5
0.8373626373626374 0.7105263157894737 GradientBoostingClassifier 6


In [10]:
with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(accuracy_scores_trees, f)

with open('bag.pkl', 'wb') as f:
    pickle.dump(clfs_trees, f)


In [11]:
#7. sampling 
data_breast_cancer_X, data_breast_cancer_y = datasets.load_breast_cancer(return_X_y= True, as_frame=True)
#data_breast_cancer_X = data_breast_cancer_X[['mean texture','mean symmetry']]
data_breast_cancer_X
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_breast_cancer_X, data_breast_cancer_y, test_size=0.2, random_state=42)
data_breast_cancer_X.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [12]:

rnd_clf = RandomForestClassifier(n_estimators=500,random_state=42)
rnd_clf.fit(X_train, y_train)
df = pd.DataFrame()
for name, score in zip(X_test.columns, rnd_clf.feature_importances_):
    print(name, score)
    


mean radius 0.04706475687207926
mean texture 0.016359326862804082
mean perimeter 0.041768943622206234
mean area 0.040207440805062056
mean smoothness 0.007518062679254841
mean compactness 0.012101887153474402
mean concavity 0.051091160395859585
mean concave points 0.11533220743928123
mean symmetry 0.003570340625580548
mean fractal dimension 0.004723438579581869
radius error 0.016963074265710348
texture error 0.004404533130902704
perimeter error 0.010395285249577283
area error 0.029796888704994265
smoothness error 0.0034023295569631977
compactness error 0.005323849038562246
concavity error 0.007545928120234165
concave points error 0.0045157165250521575
symmetry error 0.004979756978372231
fractal dimension error 0.0061489633752481775
worst radius 0.08250625427779008
worst texture 0.02100639840135993
worst perimeter 0.11441049172818418
worst area 0.12244813320331961
worst smoothness 0.01310910823606547
worst compactness 0.015939294523398435
worst concavity 0.03843478553502979
worst concave