In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [2]:
X_train, X_test, y_train, y_test = train_test_split(data_breast_cancer['data'][['mean texture','mean symmetry']], data_breast_cancer['target'], test_size=0.2)

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

In [4]:
tree_clf = DecisionTreeClassifier()

In [5]:
log_clf = LogisticRegression()

In [6]:
knn = KNeighborsClassifier()

In [7]:
voting_clf_hard = VotingClassifier(
    estimators=[('tree', tree_clf), ('log', log_clf), ('knn', knn)], voting='hard')

In [8]:
voting_clf_soft = VotingClassifier(
    estimators=[('tree', tree_clf), ('log', log_clf), ('knn', knn)], voting='soft')

In [9]:
lista = []
classifiers = []
for clf in [tree_clf, log_clf, knn, voting_clf_hard, voting_clf_soft]:
    clf.fit(X_train, y_train)
    lista.append((
        accuracy_score(y_train, clf.predict(X_train)), accuracy_score(y_test, clf.predict(X_test))
    ))
    classifiers.append(clf)

In [11]:
import pickle

In [12]:
file = open("acc_vote.pkl", 'wb')
pickle.dump(lista, file)

file = open("acc_vote.pkl",'rb')
a = pickle.load(file)
file.close()
print(a)

[(1.0, 0.631578947368421), (0.7054945054945055, 0.6929824561403509), (0.7758241758241758, 0.6491228070175439), (0.8417582417582418, 0.6578947368421053), (0.9626373626373627, 0.6666666666666666)]


In [13]:
file = open("vote.pkl", 'wb')
pickle.dump(classifiers, file)

file = open("vote.pkl",'rb')
a = pickle.load(file)
file.close()
print(a)

[DecisionTreeClassifier(), LogisticRegression(), KNeighborsClassifier(), VotingClassifier(estimators=[('tree', DecisionTreeClassifier()),
                             ('log', LogisticRegression()),
                             ('knn', KNeighborsClassifier())]), VotingClassifier(estimators=[('tree', DecisionTreeClassifier()),
                             ('log', LogisticRegression()),
                             ('knn', KNeighborsClassifier())],
                 voting='soft')]


In [14]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [15]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=1.0, bootstrap=True, random_state=42)

bag_clf_half = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, bootstrap=True, random_state=42)

pasting = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=1.0, bootstrap=False, random_state=42)

pasting_half = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, bootstrap=False, random_state=42)

random_forest = RandomForestClassifier(n_estimators=30)

ada_boost = AdaBoostClassifier(n_estimators=30)

gradient_boosting = GradientBoostingClassifier(n_estimators=30).fit(X_train, y_train)

In [16]:
lista = []
classifiers = []
for bag in [bag_clf, bag_clf_half, pasting, pasting_half, random_forest, ada_boost, gradient_boosting]:
    bag.fit(X_train, y_train)
    lista.append((
        accuracy_score(y_train, bag.predict(X_train)), accuracy_score(y_test, bag.predict(X_test))
    ))
    classifiers.append(bag)

In [17]:
file = open("acc_bag.pkl", 'wb')
pickle.dump(lista, file)

file = open("acc_bag.pkl",'rb')
a = pickle.load(file)
file.close()
print(a)

[(0.9956043956043956, 0.6491228070175439), (0.9208791208791208, 0.6403508771929824), (1.0, 0.6052631578947368), (0.9626373626373627, 0.6754385964912281), (0.9912087912087912, 0.6491228070175439), (0.7912087912087912, 0.6754385964912281), (0.8197802197802198, 0.7280701754385965)]


In [18]:
file = open("bag.pkl", 'wb')
pickle.dump(classifiers, file)

file = open("bag.pkl",'rb')
a = pickle.load(file)
file.close()
print(a)

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30,
                  random_state=42), BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30, random_state=42), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30, random_state=42), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30, random_state=42), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    data_breast_cancer['data'], data_breast_cancer['target'], test_size=0.2)

In [20]:
bag = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=30, max_samples=0.5,
    bootstrap_features=True, max_features=2)
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=2, max_samples=0.5,
                  n_estimators=30)

In [21]:
file = open("acc_fea.pkl", 'wb')
pickle.dump([accuracy_score(y_train, bag.predict(X_train)), accuracy_score(y_test, bag.predict(X_test))], file)

file = open("acc_fea.pkl",'rb')
a = pickle.load(file)
file.close()
print(a)

[0.9978021978021978, 0.9736842105263158]


In [22]:
file = open("fea.pkl", 'wb')
pickle.dump([bag], file)

file = open("fea.pkl",'rb')
a = pickle.load(file)
file.close()
print(a)

[BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=2, max_samples=0.5,
                  n_estimators=30)]


In [23]:
df = pd.DataFrame({
    "acc train":[],
    "acc test":[],
    "features":[]})


In [24]:
for i, e in enumerate(bag.estimators_):
    features = data_breast_cancer.feature_names[np.array(bag.estimators_features_[i])]
    df_row = pd.DataFrame({
        'acc train': [accuracy_score(y_train, e.predict(X_train[features]))],
        'acc test': [accuracy_score(y_test, e.predict(X_test[features]))],
        'features': [features]
    })
    df = pd.concat([df, df_row])

In [25]:
df.sort_values(by='acc train', ascending=False, inplace=True)
df.sort_values(by='acc test', ascending=False, inplace=True)
df

Unnamed: 0,acc train,acc test,features
0,0.936264,0.921053,"[worst area, mean concavity]"
0,0.942857,0.912281,"[mean perimeter, worst smoothness]"
0,0.927473,0.903509,"[worst radius, fractal dimension error]"
0,0.920879,0.903509,"[fractal dimension error, worst radius]"
0,0.896703,0.894737,"[mean area, mean smoothness]"
0,0.934066,0.885965,"[worst texture, worst concave points]"
0,0.942857,0.877193,"[worst fractal dimension, worst concave points]"
0,0.923077,0.877193,"[worst texture, mean radius]"
0,0.920879,0.877193,"[mean radius, mean concavity]"
0,0.92967,0.877193,"[worst perimeter, perimeter error]"


In [26]:
file = open("acc_fea_rank.pkl", 'wb')
pickle.dump(df, file)
    
file = open("acc_fea_rank.pkl",'rb')
a = pickle.load(file)
file.close()
print(a)


   acc train  acc test                                           features
0   0.936264  0.921053                       [worst area, mean concavity]
0   0.942857  0.912281                 [mean perimeter, worst smoothness]
0   0.927473  0.903509            [worst radius, fractal dimension error]
0   0.920879  0.903509            [fractal dimension error, worst radius]
0   0.896703  0.894737                       [mean area, mean smoothness]
0   0.934066  0.885965              [worst texture, worst concave points]
0   0.942857  0.877193    [worst fractal dimension, worst concave points]
0   0.923077  0.877193                       [worst texture, mean radius]
0   0.920879  0.877193                      [mean radius, mean concavity]
0   0.929670  0.877193                 [worst perimeter, perimeter error]
0   0.890110  0.868421                       [mean area, perimeter error]
0   0.927473  0.859649                     [radius error, mean perimeter]
0   0.914286  0.859649             [co