In [105]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd

# 3.1

In [77]:
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)
data_breast_cancer.data
Xb = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
yb = data_breast_cancer['target']
Xb_train, Xb_test, yb_train, yb_test = train_test_split(Xb, yb, test_size=0.2)

# 3.2

In [78]:
decision_tree_clf = DecisionTreeClassifier()
log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
ensemble_hard = VotingClassifier(estimators=[('lr', log_clf),
                                          ('dt', decision_tree_clf),
                                          ('knn', knn_clf)],
                                voting='hard')
ensemble_soft = VotingClassifier(estimators=[('lr', log_clf),
                                          ('dt', decision_tree_clf),
                                          ('knn', knn_clf)],
                                voting='soft')
ensemble_hard.fit(Xb_train, yb_train)
ensemble_soft.fit(Xb_train, yb_train)

In [79]:
ensemble_hard.predict(Xb_test)
ensemble_soft.predict(Xb_test)

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1])

# 3.3

In [80]:
accuracy_list = []
for clf in (decision_tree_clf, log_clf, knn_clf, ensemble_hard, ensemble_soft):
    clf.fit(Xb_train, yb_train)
    train_pred = clf.predict(Xb_train)
    test_pred = clf.predict(Xb_test)
    accuracy_list.append([accuracy_score(yb_train, train_pred), accuracy_score(yb_test, test_pred)])
accuracy_list

[[1.0, 0.5877192982456141],
 [0.7296703296703296, 0.6140350877192983],
 [0.789010989010989, 0.631578947368421],
 [0.8483516483516483, 0.6052631578947368],
 [0.9604395604395605, 0.5877192982456141]]

# 3.4

In [81]:
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(accuracy_list, f)
list_of_clf = [decision_tree_clf, log_clf, knn_clf, ensemble_hard, ensemble_soft]
with open('vote.pkl', 'wb') as f:
    pickle.dump(list_of_clf, f)

# 3.5

In [82]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30)
bag_clf.fit(Xb_train, yb_train)

bag_50_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5)
bag_50_clf.fit(Xb_train, yb_train)

past_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
past_clf.fit(Xb_train, yb_train)

past_50_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False, max_samples=0.5)
past_50_clf.fit(Xb_train, yb_train)

random_forest_clf = RandomForestClassifier(n_estimators=30)
random_forest_clf.fit(Xb_train, yb_train)

adaboost_clf = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=30)
adaboost_clf.fit(Xb_train, yb_train)

gradient_boost_clf = GradientBoostingClassifier(n_estimators=30)
gradient_boost_clf.fit(Xb_train, yb_train)


# 3.6

In [83]:
bagging_acc = []
for clf in (bag_clf, bag_50_clf, past_clf, past_50_clf, random_forest_clf, adaboost_clf, gradient_boost_clf):
    clf.fit(Xb_train, yb_train)
    train_pred = clf.predict(Xb_train)
    test_pred = clf.predict(Xb_test)
    bagging_acc.append([accuracy_score(yb_train, train_pred), accuracy_score(yb_test, test_pred)])
bagging_acc

[[0.9956043956043956, 0.6403508771929824],
 [0.9274725274725275, 0.6491228070175439],
 [1.0, 0.6140350877192983],
 [0.9714285714285714, 0.6578947368421053],
 [0.9978021978021978, 0.6666666666666666],
 [1.0, 0.6052631578947368],
 [0.843956043956044, 0.6929824561403509]]

In [84]:
with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(bagging_acc, f)
list_of_clf = [bag_clf, bag_50_clf, past_clf, past_50_clf, random_forest_clf, adaboost_clf, gradient_boost_clf]
with open('bag.pkl', 'wb') as f:
    pickle.dump(list_of_clf, f)

# 3.7

In [141]:
tree = DecisionTreeClassifier()
bagging = BaggingClassifier(n_estimators=30, 
                            max_samples=0.5, 
                            max_features=2, 
                            bootstrap=True)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data_breast_cancer.data, data_breast_cancer.target, test_size=0.2)
bagging.fit(X_train_1, y_train_1)
# tree.fit(Xb_train, yb_train)
# bagging.fit(Xb_train, yb_train)

In [142]:
bag_acc = [accuracy_score(y_train_1, bagging.predict(X_train_1)), accuracy_score(y_test_1, bagging.predict(X_test_1))]

with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(bag_acc, f)
    
with open('fea.pkl', 'wb') as f:
    pickle.dump([bagging], f)
print(bagging)
bag_acc

BaggingClassifier(max_features=2, max_samples=0.5, n_estimators=30)


[0.9934065934065934, 0.956140350877193]

# 3.9

In [145]:
estimators = bagging.estimators_
estimator_features = bagging.estimators_features_

train_scores = []
test_scores = []
features_names = []

for estimator, features in zip(estimators, estimator_features):
    train_scores.append(accuracy_score(y_train_1, estimator.predict(X_train_1.iloc[:, features])))
    test_scores.append(accuracy_score(y_test_1, estimator.predict(X_test_1.iloc[:, features])))
    features_names.append(features)
    
df = pd.DataFrame({'train': train_scores, 'test': test_scores, 'features': features_names})
df.sort_values(by=['test', 'train'], ascending=False, inplace=True)
df



Unnamed: 0,train,test,features
14,0.949451,0.947368,"[20, 25]"
12,0.927473,0.938596,"[20, 8]"
26,0.947253,0.929825,"[20, 27]"
6,0.931868,0.921053,"[12, 22]"
15,0.953846,0.903509,"[22, 27]"
16,0.918681,0.903509,"[2, 28]"
18,0.923077,0.894737,"[6, 27]"
13,0.898901,0.894737,"[3, 13]"
11,0.887912,0.894737,"[17, 2]"
5,0.931868,0.885965,"[27, 13]"


In [144]:
df.to_pickle('acc_fea_rank.pkl')