### Libraries 

In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier


### Read and defined the names of the columns 

In [30]:


col = ['id', 'Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class']


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
df = pd.read_csv(url, names=col, header=None)
df.head()
 
 

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


### Clean data on the Bare Nuclei

In [31]:
df['Bare Nuclei'].value_counts()
df['Bare Nuclei'].replace("?", np.NAN, inplace=True)
df = df.dropna()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Bare Nuclei'].replace("?", np.NAN, inplace=True)


### Define the X and Y

X doesnt not include id nor class 
Y is to predict the class based on X's

In [32]:
df['Bare Nuclei'].value_counts()
 
X = df.drop(['id', 'Class'], axis=1)
X_col = X.columns
 
y = df['Class']

### Split the data

- The data is splited into 70% 30%
- Fit transform the data 
- Configurate and train the knn method


In [33]:
df1 = pd.DataFrame(X, columns=X_col)
X_train, X_test, y_train, y_test = train_test_split(df1, y,
                                                    train_size=0.7,
                                                    random_state=42)
df1.head()
 
pd.DataFrame(MinMaxScaler().fit_transform(df.drop(['id', 'Class'], axis=1).values), columns=X_col).head()
 
 
knn = KNeighborsClassifier(n_neighbors=5,
                           p=2, metric='minkowski')
knn.fit(X_train, y_train)

### Print the score 
- Create a function of print score 
- Print the result of the KNN trained before, only knowing its testing result 

In [34]:

def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        res = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train,
                                                                res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train,
                                                                            res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train,
                                                                  res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train),
                                                      lb.transform(res))))
 
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
 
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test,
                                                                res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test,
                                                                            res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test,
                                                                  res_test)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test),
                                                      lb.transform(res_test))))
 
############
 
 
print_score(knn, X_train, X_test, y_train, y_test, train=False)

Test Result:

accuracy score: 0.9489

Classification Report: 
               precision    recall  f1-score   support

           2       0.93      0.99      0.96        79
           4       0.98      0.90      0.94        58

    accuracy                           0.95       137
   macro avg       0.95      0.94      0.95       137
weighted avg       0.95      0.95      0.95       137


Confusion Matrix: 
 [[78  1]
 [ 6 52]]

ROC AUC: 0.9419



### Grid Search 

In [35]:
knn.get_params()
 
params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
grid_search_cv = GridSearchCV(KNeighborsClassifier(),
                              params,
                              n_jobs=-1,
                              verbose=1,
                              cv=10)
grid_search_cv.fit(X_train, y_train)
grid_search_cv.best_estimator_
print_score(grid_search_cv, X_train, X_test, y_train, y_test, train=False)
 
grid_search_cv.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Test Result:

accuracy score: 0.9489

Classification Report: 
               precision    recall  f1-score   support

           2       0.93      0.99      0.96        79
           4       0.98      0.90      0.94        58

    accuracy                           0.95       137
   macro avg       0.95      0.94      0.95       137
weighted avg       0.95      0.95      0.95       137


Confusion Matrix: 
 [[78  1]
 [ 6 52]]

ROC AUC: 0.9419



{'n_neighbors': 5}

### Bagging Method

In [36]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
bag_clf = BaggingClassifier(estimator=rf_clf, n_estimators=20,
                            bootstrap=True, n_jobs=None,
                            random_state=42)
bag_clf.fit(X_train, y_train)
print_score(bag_clf, X_train, X_test, y_train, y_test, train=True)
print("\n********************************\n")
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.9963

Classification Report: 
               precision    recall  f1-score   support

           2       1.00      0.99      1.00       365
           4       0.99      1.00      0.99       181

    accuracy                           1.00       546
   macro avg       0.99      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546


Confusion Matrix: 
 [[363   2]
 [  0 181]]

ROC AUC: 0.9973

Average Accuracy: 	 0.9634
Accuracy SD: 		 0.0231

********************************

Test Result:

accuracy score: 0.9562

Classification Report: 
               precision    recall  f1-score   support

           2       0.94      0.99      0.96        79
           4       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137


Confusion Matrix: 
 [[78  1]
 [ 5 53]]

ROC AUC: 0.9506



### Final Results and Comparation 

In [37]:
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)
print_score(knn, X_train, X_test, y_train, y_test, train=False)

Test Result:

accuracy score: 0.9562

Classification Report: 
               precision    recall  f1-score   support

           2       0.94      0.99      0.96        79
           4       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137


Confusion Matrix: 
 [[78  1]
 [ 5 53]]

ROC AUC: 0.9506

Test Result:

accuracy score: 0.9489

Classification Report: 
               precision    recall  f1-score   support

           2       0.93      0.99      0.96        79
           4       0.98      0.90      0.94        58

    accuracy                           0.95       137
   macro avg       0.95      0.94      0.95       137
weighted avg       0.95      0.95      0.95       137


Confusion Matrix: 
 [[78  1]
 [ 6 52]]

ROC AUC: 0.9419



### Conclusions 

- The method of KNN simple and bagging gave the same results. 
- As for the results shows, the modelling have a maximun performance in testing of 95% for both, KNN and Bagging. Meaning that it is almost imposible to improve the modelling