###### Credit : superdatascience.com

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Data

In [2]:
dataset = pd.read_csv('Breast_Cancer.csv')

In [3]:
dataset.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## Preprocessing

##### Set Split

In [5]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [92]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## 1/ XGBoost

##### Encode y_train label

In [93]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

##### Model

In [94]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

##### Prediction

In [95]:
y_pred = classifier.predict(X_test)
y_pred = le.inverse_transform(y_pred)

##### Evaluation

In [96]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[84  3]
 [ 1 49]]


0.9708029197080292

##### k-Fold Cross Validation

In [97]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)

print('Accuracy : {:.2f} %'.format(accuracies.mean() * 100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std() * 100))

Accuracy : 96.89 %
Standard Deviation : 2.17 %


## 2/ Logistic Regression

##### Model

In [40]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(random_state = 0)
lg.fit(X_train, y_train)

##### Prediction

In [41]:
lg_pred = lg.predict(X_test)

##### Evaluation

In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, lg_pred)
print(cm)
accuracy_score(y_test, lg_pred)

[[84  3]
 [ 3 47]]


0.9562043795620438

##### k-Fold Cross Validation

In [43]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = lg, X = X_train, y = y_train, cv = 10)

print('Accuracy : {:.2f} %'.format(accuracies.mean() * 100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std() * 100))

Accuracy : 96.70 %
Standard Deviation : 1.97 %


## 3/  K Nearest Neighbors

##### Model

In [44]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)

##### Prediction

In [45]:
knn_pred = knn.predict(X_test)

##### Evaluation

In [46]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, knn_pred)
print(cm)
accuracy_score(y_test, knn_pred)

[[83  4]
 [ 2 48]]


0.9562043795620438

##### k-Fold Cross Validation

In [47]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 10)

print('Accuracy : {:.2f} %'.format(accuracies.mean() * 100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std() * 100))

Accuracy : 96.70 %
Standard Deviation : 1.79 %


## 4/ Support Vector Machine

##### Model

In [52]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, y_train)

##### Prediction

In [53]:
svm_pred = svm.predict(X_test)

##### Evaluation

In [54]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, svm_pred)
print(cm)
accuracy_score(y_test, svm_pred)

[[83  4]
 [ 2 48]]


0.9562043795620438

##### k-Fold Cross Validation

In [55]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = svm, X = X_train, y = y_train, cv = 10)

print('Accuracy : {:.2f} %'.format(accuracies.mean() * 100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std() * 100))

Accuracy : 97.07 %
Standard Deviation : 2.19 %


## 5/ Kernel SVM

##### Model

In [56]:
from sklearn.svm import SVC
ksvm = SVC(kernel = 'rbf', random_state = 0)
ksvm.fit(X_train, y_train)

##### Prediction

In [57]:
ksvm_pred = ksvm.predict(X_test)

##### Evaluation

In [58]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, ksvm_pred)
print(cm)
accuracy_score(y_test, ksvm_pred)

[[82  5]
 [ 1 49]]


0.9562043795620438

##### k-Fold Cross Validation

In [59]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = ksvm, X = X_train, y = y_train, cv = 10)

print('Accuracy : {:.2f} %'.format(accuracies.mean() * 100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std() * 100))

Accuracy : 96.89 %
Standard Deviation : 2.17 %


## 6/ Decision Tree Classification

##### Model

In [60]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtc.fit(X_train, y_train)

##### Prediction

In [61]:
dtc_pred = dtc.predict(X_test)

##### Evaluation

In [62]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, dtc_pred)
print(cm)
accuracy_score(y_test, dtc_pred)

[[84  3]
 [ 3 47]]


0.9562043795620438

##### k-Fold Cross Validation

In [63]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = dtc, X = X_train, y = y_train, cv = 10)

print('Accuracy : {:.2f} %'.format(accuracies.mean() * 100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std() * 100))

Accuracy : 94.33 %
Standard Deviation : 2.65 %


## 7/ Random Forest

##### Model

In [64]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rfc.fit(X_train, y_train)

##### Prediction

In [65]:
rfc_pred = rfc.predict(X_test)

##### Evaluation

In [66]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, rfc_pred)
print(cm)
accuracy_score(y_test, rfc_pred)

[[83  4]
 [ 3 47]]


0.948905109489051

##### k-Fold Cross Validation

In [67]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)

print('Accuracy : {:.2f} %'.format(accuracies.mean() * 100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std() * 100))

Accuracy : 96.34 %
Standard Deviation : 2.16 %


## 8/ Naive Bayes

##### Model

In [68]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

##### Prediction

In [69]:
nb_pred = nb.predict(X_test)

##### Evaluation

In [70]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, nb_pred)
print(cm)
accuracy_score(y_test, nb_pred)

[[80  7]
 [ 0 50]]


0.948905109489051

##### k-Fold Cross Validation

In [71]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = nb, X = X_train, y = y_train, cv = 10)

print('Accuracy : {:.2f} %'.format(accuracies.mean() * 100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std() * 100))

Accuracy : 96.52 %
Standard Deviation : 2.24 %
