In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv('Data.csv')
df = df.drop(columns=['Sample code number'])
df.head(20)

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
5,8,10,10,8,7,10,9,7,1,4
6,1,1,1,1,2,10,3,1,1,2
7,2,1,2,1,2,1,3,1,1,2
8,2,1,1,1,2,1,1,1,5,2
9,4,2,1,1,2,1,2,1,1,2


## Data preprocessing

In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(random_state=0)
log_clf.fit(X_train, y_train)

y_pred_log = log_clf.predict(X_test)

from sklearn.metrics import confusion_matrix

c_matrix = confusion_matrix(y_test, y_pred_log)
c_matrix



array([[103,   4],
       [  5,  59]])

# K-NN

In [5]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='minkowski', p=2)
knn_clf.fit(X_train, y_train)

y_pred_knn = knn_clf.predict(X_test)

c_matrix = confusion_matrix(y_test, y_pred_knn)
c_matrix

array([[103,   4],
       [  5,  59]])

## SVM

In [6]:
from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', random_state=0)
svm_clf.fit(X_train, y_train)

y_pred_svm = svm_clf.predict(X_test)

c_matrix = confusion_matrix(y_test, y_pred_svm)
c_matrix

array([[102,   5],
       [  5,  59]])

## Kernel SVM

In [7]:
from sklearn.svm import SVC

ksvm_clf = SVC(kernel='rbf', random_state=0)
ksvm_clf.fit(X_train, y_train)

y_pred_ksvm = ksvm_clf.predict(X_test)

c_matrix = confusion_matrix(y_test, y_pred_ksvm)
c_matrix

array([[101,   6],
       [  3,  61]])

## Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

y_pred_gnb = gnb_clf.predict(X_test)

c_matrix = confusion_matrix(y_test, y_pred_gnb)
c_matrix

array([[100,   7],
       [  3,  61]])

## Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_clf.fit(X_train, y_train)

y_pred_dt = dt_clf.predict(X_test)

c_matrix = confusion_matrix(y_test, y_pred_dt)
c_matrix

array([[104,   3],
       [  4,  60]])

## Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)

c_matrix = confusion_matrix(y_test, y_pred_rf)
c_matrix

array([[104,   3],
       [  5,  59]])

## Accuracy scores

In [15]:
from sklearn.metrics import accuracy_score

acc_log = accuracy_score(y_test, y_pred_log)
print('Logistic Regression: {:.4f}%'.format(acc_log * 100))

print('-----------------------------')

acc_knn = accuracy_score(y_test, y_pred_knn)
print('K-NN: {:.4f}%'.format(acc_knn * 100))

print('-----------------------------')

acc_svm = accuracy_score(y_test, y_pred_svm)
print('SVM: {:.4f}%'.format(acc_svm * 100))

print('-----------------------------')

acc_ksvm = accuracy_score(y_test, y_pred_ksvm)
print('Kernel SVM: {:.4f}%'.format(acc_ksvm * 100))

print('-----------------------------')

acc_gnb = accuracy_score(y_test, y_pred_gnb)
print('Naive Bayes: {:.4f}%'.format(acc_gnb * 100))

print('-----------------------------')

acc_dt = accuracy_score(y_test, y_pred_dt)
print('Decision Tree: {:.4f}%'.format(acc_dt * 100))

print('-----------------------------')

acc_rf = accuracy_score(y_test, y_pred_rf)
print('Random Forest: {:.4f}%'.format(acc_rf * 100))

print('-----------------------------')

Logistic Regression: 94.7368%
-----------------------------
K-NN: 94.7368%
-----------------------------
SVM: 94.1520%
-----------------------------
Kernel SVM: 94.7368%
-----------------------------
Naive Bayes: 94.1520%
-----------------------------
Decision Tree: 95.9064%
-----------------------------
Random Forest: 95.3216%
-----------------------------


We can conclude that out of all our models, the Decision Tree classifier was the best fit for our data followed by Random Forest.

SVM and Naive Bayes performed the worst however, in general they all performed at between 94% and 96% which is a very good accuracy for all models.