## Importing the dataset

In [1]:
import pandas as pd
dataset = pd.read_csv("Data.csv")
dataset.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [2]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Check for missing values

In [3]:
dataset.isnull().sum()

Sample code number             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

No missing values let's move ahead.

## Splitting the data 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Feature Scaling

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Creating Different Classification Models

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score


log_reg = LogisticRegression(random_state=0)
knn = KNeighborsClassifier()
linear_svc = SVC(kernel='linear', random_state=0)
poly_svc = SVC(kernel='poly', random_state=0)
rbf_svc = SVC(kernel='rbf', random_state=0)
sigmoid_svc = SVC(kernel='sigmoid', random_state=0)
nb = GaussianNB()
dt = DecisionTreeClassifier(random_state=0)
rf = RandomForestClassifier(random_state=0)

## Training All the Models

In [7]:
classifiers = [log_reg, knn, linear_svc, poly_svc, rbf_svc, sigmoid_svc, nb, dt, rf]
best_acc_score = 0
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print("\nModel: ", classifier)
    print("Confusion Matrix:\n", cm)
    new_acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: ",new_acc_score)
    if new_acc_score > best_acc_score:
        best_acc_score = new_acc_score
        best_model = classifier
print("=="*50)
print("\nThe Best Model is: ", classifier)
print("With Accuracy Score: ",best_acc_score)


Model:  LogisticRegression(random_state=0)
Confusion Matrix:
 [[84  3]
 [ 3 47]]
Accuracy Score:  0.9562043795620438

Model:  KNeighborsClassifier()
Confusion Matrix:
 [[83  4]
 [ 2 48]]
Accuracy Score:  0.9562043795620438

Model:  SVC(kernel='linear', random_state=0)
Confusion Matrix:
 [[83  4]
 [ 2 48]]
Accuracy Score:  0.9562043795620438

Model:  SVC(kernel='poly', random_state=0)
Confusion Matrix:
 [[86  1]
 [ 7 43]]
Accuracy Score:  0.9416058394160584

Model:  SVC(random_state=0)
Confusion Matrix:
 [[82  5]
 [ 1 49]]
Accuracy Score:  0.9562043795620438

Model:  SVC(kernel='sigmoid', random_state=0)
Confusion Matrix:
 [[82  5]
 [ 2 48]]
Accuracy Score:  0.948905109489051

Model:  GaussianNB()
Confusion Matrix:
 [[80  7]
 [ 0 50]]
Accuracy Score:  0.948905109489051

Model:  DecisionTreeClassifier(random_state=0)
Confusion Matrix:
 [[80  7]
 [ 3 47]]
Accuracy Score:  0.927007299270073

Model:  RandomForestClassifier(random_state=0)
Confusion Matrix:
 [[84  3]
 [ 1 49]]
Accuracy Scor

## Cross validating all the Models

In [8]:
from sklearn.model_selection import cross_val_score
classifiers = [log_reg, knn, linear_svc, poly_svc, rbf_svc, sigmoid_svc, nb, dt, rf]

for classifier in classifiers:
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print(classifier)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
    print("="*100, "\n")

LogisticRegression(random_state=0)
Accuracy: 96.70 %
Standard Deviation: 1.97 %

KNeighborsClassifier()
Accuracy: 96.70 %
Standard Deviation: 1.79 %

SVC(kernel='linear', random_state=0)
Accuracy: 97.07 %
Standard Deviation: 2.19 %

SVC(kernel='poly', random_state=0)
Accuracy: 95.59 %
Standard Deviation: 2.87 %

SVC(random_state=0)
Accuracy: 96.53 %
Standard Deviation: 1.91 %

SVC(kernel='sigmoid', random_state=0)
Accuracy: 96.52 %
Standard Deviation: 2.09 %

GaussianNB()
Accuracy: 96.16 %
Standard Deviation: 1.91 %

DecisionTreeClassifier(random_state=0)
Accuracy: 93.59 %
Standard Deviation: 2.21 %

RandomForestClassifier(random_state=0)
Accuracy: 96.52 %
Standard Deviation: 2.39 %



In [9]:
from sklearn.model_selection import cross_val_score
best_acc_score = 0
best_std = 0
for classifier in classifiers:
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    new_acc_score = accuracies.mean()*100
    new_std = accuracies.std()*100
    print(classifier)
    print("Accuracy: {:.2f} %".format(new_acc_score))
    print("Standard Deviation: {:.2f} %".format(new_std))
    print("="*100, "\n")
    
    if new_acc_score > best_acc_score:
        best_model = classifier
        best_acc_score = new_acc_score
        best_std = new_std

print("=="*50)
print("\nThe Best Model is: ", classifier)
print("Having Accuracy Score: ",best_acc_score)
print("With Standard Deviation: ", best_std)

LogisticRegression(random_state=0)
Accuracy: 96.70 %
Standard Deviation: 1.97 %

KNeighborsClassifier()
Accuracy: 96.70 %
Standard Deviation: 1.79 %

SVC(kernel='linear', random_state=0)
Accuracy: 97.07 %
Standard Deviation: 2.19 %

SVC(kernel='poly', random_state=0)
Accuracy: 95.59 %
Standard Deviation: 2.87 %

SVC(random_state=0)
Accuracy: 96.53 %
Standard Deviation: 1.91 %

SVC(kernel='sigmoid', random_state=0)
Accuracy: 96.52 %
Standard Deviation: 2.09 %

GaussianNB()
Accuracy: 96.16 %
Standard Deviation: 1.91 %

DecisionTreeClassifier(random_state=0)
Accuracy: 93.59 %
Standard Deviation: 2.21 %

RandomForestClassifier(random_state=0)
Accuracy: 96.52 %
Standard Deviation: 2.39 %


The Best Model is:  RandomForestClassifier(random_state=0)
Having Accuracy Score:  97.07070707070707
With Standard Deviation:  2.1943977876398093
