In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Exploration of the dataset
We first start by checking the presence of missing values

In [2]:
dataset = pd.read_csv('wbc.csv').drop('id', axis=1)
print(f'Shape of the dataset : {dataset.shape}.\nPresence of NaNs:\n{dataset.isna().sum()}')

Shape of the dataset : (569, 31).
Presence of NaNs:
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


We now check the descriptive statistics of the dataset

In [9]:
dataset.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [3]:
X = dataset.drop('diagnosis', axis=1).values
y = dataset['diagnosis'].values
y = y=='M'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

# Feature Selection

In [4]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

ss = StandardScaler()
X_scale = ss.fit_transform(X)
selector = RFECV(LogisticRegression(max_iter=10000), step=1, cv=5)
selector.fit(X_scale, y)
print(selector.ranking_)
print(selector.support_)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]


# Compare models performance

## Logistic regression

In [5]:
from sklearn.linear_model import LogisticRegression

steps = [('scale', StandardScaler()), ('logreg', LogisticRegression())]
params = {'logreg__penalty': ['l1', 'l2'],
         'logreg__C': np.linspace(0.1, 10.0, 30),
         'logreg__solver': ['saga'],
         'logreg__max_iter': [10000]}
pipeline = Pipeline(steps)
grid_logreg = GridSearchCV(pipeline, param_grid=params, cv=5)

grid_logreg.fit(X_train, y_train)

## Random Forest Classifier

In [33]:
from sklearn.ensemble import RandomForestClassifier

params = {
    'n_estimators': [500],
    'max_depth': [6],
    'max_features': [1]
        #'n_estimators': np.arange(100, 600, 100),
        #'max_depth': np.arange(2,16,2),
        #'max_features': [1.0, 'sqrt', 'log2',1],
    }

grid_rfc = GridSearchCV(RandomForestClassifier(random_state=99), param_grid = params, cv=5)

grid_rfc.fit(X_train, y_train)
print(grid_rfc.score(X_test, y_test), grid_rfc.best_params_, grid_rfc.best_score_)

0.956140350877193 {'max_depth': 6, 'max_features': 1, 'n_estimators': 500} 0.9472527472527472


## Support Vector Machine

In [28]:
from sklearn.svm import SVC

steps = [('scale', StandardScaler()), ('svc', SVC(random_state=99))]
params = {
        'svc__C': np.linspace(0.01, 1.0, 30),
        'svc__kernel': ['linear'],
        'svc__degree': np.arange(2,10,1)
}
pipeline = Pipeline(steps)

grid_svc = GridSearchCV(pipeline, param_grid=params, cv=5)

grid_svc.fit(X_train, y_train)

print(grid_svc.score(X_test, y_test), grid_svc.best_params_)

0.9649122807017544 {'svc__C': 0.04413793103448276, 'svc__degree': 2, 'svc__kernel': 'linear'}


## K Neighbors Classifier

In [27]:
from sklearn.neighbors import KNeighborsClassifier

steps = [('scale', StandardScaler()), ('knn', KNeighborsClassifier())]
params = {
        'knn__n_neighbors': np.arange(1,15),
        'knn__weights': ['uniform', 'distance']
}
pipeline=Pipeline(steps)

grid_knn = GridSearchCV(pipeline, param_grid=params, cv=5)

grid_knn.fit(X_train, y_train)

print(grid_knn.best_score_, grid_knn.best_params_, grid_knn.score(X_test, y_test))

0.9670329670329672 {'knn__n_neighbors': 9, 'knn__weights': 'uniform'} 0.956140350877193


## Naive Bayes

In [48]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

print(gnb.score(X_test, y_test))

0.9210526315789473


## Ensemble Learning

In [47]:
from sklearn.metrics import accuracy_score

y_pred = (np.where(grid_rfc.predict(X_test),1,0) + np.where(grid_svc.predict(X_test),1,0) + np.where(grid_knn.predict(X_test),1,0))//2
print(accuracy_score(y_test, y_pred))

0.9649122807017544
