In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Exploration of the dataset
We first start by checking the presence of missing values

In [None]:
dataset = pd.read_csv('wbc.csv').drop('id', axis=1)
print(f'Shape of the dataset : {dataset.shape}.\nPresence of NaNs:\n{dataset.isna().sum()}')

We now check the descriptive statistics of the dataset

In [None]:
dataset.describe()

## Conclusion
We learnt that there are no missing values in our features and that they are not on the same scale (normalization might be necessary for some models)<br>
We can therefore proceed and split the train and test set

In [3]:
X = dataset.drop('diagnosis', axis=1).values
y = dataset['diagnosis'].values
y = y=='M'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

# Feature Selection

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

ss = StandardScaler()
X_scale = ss.fit_transform(X)
selector = RFECV(LogisticRegression(max_iter=10000), step=1, cv=5)
selector.fit(X_scale, y)
print(selector.ranking_)
print(selector.support_)

## Conclusion
RFECV showe us that no feature should be removed from the dataset. 

# Compare models performance

## Logistic regression

In [52]:
from sklearn.linear_model import LogisticRegression

steps = [('scale', StandardScaler()), ('logreg', LogisticRegression())]
params = {'logreg__penalty': ['l1', 'l2'],
         'logreg__C': np.linspace(0.1, 10.0, 30),
         'logreg__solver': ['saga'],
         'logreg__max_iter': [10000]}
pipeline = Pipeline(steps)
grid_logreg = GridSearchCV(pipeline, param_grid=params, cv=5)

grid_logreg.fit(X_train, y_train)

print(f'Score when tested on the test set : {grid_logreg.score(X_test, y_test)}.\nThis score is obtained with the following parameters : {grid_logreg.best_params_}')

Score when tested on the test set : 0.9649122807017544.
This score is obtained with the following parameters : {'logreg__C': 0.4413793103448276, 'logreg__max_iter': 10000, 'logreg__penalty': 'l2', 'logreg__solver': 'saga'}


## Random Forest Classifier

In [50]:
from sklearn.ensemble import RandomForestClassifier

params = {
        'n_estimators': np.arange(100, 600, 100),
        'max_depth': np.arange(2,10,2),
        'max_features': [1.0, 'sqrt', 'log2',1],
    }

grid_rfc = GridSearchCV(RandomForestClassifier(random_state=99), param_grid = params, cv=5)

grid_rfc.fit(X_train, y_train)

print(f'Score when tested on the test set : {grid_rfc.score(X_test, y_test)}.\nThis score is obtained with the following parameters : {grid_rfc.best_params_}')

Score when tested on the test set : 0.9385964912280702.
This score is obtained with the following parameters : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 100}


## Support Vector Machine

In [51]:
from sklearn.svm import SVC

steps = [('scale', StandardScaler()), ('svc', SVC(random_state=99))]
params = {
        'svc__C': np.linspace(0.01, 1.0, 30),
        'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'svc__degree': np.arange(2,10,1)
}
pipeline = Pipeline(steps)

grid_svc = GridSearchCV(pipeline, param_grid=params, cv=5)

grid_svc.fit(X_train, y_train)

print(f'Score when tested on the test set : {grid_svc.score(X_test, y_test)}.\nThis score is obtained with the following parameters : {grid_rfc.best_params_}')

Score when tested on the test set : 0.9649122807017544.
This score is obtained with the following parameters : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 100}


## K Neighbors Classifier

In [53]:
from sklearn.neighbors import KNeighborsClassifier

steps = [('scale', StandardScaler()), ('knn', KNeighborsClassifier())]
params = {
        'knn__n_neighbors': np.arange(1,15),
        'knn__weights': ['uniform', 'distance']
}
pipeline=Pipeline(steps)

grid_knn = GridSearchCV(pipeline, param_grid=params, cv=5)

grid_knn.fit(X_train, y_train)

print(f'Score when tested on the test set : {grid_knn.score(X_test, y_test)}.\nThis score is obtained with the following parameters : {grid_knn.best_params_}')

Score when tested on the test set : 0.956140350877193.
This score is obtained with the following parameters : {'knn__n_neighbors': 9, 'knn__weights': 'uniform'}


## Naive Bayes

In [54]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

print(f'Score when tested on the test set : {gnb.score(X_test, y_test)}.')

Score when tested on the test set : 0.9210526315789473.


## Ensemble Learning

In [56]:
y_pred = (np.where(grid_rfc.predict(X_test),1,0) + np.where(grid_svc.predict(X_test),1,0) + np.where(grid_knn.predict(X_test),1,0))//2
print(f'Accuracy when using applying a majority vote to the 3 best models : {accuracy_score(y_test, y_pred)}\nWe see that this is the same accuracy as the best model')

Accuracy when using applying a majority vote to the 3 best models : 0.9649122807017544
We see that this is the same accuracy as the best model


## Conclusion
The best performing model seems to be the linear ones. Indeed, the logistic regression and the linear SVC performed with the same accuracy : 96.5%