# Wisconsin Breast Cancer - Import

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml, time, sys, os

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  
sns.set_style("darkgrid")

DATASET = "BC_Wisconsin"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = True


In [None]:
if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(d): os.makedirs(d)
  if not os.path.isdir(ROOT): os.makedirs(ROOT)

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)


## Load Dataset

In [None]:
UCI = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/"

for filename in ["wdbc.data", "wdbc.names"]:
    source = f"{UCI}/{filename}"
    target = f"{ROOT}/orig/{filename}"
    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename}")


In [None]:
names = ['id_number', 'diagnosis', 'radius_mean', 
         'texture_mean', 'perimeter_mean', 'area_mean', 
         'smoothness_mean', 'compactness_mean', 'concavity_mean',
         'concave_points_mean', 'symmetry_mean', 
         'fractal_dimension_mean', 'radius_se', 'texture_se', 
         'perimeter_se', 'area_se', 'smoothness_se', 
         'compactness_se', 'concavity_se', 'concave_points_se', 
         'symmetry_se', 'fractal_dimension_se', 
         'radius_worst', 'texture_worst', 'perimeter_worst',
         'area_worst', 'smoothness_worst', 
         'compactness_worst', 'concavity_worst', 
         'concave_points_worst', 'symmetry_worst', 
         'fractal_dimension_worst'] 

if not os.path.isfile(f"{ROOT}/data/wdbc.data"):
    print("Reading original data ...")
    df = pd.read_csv(f"{ROOT}/orig/wdbc.data",header=None, names=names)

    print("Encoding target ...")
    df.diagnosis = df.diagnosis.map( {"M":0, "B":1} )

    print("Drop unique identifieer ...")
    df.drop(columns=["id_number"], inplace=True)

    print("Save to folder data/  ...")
    df.to_csv(f"{ROOT}/data/wdbc.data", index=False)

print("Load from folder data/ ...")
df = pd.read_csv(f"{ROOT}/data/wdbc.data")

print(df.shape)
df.head(10)

## Preprocessing

In [None]:
X = df.values[:,1:]
y = df.diagnosis.values
X.shape, y.shape

## Model Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier

## Baseline Model

In [None]:
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier()
scores = cross_val_score(model, X, y, cv=10)
scores.mean(), scores.std()


## Grid Search

Import module

In [None]:
from sklearn.model_selection import GridSearchCV

Defining the parameter search space

In [None]:
parameter_space = {
    "criterion": ['gini', 'entropy'],
    'max_depth': range(1,20),
    'max_features': np.linspace(0.1, 0.9, 9),
    "n_estimators": range(2,21),
}

Defining the search

In [None]:
grid_search = GridSearchCV(model, parameter_space, n_jobs=-1)

Carring out  the search

In [None]:
start = time.time()
grid_search.fit(X, y)
end = time.time()

Reporting results

In [None]:
print("Fit Time:", end - start)
print("Best param:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

## Optuna

Import module

In [None]:
import optuna

Define objective function (to maximise/minimise) 

In [None]:
def objective(trial):

    # Parameter space
    parameter_space = {
        "criterion": trial.suggest_categorical('criterion', ['gini','entropy']),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "max_features": trial.suggest_float("max_features", 0.1, 0.9),
        "n_estimators": trial.suggest_int("n_estimators", 2, 10),
    }

    # Setup model using hyper-parameters values
    model = RandomForestClassifier(**parameter_space)

    # Scoring model
    score = cross_val_score(model, X, y, n_jobs=-1, cv=10)

    return score.mean()

Defining the search

In [None]:
study = optuna.create_study(direction="maximize")

Carring out  the search

In [None]:
start = time.time()
study.optimize(objective, n_trials=100)
end = time.time()

Reporting results

In [None]:
print("Fit Time:", end - start)
print("Best Param:", study.best_params)
print("Best score:", study.best_value)

## Hyperopt

In [78]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [79]:
parmeter_space = {
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.uniform('max_features', 0.1,0.9),
    'n_estimators': hp.choice('n_estimators', range(1,10)),
}

In [80]:
def objective(params):
    model = RandomForestClassifier(**params)
    return cross_val_score(model, X, y).mean()

In [81]:
best = 0
best_param = {}

def f(params):
    global best, best_param
    acc = objective(params)
    if acc > best:
        best = acc
        best_param = params
        print( 'new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()

In [82]:
start = time.time()
hyperopt_search = fmin(f, parmeter_space, algo=tpe.suggest, max_evals=300, trials=trials)
end = time.time()

new best:                                              
0.9209594783418724                                     
{'criterion': 'entropy', 'max_depth': 1, 'max_features': 0.8125342871681082, 'n_estimators': 5}
new best:                                                                         
0.9508306163639186                                                                
{'criterion': 'entropy', 'max_depth': 11, 'max_features': 0.2761617577615303, 'n_estimators': 9}
new best:                                                                         
0.9525384257102933                                                                
{'criterion': 'gini', 'max_depth': 15, 'max_features': 0.6760393398030533, 'n_estimators': 6}
new best:                                                                         
0.9596335972675052                                                                
{'criterion': 'gini', 'max_depth': 9, 'max_features': 0.7561581423414493, 'n_estimators': 9}
new best: 

In [83]:
print("Fit Time:", end - start)
print("Best Param:", best_param)
print("Best score:", best)

Fit Time: 31.647873878479004
Best Param: {'criterion': 'entropy', 'max_depth': 5, 'max_features': 0.23492833392247894, 'n_estimators': 9}
Best score: 0.9701288619779538
