# Workshop 6

Starter code for workshop 6. You should have seen most of it before, but make sure you understand what it is doing!

In [None]:
# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To plot even prettier figures
import seaborn as sn

# General data handling (pure numerics are better in numpy)
import pandas as pd

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [None]:
# Extract the feature and target
xarray = data.data
yarray = data.target
print(xarray.shape)
print(yarray.shape)
fullarray = np.concatenate((xarray,np.reshape(yarray,(-1,1))),axis=1)
print(fullarray.shape)

### Viewing the data
One thing we haven't done with this dataset is actually look at some of the data

In [None]:
## Let's be a bit more imaginative

 # Define a proper size of the plot
plt.figure(figsize=(10,6))

# Check the relation between each variable and the target
for i in range(1, 9):
    plt.subplot(2, 4, i)
    plt.plot(fullarray[:, i], fullarray[:,-1], 'k.')
    plt.title(data.feature_names[i])

In [None]:
# Remember to define properly the positive and negative class
fullarray[:,-1] = 1 - fullarray[:,-1]   # now invert the labels (so that malignant=1)
df = pd.DataFrame(fullarray,columns = list(data.feature_names) + ['target'])

In [None]:
# Check the dataframe
df

# Splitting into separate datasets


In [None]:
from sklearn.model_selection import train_test_split

# Split in train, validation and test. Check the parameter "stratify"
bigtrain_set, test_set = train_test_split(fullarray, test_size=0.15, random_state=42, stratify=fullarray[:,-1])
train_set, val_set = train_test_split(bigtrain_set, test_size=0.1765, random_state=42, stratify=bigtrain_set[:,-1])

**Note the use of "stratify" in the calls above, as these make sure that each dataset has roughly the same proportions of the classes.**

In [None]:
#Get the X and y for train, val and test
X_train = train_set[:,:-1]
y_train = train_set[:,-1]
X_test = test_set[:,:-1]
y_test = test_set[:,-1]
X_val = val_set[:,:-1]
y_val = val_set[:,-1]
print(f'Shapes are {[X_train.shape,y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape]}')

In [None]:
# You can see the proportion of the classes in each partition
print(np.mean(y_train),np.mean(y_test),np.mean(y_val))

These are the proportions of the classes in each dataset (as classes are given values 0 and 1, so a mean is just equal to the proportion of the class represented by 1).

# Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")), 
                        ('std_scaler', StandardScaler())])

# Tutor will demonstrate SVM Classifier by the following:
* build a pipeline
* AUC/feature
* ROC Curves
* Visualisation of decision boundary
* Validation
* Tuning with C Values
* More options
* Model Selection