In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [22]:
df= pd.read_csv("all_penguins_clean.csv", index_col=0)
df.dropna(inplace=True) # quick solve for NaN error in m.fit below
df.shape

(334, 14)

In [23]:
X = df[['Culmen Length (mm)', "Body Mass (g)"]]
y = df["Species"]

We want to :
* try different values for each hyperparameter
* train the model for each
* see which has the best validation score

In [24]:
for i in range (1,11):
    m = DecisionTreeClassifier(max_depth=i)
    mean_acc = cross_val_score(m, X, y, cv=5, scoring='accuracy').mean().round(3)
    print (f"{i:3} {mean_acc:8.3f}")

  1    0.740
  2    0.910
  3    0.907
  4    0.916
  5    0.916
  6    0.907
  7    0.904
  8    0.895
  9    0.901
 10    0.901


In [25]:
from sklearn.model_selection import GridSearchCV

In [27]:
#define hyperparameters to combine
hyperparams = {
    'max_depth'  : list(range(1,11)),
    'min_samples_split' : list(range(2,20,2))
}

# create an estimator
m =DecisionTreeClassifier()
g = GridSearchCV(m, hyperparams, cv=5)
g.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
       

In [28]:
g.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [29]:
g.best_params_

{'max_depth': 5, 'min_samples_split': 6}

In [30]:
g.cv_results_

{'mean_fit_time': array([0.0075706 , 0.00388417, 0.0039866 , 0.00539312, 0.0035738 ,
        0.0032752 , 0.00370827, 0.00421968, 0.00352459, 0.0034894 ,
        0.00323114, 0.00469003, 0.00370755, 0.00366635, 0.0036932 ,
        0.00339079, 0.0034214 , 0.00432773, 0.00613098, 0.00342288,
        0.00399942, 0.00355654, 0.00365105, 0.00400157, 0.0033514 ,
        0.00424933, 0.00378761, 0.00387349, 0.00510507, 0.01215496,
        0.00477715, 0.01264486, 0.00490341, 0.00378957, 0.01103301,
        0.00681958, 0.0061903 , 0.0055892 , 0.00444059, 0.00886555,
        0.00629506, 0.00532541, 0.00454235, 0.00526133, 0.00391765,
        0.00417242, 0.0055676 , 0.00504084, 0.00631065, 0.00426803,
        0.00760241, 0.00583439, 0.00574841, 0.00534177, 0.00391827,
        0.00435634, 0.00407739, 0.00392852, 0.00414381, 0.00598297,
        0.00572906, 0.00838847, 0.00425763, 0.00372863, 0.00631042,
        0.00515127, 0.00729394, 0.00389032, 0.00421491, 0.00405354,
        0.00467534, 0.00428352,

In [31]:
g.best_score_

0.9252374491180462

## Hyperparameter optimisation
### Column Transformer
take columns a, b, c, apply feature engineering M to them, then put the result back in the dataframe


In [43]:
from sklearn.compose import ColumnTransformer

trans = ColumnTransformer([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Island', 'Sex']),
    ('my_binning', KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile'), ['Culmen Depth (mm)'] ),
    ('do_nothing', "passthrough", ['Culmen Length (mm)', 'Body Mass (g)'])
])

In [55]:
trans

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('onehot',
                                 OneHotEncoder(categories='auto', drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='ignore',
                                               sparse=False),
                                 ['Island', 'Sex']),
                                ('my_binning',
                                 KBinsDiscretizer(encode='onehot', n_bins=5,
                                                  strategy='quantile'),
                                 ['Culmen Depth (mm)']),
                                ('do_nothing', 'passthrough',
                                 ['Culmen Length (mm)', 'Body Mass (g)'])],
                  verbose=False)

In [44]:
trans.fit(df)
X = trans.transform(df)
X.shape

(334, 13)

In [48]:
y = df['Species']

In [45]:
X[0]

array([0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 1.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 3.91e+01,
       3.75e+03])

### Scaling
uses the output of the Column Transformer


In [49]:
scaler = MinMaxScaler()
scaler.fit(X)
Xsc = scaler.transform(X)

## Modelling Pipeline:
1. Apply column transformations (OneHot, Binning, Custom funcs)
2. Scale everything
3. Train a model


In [51]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

In [52]:
p = make_pipeline(trans, MinMaxScaler(), LogisticRegression())

In [54]:
p.fit(df,y) # only on the training data
p.score(df,y) # also on test data

0.9970059880239521

Using `pipeline` we only have to call `.fit` once

We can use `p` in `cross_val_score` 

In [None]:
#p.score(X_test, y_test) #<-- This is where the X_test is feature engineered to be the same as X_train