## Estimating the Cost with Cross-Validation

3 ways of estimating the cost:

- Domain Expert provides the cost
- Balance Ratio (we did this in previous notebook)
- Cross-validation: find cost as hyper-parameter

In this notebook, the cost is applied as a hyper parameter search with cross-validation.

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# load data
# only a few observations to speed the computaton

data = pd.read_csv('../kdd2004.csv').sample(10000)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
42061,30.89,26.58,-0.95,-36.0,68.5,2601.9,-0.84,-0.32,-11.0,-68.5,...,2040.6,-0.55,3.29,8.0,-120.0,311.2,2.4,0.08,-0.06,-1
16662,57.14,20.83,-0.55,5.5,53.0,1261.9,1.1,-1.01,2.0,-52.5,...,1301.2,1.2,-0.9,-3.0,-22.0,18.0,0.9,0.09,-0.19,-1
5035,80.0,18.48,-0.8,17.0,-7.0,617.4,0.86,0.09,15.5,-61.5,...,369.8,0.03,-0.25,0.0,-15.0,9.6,0.32,0.03,-0.77,-1
131062,54.95,30.0,-0.83,-32.0,23.0,1664.5,-0.01,-0.2,0.0,-77.0,...,1273.7,1.13,1.8,3.0,-66.0,266.4,1.76,0.31,0.13,-1
141169,77.42,27.21,-0.16,-37.5,80.5,2684.8,0.51,0.46,11.5,-76.0,...,3462.7,-0.85,-0.12,-10.0,-51.0,392.4,1.12,0.35,0.18,-1


In [3]:
# imbalanced target

data.target.value_counts() / len(data)

-1    0.9925
 1    0.0075
Name: target, dtype: float64

In [4]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((7000, 74), (3000, 74))

In [5]:
# set up initial rf

rf = RandomForestClassifier(n_estimators=50,
                            random_state=39,
                            max_depth=2,
                            n_jobs=4,
                            class_weight=None)

In [6]:
# set up parameter search grid
# including class weight

param_grid = {
  'n_estimators': [10, 50, 100],
  'max_depth': [None, 2, 3],
  'class_weight': [None, {-1:1, 1:10}, {-1:1, 1:100}],
}

In [7]:
search = GridSearchCV(estimator=rf,
                      scoring='roc_auc',
                      param_grid=param_grid,
                      cv=2,
                     ).fit(X_train, y_train)

In [8]:
search.best_score_

0.9842762589928058

In [10]:
search.best_params_

{'class_weight': {-1: 1, 1: 100}, 'max_depth': 3, 'n_estimators': 100}

In [11]:
search.best_estimator_

RandomForestClassifier(class_weight={-1: 1, 1: 100}, max_depth=3, n_jobs=4,
                       random_state=39)

In [12]:
search.score(X_test, y_test)

0.9861781512605042