## This notebook is a training run to save two deployable models with pipelines: KNN and RFC

In [1]:
import numpy as np, pandas as pd
import warnings
warnings.filterwarnings('ignore')

# X_train = pd.read_pickle('/home/nedderlander/datascience/burn notice/Data-Science/Data/X_train_full.pkl')
# y_train = pd.read_pickle('/home/nedderlander/datascience/burn notice/Data-Science/Data/y_train_full.pkl')
X_test = pd.read_pickle('/home/nedderlander/datascience/burn notice/Data-Science/Data/X_test_full.pkl')
# y_test = pd.read_pickle('/home/nedderlander/datascience/burn notice/Data-Science/Data/y_test_full.pkl')

In [2]:
X_test.columns.tolist()

['latitude',
 'longitude',
 'brightness',
 'scan',
 'track',
 'acq_time',
 'satellite',
 'confidence',
 'bright_t31',
 'frp',
 'daynight',
 'type',
 'FIRE_YEAR',
 'MONTH',
 'WEEK',
 'DAY']

In [3]:
X_test.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_time,satellite,confidence,bright_t31,frp,daynight,type,FIRE_YEAR,MONTH,WEEK,DAY
1865983,19.4104,-155.2771,306.399994,1.1,1.1,830,Terra,68,284.0,12.1,N,2,2015,1,1,1
1865984,19.442499,-155.0047,324.100006,1.1,1.0,830,Terra,100,286.0,29.0,N,2,2015,1,1,1
1865985,19.4601,-154.992493,313.0,1.1,1.0,830,Terra,86,288.0,16.700001,N,2,2015,1,1,1
1865986,19.408701,-155.287598,309.799988,1.1,1.1,830,Terra,78,284.0,14.8,N,2,2015,1,1,1
1865987,41.633301,-87.136101,301.0,1.9,1.3,1717,Terra,33,270.700012,22.700001,D,2,2015,1,1,1


In [4]:
X_test['type'].value_counts()

0    163798
2      7400
3        60
1        50
Name: type, dtype: int64

In [None]:
!pip install category-encoders

In [2]:
# scikit-learn pipelines
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# feature processing
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

# pre-processing pipeline
column_trans = ColumnTransformer(
    [('onehot', ce.OneHotEncoder(), ['satellite', 'daynight', 'type', 'FIRE_YEAR', 'MONTH']),
     ('scale', StandardScaler(), ['brightness', 'track', 'scan', 'acq_time', 'confidence', 'bright_t31', 'frp'])],
    remainder='passthrough')

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

random_state = 314

pipelines ={ 
    'rfc' : make_pipeline(column_trans, RandomForestClassifier(random_state = random_state )),
    'knn' : make_pipeline(column_trans, KNeighborsClassifier()),
}

# create a class weights grid:

class_weights = [
    {0:1, 1:1},
    {0:1, 1:2},
    {0:1, 1:5},
    {0:1, 1:10},
    {0:1, 1:100},
    {0:1, 1:1000}
]

# Create a hyperparameter grid for Random Forest

rfc_hyperparameters = { 
    'randomforestclassifier__n_estimators' : [100, 200] ,
    'randomforestclassifier__max_features' : ['auto', 0.3, 0.6],
    'randomforestclassifier__class_weight' : class_weights
}

knn_hyperparameters = {
    'kneighborsclassifier__n_neighbors' : [3, 5, 10, 20],
    'kneighborsclassifier__weights' : ['uniform', 'distance'],
    'kneighborsclassifier__algorithm' : ['ball_tree', 'kd_tree'],
    'kneighborsclassifier__leaf_size' : [15, 30, 45, 75]
}

# Create the hyperparameter_grids dictionary

hyperparameter_grids = {
    'rfc' : rfc_hyperparameters,
    'knn' : knn_hyperparameters
}

In [4]:
#check to see if our dictionaries match


for key in pipelines.keys():
    if key in hyperparameter_grids:
        if type(hyperparameter_grids[key]) is dict:
            print(key, 'was found, and it is a dict')
        else:
            print(key, 'was found, and it is not a dict')
    else:
        print(key, 'was not found')

rfc was found, and it is a dict
knn was found, and it is a dict


In [5]:
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

models = {}

tscv = TimeSeriesSplit(n_splits=5)

for key in pipelines.keys():
    models[key] = RandomizedSearchCV(
        pipelines[key],
        hyperparameter_grids[key],
        scoring='f1',
        cv=tscv,
        n_jobs=-1,
        verbose=10
  )

models.keys()

dict_keys(['rfc', 'knn'])

In [6]:
#Run cross validation on the models

for key in models.keys():
    models[key].fit(X_train, y_train)
    print(key, 'is trained')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 28.4min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 46.3min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 63.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 84.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 106.6min finished


rfc is trained
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 75.1min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 155.1min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 164.4min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 206.0min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 223.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 267.4min finished


knn is trained


In [8]:
from sklearn.externals import joblib

# save models and test results
for key, item in models.items():
    
    joblib_file ='{}_trained_full.pkl'.format(key)
    model = item.best_estimator_
    
    joblib.dump(model, joblib_file)
    
    results = pd.DataFrame(item.cv_results_).sort_values('mean_test_score', ascending=False)
    joblib_file = '{}_training_results.pkl'.format(key)
    joblib.dump(results, joblib_file)
    

In [None]:
# below is code to check outputs

In [None]:
# now to reload and examine

rfc = joblib.load('rfc_trained_full.pkl')

rfc_results = joblib.load('rfc_training_results.pkl')

In [None]:
type(rfc)

In [None]:
rfc_results