In [2]:
import numpy as np, pandas as pd
import warnings
import psutil, os
warnings.filterwarnings('ignore')

# X_train = pd.read_csv('X_train.csv')
# y_train = pd.read_csv('y_train.csv')
# X_test = pd.read_csv('X_test.csv')
# y_test = pd.read_csv('y_test.csv')

# X_train = pd.read_csv('X_train_small.csv')
# y_train = pd.read_csv('y_train_small.csv')
# X_test = pd.read_csv('X_test_small.csv')
# y_test = pd.read_csv('y_test_small.csv')

X_train = pd.read_pickle('X_train_full.pkl')
y_train = pd.read_pickle('y_train_full.pkl')
X_test = pd.read_pickle('X_test_full.pkl')
y_test = pd.read_pickle('y_test_full.pkl')

In [5]:
!pip install category-encoders

Collecting category-encoders
[?25l  Downloading https://files.pythonhosted.org/packages/6e/a1/f7a22f144f33be78afeb06bfa78478e8284a64263a3c09b1ef54e673841e/category_encoders-2.0.0-py2.py3-none-any.whl (87kB)
[K    100% |████████████████████████████████| 92kB 5.7MB/s ta 0:00:011
Installing collected packages: category-encoders
Successfully installed category-encoders-2.0.0
[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
# Now I'll set up pipelines

# scikit-learn pipelines
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# feature processing
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

# pre-processing pipeline
column_trans = ColumnTransformer(
    [('onehot', ce.OneHotEncoder(), ['satellite', 'daynight', 'type']),
    ('scale', StandardScaler(), ['brightness', 'track', 'scan', 'acq_time', 'confidence', 'bright_t31', 'frp'])],
    remainder='passthrough')

# preprocess = make_pipeline(column_trans, FunctionTransformer(all_float_to_int))
# # , 
# #                            FunctionTransformer(downcast_all, "float"),
# #                           FunctionTransformer(downcast_all, "integer"),
# #                           FunctionTransformer(downcast_all, target_type = "unsigned", 
# #                            inital_type = "integer"))


In [None]:
# try to tune RFC with timeseries split
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from xgboost import XGBClassifier 

random_state = 314

tscv = TimeSeriesSplit(n_splits=3)

model =  make_pipeline(column_trans, XGBClassifier(random_state = random_state ))

# Create a hyperparameter grid for Gradient Booster

xgb_hyperparameters = { 
    'xgbclassifier__n_estimators' : [100, 200] ,
    'xgbclassifier__learning_rate' : [0.05, 0.1, 0.2],
    'xgbclassifier__max_depth' : [1, 3, 5] 
}

search = RandomizedSearchCV(estimator=model, cv=tscv, scoring='f1',
                           param_distributions=rfc_hyperparameters, n_jobs=-1, verbose=10)
search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  30 | elapsed: 18.2min remaining: 163.4min
[Parallel(n_jobs=-1)]: Done   7 out of  30 | elapsed: 30.0min remaining: 98.6min
[Parallel(n_jobs=-1)]: Done  11 out of  30 | elapsed: 38.0min remaining: 65.6min


In [None]:
search.score(X_test, y_test)

In [None]:
results = pd.DataFrame(search.cv_results_).sort_values('mean_test_score', ascending=False)
results.head()

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_proba = search.predict_proba(X_test)[:,1]

roc_auc_score(y_test, y_pred_proba)

In [None]:
# save the model to disk
filename = 'rfc_big_fire_training.sav'
pickle.dump(model, open(filename, 'wb'))