In [1]:
import numpy as np, pandas as pd
import warnings
import psutil, os
warnings.filterwarnings('ignore')

# X_train = pd.read_csv('X_train.csv')
# y_train = pd.read_csv('y_train.csv')
# X_test = pd.read_csv('X_test.csv')
# y_test = pd.read_csv('y_test.csv')

# X_train = pd.read_csv('X_train_small.csv')
# y_train = pd.read_csv('y_train_small.csv')
# X_test = pd.read_csv('X_test_small.csv')
# y_test = pd.read_csv('y_test_small.csv')

X_train = pd.read_pickle('X_train_small.pkl')
y_train = pd.read_pickle('y_train_small.pkl')
X_test = pd.read_pickle('X_test_small.pkl')
y_test = pd.read_pickle('y_test_small.pkl')

In [26]:
# memory downcasting system

def float_to_int(ser):
    try:
        int_ser = ser.astype(int)
        if (ser == int_ser).all():
            return int_ser
        else:
            return ser
    except ValueError:
        return ser
    
def all_float_to_int(df):
    df_to_use = df.copy()
    transform_fn = float_to_int
    condition = lambda x: list(x
                    .select_dtypes(include=["float"])
                    .columns)    
    
    return multi_assign(df_to_use, transform_fn, condition)

def multi_assign(df, transform_fn, condition):
    df_to_use = df.copy()
    
    return (df_to_use
        .assign(
            **{col: transform_fn(df_to_use[col])
               for col in condition(df_to_use)})
           )

def downcast_all(df, target_type, inital_type=None):
    #Gotta specify floats, unsigned, or integer
    #If integer, gotta be 'integer', not 'int'
    #Unsigned should look for Ints
    if inital_type is None:
        inital_type = target_type
    
    df_to_use = df.copy()
    
    transform_fn = lambda x: pd.to_numeric(x, 
                                downcast=target_type)
    
    condition = lambda x: list(x
                    .select_dtypes(include=[inital_type])
                    .columns) 
    
    return multi_assign(df_to_use, transform_fn, condition)


In [28]:
# Now I'll set up pipelines

# scikit-learn pipelines
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# feature processing
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

# pre-processing pipeline
column_trans = ColumnTransformer(
    [('onehot', ce.OneHotEncoder(), ['satellite', 'daynight', 'type']),
    ('scale', StandardScaler(), ['brightness', 'track', 'scan', 'acq_time', 'confidence', 'bright_t31', 'frp'])],
    remainder='passthrough')

preprocess = make_pipeline(column_trans, FunctionTransformer(all_float_to_int))
# , 
#                            FunctionTransformer(downcast_all, "float"),
#                           FunctionTransformer(downcast_all, "integer"),
#                           FunctionTransformer(downcast_all, target_type = "unsigned", 
#                            inital_type = "integer"))


In [29]:
preprocess.fit(X_train)

Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('onehot', OneHotEncoder(cols=None, drop_invariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, use_cat_...  inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=None))])

In [31]:
preprocess.transform(X_train)

AttributeError: 'numpy.ndarray' object has no attribute 'assign'

In [None]:
df = (df
     .pipe(all_float_to_int)
     .pipe(downcast_all, "float")
     .pipe(downcast_all, "integer")
     .pipe(downcast_all,  
           target_type = "unsigned", 
           inital_type = "integer")
)

In [3]:
import psutil, os
psutil.Process(os.getpid()).memory_info()

pmem(rss=162553856, vms=887197696, shared=48627712, text=2342912, lib=0, data=183377920, dirty=0)

In [3]:
X_train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186598 entries, 9 to 1865979
Data columns (total 16 columns):
latitude      186598 non-null float32
longitude     186598 non-null float32
brightness    186598 non-null float32
scan          186598 non-null float32
track         186598 non-null float32
acq_time      186598 non-null uint16
satellite     186598 non-null object
confidence    186598 non-null uint8
bright_t31    186598 non-null float32
frp           186598 non-null float32
daynight      186598 non-null object
type          186598 non-null uint8
FIRE_YEAR     186598 non-null uint16
MONTH         186598 non-null uint8
WEEK          186598 non-null uint8
DAY           186598 non-null uint8
dtypes: float32(7), object(2), uint16(2), uint8(5)
memory usage: 30.7 MB


In [4]:
# try to tune RFC with timeseries split
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

random_state = 314

tscv = TimeSeriesSplit(n_splits=3)

model =  make_pipeline(preprocess, RandomForestClassifier(random_state = random_state ))

rfc_hyperparameters = { 
    'randomforestclassifier__n_estimators' : [200, 400, 600, 800, 1000, 1200, 1600],
    'randomforestclassifier__max_features' : ['auto', 'log2', None],
    'randomforestclassifier__min_samples_leaf' : [2, 5, 10],
    'randomforestclassifier__min_samples_split' : [2,5,10]
}

search = RandomizedSearchCV(estimator=model, cv=tscv, scoring='f1',
                           param_distributions=rfc_hyperparameters, n_jobs=-1, verbose=10)
search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed: 41.3min remaining:  4.6min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 52.2min finished


RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
          error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('onehot', OneHotEncoder(cols=None, drop_invariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, use_cat_...s='warn', n_jobs=None,
            oob_score=False, random_state=314, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'randomforestclassifier__n_estimators': [200, 400, 600, 800, 1000, 1200, 1600], 'randomforestclassifier__max_features': ['auto', 'log2', None], 'randomforestclassifier__min_samples_leaf': [2, 5, 10], 'randomforestclassifier__min_samples_split': [2, 5, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return

In [5]:
search.score(X_test, y_test)

0.5052005943536403

In [7]:
results = pd.DataFrame(search.cv_results_).sort_values('mean_test_score', ascending=False)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestclassifier__n_estimators,param_randomforestclassifier__min_samples_split,param_randomforestclassifier__min_samples_leaf,param_randomforestclassifier__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
9,569.288943,267.906683,5.439396,0.292203,1000,2,2,,"{'randomforestclassifier__n_estimators': 1000,...",0.512267,0.519709,0.506214,0.51273,0.005519,1,0.985557,0.988712,0.98997,0.988079,0.001856
6,417.513494,201.089227,3.986167,0.210407,800,10,10,,"{'randomforestclassifier__n_estimators': 800, ...",0.530888,0.518278,0.486427,0.511864,0.018709,2,0.84921,0.861309,0.852102,0.854207,0.005159
8,867.328239,413.786421,8.288466,0.281597,1600,5,5,,"{'randomforestclassifier__n_estimators': 1600,...",0.51922,0.519806,0.495765,0.511597,0.011198,3,0.912888,0.924729,0.922157,0.919925,0.005085
2,662.852054,318.806489,6.305778,0.332851,1200,5,5,,"{'randomforestclassifier__n_estimators': 1200,...",0.519245,0.519934,0.495479,0.511553,0.01137,4,0.912888,0.924588,0.922098,0.919858,0.005032
0,332.309383,159.214951,3.132338,0.169734,600,5,5,,"{'randomforestclassifier__n_estimators': 600, ...",0.520937,0.519043,0.493804,0.511261,0.012368,5,0.912181,0.924174,0.922094,0.919483,0.005233


In [23]:
from sklearn.metrics import roc_auc_score

y_pred_proba = search.predict_proba(X_test)[:,1]

roc_auc_score(y_test, y_pred_proba)

0.726967294616319

In [None]:
# pretty tight ROC_AUC

In [11]:
for result in results.params:
    print(result)

{'randomforestclassifier__n_estimators': 1000, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__min_samples_leaf': 2, 'randomforestclassifier__max_features': None}
{'randomforestclassifier__n_estimators': 800, 'randomforestclassifier__min_samples_split': 10, 'randomforestclassifier__min_samples_leaf': 10, 'randomforestclassifier__max_features': None}
{'randomforestclassifier__n_estimators': 1600, 'randomforestclassifier__min_samples_split': 5, 'randomforestclassifier__min_samples_leaf': 5, 'randomforestclassifier__max_features': None}
{'randomforestclassifier__n_estimators': 1200, 'randomforestclassifier__min_samples_split': 5, 'randomforestclassifier__min_samples_leaf': 5, 'randomforestclassifier__max_features': None}
{'randomforestclassifier__n_estimators': 600, 'randomforestclassifier__min_samples_split': 5, 'randomforestclassifier__min_samples_leaf': 5, 'randomforestclassifier__max_features': None}
{'randomforestclassifier__n_estimators': 1200, 'randomforest

In [18]:
# try to speed up search by doing pipeline first

X_train_fit = column_trans.fit_transform(X_train)
X_test_fit = column_trans.transform(X_test)


tscv = TimeSeriesSplit(n_splits=3)

model =  RandomForestClassifier(random_state = random_state)

rfc_hyperparameters = { 
    'n_estimators' : [200, 400, 600, 800, 1000, 1200, 1600],
    'max_features' : ['auto', 'log2', None],
    'min_samples_leaf' : [2, 5, 10],
    'min_samples_split' : [2,5,10]
}

search = RandomizedSearchCV(estimator=model, cv=tscv, scoring='f1', n_jobs=-1,
                           param_distributions=rfc_hyperparameters, verbose=10)

search.fit(X_train_fit, y_train.values.ravel())

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  1.7min remaining:   11.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.6min finished


RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
          error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=314, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1600], 'max_features': ['auto', 'log2', None], 'min_samples_leaf': [2, 5, 10], 'min_samples_split': [2, 5, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=10)

In [19]:
search.score(X_test_fit, y_test)

0.5188284518828452

In [21]:
X_test.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1713 entries, 0 to 1712
Data columns (total 16 columns):
latitude      1713 non-null float64
longitude     1713 non-null float64
brightness    1713 non-null float64
scan          1713 non-null float64
track         1713 non-null float64
acq_time      1713 non-null int64
satellite     1713 non-null object
confidence    1713 non-null int64
bright_t31    1713 non-null float64
frp           1713 non-null float64
daynight      1713 non-null object
type          1713 non-null int64
FIRE_YEAR     1713 non-null int64
MONTH         1713 non-null int64
WEEK          1713 non-null int64
DAY           1713 non-null int64
dtypes: float64(7), int64(7), object(2)
memory usage: 400.7 KB


In [22]:
X_train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18659 entries, 0 to 18658
Data columns (total 16 columns):
latitude      18659 non-null float64
longitude     18659 non-null float64
brightness    18659 non-null float64
scan          18659 non-null float64
track         18659 non-null float64
acq_time      18659 non-null int64
satellite     18659 non-null object
confidence    18659 non-null int64
bright_t31    18659 non-null float64
frp           18659 non-null float64
daynight      18659 non-null object
type          18659 non-null int64
FIRE_YEAR     18659 non-null int64
MONTH         18659 non-null int64
WEEK          18659 non-null int64
DAY           18659 non-null int64
dtypes: float64(7), int64(7), object(2)
memory usage: 4.3 MB
