# Import Libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

import pandas as pd
from imblearn.over_sampling import SMOTE
import time

random_state = 1

# Import Data

In [2]:
df = pd.read_csv('../data/processed/cleaned_dataframe.gz', compression='gzip')
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


# Train:Test Split

In [3]:
X = df.drop(['Class'], axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    stratify=y, 
                                                    random_state=1)

print("No. of samples in each training set:\t{}".format(X_train.shape[0]))
print("No. of samples in each test set:\t{}".format(X_test.shape[0]))

No. of samples in each training set:	199364
No. of samples in each test set:	85443


# Scaling

In [4]:
scaler = RobustScaler()

# SMOTE

In [5]:
smote = SMOTE(random_state = random_state)

# Construct Pipelines

## Logistic Regression

In [48]:
clf_lr = LogisticRegression(random_state = random_state,
                            class_weight = {0: 1, 1: 2},
                            fit_intercept= False,
                            solver= 'liblinear'
                           )

pipe_lr = Pipeline([('scl', scaler), 
                    ('clf', clf_lr)])

## Random Forest

In [38]:
clf_rf = RandomForestClassifier(random_state= random_state, 
                                class_weight={0: 1, 1: 2},
                                n_estimators=30)
pipe_rf = Pipeline([('scl', scaler), 
                    ('clf', clf_rf)])


## Support Vector Machine

In [39]:
# pipe_svm = Pipeline([('scl', StandardScaler()),
# 			('clf', svm.SVC(random_state=42))])

# Grid Search Parameters

In [40]:
param_range = [1, 2, 5, 10]

In [49]:
# Logistic Regression
grid_params_lr = [{'clf__penalty': ['l1', 'l2']
                  }] 

# Random Forest
grid_params_rf = [{'clf__criterion': ['gini', 'entropy'], 
                   'clf__min_samples_leaf': param_range, 
                   'clf__max_depth': param_range, 
                   'clf__min_samples_split': param_range[1:]
                  }]

# Construct Grid Searches

In [51]:
jobs = -1

# Logistic Regression
gs_lr = GridSearchCV(estimator=pipe_lr, 
                     param_grid=grid_params_lr, 
                     scoring='f1', 
                     cv=10,
                     n_jobs=jobs)

# Random Forest
gs_rf = GridSearchCV(estimator=pipe_rf, 
                     param_grid=grid_params_rf, 
                     scoring='f1', 
                     cv=10,  
                     n_jobs=jobs)

In [52]:
# List of pipelines to iterate through
grids = [gs_lr, gs_rf]

# Dictionary of pipelines and classifier types
grid_dict = {0: 'Logistic Regression', 1: 'Random Forest'}

In [62]:
# Fit grid search objects
print('Performing model optimizations...')
best_f1 = 0.0
best_clf = 0
best_gs = ''

for idx, gs in enumerate(grids):
    print('\nEstimator: {}'.format(grid_dict[idx]))
    # Fit grid search
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: {}'.format(gs.best_params_))
    # Best training data f1
    print('Best training F1 score: {}'.format(round(gs.best_score_, 
                                                    3)))
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data f1 of model with best params
    print('Test set F1 score for best params: {}'.format(round(f1_score(y_test, 
                                                                        y_pred), 
                                                               3)))
    # Track best (highest test f1) model
    if gs.best_score_ > best_f1:
        best_f1 = f1_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx

print('\nClassifier with best test set F1 score: {} ({})'.format(grid_dict[best_clf], 
                                                                 best_f1))

Performing model optimizations...

Estimator: Logistic Regression
Best params: {'clf__penalty': 'l1'}
Best training F1 score: 0.421
Test set F1 score for best params: 0.487

Estimator: Random Forest


KeyboardInterrupt: 

## Done individuallly

In [14]:
def model_fit(classifier_name, 
              classifier, 
              grid_params, 
              X_train, 
              y_train, 
              X_test, 
              y_test,
              scaler,
              n_jobs):
    '''Fit the classifier model and return the test f1 score.
       The best performing parameters on the training data are used'''
    
    pipe = Pipeline([('scl', scaler), 
                    ('clf', classifier)])
    
    gs = GridSearchCV(estimator=pipe, 
                     param_grid=grid_params, 
                     scoring='f1', 
                     cv=3,
                     n_jobs=n_jobs,
                     verbose=2)
    
    print('\nEstimator: {}\n'.format(classifier_name))

    # Fit model
    start_time = time.time()
    gs.fit(X_train, y_train)
    end_time = time.time()

    # Best training data f1
    print('\tBest Training F1 score: {}'.format(round(gs.best_score_, 
                                                    3)))
    # Predict on test data with best params
    y_pred = gs.predict(X_test)

    # Test data f1 of model with best params
    print('\tBest Test F1 score: {}'.format(round(f1_score(y_test, y_pred), 3)
                                                ))
    test_f1_score = f1_score(y_test, y_pred)
    
    # Best params
    print('\n\tBest params: {}\n'.format(gs.best_params_))
    # Time to fit model
    print('(\tRuntime {} seconds)'.format(round(end_time - start_time, 2)))

    # Save the best grid search pipeline to
    dump(gs, 'best_{}_pipeline.joblib'.format(classifier_name))
    print('Best GridSearch Pipeline saved to file "{}"'.format(classifier_name))
    
    return test_f1_score

In [15]:
n_jobs = -1
param_range = [1, 5, 10]

### Logistic Regression

In [16]:
classifier_name = 'Logistic Regression'
classifier_lr = LogisticRegression(random_state = random_state,
                               class_weight = {0: 1, 1: 2},
                               fit_intercept= False, 
                               solver= 'liblinear'
                               )

grid_params_lr = [{'clf__penalty': ['l1', 'l2']
                  }]

In [17]:
f1_score_lr = model_fit(classifier_name, 
              classifier_lr, 
              grid_params_lr, 
              X_train, 
              y_train, 
              X_test, 
              y_test,
              scaler,
              n_jobs)


Estimator: Logistic Regression

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    6.3s remaining:    6.3s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   28.3s finished


	Best Training F1 score: 0.416
	Best Test F1 score: 0.487

	Best params: {'clf__penalty': 'l1'}

(	Runtime 53.27 seconds)
Best GridSearch Pipeline saved to file "Logistic Regression"


### Random Forest

In [9]:
classifier_name = 'Random Forest'
classifier_rf = RandomForestClassifier(random_state= random_state, 
                                class_weight={0: 1, 1: 2},
                                n_estimators=30)

grid_params_rf= [{'clf__criterion': ['gini', 'entropy'], 
                   'clf__min_samples_leaf': param_range, 
                   'clf__max_depth': param_range, 
                   'clf__min_samples_split': param_range[1:]
                  }]

In [10]:
f1_score_rf = model_fit(classifier_name, 
              classifier_rf, 
              grid_params_rf, 
              X_train, 
              y_train, 
              X_test, 
              y_test,
              scaler,
              n_jobs)


Estimator: Random Forest
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  3.6min finished


	Best Training F1 score: 0.845
	Best Test F1 score: 0.889

	Best params: {'clf__criterion': 'entropy', 'clf__max_depth': 10, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5}

(	Runtime 235.6 seconds)


In [12]:
from joblib import dump, load

In [13]:
dump(gs, 'best_{}_pipeline.joblib'.format(classifier_name))

NameError: name 'gs' is not defined

In [None]:
# from sklearn.externals import joblib
# # Save the best grid search pipeline to file with Pickle
# best_gs_pipeline = 'best_gs_pipeline.pkl'
# joblib.dump(best_gs, best_gs_pipeline, compress=1)
# print('\nSaved {} grid search pipeline to file: {}'.format(grid_dict[best_clf],best_gs_pipeline))


To Do:
* make function for some of above code
* add in other models (knn, svm, gradient boosting XG and ADA)
* fix pickle and joblib, and add it to bottom of pipeline code, in same cell
* where do i put smote?