# Table 2 and Table 3: 
# Random Forest and Logistic Regression Classification Results 

This notebook contains code for the Random Forest Classification and Logistic Regression Classification Results in Table 2 and 3, that is, AUROC, sensitivity and specificity. To produce these results, parameters need to be adjusted in the following sections (clearly indicated below):  

- in **data** section: inclusion threshold
    - 200 bets or 10 sessions
- in **pipeline** section: modeling parameters
    - full or reduced variable set
    - Random Forest Classification or Logistic Regression 
    - unbalanced (full) or balanced dataset


# libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import time
import random
import datetime 
import pickle
import os.path
import warnings

linestyles = ['solid', 'dotted', 'dashed','dashdot']  
colors = ['b','r','k','#FFFF00','g','#808080','#56B4E9','#FF7F00']
markers = ['.','+','s']
warnings.filterwarnings('ignore')

%run _helper.ipynb
%run _feature_selection_class.ipynb

# parameters

In [None]:
input_path = ''
output_path = ''

# data

In [None]:
################### SELECT INCLUSION THRESHOLD ###################

inclusion_var, inclusion_threshold = 'total_bets',200  # >200 bet threshold
#inclusion_var, inclusion_threshold = 'total_number_of_sessions',10 # sensitivity analysis: >10 session threshold

#################################################################


# load
data = pd.read_csv(input_path)
header = list(data)[1:-1]

# subset data
data = data[data[inclusion_var] > inclusion_threshold]

# summary
print('total: ',len(data))
print('columns: ',len(header))
print('non-vse: ',len(data[data.exclusion==0]))
print('vse: ',len(data[data.exclusion==1]))

In [None]:
# load dataframes to store results
df_results = get_df_results('df_results')

# hyperparms

In [None]:
# Random Forest
BOOTSTRAP = [True, False]
CRITERION = ['entropy', 'gini']
MAX_FEATURES = [0.1, None, 'log2', 'sqrt']
N_ESTIMATORS = [20, 60, 100, 500, 1000]
MAX_DEPTH = [10, 20, 50, None]
MIN_SAMPLES_SPLIT = [2,30,100]
N_JOBS = [-1]
RANDOM_STATE = [123]

# Logit
C_VALUE = [0.1, 0.5, 1, 1.5, 2]
MAX_ITER = [300]

# pipeline

In [None]:
# NESTED CROSS VALIDATION 
n_splits = 10


################### SELECT MODELING PARAMETERS ###################

# 1. FULL VS SELECTED VARIABLES 
#featureselector = 'passthrough' # use all 20 variables
featureselector = VariableSelector() # use 6 selected variables

# 2. ESTIMATOR
estimator = RandomForestClassifier()
#estimator = LogisticRegression()

# 3. DATASET MODE
mode = 'full' # full dataset
#mode= 'balanced' # balanced dataset

##################################################################


# initialize  
X,y = get_dataset_full_or_balanced(mode=mode, 
                                   data=data, 
                                   header=header)
param_grid = get_param_grid(estimator)
pipe, analysis_type, variables, model = get_pipeline(featureselector=featureselector, 
                                                     estimator=estimator, 
                                                     inclusion_var=inclusion_var)


# confirm
print('analysis type:',analysis_type )
print('model:', model)
print('mode:', mode, 'size:', len(X))
print('variables:', variables)
print('n_splits:', n_splits)
print('pipeline:',pipe)
print('param_grid:', param_grid)


# nested cross-validation (AUROC) 

In [None]:
start_time = time.time()

# define cross-validation technique
inner_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)
outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)

# set up grid search
# note: refit=True refits best estimator with best hyperparms on entire outer training fold.
grid_search = GridSearchCV(pipe, cv=inner_cv, n_jobs=-1, param_grid=param_grid, iid=False, scoring='roc_auc',
                          refit=True)

# nested cross-validation
nested_cv = cross_validate(estimator=grid_search, 
                           X=X, y=y, 
                           scoring='roc_auc', 
                           cv=outer_cv,
                           error_score=np.nan, 
                           #n_jobs=-1, 
                           return_estimator=True,
                           return_train_score=True)
runtime=time.time()-start_time
print('Runtime: ', runtime, 'seconds')

# selecting hyperparms
clf_final = GridSearchCV(estimator, 
                    cv=inner_cv, n_jobs=-1, param_grid=param_grid, iid=False, scoring='roc_auc',
                          refit=True).fit(X,y)
runtime=time.time()-start_time
print('Runtime: ', runtime, 'seconds')


# AUROC values for train and test
mean_train = nested_cv['train_score'].mean()
std_train = np.std(nested_cv['train_score'])
mean_test = nested_cv['test_score'].mean()
std_test = np.std(nested_cv['test_score'])

print('################## Performance Metrics for', model, '##################')

print('analysis type:',analysis_type )
print('model:', model)
print('dataset:', dataset, 'size:', len(X))
print('variables:', variables)
print('Average ROC AUC train:',np.round(mean_train, 2), ' std:', np.round(std_train, 2))
print('Average ROC AUC test:',np.round(mean_test, 2), ' std:', np.round(std_test, 2))
print('Best hyperparms:', clf_final.best_params_)

print('##################################################################################')

if variables != 'full_vars':
    selected_columns = clf_final.best_estimator_['featureselect'].results['selected columns']
else: 
    selected_columns = 'NA'

now = (datetime.datetime.now()-datetime.timedelta(hours=7)).strftime("%Y-%m-%d_%H:%M")


# store output in dataframe
ml = '_'.join((model,analysis_type,mode,variables))
df_auroc.loc[df_auroc.index == ml,'auc_mean_train'] = np.round(mean_train, 2)
df_auroc.loc[df_auroc.index == ml,'auc_std_train'] = np.round(std_train, 2)
df_auroc.loc[df_auroc.index == ml,'auc_mean_test'] = np.round(mean_test, 2)
df_auroc.loc[df_auroc.index == ml,'auc_std_test'] = np.round(std_test, 2)
df_auroc.loc[df_auroc.index == ml,'best_parms_final'] = clf_final.best_params_
df_auroc.loc[df_auroc.index == ml,'selected_columns'] = selected_columns
df_auroc.loc[df_auroc.index == ml,'timestamp']= now


# save objects
save_obj(path=output_path + '_'.join([ml, 'nested']), obj=nested_cv)
save_obj(path=output_path + '_'.join([ml, 'final']), obj=clf_final)
save_obj(output_path + 'df_results', obj=df_results)

print('done')

# classifications (recall, specificity)

In [None]:
# load data
data = pd.read_csv(input_path)
header = list(data)[1:-1]

# get list of models in df_results
#models = [i for i in df_results[df_results.iloc[:,0].notnull()].index]
models = [i for i in df_results.index]

# get splits
n_splits=10
inner_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)
outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)

# run simple 10-fold loop 
for model in models:
    
       # prep data
    if 'main' in model:
        d = data[data['total_bets']>200].copy()
    else:
        d = data[data['total_number_of_sessions']>10].copy()
    
    # get sample
    if 'balanced' in model:
        X,y = get_dataset_full_or_balanced(mode='balanced', data=d, header=header)
    else:
        X,y = get_dataset_full_or_balanced(mode='full', data=d, header=header)
    
    # load model
    final = load_obj('../obj/' + model+'_final')
    cutoff_points = []
    predictions = []
    y_true = []
    for outer_fold, (train_index, test_index) in enumerate(outer_cv.split(X,y)):

        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]    

        clf = final.best_estimator_.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:,1]
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
        cut_opt = get_optimal_cutoff_point(fpr, tpr, thresholds)

        cutoff_points.extend([cut_opt])
        predictions.extend(y_pred)
        y_true.extend(y_test)
    
    cutoff_mean = np.mean(cutoff_points)
    y_class = get_classification(predictions, cutoff=cutoff_mean)
    conf_matrix = metrics.confusion_matrix(y_true, y_class)
    recall = metrics.recall_score(y_true, y_class)
    specificity = get_specificity(conf_matrix)
    
    df_results.loc[model,'mean_cutoff_opt'] = cutoff_mean
    df_results.loc[model,'conf_matrix'] = conf_matrix
    df_results.loc[model,'specificity'] = specificity
    df_results.loc[model,'recall'] = recall

    
    
# save objects
save_obj(df_results, 'df_results')

print('done')    