# helper

This notebook contains helper-functions used throughout this repository. 

# utilities

In [None]:
# save and load objects

def save_obj(obj, path):
    '''Save object to path.
    
    Parameters
    ----------
    obj : any python object (model, dictionary, dataframe) to store
    path : str
    

    '''
    
    with open(path + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(path):
    '''Load object from path.
    
    Parameters
    ----------
    path : str
    
    '''
    
    with open(path + '.pkl', 'rb') as f:
        return pickle.load(f)

# tables 2 and 3

In [None]:
# load dataframe to store modeling results
def get_df_results(path):
    '''Load or create dataframe to store modeling results.
    
    Parameters
    ----------
    path : str
    
    '''
    
    if os.path.exists(path):
        return load_obj(path)
    
    else:
        # define dataframe to store AUROC results
        models_list = [
              'random_forest_main_full_full_vars',
              'random_forest_main_full_selected_vars',
              'random_forest_main_balanced_full_vars',
              'random_forest_main_balanced_selected_vars',
              'random_forest_sens_full_full_vars',
              'random_forest_sens_full_selected_vars',
              'random_forest_sens_balanced_full_vars',
              'random_forest_sens_balanced_selected_vars',
              'logit_main_full_full_vars',
              'logit_main_full_selected_vars',
              'logit_main_balanced_full_vars',
              'logit_main_balanced_selected_vars',
              'logit_sens_full_full_vars',
              'logit_sens_full_selected_vars',
              'logit_sens_balanced_full_vars',
              'logit_sens_balanced_selected_vars']
        
        df_results = pd.DataFrame(index = models_list,
                                columns = ['auroc_mean_train', 
                                           'auroc_std_train', 
                                           'auroc_mean_test', 
                                           'auroc_std_test',
                                           'best_parms_final',
                                           'selected_columns',
                                           'timestamp',                                                                                     'mean_cutoff_opt', 
                                           'conf_matrix', 
                                           'recall', 
                                           'specificity']
                         )
        
        return df_results
        

In [None]:
# get dataset (balanced or unbalanced)
def get_dataset_full_or_balanced(mode, data, header, target='exclusion'):
    '''Get balanced or unbalanced dataset.
    
    Parameters
    ----------
    mode : str
           'full' for unbalanced dataset, 'balanced' for balanced dataset
    data : str 
           Dataset
    header : array of strings
             Array of variable names
    target : str
             Target variable
             
    '''
    
    # assign regressor matrix (X) and array with independent variable (y)
    X = data[header]
    y = data[target]

    if mode=='balanced':
        idx = y.index[y==1]
        idx = idx.union(random.choices(population=y.index[y==0], k=len(idx)))
        X=X.loc[idx]
        y=y.loc[idx]
    
    return X,y

In [None]:
# get hyperparameter grid
def get_param_grid(estimator): 
    '''Assemble hyperaparameter grid for hyperparameter search.
    
    Get hyperparameter grid for Random Forest Classifier and Logistic Regression 
    to use in sklearn.model_selection.GridSearchCV.
    
    Parameters
    ----------
    estimator : sklearn.ensemble.RandomForestClassifier or sklearn.linear_model.LogisticRegression class

    '''

    # get parameter grid for Logistic Regression
    if isinstance(estimator,LogisticRegression):
        paramgrid = [
            {
                'classify__C': C_VALUE,
                'classify__max_iter': MAX_ITER
            }
        ]


    # get parameter grid for Random Forest Classifier
    elif isinstance(estimator,RandomForestClassifier):
        paramgrid = [
            {
                'classify__bootstrap': BOOTSTRAP,
                'classify__criterion': CRITERION,
                'classify__max_features': MAX_FEATURES,
                'classify__n_estimators': N_ESTIMATORS,
                'classify__max_depth': MAX_DEPTH,
                'classify__min_samples_split': MIN_SAMPLES_SPLIT,
                'classify__n_jobs': N_JOBS,
                'classify__random_state': RANDOM_STATE
            }
        ]

    else:
        print('WARNING: Estimator not found.')
        sys.exit()

        
    
    return paramgrid

In [None]:
# define pipeline
def get_pipeline(
             featureselector, 
             estimator, 
             inclusion_var):
    ''' Assemble pipeline to use in sklearn.pipeline.
    
    Initialize the sklearn.pipeline.Pipeline class, consisting of a 
    feature selection stage and the classification (estimator) stage. 
    
    Parameters
    ----------
    featureselector : str ('passthrough') or VariableSelector() class
                      If 'passthrough', no feature selection is performed.
    estimator : sklearn.ensemble.RandomForestClassifier or sklearn.linear_model.LogisticRegression class
    inclusion_var : str
                    Determines whether 200 bet threshold or 10 session threshold is used 
    '''
    
    # initialize pipeline
    pipe = Pipeline([
    ('featureselect', featureselector),
    ('classify', estimator)])
        
    # get variables setting
    if featureselector == 'passthrough':
        variables = 'full_vars'
    else:
        variables = 'selected_vars'
    
    # get analysis type
    if inclusion_var == 'total_bets':
        analysis_type = 'main'
    else:
        analysis_type = 'sens'
        
    
    # get model
    if isinstance(estimator,LogisticRegression):
        model = 'logit'

    elif isinstance(estimator,RandomForestClassifier):
        model = 'random_forest'
         
    
    return pipe, analysis_type, variables, model

In [None]:
# get optimal classification threshold
def get_optimal_cutoff_point(fpr, tpr, thresholds):
    '''Get optimal classification cutoff.
    
    Get optimal classification cutoff by finding the threshold that maximizes 
    the difference between true positive rate (tpr) and false positive rate (fpr), 
    the two axes in a ROC curve. 
    
    Parameters
    ----------
    fpr, tpr, thresholds : output from sklearn.metrics.roc_curve method
    
    '''
    
    opt_cutoff = thresholds[np.argmax(tpr - fpr)]
    
    return opt_cutoff

In [None]:
# get classification based on cutoff and array of predicted probabilities
def get_classification(array, cutoff=0.5):
    '''Get classification from array of predicted probabilities.
    
    Parameters
    ----------
    array : array of int
            Array of predicted probabilities
    cutoff : int
             Classification threshold value
             
    '''
    
    new_array=np.zeros(len(array))
    new_array[array>cutoff]=1
    
    return new_array

In [None]:
# calculate specificity
def get_specificity(confusion_matrix):
    '''Calculate specificity from confusion matrix.
    
    Parameters
    ----------
    confusion_matrix : sklear.metrics.confusion_matrix.
    
    '''
    
    cm = np.array(confusion_matrix)
    specificity = cm[0][0]/(cm[0]).sum()
    
    return specificity

# Figure 1 and Supplementary Figure 2

In [None]:
def get_feature_importances(clf, features):
    '''Extract feature importances from fitted Random Forest Classification model.
    
    Parameters
    ----------
    clf : fitted model
    features : array of features
    
    '''
    
    if len(features)!=clf.n_features_:
        print('WARNING: len(features)!=len(n_features_) is True. Fitted model must correspond to indicated features.')
        sys.exit()
        
    # clf.feature_importances:
    # give a list of length X.shape[1], which indicates the variable importance for 
    # each feature in each in the order they appear in X. 
    importances=clf.feature_importances_
    
    # relative feature importances
    #imp_sum = importances.sum()
    #relative_importances = importances/imp_sum

    # standard deviation for each feature's importance across all estimators
    std = np.std([tree.feature_importances_ for tree in clf.estimators_],
                 axis=0)


    # np.argsort():
    # gives back a ordered (in ascending order) list, where first position indicates lowest value 
    # and last position indicates highest value. So, e.g. features[19] will give the name of the feautre with the 
    # lowest importance. 
    indices = np.argsort(importances)[::-1]

    # create Dataframe w/ feature importance
    feat_imp = pd.DataFrame({'features':features, 
                             'importances':importances, 
                             'std':std}).sort_values(by='importances',ascending=False)
    
    return feat_imp

In [None]:
def get_full_variable_names(header):
    '''Create dictionary with variable names and their full names.'''
    
    features_fullnames = [
        'Average Session Length',
        'Bets per Day',
        'Bets per Session',
        'Days Gambled',
        'Distinct Games per Session',
        'Money Bet per Session',
        'Net Loss per Session',
        'Net Win per Session',
        'Money Bet from Promotional Offers per Session',
        'Sessions per Day',
        'Total Bets',
        'Total Money Bet',
        'Total Net Loss',
        'Total Net Win',
        'Total Sessions',
        'Total Money Bet from Promotional Offers', 
        'Variance in Bets per Session',
        'Variance in Distinct Games per Session',
        'Variance in Money Bet per Session',
        'Variance in Average Session Length'
    ]

    header.sort()
    dicc = dict(zip(header,features_fullnames))
    
    return dicc