In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Combine Train and Test Sets

We need to set stumps for both the train and test sets, so we combine them first and set the train_index to be used in spliting train and test sets again in later steps.

In [34]:
## data path
train_set = "~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/train.csv"
test_set = "~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/test.csv"
train = pd.read_csv(train_set)
test = pd.read_csv(test_set)

## get rid of extra columns
train = train.drop(['PersonID', 'screening_date','fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc'], axis=1)
test = test.drop(['PersonID', 'screening_date','fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc'], axis=1)

## add training set index for future use
combined = pd.concat([train, test])
X,Y = combined.loc[:,:'current_violence'].values, combined.loc[:,'recid_two_year'].values
train_index = list(np.repeat(1, len(train))) + list(np.repeat(0, len(test)))
cols = train.columns[:-9]

##### sanity check

In [40]:
train.shape, test.shape, data.shape, X.shape, Y.shape

((162513, 46), (18058, 46), (180571, 46), (180571, 37), (180571,))

In [41]:
cols

Index(['Gender', 'age_at_current_charge', 'p_arrest', 'p_charges',
       'p_violence', 'p_felony', 'p_misdemeanor', 'p_property', 'p_murder',
       'p_assault', 'p_sex_offense', 'p_weapon', 'p_felprop_viol',
       'p_felassult', 'p_misdeassult', 'p_traffic', 'p_drug', 'p_dui',
       'p_stalking', 'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass',
       'ADE', 'Treatment', 'p_prison', 'p_jail', 'p_fta_two_year',
       'p_fta_two_year_plus', 'p_pending_charge', 'p_probation',
       'p_SentMonths', 'six_month', 'one_year', 'three_year', 'five_year',
       'current_violence'],
      dtype='object')

#### Function

In [42]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if (feature == 'age_at_current_charge'):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '<=' + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '>=' + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data
    #return final_data, final_names

#### Cutoffs from GA2M Model -- version 1

- avg. train auc: 0.7062
- avg. validation auc: 0.7065
- holdout auc: 0.7182
- holdout acc: 0.7490


Logistic
- avg. train auc: 0.7429
- avg. validation auc: 0.7419

In [12]:
cutoffs = [[1], ## 0. sex
           [18,19,20,21,24,27,30,33,36,39,42,45,48,53,55,57,60,63,66], ## 2. p_age_at_current_charge
           [1,2,3,4,5,6,7,8,9,10], # p_arrests
           [1,3,5,7,9,11,13,15,17,19,21], ## 3.p_charge
           [1,3,5,7], ## 4.p_violence
           [1,3,5,7,9], ## 5. p_felony
           [1,3,5,7,9,11,13], ## 6. p_misdemeanor
           [1,2], ## 7. p_property
           [1], ##  8. p_murder
           [1,2], ## 9. p_assult
           [1], ## 10. p_sex_offense
           [1], ## 11. p_weapon
           [1], ## 12. p_felprop_viol
           [1], ## 13. p_felassul
           [1,2], ## 14. p_misdeassul
           [1,2,3], ## 15. p_traffic
           [1,3,5,7], ## 16. p_drug
           [1,], ## 17. p_dui
           [1],  ## 19. p_stalking
           [1], ## 20. p_voyeuriam
           [1,2], ## 21. p_fraud
           [1,2], ## 22. p_stealing
           [1], ## 23. p_trespass
           [1], ## ADE
           [1], ## Treatment
           [1,2], ## 24. p_prison
           [1,2,3,4,5], ## 25. jail
           [1,2], ## 26. p_fta_two_year
           [1], ## p_fta_two_year_plus
           [1,2,3,4], ## p_pending_charges
           [1,2], ## 27. p_probation
           [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48], ## p_sentmonths
           [1],[1],[1],[1], ## 28;29,30,31: six_month; one_year; three_year; five_year
           [1]]  ## current_violence

#### Cutoffs from GA2M Model -- version 2

- avg. train auc: 0.7062
- avg. validation auc: 0.7065
- holdout auc: 0.7182
- holdout acc: 0.7490


Logistic
- avg. train auc: 0.7429
- avg. validation auc: 0.7419

In [50]:
cutoffs = [[1], ## 0. sex
           [18,19,20,21,24,27,30,33,36,39,42,45,48,53,55,57,60,63,66], ## 2. p_age_at_current_charge
           [2,3,4,5,6,7,8,9,10], # p_arrests
           [3,5,7,9,11,13,15,17,19,21], ## 3.p_charge
           [1,3,5,7], ## 4.p_violence
           [1,3,5,7,9], ## 5. p_felony
           [1,3,5,7,9,11,13], ## 6. p_misdemeanor
           [1,2], ## 7. p_property
           [1], ##  8. p_murder
           [1,2], ## 9. p_assult
           [1], ## 10. p_sex_offense
           [1], ## 11. p_weapon
           [1], ## 12. p_felprop_viol
           [1], ## 13. p_felassul
           [1,2], ## 14. p_misdeassul
           [1,2,3], ## 15. p_traffic
           [1,3,5,7], ## 16. p_drug
           [1,], ## 17. p_dui
           [1],  ## 19. p_stalking
           [1], ## 20. p_voyeuriam
           [1,2], ## 21. p_fraud
           [1,2], ## 22. p_stealing
           [1], ## 23. p_trespass
           [1], ## ADE
           [1], ## Treatment
           [1,2], ## 24. p_prison
           [1,2,3,4,5], ## 25. jail
           [1,2], ## 26. p_fta_two_year
           [1], ## p_fta_two_year_plus
           [1,2,3,4], ## p_pending_charges
           [1,2], ## 27. p_probation
           [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48], ## p_sentmonths
           [1],[1],[1],[1], ## 28;29,30,31: six_month; one_year; three_year; five_year
           [1]]  ## current_violence

In [13]:
## new stumps
new_data = create_stumps(X, cols, cutoffs)

## response variable & train set index
Y[Y == 0] = -1
y = pd.DataFrame(Y, columns=['recid_two_year'])
index = pd.DataFrame(train_index, columns=['index'])

## concate
new_data = pd.concat([index, y, new_data], axis=1)

In [14]:
new_data.head(10)

Unnamed: 0,index,recid_two_year,Gender>=1,age_at_current_charge<=18,age_at_current_charge<=19,age_at_current_charge<=20,age_at_current_charge<=21,age_at_current_charge<=24,age_at_current_charge<=27,age_at_current_charge<=30,...,p_SentMonths>=36,p_SentMonths>=39,p_SentMonths>=42,p_SentMonths>=45,p_SentMonths>=48,six_month>=1,one_year>=1,three_year>=1,five_year>=1,current_violence>=1
0,1,-1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
2,1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
4,1,-1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
5,1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,-1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,-1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1,1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0


In [15]:
new_train = new_data[new_data['index'] == 1].drop(['index'], axis=1)
new_test = new_data[new_data['index'] == 0].drop(['index'], axis=1)

In [17]:
new_train.to_csv("new_train.csv", header=True, index=False)
new_test.to_csv("new_test.csv", header=True, index=False)