In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [110]:
### train data
train_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/train.csv")
train_data = train_data.drop(['PersonID', 'screening_date', 'fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc','Race'], axis=1)

train_X = train_data.loc[:,:'current_violence']
train_Y = train_data.loc[:, 'recid_two_year':]

### test data
test_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/test.csv")
test_data = test_data.drop(['PersonID', 'screening_date','fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc', 'Race'], axis=1)
test_X = test_data.loc[:,:'current_violence']
test_Y = test_data.loc[:,'recid_two_year':]

### column names
cols_X = train_data.columns[:-14]
cols_Y = train_data.columns[-14:]

### stack train and test
data_X = pd.concat([train_X, test_X]).values
data_Y = pd.concat([train_Y, test_Y]).values
train_index = np.repeat(1, train_data.shape[0]).tolist() + np.repeat(0, test_data.shape[0]).tolist()

##### sanity check

In [111]:
data_X.shape, data_Y.shape

((146003, 36), (146003, 14))

In [112]:
cols_X

Index(['Gender', 'age_at_current_charge', 'p_arrest', 'p_charges',
       'p_violence', 'p_felony', 'p_misdemeanor', 'p_property', 'p_murder',
       'p_assault', 'p_sex_offense', 'p_weapon', 'p_felprop_viol',
       'p_felassult', 'p_misdeassult', 'p_traffic', 'p_drug', 'p_dui',
       'p_stalking', 'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass',
       'ADE', 'Treatment', 'p_prison', 'p_jail30', 'p_fta_two_year',
       'p_fta_two_year_plus', 'p_pending_charge', 'p_probation', 'six_month',
       'one_year', 'three_year', 'five_year', 'current_violence'],
      dtype='object')

In [113]:
cols_Y

Index(['recid_two_year', 'recid_drug_two_year', 'recid_traffic_two_year',
       'recid_violence_two_year', 'recid_F_two_year', 'recid_M_two_year',
       'recid_property_two_year', 'recid_six_month', 'recid_drug_six_month',
       'recid_traffic_six_month', 'recid_violence_six_month',
       'recid_F_six_month', 'recid_M_six_month', 'recid_property_six_month'],
      dtype='object')

#### Function

In [114]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if (feature == 'age_at_current_charge'):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '<=' + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '>=' + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data
    #return final_data, final_names

#### Cutoffs from GA2M Model -- version 1

- sparse

In [121]:
cutoffs = [[1], ## 0. sex
           [18,19,20,24,29,30,38,43,46,52,55,61,67], ## 2. p_age_at_current_charge
           [1,2,3,4,5,6,7,8,9,10], # p_arrests
           [1,2,3,4,5,6], ## 3.p_charge
           [1], ## 4.p_violence
           [1,2], ## 5. p_felony
           [1,2,3], ## 6. p_misdemeanor
           [1], ## 7. p_property
           [1], ##  8. p_murder
           [1], ## 9. p_assult
           [1], ## 10. p_sex_offense
           [1], ## 11. p_weapon
           [1], ## 12. p_felprop_viol
           [1], ## 13. p_felassul
           [1], ## 14. p_misdeassul
           [1,2], ## 15. p_traffic
           [1,2], ## 16. p_drug
           [1,], ## 17. p_dui
           [1],  ## 19. p_stalking
           [1], ## 20. p_voyeuriam
           [1], ## 21. p_fraud
           [1], ## 22. p_stealing
           [1], ## 23. p_trespass
           [1], ## ADE
           [1], ## Treatment
           [1], ## 24. p_prison
           [1,2,3], ## 25. jail
           [1], ## 26. p_fta_two_year
           [1], ## p_fta_two_year_plus
           [1,2], ## p_pending_charges
           [1,2], ## 27. p_probation
           [1],[1],[1],[1], ## 28;29,30,31: six_month; one_year; three_year; five_year
           [1]]  ## current_violence

In [141]:
## make stumps
new_data = create_stumps(data_X, cols_X, cutoffs)

## add labels
data_Y[data_Y == 0] = -1
y = pd.DataFrame(data_Y, columns=cols_Y)
new_data = pd.concat([new_data,y], axis=1)
new_data['train_index'] = train_index

In [142]:
new_data.head(10)

Unnamed: 0,Gender>=1,age_at_current_charge<=18,age_at_current_charge<=19,age_at_current_charge<=20,age_at_current_charge<=24,age_at_current_charge<=29,age_at_current_charge<=30,age_at_current_charge<=38,age_at_current_charge<=43,age_at_current_charge<=46,...,recid_M_two_year,recid_property_two_year,recid_six_month,recid_drug_six_month,recid_traffic_six_month,recid_violence_six_month,recid_F_six_month,recid_M_six_month,recid_property_six_month,train_index
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,-1,-1,-1,-1,-1,-1,-1,-1,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
6,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
7,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1,-1,-1,-1,-1,-1,-1,-1,-1,1
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
9,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,-1,-1,-1,-1,-1,-1,-1,-1,1


In [146]:
train_data = new_data[new_data['train_index'] == 1]
test_data = new_data[new_data['train_index'] == 0]

In [147]:
new_data.to_csv('kentucky_stumps.csv', header=True, index=False)
train_data.to_csv('kentucky_train_stumps.csv', header=True, index=False)
test_data.to_csv('kentucky_test_stumps.csv', header=True, index=False)