In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
### train data
train_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_train_v1.csv")
train_data = train_data.drop(['fta_risk_score_raw','nca_risk_score_raw','pvf_risk_score_raw'], axis=1)
train_X = train_data.loc[:,:'current_violence20']
train_Y = train_data.loc[:, 'general_two_year':]

### test data
test_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_test_v1.csv")
test_data = test_data.drop(['fta_risk_score_raw','nca_risk_score_raw','pvf_risk_score_raw'], axis=1)
test_X = test_data.loc[:,:'current_violence20']
test_Y = test_data.loc[:,'general_two_year':]

### stack train and test
data_X = pd.concat([train_X, test_X])
data_Y = pd.concat([train_Y, test_Y])
train_index = np.repeat(1, train_data.shape[0]).tolist() + np.repeat(0, test_data.shape[0]).tolist()

## extract id, race and date
person_id = pd.DataFrame(data_X['person_id'])
race = pd.DataFrame(data_X['race'])
screening_date = pd.DataFrame(data_X['screening_date'])
data_X = data_X.drop(['person_id', 'race', 'screening_date'], axis=1)

### column names
cols_X = data_X.columns
cols_Y = data_Y.columns
data_X = data_X.values

##### sanity check

In [45]:
data_X.shape, data_Y.shape

((176466, 37), (176466, 12))

In [46]:
cols_X

Index(['sex', 'age_at_current_charge', 'p_arrest', 'p_charges', 'p_violence',
       'p_felony', 'p_misdemeanor', 'p_property', 'p_murder', 'p_assault',
       'p_sex_offense', 'p_weapon', 'p_felprop_viol', 'p_felassult',
       'p_misdeassult', 'p_traffic', 'p_drug', 'p_dui', 'p_stalking',
       'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass', 'ADE',
       'Treatment', 'p_fta_two_year', 'p_fta_two_year_plus',
       'p_pending_charge', 'p_probation', 'p_incarceration', 'six_month',
       'one_year', 'three_year', 'five_year', 'current_violence',
       'current_pending_charge', 'current_violence20'],
      dtype='object')

In [47]:
cols_Y

Index(['general_two_year', 'drug_two_year', 'violent_two_year',
       'felony_two_year', 'misdemeanor_two_year', 'property_two_year',
       'general_six_month', 'drug_six_month', 'violent_six_month',
       'felony_six_month', 'misdemeanor_six_month', 'property_six_month'],
      dtype='object')

#### Function

In [48]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if (feature == 'age_at_current_charge'):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n): 
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data

In [49]:
cols_X

Index(['sex', 'age_at_current_charge', 'p_arrest', 'p_charges', 'p_violence',
       'p_felony', 'p_misdemeanor', 'p_property', 'p_murder', 'p_assault',
       'p_sex_offense', 'p_weapon', 'p_felprop_viol', 'p_felassult',
       'p_misdeassult', 'p_traffic', 'p_drug', 'p_dui', 'p_stalking',
       'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass', 'ADE',
       'Treatment', 'p_fta_two_year', 'p_fta_two_year_plus',
       'p_pending_charge', 'p_probation', 'p_incarceration', 'six_month',
       'one_year', 'three_year', 'five_year', 'current_violence',
       'current_pending_charge', 'current_violence20'],
      dtype='object')

In [50]:
cutoffs = [[1], ## 0. sex
           [18,19,20,24,27,30,35,39,43,47,51,55,61,66], ## 1. age_at_current_charge
           ##[1,2,3,4,5,6,7,8,9,10], # 2.p_arrests
           ##[1,2,3,4,5,6], ## 3.p_charges
           [2,3,4,5,6,7,8,9,10], # 2.p_arrests
           [2,3,4,5,6], ## 3.p_charges
           [1,2,3], ## 4. p_violence
           [1,2,3], ## 5. p_felony
           [1,2,3], ## 6. p_misdemeanor
           [1,2,3], ## 7. p_property
           [1], ## 8. p_murder
           [1], ## 9. p_assult
           [1], ## 10. p_sex_offense
           [1], ## 11. p_weapon
           [1], ## 12. p_felprop_viol
           [1], ## 13. p_felassul
           [1], ## 14. p_misdeassul
           [1], ## 15. p_traffic
           [1,2,3], ## 16. p_drug
           [1], ## 17. p_dui
           [1],  ## 18. p_stalking
           [1], ## 19. p_voyeuriam
           [1], ## 20. p_fraud
           [1], ## 21. p_stealing
           [1], ## 22. p_trespass
           [1], ## 23. ADE
           [1], ## 24. Treatment
           #[1], ## 25. p_prison
           #[1], ## 26. jail30
           [1,2], ## 27. p_fta_two_year
           [1], ## 28. p_fta_two_year_plus
           [1], ## 29. p_pending_charges
           [1,2], ## 30. p_probation
           [1], ## 31. p_incarceration
           [1], ## 32. six_month
           [1], ## 33. one_year
           [1], ## 34. three_year
           [1], ## 35. five_year           
           [1], ## 36. current_violence
           [1], ## 37. current_pending_charge
           [1]] ## 38. current_violence20

#### Make Stumps

In [52]:
## make stumps
new_data = create_stumps(data_X, cols_X, cutoffs)

## combine data sets
data_Y[data_Y == 0] = -1
new_cols = ['person_id'] + ['race'] + ['screening_date'] + list(new_data.columns) + list(cols_Y)
new_data1 = pd.DataFrame(np.c_[person_id, race, screening_date, new_data, data_Y], columns=new_cols)
new_data1['train_index'] = train_index

In [53]:
new_data1.head(10)

Unnamed: 0,person_id,race,screening_date,sex1,age_at_current_charge18,age_at_current_charge19,age_at_current_charge20,age_at_current_charge24,age_at_current_charge27,age_at_current_charge30,...,felony_two_year,misdemeanor_two_year,property_two_year,general_six_month,drug_six_month,violent_six_month,felony_six_month,misdemeanor_six_month,property_six_month,train_index
0,6,White,2014-06-16,0,0,0,0,0,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
1,8,White,2015-07-09,1,0,0,0,0,1,1,...,1,1,-1,1,1,-1,1,-1,-1,1
2,11,White,2013-07-07,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,12,White,2015-07-22,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
4,23,White,2015-07-21,1,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
5,27,Black,2014-01-09,1,0,0,0,0,0,0,...,1,1,-1,1,-1,-1,1,1,-1,1
6,33,White,2015-07-08,1,0,0,0,0,0,1,...,-1,1,-1,1,1,1,-1,1,-1,1
7,35,White,2015-02-02,1,0,0,0,0,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
8,40,White,2014-11-07,1,0,0,0,0,1,1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
9,42,White,2013-11-07,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1


### Save Results

In [54]:
train_data = new_data1[new_data1['train_index'] == 1]
test_data = new_data1[new_data1['train_index'] == 0]

In [55]:
new_data1 = new_data1.drop(['train_index'], axis=1)
train_data = train_data.drop(['train_index'], axis=1)
test_data = test_data.drop(['train_index'], axis=1)

In [56]:
new_data1.to_csv('kentucky_stumps_v1.csv', header=True, index=False)
train_data.to_csv('kentucky_train_stumps_v1.csv', header=True, index=False)
test_data.to_csv('kentucky_test_stumps_v1.csv', header=True, index=False)
#train_index.to_csv('stumps_train_index.csv', header=False, index=False)
#test_index.to_csv('stumps_test_index.csv', header=False, index=False)

# Appendix

## Manually Create Stumps

In [29]:
### train data
train_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_train.csv")
train_data = train_data.drop(['fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc'], axis=1)

train_X = train_data.loc[:,:'current_violence']
train_Y = train_data.loc[:, 'recid_two_year':]

### test data
test_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_test.csv")
test_data = test_data.drop(['fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc'], axis=1)
test_X = test_data.loc[:,:'current_violence']
test_Y = test_data.loc[:,'recid_two_year':]

### stack train and test
data_X = pd.concat([train_X, test_X])
data_Y = pd.concat([train_Y, test_Y])
train_index = np.repeat(1, train_data.shape[0]).tolist() + np.repeat(0, test_data.shape[0]).tolist()

## extract id, race and date
person_id = pd.DataFrame(data_X['person_id'])
race = pd.DataFrame(data_X['race'])
screening_date = pd.DataFrame(data_X['screening_date'])
data_X = data_X.drop(['person_id', 'race', 'screening_date'], axis=1)

### column names
cols_X = data_X.columns
cols_Y = data_Y.columns

In [28]:
data_X.shape

(96010, 36)

In [27]:
data_X.columns

Index(['sex', 'age_at_current_charge', 'p_arrest', 'p_charges', 'p_violence',
       'p_felony', 'p_misdemeanor', 'p_property', 'p_murder', 'p_assault',
       'p_sex_offense', 'p_weapon', 'p_felprop_viol', 'p_felassult',
       'p_misdeassult', 'p_traffic', 'p_drug', 'p_dui', 'p_stalking',
       'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass', 'ADE',
       'Treatment', 'p_prison', 'p_jail30', 'p_fta_two_year',
       'p_fta_two_year_plus', 'p_pending_charge', 'p_probation', 'six_month',
       'one_year', 'three_year', 'five_year', 'current_violence'],
      dtype='object')

#### Version 1

In [8]:
## sex
sex1 = (data_X['sex'] == 1)*1

## age_at_current_charge
current_age19 = (data_X['age_at_current_charge'] <= 19)*1
current_age22 = (data_X['age_at_current_charge'] <= 22)*1
current_age25 = (data_X['age_at_current_charge'] <= 25)*1
current_age29 = (data_X['age_at_current_charge'] <= 29)*1
current_age32 = (data_X['age_at_current_charge'] <= 32)*1
current_age35 = (data_X['age_at_current_charge'] <= 35)*1
current_age38 = (data_X['age_at_current_charge'] <= 38)*1
current_age41 = (data_X['age_at_current_charge'] <= 41)*1
current_age44 = (data_X['age_at_current_charge'] <= 44)*1
current_age47 = (data_X['age_at_current_charge'] <= 47)*1
current_age51 = (data_X['age_at_current_charge'] <= 51)*1
current_age55 = (data_X['age_at_current_charge'] <= 55)*1
current_age61 = (data_X['age_at_current_charge'] <= 61)*1
current_age65 = (data_X['age_at_current_charge'] <= 65)*1

## p_arrest
p_arrest1 = (data_X['p_arrest'] == 1)*1
p_arrest2 = (data_X['p_arrest'] == 2)*1
p_arrest3 = (data_X['p_arrest'] >= 3)*1
p_arrest5 = (data_X['p_arrest'] >= 5)*1
p_arrest7 = (data_X['p_arrest'] >= 7)*1
p_arrest10 = (data_X['p_arrest'] >= 10)*1

## p_charge
p_charges1 = (data_X['p_arrest'] == 1)*1
p_charges2 = (data_X['p_arrest'] == 2)*1
p_charges3 = (data_X['p_arrest'] >= 3)*1
p_charges5 = (data_X['p_arrest'] >= 5)*1
p_charges7 = (data_X['p_arrest'] >= 7)*1

## p_violence
p_violence0 = (data_X['p_violence'] == 0)*1
p_violence1 = (data_X['p_violence'] == 1)*1
p_violence2 = (data_X['p_violence'] >= 2)*1

## p_felony
p_felony0 = (data_X['p_felony'] == 0)*1
p_felony1 = (data_X['p_felony'] == 1)*1
p_felony2 = (data_X['p_felony'] >= 2)*1
p_felony4 = (data_X['p_felony'] >= 4)*1

## p_misdemenaor
p_misdemeanor0 = (data_X['p_misdemeanor'] == 0)*1
p_misdemeanor1 = (data_X['p_misdemeanor'] == 1)*1
p_misdemeanor2 = (data_X['p_misdemeanor'] >= 2)*1
p_misdemeanor4 = (data_X['p_misdemeanor'] >= 4)*1

## p_property
p_property0 = (data_X['p_property'] == 0)*1
p_property1 = (data_X['p_property'] >= 1)*1

## p_murder
p_murder0 = (data_X['p_murder'] == 0)*1
p_murder1 = (data_X['p_murder'] >= 1)*1

## p_assult
p_assault0 = (data_X['p_assault'] == 0)*1
p_assault1 = (data_X['p_assault'] >= 1)*1

## p_sex_offense
p_sex_offense0 = (data_X['p_sex_offense'] == 0)*1
p_sex_offense1 = (data_X['p_sex_offense'] >= 1)*1

## p_weapon
p_weapon0 = (data_X['p_weapon'] == 0)*1
p_weapon1 = (data_X['p_weapon'] >= 1)*1

## p_felprop
p_felprop0 = (data_X['p_felprop_viol'] == 0)*1
p_felprop1 = (data_X['p_felprop_viol'] >= 1)*1

## p_felassault
p_felassault0 = (data_X['p_felassult'] == 0)*1
p_felassault1 = (data_X['p_felassult'] >= 1)*1

## p_misassault
p_misassault0 = (data_X['p_misdeassult'] == 0)*1
p_misassault1 = (data_X['p_misdeassult'] >= 1)*1

## p_traffic
p_traffic0 = (data_X['p_traffic'] == 0)*1
p_traffic1 = (data_X['p_traffic'] >= 1)*1

## p_drug
p_drug0 = (data_X['p_drug'] == 0)*1
p_drug1 = (data_X['p_drug'] == 1)*1
p_drug2 = (data_X['p_drug'] >= 2)*1

## p_dui
p_dui0 = (data_X['p_dui'] == 0)*1
p_dui1 = (data_X['p_dui'] >= 1)*1

## p_stalking
p_stalk0 = (data_X['p_stalking'] == 0)*1
p_stalk1 = (data_X['p_stalking'] >= 1)*1

## p_voye
p_voye0 = (data_X['p_voyeurism'] == 0)*1
p_voye1 = (data_X['p_voyeurism'] >= 1)*1

## p_fraud
p_fraud0 = (data_X['p_fraud'] == 0)*1
p_fraud1 = (data_X['p_fraud'] >= 1)*1

## p_stealing
p_stealing0 = (data_X['p_stealing'] == 0)*1
p_stealing1 = (data_X['p_stealing'] >= 1)*1

## p_trespass
p_trespass0 = (data_X['p_trespass'] == 0)*1
p_trespass1 = (data_X['p_trespass'] >= 1)*1

## p_ADE
ADE0 = (data_X['ADE'] == 0)*1
ADE1 = (data_X['ADE'] >= 1)*1

## treatment
treatment0 = (data_X['Treatment'] == 0)*1
treatment1 = (data_X['Treatment'] >= 1)*1

## prison
p_prison0 = (data_X['p_prison'] == 0)*1
p_prison1 = (data_X['p_prison'] >= 1)*1

## jail30
p_jail0 = (data_X['p_jail30'] == 0)*1
p_jail1 = (data_X['p_jail30'] == 1)*1
p_jail2 = (data_X['p_jail30'] >= 2)*1

## fta
p_fta0 = (data_X['p_fta_two_year'] == 0)*1
p_fta1 = (data_X['p_fta_two_year'] == 1)*1
p_fta2 = (data_X['p_fta_two_year'] >= 2)*1

## fta_plus
p_fta_plus0 = (data_X['p_fta_two_year_plus'] == 0)*1
p_fta_plus1 = (data_X['p_fta_two_year_plus'] >= 1)*1

## pending_charge
p_pending_charge0 = (data_X['p_pending_charge'] == 0)*1
p_pending_charge1 = (data_X['p_pending_charge'] == 1)*1
p_pending_charge2 = (data_X['p_pending_charge'] >= 2)*1

## probation
p_probation0 = (data_X['p_probation'] == 0)*1
p_probation1 = (data_X['p_probation'] == 1)*1
p_probation2 = (data_X['p_probation'] >= 2)*1

## other
six_month = data_X['six_month']
one_year = data_X['one_year']
three_year = data_X['three_year']
five_year = data_X['five_year']
current_violence = data_X['current_violence']

In [18]:
## combine data sets
data_Y[data_Y == 0] = -1
new_cols = ["person_id", "race", "screening_date", "sex1", "current_age19", "current_age22", "current_age25", 
            "current_age29", "current_age32", "current_age35", "current_age38", "current_age41", "current_age44", 
            "current_age47", "current_age51", "current_age55", "current_age61", "current_age65", "p_arrest1", 
            "p_arrest2", "p_arrest3", "p_arrest5", "p_arrest7", "p_arrest10", "p_charges1", "p_charges2", "p_charges3", 
            "p_charges5", "p_charges7", "p_violence0", "p_violence1", "p_violence2", "p_felony0", "p_felony1", "p_felony2", 
            "p_felony4", "p_misdemeanor0", "p_misdemeanor1", "p_misdemeanor2", "p_misdemeanor4", "p_property0", "p_property1", 
            "p_murder0", "p_murder1", "p_assault0", "p_assault1", "p_sex_offense0", "p_sex_offense1", "p_weapon0", 
            "p_weapon1", "p_felprop0", "p_felprop1", "p_felassault0", "p_felassault1", "p_misassault0", "p_misassault1", 
            "p_traffic0", "p_traffic1", "p_drug0", "p_drug1", "p_drug2", "p_dui0", "p_dui1", "p_stalk0", "p_stalk1", 
            "p_voye0", "p_voye1", "p_fraud0", "p_fraud1", "p_stealing0", "p_stealing1", "p_trespass0", "p_trespass1", 
            "ADE0", "ADE1", "treatment0", "treatment1", "p_prison0", "p_prison1", "p_jail0", "p_jail1", "p_jail2", 
            "p_fta0", "p_fta1", "p_fta2", "p_fta_plus0", "p_fta_plus1", "p_pending_charge0", "p_pending_charge1", 
            "p_pending_charge2", "p_probation0", "p_probation1", "p_probation2", "six_month", "one_year", "three_year", 
            "five_year", "current_violence"] + list(cols_Y)

In [19]:
new_data1 = pd.DataFrame(np.c_[person_id, race, screening_date, sex1, current_age19, current_age22, current_age25, 
                               current_age29, current_age32, current_age35, current_age38, current_age41, current_age44, 
                               current_age47, current_age51, current_age55, current_age61, current_age65, p_arrest1, 
                               p_arrest2, p_arrest3, p_arrest5, p_arrest7, p_arrest10, p_charges1, p_charges2, p_charges3, 
                               p_charges5, p_charges7,p_violence0, p_violence1, p_violence2, p_felony0, p_felony1,
                               p_felony2, p_felony4, p_misdemeanor0, p_misdemeanor1, p_misdemeanor2, p_misdemeanor4, p_property0, 
                               p_property1, p_murder0, p_murder1, p_assault0, p_assault1, p_sex_offense0, p_sex_offense1, 
                               p_weapon0, p_weapon1, p_felprop0, p_felprop1, p_felassault0, p_felassault1, p_misassault0, 
                               p_misassault1, p_traffic0, p_traffic1, p_drug0, p_drug1, p_drug2, p_dui0, p_dui1, p_stalk0, 
                               p_stalk1, p_voye0, p_voye1, p_fraud0, p_fraud1, p_stealing0, p_stealing1, p_trespass0, p_trespass1,
                               ADE0, ADE1, treatment0, treatment1, p_prison0, p_prison1, p_jail0, p_jail1, p_jail2, p_fta0, 
                               p_fta1, p_fta2, p_fta_plus0, p_fta_plus1, p_pending_charge0, p_pending_charge1, 
                               p_pending_charge2, p_probation0, p_probation1, p_probation2, six_month, one_year, three_year,
                               five_year, current_violence, data_Y], columns=new_cols)
new_data1['train_index'] = train_index

In [21]:
new_data1.shape

(96010, 113)

In [20]:
new_data1.head()

Unnamed: 0,person_id,race,screening_date,sex1,current_age19,current_age22,current_age25,current_age29,current_age32,current_age35,...,recid_M_two_year,recid_property_two_year,recid_six_month,recid_drug_six_month,recid_traffic_six_month,recid_violence_six_month,recid_F_six_month,recid_M_six_month,recid_property_six_month,train_index
0,8,White,2015-07-09,1,0,0,0,1,1,1,...,1,-1,1,1,-1,-1,1,-1,-1,1
1,12,White,2015-07-22,1,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
2,23,White,2015-03-29,1,0,0,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,33,White,2015-07-08,1,0,0,0,0,1,1,...,1,-1,1,-1,-1,1,-1,1,-1,1
4,35,White,2015-02-02,1,0,0,0,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1


## Create Stumps -- with 10/9/2019 data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
### train data
train_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_train_2019109.csv")
train_data = train_data.drop(['fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc'], axis=1)

train_X = train_data.loc[:,:'current_pending_charge']
train_Y = train_data.loc[:, 'recid_two_year':]

### test data
test_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_test_2019109.csv")
test_data = test_data.drop(['fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc'], axis=1)
test_X = test_data.loc[:,:'current_pending_charge']
test_Y = test_data.loc[:,'recid_two_year':]

### stack train and test
data_X = pd.concat([train_X, test_X])
data_Y = pd.concat([train_Y, test_Y])
train_index = np.repeat(1, train_data.shape[0]).tolist() + np.repeat(0, test_data.shape[0]).tolist()

## extract id, race and date
person_id = pd.DataFrame(data_X['person_id'])
race = pd.DataFrame(data_X['race'])
screening_date = pd.DataFrame(data_X['screening_date'])
data_X = data_X.drop(['person_id', 'race', 'screening_date'], axis=1)

### column names
cols_X = data_X.columns
cols_Y = data_Y.columns

##
data_X = data_X.values
data_Y = data_Y.values

In [8]:
data_X.shape

(96010, 38)

In [9]:
len(cols_X)

38

#### Function

In [11]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if (feature == 'age_at_current_charge'):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '<=' + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n): 
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '>=' + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data
    #return final_data, final_names

#### Cutoffs from GA2M Model -- version 1

- sparse

In [12]:
cols_X

Index(['sex', 'age_at_current_charge', 'p_arrest', 'p_charges', 'p_felony',
       'p_misdemeanor', 'p_property', 'p_murder', 'p_assault', 'p_sex_offense',
       'p_weapon', 'p_felprop_viol', 'p_felassult', 'p_misdeassult',
       'p_traffic', 'p_drug', 'p_dui', 'p_stalking', 'p_voyeurism', 'p_fraud',
       'p_stealing', 'p_trespass', 'ADE', 'Treatment', 'p_prison', 'p_jail30',
       'p_fta_two_year', 'p_fta_two_year_plus', 'p_probation',
       'p_incarceration', 'six_month', 'one_year', 'three_year', 'five_year',
       'p_violence', 'p_pending_charge', 'current_violence',
       'current_pending_charge'],
      dtype='object')

In [13]:
cutoffs = [[1], ## 0. sex
           [18,19,20,24,29,30,38,43,47,52,55,61,67], ## 1. p_age_at_current_charge
           [1,2,3,4,5,6,7,8,9,10], # 2.p_arrests
           [1,2,3,4,5,6], ## 3.p_charges
           [1,2], ## 4. p_felony
           [1,2,3], ## 5. p_misdemeanor
           [1], ## 6. p_property
           [1], ## 7. p_murder
           [1], ## 8. p_assult
           [1], ## 9. p_sex_offense
           [1], ## 10. p_weapon
           [1], ## 11. p_felprop_viol
           [1], ## 12. p_felassul
           [1], ## 13. p_misdeassul
           [1,2], ## 14. p_traffic
           [1,2], ## 15. p_drug
           [1,], ## 16. p_dui
           [1],  ## 17. p_stalking
           [1], ## 18. p_voyeuriam
           [1], ## 19. p_fraud
           [1], ## 20. p_stealing
           [1], ## 21. p_trespass
           [1], ## 22. ADE
           [1], ## 23. Treatment
           [1], ## 24. p_prison
           [1,2,3], ## 25. jail30
           [1], ## 26. p_fta_two_year
           [1], ## 27. p_fta_two_year_plus
           [1,2], ## 28. p_probation
           [1], ## 29. p_incarceration
           [1], ## 30. six_month
           [1], ## 31. one_year
           [1], ## 32. three_year
           [1], ## 33. five_year
           [1], ## 34. p_violence
           [1,2], ## 35. p_pending_charges
           [1], ## 36. current_violence
           [1]] ## 37. current_pending_charge

#### Make Stumps

In [15]:
## make stumps
new_data = create_stumps(data_X, cols_X, cutoffs)
## combine data sets
data_Y[data_Y == 0] = -1
new_cols = ['person_id'] + ['race'] + ['screening_date'] + list(new_data.columns) + list(cols_Y)
new_data1 = pd.DataFrame(np.c_[person_id, race, screening_date, new_data, data_Y], columns=new_cols)
new_data1['train_index'] = train_index

In [16]:
new_data1.head(10)

Unnamed: 0,person_id,race,screening_date,sex>=1,age_at_current_charge<=18,age_at_current_charge<=19,age_at_current_charge<=20,age_at_current_charge<=24,age_at_current_charge<=29,age_at_current_charge<=30,...,recid_M_two_year,recid_property_two_year,recid_six_month,recid_drug_six_month,recid_traffic_six_month,recid_violence_six_month,recid_F_six_month,recid_M_six_month,recid_property_six_month,train_index
0,8,White,2015-07-09,1,0,0,0,0,1,1,...,1,-1,1,1,-1,-1,1,-1,-1,1
1,12,White,2015-07-22,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
2,23,White,2015-03-29,1,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,33,White,2015-07-08,1,0,0,0,0,0,1,...,1,-1,1,-1,-1,1,-1,1,-1,1
4,35,White,2015-02-02,1,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
5,36,Asian,2015-05-07,0,0,0,0,0,0,0,...,-1,-1,1,-1,-1,1,-1,-1,-1,1
6,40,White,2014-10-31,1,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
7,43,White,2015-07-22,1,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
8,47,White,2015-06-03,0,0,0,0,0,0,0,...,1,-1,1,1,-1,1,1,1,-1,1
9,62,White,2015-06-15,1,0,0,0,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1


### Save Results

In [23]:
train_data = new_data1[new_data1['train_index'] == 1]
test_data = new_data1[new_data1['train_index'] == 0]

In [24]:
new_data1 = new_data1.drop(['train_index'], axis=1)
train_data = train_data.drop(['train_index'], axis=1)
test_data = test_data.drop(['train_index'], axis=1)

In [25]:
new_data1.to_csv('kentucky_stumps_2019109.csv', header=True, index=False)
train_data.to_csv('kentucky_train_stumps_2019109.csv', header=True, index=False)
test_data.to_csv('kentucky_test_stumps_2019109.csv', header=True, index=False)
#train_index.to_csv('stumps_train_index.csv', header=False, index=False)
#test_index.to_csv('stumps_test_index.csv', header=False, index=False)