In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
### train data
train_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_train.csv")
train_data = train_data.drop(['arnold_fta_raw','arnold_nca_raw','arnold_nvca_raw'], axis=1)
train_X = train_data.loc[:,:'current_violence20']
train_Y = train_data.loc[:, 'general_two_year':]

### test data
test_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_test.csv")
test_data = test_data.drop(['arnold_fta_raw','arnold_nca_raw','arnold_nvca_raw'], axis=1)
test_X = test_data.loc[:,:'current_violence20']
test_Y = test_data.loc[:,'general_two_year':]

### stack train and test
data_X = pd.concat([train_X, test_X])
data_Y = pd.concat([train_Y, test_Y])
train_index = np.repeat(1, train_data.shape[0]).tolist() + np.repeat(0, test_data.shape[0]).tolist()

## extract id, race and date
person_id = pd.DataFrame(data_X['person_id'])
race = pd.DataFrame(data_X['race'])
screening_date = pd.DataFrame(data_X['screening_date'])
data_X = data_X.drop(['person_id', 'race', 'screening_date'], axis=1)

### column names
cols_X = data_X.columns
cols_Y = data_Y.columns
data_X = data_X.values

##### sanity check

In [4]:
cols_X

Index(['sex', 'age_at_current_charge', 'p_arrest', 'p_charges', 'p_violence',
       'p_felony', 'p_misdemeanor', 'p_property', 'p_murder', 'p_assault',
       'p_sex_offense', 'p_weapon', 'p_felprop_viol', 'p_felassult',
       'p_misdeassult', 'p_traffic', 'p_drug', 'p_dui', 'p_stalking',
       'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass', 'ADE',
       'Treatment', 'p_fta_two_year', 'p_fta_two_year_plus',
       'p_pending_charge', 'p_probation', 'p_incarceration', 'six_month',
       'one_year', 'three_year', 'five_year', 'current_violence',
       'current_pending_charge', 'current_violence20'],
      dtype='object')

In [5]:
cols_Y

Index(['general_two_year', 'drug_two_year', 'violent_two_year',
       'felony_two_year', 'misdemeanor_two_year', 'property_two_year',
       'general_six_month', 'drug_six_month', 'violent_six_month',
       'felony_six_month', 'misdemeanor_six_month', 'property_six_month'],
      dtype='object')

#### Function

In [12]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if (feature == 'age_at_current_charge'):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n): 
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data

In [14]:
cutoffs = [[1], ## 0. sex
           [18,19,20,21,24,27,30,35,39,43,47,51,55,61,66], ## 1. age_at_current_charge
           [2,3,4,5,6,7,8,9,10], # 2.p_arrests
           [2,3,4,5,6], ## 3.p_charges
           [1,2,3], ## 4. p_violence
           [1,2,3,4], ## 5. p_felony
           [1,2,3,4], ## 6. p_misdemeanor
           [1,2], ## 7. p_property
           [1], ## 8. p_murder
           [1], ## 9. p_assult
           [1], ## 10. p_sex_offense
           [1], ## 11. p_weapon
           [1], ## 12. p_felprop_viol
           [1], ## 13. p_felassul
           [1], ## 14. p_misdeassul
           [1,2,3], ## 15. p_traffic
           [1,2,3], ## 16. p_drug
           [1], ## 17. p_dui
           [1],  ## 18. p_stalking
           [1], ## 19. p_voyeuriam
           [1], ## 20. p_fraud
           [1,2], ## 21. p_stealing
           [1], ## 22. p_trespass
           [1], ## 23. ADE
           [1], ## 24. Treatment
           [1,2], ## 25. p_fta_two_year
           [1], ## 26. p_fta_two_year_plus
           [1,2,3], ## 27. p_pending_charges
           [1,2,3], ## 28. p_probation
           [1], ## 29. p_incarceration
           [1], ## 30. six_month
           [1], ## 31. one_year
           [1], ## 32. three_year
           [1], ## 33. five_year           
           [1], ## 34. current_violence
           [1], ## 35. current_pending_charge
           [1]] ## 36. current_violence20

#### Make Stumps

In [16]:
## make stumps
new_data = create_stumps(data_X, cols_X, cutoffs)

## combine data sets
data_Y[data_Y == 0] = -1
new_cols = ['person_id'] + ['race'] + ['screening_date'] + list(new_data.columns) + list(cols_Y)
new_data1 = pd.DataFrame(np.c_[person_id, race, screening_date, new_data, data_Y], columns=new_cols)
new_data1['train_index'] = train_index

In [17]:
new_data1.head(10)

Unnamed: 0,person_id,race,screening_date,sex1,age_at_current_charge18,age_at_current_charge19,age_at_current_charge20,age_at_current_charge21,age_at_current_charge24,age_at_current_charge27,...,felony_two_year,misdemeanor_two_year,property_two_year,general_six_month,drug_six_month,violent_six_month,felony_six_month,misdemeanor_six_month,property_six_month,train_index
0,6,White,6/16/2014,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
1,8,White,7/9/2015,1,0,0,0,0,0,1,...,1,1,-1,1,1,-1,1,-1,-1,1
2,11,White,7/7/2013,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,12,White,7/22/2015,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
4,23,White,7/21/2015,1,0,0,0,0,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
5,27,Black,1/9/2014,1,0,0,0,0,0,0,...,1,1,-1,1,-1,-1,1,1,-1,1
6,33,White,7/8/2015,1,0,0,0,0,0,0,...,-1,1,-1,1,1,1,-1,1,-1,1
7,35,White,2/2/2015,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
8,40,White,11/7/2014,1,0,0,0,0,0,1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
9,42,White,11/7/2013,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1


### Save Results

In [18]:
train_data = new_data1[new_data1['train_index'] == 1]
test_data = new_data1[new_data1['train_index'] == 0]

In [19]:
new_data1 = new_data1.drop(['train_index'], axis=1)
train_data = train_data.drop(['train_index'], axis=1)
test_data = test_data.drop(['train_index'], axis=1)

In [20]:
new_data1.to_csv('kentucky_stumps.csv', header=True, index=False)
train_data.to_csv('kentucky_train_stumps.csv', header=True, index=False)
test_data.to_csv('kentucky_test_stumps.csv', header=True, index=False)