In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
### train data
train_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/broward_train.csv")
train_data = train_data.drop(['person_id', 'screening_date', 'race'], axis=1)

train_X = train_data.loc[:,:'five_year']
train_Y = train_data.loc[:, 'recid_two_year':]

### test data
test_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/broward_test.csv")
test_data = test_data.drop(['person_id', 'screening_date', 'race'], axis=1)
test_X = test_data.loc[:,:'five_year']
test_Y = test_data.loc[:,'recid_two_year':]

### column names
cols_X = train_data.columns[:-12]
cols_Y = train_data.columns[-12:]

### stack train and test
data_X = pd.concat([train_X, test_X]).values
data_Y = pd.concat([train_Y, test_Y]).values
train_index = np.repeat(1, train_data.shape[0]).tolist() + np.repeat(0, test_data.shape[0]).tolist()

##### sanity check

In [9]:
data_X.shape, data_Y.shape

((1927, 40), (1927, 12))

In [10]:
cols_X

Index(['sex', 'p_current_age', 'p_age_first_offense', 'p_charge', 'p_jail30',
       'p_prison', 'p_probation', 'p_juv_fel_count', 'p_felprop_violarrest',
       'p_murder_arrest', 'p_felassault_arrest', 'p_misdemassault_arrest',
       'p_famviol_arrest', 'p_sex_arrest', 'p_weapons_arrest',
       'fail_appear_two_yr', 'fail_appear_two_plus', 'current_violent',
       'current_violent20', 'pending_charge', 'prior_conviction_F',
       'prior_conviction_M', 'violent_conviction', 'total_convictions',
       'p_arrest', 'p_property', 'p_traffic', 'p_drug', 'p_dui', 'p_domestic',
       'p_stalking', 'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass',
       'years_since_last_crime', 'six_month', 'one_year', 'three_year',
       'five_year'],
      dtype='object')

#### Function

In [11]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if ((feature == 'p_current_charge') | (feature == 'p_age_first_offense')):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '<=' + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '>=' + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data
    #return final_data, final_names

#### Cutoffs from GA2M Model -- version 1

In [12]:
cutoffs = [[1], ## 0. sex
           [18,19,23,27,29,30,31,33,37,43,49,53,57], ## 2. p_current_charge
           [18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,44,48], ## 3. p_age_first_offense 
           [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], ## p_charge
           [1], # jail30
           [1,2], # prison
           [1,2,3,4,5], # probation
           [1], # jul_fel_count
           [1,2,3], # felprop_violence
           [1], # murder
           [1,2], # felassault
           [1,2], # misassault
           [1], # family_viol
           [1], # sex_arrest
           [1], # weapon
           [1,2], # fta2
           [1], # fta2+
           [1], # current_violent
           [1], # current_violent20
           [1,2,3,4,5,6,7,8], # pending_charge
           [1,2,3,4], # prior_F
           [1,2,3,4,5,6,7,8], # prior_M
           [1,2,3,4,5], # violent_conv
           [1,2,3,4,5,6,7,8,9,11,13,15], # total_conv
           [1,2,3,4,5,6,10,15,20,25,30,40], # arrest
           [1,2,3,4,5,6,7], # property
           [1], # traffic
           [1,2,3,4], # drug
           [1], # dui
           [1], # domestic
           [1], # stalking
           [1], # voye
           [1], # fraud
           [1,2,3,4], # stealing
           [1,2,3], # trespass
           [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5], # year_since_last
           [1],[1],[1],[1]]

In [15]:
## make stumps
new_data = create_stumps(data_X, cols_X, cutoffs)

## add labels
data_Y[data_Y == 0] = -1
y = pd.DataFrame(data_Y, columns=cols_Y)
new_data = pd.concat([new_data,y], axis=1)
new_data['train_index'] = train_index

In [16]:
new_data.head(10)

Unnamed: 0,sex>=1,p_current_age>=18,p_current_age>=19,p_current_age>=23,p_current_age>=27,p_current_age>=29,p_current_age>=30,p_current_age>=31,p_current_age>=33,p_current_age>=37,...,recid_property2,recid_M2,recid_F2,recid_violent2,recid_drug6,recid_property6,recid_M6,recid_F6,recid_violent6,train_index
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,-1,1,-1,1,-1,-1,-1,-1,-1,1
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
4,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,1,-1,-1,-1,-1,-1,-1,1
5,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
6,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,1,-1,-1,-1,-1,1,-1,-1,1
7,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,-1,1,-1,1,1,-1,1,1
8,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1


In [17]:
train_data = new_data[new_data['train_index'] == 1]
test_data = new_data[new_data['train_index'] == 0]

In [18]:
new_data.to_csv('broward_stumps.csv', header=True, index=False)
train_data.to_csv('broward_train_stumps.csv', header=True, index=False)
test_data.to_csv('broward_test_stumps.csv', header=True, index=False)