In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
### train data
train_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/broward_train.csv")
train_X = train_data.loc[:,:'five_year']
train_Y = train_data.loc[:, 'recid_two_year':]

### test data
test_data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/broward_test.csv")
test_X = test_data.loc[:,:'five_year']
test_Y = test_data.loc[:,'recid_two_year':]

### stack train and test
data_X = pd.concat([train_X, test_X])
data_Y = pd.concat([train_Y, test_Y])
train_index = np.repeat(1, train_data.shape[0]).tolist() + np.repeat(0, test_data.shape[0]).tolist()

## extract id, race and date
person_id = pd.DataFrame(data_X['person_id'])
race = pd.DataFrame(data_X['race'])
screening_date = pd.DataFrame(data_X['screening_date'])
data_X = data_X.drop(['person_id', 'race', 'screening_date'], axis=1)

### column names
cols_X = data_X.columns
cols_Y = data_Y.columns
data_X = data_X.values
#data_Y = data_Y.values

##### sanity check

In [14]:
data_X.shape, data_Y.shape

((1954, 40), (1954, 12))

In [15]:
cols_X

Index(['sex', 'p_current_age', 'p_age_first_offense', 'p_charge', 'p_jail30',
       'p_prison', 'p_probation', 'p_juv_fel_count', 'p_felprop_violarrest',
       'p_murder_arrest', 'p_felassault_arrest', 'p_misdemassault_arrest',
       'p_famviol_arrest', 'p_sex_arrest', 'p_weapons_arrest',
       'fail_appear_two_yr', 'fail_appear_two_plus', 'current_violent',
       'current_violent20', 'pending_charge', 'prior_conviction_F',
       'prior_conviction_M', 'violent_conviction', 'total_convictions',
       'p_arrest', 'p_property', 'p_traffic', 'p_drug', 'p_dui', 'p_domestic',
       'p_stalking', 'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass',
       'years_since_last_crime', 'six_month', 'one_year', 'three_year',
       'five_year'],
      dtype='object')

#### Function

In [16]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if ((feature == 'p_current_age') | (feature == 'p_age_first_offense')):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '<=' + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '>=' + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data
    #return final_data, final_names

#### Cutoffs from GA2M Model -- version 1

In [17]:
cutoffs = [[1], ## 0. sex
           [18,19,23,27,29,30,31,33,37,43,49,53,57], ## 2. p_current_charge
           [18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,44,48], ## 3. p_age_first_offense 
           [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], ## p_charge
           [1], # jail30
           [1,2], # prison
           [1,2,3,4,5], # probation
           [1], # jul_fel_count
           [1,2,3], # felprop_violence
           [1], # murder
           [1,2], # felassault
           [1,2], # misassault
           [1], # family_viol
           [1], # sex_arrest
           [1], # weapon
           [1,2], # fta2
           [1], # fta2+
           [1], # current_violent
           [1], # current_violent20
           [1,2,3,4,5,6,7,8], # pending_charge
           [1,2,3,4], # prior_F
           [1,2,3,4,5,6,7,8], # prior_M
           [1,2,3,4,5], # violent_conv
           [1,2,3,4,5,6,7,8,9,11,13,15], # total_conv
           [1,2,3,4,5,6,10,15,20,25,30,40], # arrest
           [1,2,3,4,5,6,7], # property
           [1], # traffic
           [1,2,3,4], # drug
           [1], # dui
           [1], # domestic
           [1], # stalking
           [1], # voye
           [1], # fraud
           [1,2,3,4], # stealing
           [1,2,3], # trespass
           [1,2,3,4,5], # year_since_last
           [1],[1],[1],[1]]

In [18]:
## make stumps
new_data = create_stumps(data_X, cols_X, cutoffs)

## combine data sets
data_Y[data_Y == 0] = -1
new_cols = ['person_id'] + ['race'] + ['screening_date'] + list(new_data.columns) + list(cols_Y)
new_data1 = pd.DataFrame(np.c_[person_id, race, screening_date, new_data, data_Y], columns=new_cols)
new_data1['train_index'] = train_index

In [19]:
new_data1.head(10)

Unnamed: 0,person_id,race,screening_date,sex>=1,p_current_age<=18,p_current_age<=19,p_current_age<=23,p_current_age<=27,p_current_age<=29,p_current_age<=30,...,recid_property2,recid_M2,recid_F2,recid_violent2,recid_drug6,recid_property6,recid_M6,recid_F6,recid_violent6,train_index
0,1001,African-American,2014-02-03,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
1,101,Hispanic,2013-01-13,1,0,0,0,0,0,0,...,-1,1,-1,1,-1,-1,-1,-1,-1,1
2,101,Hispanic,2014-02-02,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,1016,Other,2013-04-15,1,0,0,0,1,1,1,...,-1,-1,1,1,-1,-1,-1,1,1,1
4,1016,Other,2013-05-11,1,0,0,0,1,1,1,...,-1,-1,1,-1,-1,-1,-1,-1,-1,1
5,102,Hispanic,2013-05-25,1,0,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
6,1036,African-American,2013-04-27,1,0,0,0,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
7,1044,African-American,2013-09-25,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
8,1048,African-American,2013-10-09,1,0,0,0,0,0,0,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
9,1049,Caucasian,2013-05-03,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1


In [20]:
train_data = new_data1[new_data1['train_index'] == 1]
test_data = new_data1[new_data1['train_index'] == 0]

In [21]:
new_data1 = new_data1.drop(['train_index'], axis=1)
train_data = train_data.drop(['train_index'], axis=1)
test_data = test_data.drop(['train_index'], axis=1)

In [22]:
new_data1.to_csv('broward_stumps.csv', header=True, index=False)
train_data.to_csv('broward_train_stumps.csv', header=True, index=False)
test_data.to_csv('broward_test_stumps.csv', header=True, index=False)