In [1]:
import os 
os.chdir('../../')
print("Current working directory is now: ", os.getcwd())

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Current working directory is now:  C:\Users\binha\Documents\Duke\Cynthia Research\interpretable-machine-learning


In [2]:
data = pd.read_csv("./broward/data/broward_data.csv")
index_data = pd.read_csv("./broward/data/broward_test_index.csv")

## extract x, y and index
X = data.loc[:,:'five_year']
Y = data.loc[:, 'general_two_year':]
test_index = index_data['x'].values-1

## extract person_id, race, and screening_date
person_id = X['person_id']
race = X['race']
screening_date = X['screening_date']
X = X.drop(['person_id', 'race', 'screening_date'], axis=1)

### save column names
cols_X = X.columns
cols_Y = Y.columns
X = X.values

##### sanity check

In [3]:
X.shape, Y.shape

((1954, 38), (1954, 12))

In [4]:
cols_X

Index(['sex', 'age_at_current_charge', 'age_at_first_charge', 'p_charges',
       'p_incarceration', 'p_probation', 'p_juv_fel_count', 'p_felprop_viol',
       'p_murder', 'p_felassault', 'p_misdeassault', 'p_famviol',
       'p_sex_offense', 'p_weapon', 'p_fta_two_year', 'p_fta_two_year_plus',
       'current_violence', 'current_violence20', 'p_pending_charge',
       'p_felony', 'p_misdemeanor', 'p_violence', 'total_convictions',
       'p_arrest', 'p_property', 'p_traffic', 'p_drug', 'p_dui', 'p_domestic',
       'p_stalking', 'p_voyeurism', 'p_fraud', 'p_stealing', 'p_trespass',
       'six_month', 'one_year', 'three_year', 'five_year'],
      dtype='object')

#### Function

In [5]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use 
        ### '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if ((feature == 'age_at_current_charge') | (feature == 'age_at_first_charge')):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data

#### Cutoffs from GA2M Model -- version 1

In [6]:
cutoffs = [[1], ## 0. sex
           [18,19,23,24,27,29,30,31,33,37,43,45,49,53,57], ## 2. p_current_charge
           [18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,44,48], ## 3. p_age_first_offense 
           [3,4,5,6,7,8,9,10], ## 4.p_charge
           [1], # 5. p_incarceration
           [1,2,3], # 6. p_probation
           [1], # 7.jul_fel_count
           [1], # 8. felprop_violence
           [1], # 9. p_murder
           [1], # 10. p_felassault
           [1,2], # 11.p_misassault
           [1], # 12. family_viol
           [1], # 13. p_sex_arrest
           [1], # 14. p_weapon
           [1], # 15. p_fta2
           [1], # 16. fta2+
           [1], # 17. current_violent
           [1], # 18. current_violent20
           [1,2,3,4,5,6,7], # 19.pending_charge
           [1,2], # 20.prior_F
           [1,2,3,4,5,6], # 21. prior_M
           [1,2,3,4,5], # 22. violent_conv
           [1,2,3,4,5,6,7], # 23.total_conv
           [1,2,3,4,5,6,7], # 24.arrest
           [1,2,3,4,5], # 25.property
           [1], # 26. traffic
           [1,2,3,4], # 27.drug
           [1], # 28. dui
           [1], # 29. domestic
           [1], # 30. stalking
           [1], # 31. voye
           [1], # 32. fraud
           [1,2,3], # 33. stealing
           [1,2], # 34. trespass
           [1], # 36. six_month
           [1], # 37. one_year
           [1], # 38. three_year
           [1]] ## 39. five_year

In [7]:
## make stumps
new_data = create_stumps(X, cols_X, cutoffs)

## combine data sets
Y[Y == 0] = -1
new_cols = ['person_id'] + ['race'] + ['screening_date'] + list(new_data.columns) + list(cols_Y)
new_data1 = pd.DataFrame(np.c_[person_id, race, screening_date, new_data, Y], columns=new_cols)

In [8]:
new_data1.head(10)

Unnamed: 0,person_id,race,screening_date,sex1,age_at_current_charge18,age_at_current_charge19,age_at_current_charge23,age_at_current_charge24,age_at_current_charge27,age_at_current_charge29,...,drug_two_year,property_two_year,misdemeanor_two_year,felony_two_year,violent_two_year,drug_six_month,property_six_month,misdemeanor_six_month,felony_six_month,violent_six_month
0,1001,African-American,2014-02-03,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101,Hispanic,2013-01-13,1,0,0,0,0,0,0,...,-1,-1,1,-1,1,-1,-1,-1,-1,-1
2,101,Hispanic,2014-02-02,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,1015,African-American,2014-01-22,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1016,Other,2013-04-15,1,0,0,0,0,1,1,...,-1,-1,-1,1,1,-1,-1,-1,1,1
5,1016,Other,2013-05-11,1,0,0,0,0,1,1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,-1
6,102,Hispanic,2013-05-25,1,0,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7,1027,African-American,2013-04-04,1,0,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
8,1032,Hispanic,2013-09-23,1,0,0,0,0,1,1,...,-1,-1,1,-1,-1,-1,-1,1,-1,-1
9,1034,African-American,2013-01-14,1,0,1,1,1,1,1,...,-1,1,1,-1,1,-1,1,1,-1,1


In [9]:
train_data = new_data1.drop(test_index)
test_data = new_data1.iloc[test_index, :]

In [10]:
new_data1.to_csv('./broward/data/broward_stumps.csv', header=True, index=False)
train_data.to_csv('./broward/data/broward_train_stumps.csv', header=True, index=False)
test_data.to_csv('./broward/data/broward_test_stumps.csv', header=True, index=False)