In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
## load two data separately
KY_data = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/data preparation/KY_cleaned")
FL_data = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/data preparation/FL_cleaned")

## combined them
whole_data = pd.concat([KY_data, FL_data])
index = np.repeat('KY', KY_data.shape[0]).tolist() + np.repeat('FL', FL_data.shape[0]).tolist()

## split X and Y
X, Y= whole_data.loc[:, :'current_violence'], whole_data.loc[:, 'recid_two_year':]
cols_X, cols_Y = X.columns, Y.columns

X = X.values ## np.array
Y = Y.values

#### Function

In [5]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if (feature == 'age_at_current_charge'):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '<=' + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + '>=' + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data
    #return final_data, final_names

#### Cutoffs from GA2M Model -- version 1

- sparse

In [8]:
cols_X

Index(['sex', 'age_at_current_charge', 'p_arrest', 'p_charges', 'p_violence',
       'p_felony', 'p_misdemeanor', 'p_property', 'p_murder', 'p_sex_offense',
       'p_weapon', 'p_felprop_viol', 'p_felassult', 'p_misdeassult',
       'p_traffic', 'p_drug', 'p_dui', 'p_stalking', 'p_voyeurism', 'p_fraud',
       'p_stealing', 'p_trespass', 'p_prison', 'p_jail30', 'p_fta_two_year',
       'p_fta_two_year_plus', 'p_probation', 'six_month', 'one_year',
       'three_year', 'five_year', 'current_violence'],
      dtype='object')

In [9]:
cutoffs = [[1], ## 0. sex
           np.linspace(18,70,53), ## 2. p_age_at_current_charge
           [1,2,3,4,5,6,7,8,9,10], ## 3. p_arrests
           [1,2,3,4,5,6], ## 4.p_charge
           [1], ## 5.p_violence
           [1,2], ## 6. p_felony
           [1,2,3], ## 7. p_misdemeanor
           [1], ## 8. p_property
           [1], ##  9. p_murder
           [1], ## 10. p_sex_offense
           [1], ## 11. p_weapon
           [1], ## 12. p_felprop_viol
           [1], ## 13. p_felassul
           [1], ## 14. p_misdeassul
           [1,2], ## 15. p_traffic
           [1,2], ## 16. p_drug
           [1,], ## 17. p_dui
           [1],  ## 19. p_stalking
           [1], ## 20. p_voyeuriam
           [1], ## 21. p_fraud
           [1], ## 22. p_stealing
           [1], ## 23. p_trespass
           [1], ## 24. p_prison
           [1,2,3], ## 25. jail30
           [1], ## 26. p_fta_two_year
           [1], ## p_fta_two_year_plus
           [1,2], ## 27. p_probation
           [1],[1],[1],[1], ## 28;29,30,31: six_month; one_year; three_year; five_year
           [1]]  ## current_violence

In [10]:
## make stumps
new_data = create_stumps(X, cols_X, cutoffs)

## add labels
Y[Y == 0] = -1
y = pd.DataFrame(Y, columns=cols_Y)
new_data = pd.concat([new_data, y], axis=1)
new_data['index'] = index

In [11]:
new_data.head(10)

Unnamed: 0,sex>=1,age_at_current_charge<=18.0,age_at_current_charge<=19.0,age_at_current_charge<=20.0,age_at_current_charge<=21.0,age_at_current_charge<=22.0,age_at_current_charge<=23.0,age_at_current_charge<=24.0,age_at_current_charge<=25.0,age_at_current_charge<=26.0,...,recid_F_two_year,recid_M_two_year,recid_property_two_year,recid_six_month,recid_drug_six_month,recid_violence_six_month,recid_F_six_month,recid_M_six_month,recid_property_six_month,index
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,-1,1,1,-1,1,-1,-1,KY
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,1,-1,1,-1,1,-1,1,-1,KY
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,1,-1,1,-1,-1,-1,KY
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,-1,1,1,1,1,1,-1,KY


In [12]:
FL_data = new_data[new_data['index'] == 'FL']
KY_data = new_data[new_data['index'] == 'KY']

In [13]:
new_data.to_csv('whole_stumps', header=True, index=False)
FL_data.to_csv('FL_stumps', header=True, index=False)
KY_data.to_csv('KY_stumps', header=True, index=False)