In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## load two data separately
KY_data = pd.read_csv("KY_cleaned")
FL_data = pd.read_csv("FL_cleaned")
whole_data = pd.concat([KY_data, FL_data])

## combined them
index = np.repeat('KY', KY_data.shape[0]).tolist() + np.repeat('FL', FL_data.shape[0]).tolist()
person_id = whole_data['person_id'].astype('object')
whole_data = whole_data.drop(['person_id'], axis=1)

## split X and Y
X, Y = whole_data.loc[:, :'current_violence20'], whole_data.loc[:, 'general_two_year':]
cols_X, cols_Y = X.columns, Y.columns
X = X.values

#### Function

In [4]:
def create_stumps(data, columns, cutpoints):
    
    """
    @parameters:
    
    - data: featres; np.array
    - columns: feature names
    - cutpoints: cut off points used to create stumps
    
    """
    
    ## data dimension
    final_data = []
    final_names = []
    n, p = data.shape[0], data.shape[1]
    
    ## loop through features
    for i in range(len(columns)):
        
        ## subset feature
        feature = columns[i]
        feature_values = data[:,i]
        cutoff = cutpoints[i]
        cutoff_length = len(cutoff)
        names = []
        
        ## create stumps
        ### if the variable is 'p_current_age' or 'p_age_first_offense', then we would want to use '<=' intervals. For other variables, we use '>=' intervals
        ### if the variable is binary, then set the cutoff point value to be 1.
        
        if (feature == 'age_at_current_charge'):
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n):
                    if feature_values[j] <= cutoff[k]: stumps[j,k] = 1
                names.append(feature + str(cutoff[k]))
        else: 
            stumps = np.zeros([n, cutoff_length])
            for k in range(cutoff_length):
                for j in range(n): 
                    if feature_values[j] >= cutoff[k]: stumps[j,k] = 1
                names.append(feature + str(cutoff[k]))
        
        ## store stumps
        final_data.append(stumps)
        final_names.append(names)
        
        ## post process
        new_data = pd.DataFrame(final_data[0], columns=final_names[0])
        for s in range(len(final_data)-1):
            a = pd.DataFrame(final_data[s+1], columns=final_names[s+1])
            new_data = pd.concat([new_data, a], axis=1)
    
    return new_data

#### Cutoffs from GA2M Model -- version 1

In [5]:
cols_X

Index(['sex', 'age_at_current_charge', 'p_arrest', 'p_charges', 'p_violence',
       'p_felony', 'p_misdemeanor', 'p_property', 'p_murder', 'p_sex_offense',
       'p_weapon', 'p_felprop_viol', 'p_felassult', 'p_misdeassult',
       'p_traffic', 'p_drug', 'p_dui', 'p_stalking', 'p_voyeurism', 'p_fraud',
       'p_stealing', 'p_trespass', 'p_fta_two_year', 'p_fta_two_year_plus',
       'p_pending_charge', 'p_probation', 'p_incarceration', 'six_month',
       'one_year', 'three_year', 'five_year', 'current_violence',
       'current_violence20'],
      dtype='object')

In [6]:
cutoffs = [[1], ## 0. sex
           [18,19,20,21,24,27,30,36,40,43,47,51,55,59,61,66], ## 1. age_at_current_charge
           [1,2,3,4,5,6,7,8,9,10], ## 2. p_arrests
           [1,2,3,4,5,6], ## 3.p_charge
           [1,2,3], ## 4.p_violence
           [1,2,3], ## 5. p_felony
           [1,2,3], ## 6. p_misdemeanor
           [1,2,3], ## 7. p_property
           [1], ##  8. p_murder
           [1], ## 9. p_sex_offense
           [1], ## 10. p_weapon
           [1], ## 11. p_felprop_viol
           [1], ## 12. p_felassul
           [1], ## 13. p_misdeassul
           [1,2,3], ## 14. p_traffic
           [1,2,3], ## 15. p_drug
           [1,], ## 16. p_dui
           [1],  ## 17. p_stalking
           [1], ## 18. p_voyeuriam
           [1], ## 19. p_fraud
           [1], ## 20. p_stealing
           [1], ## 21. p_trespass
           [1,2], ## 22. p_fta_two_year
           [1], ## 23. p_fta_two_year_plus
           [1,2,3,4,5], ## 24. p_pending_charge
           [1,2,3], ## 25. p_probation
           [1,2,3], ## 26. p_incarceration
           [1], ## 27. six-month
           [1], ## 28. one-year
           [1], ## 29. three-year
           [1], ## 30. five_year
           [1], ## 31. current_violence
           [1]] ## 32. current_violence20

In [18]:
## make stumps
new_data = create_stumps(X, cols_X, cutoffs)

## add labels
Y[Y == 0] = -1
new_cols = ['person_id'] + list(new_data.columns) + list(cols_Y)
new_data1 = pd.DataFrame(np.c_[person_id, new_data, Y], columns=new_cols)
new_data1['index'] = index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


In [19]:
new_data1.head(10)

Unnamed: 0,person_id,sex1,age_at_current_charge18,age_at_current_charge19,age_at_current_charge20,age_at_current_charge21,age_at_current_charge24,age_at_current_charge27,age_at_current_charge30,age_at_current_charge36,...,felony_two_year,misdemeanor_two_year,property_two_year,general_six_month,drug_six_month,violent_six_month,felony_six_month,misdemeanor_six_month,property_six_month,index
0,5,1,0,0,0,0,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
1,6,0,0,0,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
2,8,1,0,0,0,0,0,1,1,1,...,1,1,1,1,1,-1,1,-1,-1,KY
3,11,1,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
4,12,1,0,0,0,0,0,0,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
5,23,1,0,0,0,0,0,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
6,27,1,0,0,0,0,0,0,0,0,...,1,1,-1,1,1,-1,1,1,-1,KY
7,33,1,0,0,0,0,0,0,1,1,...,-1,1,1,1,1,1,-1,1,1,KY
8,35,1,0,0,0,0,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,KY
9,36,0,0,0,0,0,0,0,0,1,...,-1,-1,-1,1,-1,1,-1,-1,-1,KY


In [20]:
FL_data_new = new_data1[new_data1['index'] == 'FL']
KY_data_new = new_data1[new_data1['index'] == 'KY']

In [21]:
new_data1.to_csv('whole_stumps', header=True, index=False)
FL_data_new.to_csv('FL_stumps', header=True, index=False)
KY_data_new.to_csv('KY_stumps', header=True, index=False)