In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import numpy.linalg as la
import pandas as pd
from scipy.linalg import svd
from mpl_toolkits.mplot3d import Axes3D
import random
from pipeline import generate_basic_exploration

In [2]:
df = pd.read_csv('allegations_cleaned2.csv')
df = df.drop(columns='board_disposition')

In [3]:
odf = pd.read_csv('officer_df.csv')

In [4]:
odf = odf.drop(columns='Unnamed: 0.1')

In [5]:
odf.head(15)

Unnamed: 0.1,Unnamed: 0,officer_id,c_black,c_unknown,c_white,c_asian,c_hispanic,c_american_indian,c_female,c_male,...,Profane Gesture,Animal,Gender Identity,officer_charged,substantiated,ever_charged,mos_gender,mos_age_incident,rank_abbrev_incident,complaint_count
0,0,2,6.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,...,0.0,0.0,0.0,1,1,1,0.0,33.7,2.0,10
1,1,5,0.0,1.0,2.0,0.0,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0,1,0,1.0,33.666667,1.666667,3
2,2,11,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,2,2,1,0.0,27.8,1.0,5
3,3,21,0.0,0.0,3.0,1.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,4,4,1,0.0,29.75,1.0,8
4,4,23,6.0,1.0,0.0,0.0,1.0,0.0,2.0,6.0,...,0.0,0.0,0.0,1,1,1,0.0,24.857143,1.0,14
5,5,28,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0,3,0,0.0,29.4,1.0,5
6,6,38,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1,1,1,0.0,27.0,1.0,3
7,7,43,2.0,0.0,2.0,0.0,0.0,0.0,3.0,1.0,...,0.0,0.0,0.0,1,1,1,1.0,26.8,1.0,5
8,8,73,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0,2,0,0.0,25.0,1.0,2
9,9,83,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,2,3,1,0.0,32.25,1.0,8


In [6]:
officer_exclude_cols =  ['substantiated',
                         'ever_charged']

In [7]:
len(odf)

3996

In [8]:
np.mean(odf['ever_charged'])

0.43743743743743746

In [9]:
charged_offs = odf[odf['ever_charged']==1]

In [10]:
np.mean(charged_offs['promotion'])

0.6590389016018307

In [11]:
np.mean(df['closure_time'])

296.2687211463517

In [12]:
np.mean(odf['demotion']), np.mean(odf['promotion'])

(0.017767767767767766, 0.5147647647647647)

In [17]:
# helper functions
def get_svd(trainX,regularization_param,param=0, verbose=False):
    # Calculate truncated SVD
    if verbose:
        print('Generating SVD')
    U, S, V = la.svd(trainX, full_matrices=False)
#     print('Initial U: {}'.format(str(U[:5])))
#     print('Initial S: {}'.format(str(S[:5])))
#     print('Initial V: {}'.format(str(V[:5])))
    
    if regularization_param =='trunc':
        # take inverse of each elemant
        S_inv = np.reciprocal(S)
#         print('First values of inverse: {}'.format(str(S_inv[:5])))
        S_inv[param:] = 0
    if regularization_param == 'rls':
        S_inv = S/(np.square(S)+param)

    new_S = np.zeros((U.T.shape[0],V.shape[0])) 
    np.fill_diagonal(new_S,S_inv)
    if verbose:
        print(V.T.shape, new_S.T.shape, U.T.shape)
    w_hat = V.T@new_S.T@U.T

    return w_hat
    
def key_min_val(d):
#      a) create a list of the dict's keys and values; 
#      b) return the key with the min value
    v=list(d.values())
    k=list(d.keys())
    
    return k[v.index(min(v))]


def rmse(y_hat, y):
    return np.sqrt(((y_hat - y) ** 2).mean())

def mse(y_hat, y):
    return ((y_hat - y) ** 2).mean()



def get_error_rate(w_hat,testX,testY,outcome_var_type):
#     print('testX shape is {}, w_hat shape is {}'.format(testX.shape, w_hat.shape))
    y_hat = testX@w_hat
#     print(y_hat[:20])
    
    if outcome_var_type == "binary":
        y_hat_encode = [0 if x <= 0.5
                        else 1 
                        for x in y_hat]
        
    if outcome_var_type == 'three-class':    
        y_hat_encode = [-1 if x <= -0.5 
                      else 0 if x > -0.5 and x < 0.5 
                      else 1
                     for x in y_hat]

    if outcome_var_type not in ("binary", "three-class"):
        return rmse(y_hat,testY)

    else:
        equal = np.sum(np.array(y_hat_encode).reshape(len(testY),) == np.array(testY).reshape(len(testY),))
#         print(y_hat_encode[:5], testY[:5])
#         print('sum y_hat_encode is {}'.format(sum(y_hat_encode)), 
#               'len y_hat_encode is {}'.format(len(y_hat_encode)))
#         print('mean equal is {}'.format(equal.mean()))
#         print('len testY = {}'.format(len(testY)))
        error_rate = (len(testY)-equal)/len(testY)

        return error_rate
    

In [18]:
def get_errors(df,regularization_param,outcome_var,outcome_var_type="binary",
               num_trials=10, officer_df=False, vb=False):
    outer_error_rates = list()

    outcomeList = ['outcome','demotion', 'closure_time',
                   'promotion', 'officer_charged']
    if officer_df:
        outcomeList = ['substantiated', 'ever_charged', 'demotion',
                       'officer_charged', 'promotion']
    randomState = 2
    for i in range(num_trials):
        
        inner_error_rates = dict()
        
        train,test = train_test_split(df,randomState=randomState, verbose=vb)
        trainY = np.array(train[[outcome_var]])
        trainX = np.array(train.drop(['officer_id','Unnamed: 0']+outcomeList,axis=1))
        testY = np.array(test[[outcome_var]])
        testX = np.array(test.drop(['officer_id','Unnamed: 0']+outcomeList,axis=1))
        
        randomState += 1

        if regularization_param == "trunc":
            for k in range(1,10):
                w_hat = get_svd(trainX,"trunc",param=k, verbose=vb)@trainY
                inner_error_rates[k] = get_error_rate(w_hat,testX,testY,outcome_var_type)

        if regularization_param == "rls":
            for lambda_ in np. array ([0 , 0.5 , 1, 2, 4, 8, 16]):
                w_hat = get_svd(trainX,"rls",param=lambda_, verbose=vb)@trainY
                inner_error_rates[lambda_] = get_error_rate(w_hat,testX,testY,outcome_var_type)

        min_key = key_min_val(inner_error_rates)
        outer_error_rates.append((min_key,inner_error_rates[min_key]))
            
    return outer_error_rates


In [19]:
# Group train/test by officer ID somehow? See Piazza post
def train_test_split(df,holdOut=0.2, randomState = 1, verbose=False):
    # Train, val, test split:
    # get number of unique ids and the uniqe IDs
    n_ID = len(df.officer_id.unique())
    ids = pd.DataFrame(df.officer_id.unique())

    # sample from IDs
    train_index = ids.sample(round(n_ID*(1-holdOut)),random_state = randomState ).values.tolist()
    #train_index = [item for sublist in train_index for item in sublist]
    train_index = [x[0] for x in train_index]
    # train data is data from any IDs that show up in train index
    train_data = df[df.officer_id.isin(train_index)]
    # test data is data from any IDs that don't show up in train index
    test_data = df[~df.officer_id.isin(train_index)]

    # Sanity check
    if verbose:
        print("Total Number of Unique IDs:" , len(df.officer_id.unique()))
        print("Total Number of IDs in Test Data:" , len(test_data.officer_id.unique()))
        print("Total Number of IDs in Train Data:" , len(train_data.officer_id.unique()))
        print("Do the IDs add up?" , len(test_data.officer_id.unique()) + len(train_data.officer_id.unique())  ==  len(df.officer_id.unique()))
        print("Does Test Represent 20% of the data?", (len(test_data.officer_id.unique())/len(df.officer_id.unique())) == holdOut)
        print("Test Represents X% of the data:", (len(test_data.officer_id.unique())/len(df.officer_id.unique())))
    
    return train_data, test_data

In [100]:
ridge_closure = get_errors(df,regularization_param="rls",outcome_var="closure_time",
                   outcome_var_type="time", num_trials = 50)
trunc_closure = get_errors(df,regularization_param="trunc",outcome_var="closure_time",
                   outcome_var_type="time", num_trials = 50)

In [114]:
ridge_closure, trunc_closure, np.mean(df['closure_time'])

([(2.0, 131.5778457861212),
  (1.0, 137.61129946435318),
  (16.0, 130.97976856682877),
  (0.5, 128.5108532952614),
  (0.5, 134.66732753731944),
  (4.0, 128.6932359778302),
  (0.0, 132.1113666375659),
  (0.5, 128.26330213493864),
  (1.0, 131.07086145972542),
  (0.5, 137.38991856832368),
  (1.0, 126.28512647449068),
  (2.0, 131.03668669124633),
  (4.0, 131.5679792871582),
  (1.0, 134.3134940733905),
  (2.0, 131.57429322910838),
  (2.0, 133.3286714859839),
  (8.0, 131.37191130253296),
  (0.5, 133.55658363630533),
  (8.0, 133.04755557176435),
  (0.5, 136.17838828072786),
  (2.0, 134.7637274416195),
  (2.0, 141.01045374099118),
  (0.5, 128.8633145886162),
  (0.5, 127.82013786229864),
  (4.0, 131.278093982553),
  (0.5, 140.32259959041752),
  (2.0, 133.21514739390193),
  (2.0, 129.31571322381788),
  (1.0, 132.5964822890891),
  (1.0, 138.42038257990063),
  (1.0, 127.88026193911332),
  (1.0, 139.54020346063055),
  (4.0, 138.0503604340281),
  (2.0, 135.4019130582826),
  (0.5, 137.99602987904962)

In [122]:
trunc_outcome = get_errors(df,regularization_param="trunc",outcome_var="outcome",
                   outcome_var_type="three-class", num_trials = 50)
ridge_outcome = get_errors(df,regularization_param="rls",outcome_var="outcome",
                   outcome_var_type="three-class", num_trials = 50)

In [123]:
trunc_outcome, ridge_outcome

([(9, 0.6990720913633119),
  (9, 0.7073674885575078),
  (9, 0.7173847544441097),
  (9, 0.6967836257309942),
  (9, 0.7112492770387507),
  (9, 0.7009276984243852),
  (1, 0.706058339566193),
  (1, 0.7163298421596106),
  (9, 0.7052797478329393),
  (1, 0.7110294117647059),
  (9, 0.7131172372008482),
  (9, 0.7120633988846492),
  (9, 0.7120335551055829),
  (9, 0.7115706249049719),
  (9, 0.7103068450039339),
  (9, 0.7072393382910912),
  (9, 0.7134283570892723),
  (1, 0.7076137558191921),
  (9, 0.7084860032243882),
  (7, 0.7184170471841704),
  (9, 0.7134591961023142),
  (9, 0.7153779572994806),
  (7, 0.7081420263238445),
  (1, 0.7232966379053853),
  (1, 0.7153207547169811),
  (9, 0.7007610350076103),
  (7, 0.6944735659727422),
  (3, 0.7037993920972644),
  (9, 0.725),
  (9, 0.7163586375439133),
  (1, 0.7037146226415094),
  (9, 0.7106033977738723),
  (1, 0.7237407862407862),
  (9, 0.7117045091141669),
  (9, 0.7050108932461874),
  (1, 0.7149300155520996),
  (9, 0.6943516550258123),
  (1, 0.6996413

In [102]:
o_ridge_ever_charged = get_errors(odf,regularization_param='rls',outcome_var='ever_charged',
                                  outcome_var_type="binary", num_trials=50, 
                                  officer_df=True, vb=False)
o_trunc_ever_charged = get_errors(odf,regularization_param='trunc',outcome_var='ever_charged',
                                  outcome_var_type="binary", num_trials=50, 
                                  officer_df=True, vb=False)

In [117]:
o_ridge_ever_charged, o_trunc_ever_charged, np.mean(odf['ever_charged'])

([(1.0, 0.3341677096370463),
  (2.0, 0.36795994993742176),
  (0.0, 0.3379224030037547),
  (16.0, 0.37296620775969963),
  (4.0, 0.327909887359199),
  (4.0, 0.344180225281602),
  (16.0, 0.3541927409261577),
  (8.0, 0.37922403003754696),
  (1.0, 0.3579474342928661),
  (8.0, 0.3642052565707134),
  (4.0, 0.35294117647058826),
  (8.0, 0.344180225281602),
  (0.5, 0.34167709637046306),
  (16.0, 0.35168961201501875),
  (16.0, 0.3742177722152691),
  (0.0, 0.36295369211514394),
  (16.0, 0.360450563204005),
  (16.0, 0.3316645807259074),
  (1.0, 0.360450563204005),
  (16.0, 0.36295369211514394),
  (0.0, 0.32665832290362956),
  (16.0, 0.36545682102628285),
  (4.0, 0.3617021276595745),
  (0.5, 0.3579474342928661),
  (2.0, 0.35544430538172717),
  (0.5, 0.32540675844806005),
  (16.0, 0.3504380475594493),
  (16.0, 0.3617021276595745),
  (8.0, 0.3579474342928661),
  (4.0, 0.3742177722152691),
  (0.0, 0.3591989987484355),
  (2.0, 0.34668335419274093),
  (4.0, 0.34918648310387984),
  (16.0, 0.3266583229036

In [103]:
o_ridge_promoted = get_errors(odf,regularization_param='rls',outcome_var='promotion',
                                  outcome_var_type="binary", num_trials=50, 
                                  officer_df=True, vb=False)
o_trunc_promoted = get_errors(odf,regularization_param='trunc',outcome_var='promotion',
                                  outcome_var_type="binary", num_trials=50, 
                                  officer_df=True, vb=False)

In [118]:
o_trunc_promoted, o_ridge_promoted, np.mean(odf['promotion'])

([(9, 0.3742177722152691),
  (9, 0.3667083854818523),
  (8, 0.35544430538172717),
  (9, 0.37546933667083854),
  (9, 0.376720901126408),
  (9, 0.34668335419274093),
  (9, 0.3504380475594493),
  (9, 0.3717146433041302),
  (9, 0.33917396745932415),
  (9, 0.327909887359199),
  (9, 0.3642052565707134),
  (9, 0.35544430538172717),
  (9, 0.3667083854818523),
  (9, 0.35544430538172717),
  (9, 0.36545682102628285),
  (8, 0.3504380475594493),
  (8, 0.3742177722152691),
  (9, 0.34167709637046306),
  (9, 0.3692115143929912),
  (9, 0.3566958698372966),
  (6, 0.36795994993742176),
  (9, 0.344180225281602),
  (9, 0.37797246558197745),
  (9, 0.38172715894868586),
  (9, 0.3642052565707134),
  (9, 0.34918648310387984),
  (6, 0.3804755944931164),
  (9, 0.3379224030037547),
  (9, 0.37546933667083854),
  (9, 0.36295369211514394),
  (8, 0.3704630788485607),
  (9, 0.35168961201501875),
  (9, 0.3642052565707134),
  (8, 0.38172715894868586),
  (9, 0.3579474342928661),
  (9, 0.37296620775969963),
  (9, 0.381727

In [144]:
orp = {}
for test in o_ridge_promoted:
    if test[0] not in orp:
        orp[test[0]] = [test[1]]
    else:
        orp[test[0]] += [test[1]]

In [146]:
orp2 = {}
for key in orp:
    orp2[key] = [np.mean(orp[key]), len(orp[key])]

In [147]:
orp2

{16.0: [0.3038413401367094, 13],
 8.0: [0.30844110693922955, 9],
 0.5: [0.3025657071339174, 4],
 2.0: [0.3043804755944931, 5],
 4.0: [0.3043577198771192, 11],
 0.0: [0.30246141009595323, 6],
 1.0: [0.29662077596996245, 2]}

In [104]:
o_ridge_demoted = get_errors(odf,regularization_param='rls',outcome_var='demotion',
                                  outcome_var_type="binary", num_trials=50, 
                                  officer_df=True, vb=False)
o_trunc_demoted = get_errors(odf,regularization_param='trunc',outcome_var='demotion',
                                  outcome_var_type="binary", num_trials=50, 
                                  officer_df=True, vb=False)

In [134]:
o_ridge_demoted2 = get_errors(odf,regularization_param='rls',outcome_var='demotion',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)
o_trunc_demoted2 = get_errors(odf,regularization_param='trunc',outcome_var='demotion',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)

In [116]:
o_ridge_promoted2 = get_errors(odf_small,regularization_param='rls',outcome_var='promotion',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)
o_trunc_promoted2 = get_errors(odf_small,regularization_param='trunc',outcome_var='promotion',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)

In [117]:
o_ridge_promoted2, o_trunc_promoted2, np.mean(odf['promotion'])

([(8.0, 0.31289111389236546),
  (0.5, 0.35168961201501875),
  (0.0, 0.33667083854818525),
  (0.5, 0.33667083854818525),
  (0.5, 0.32916145181476847),
  (0.0, 0.3304130162703379),
  (0.0, 0.32290362953692114),
  (0.0, 0.35168961201501875),
  (0.5, 0.3341677096370463),
  (0.5, 0.3241551939924906)],
 [(8, 0.31289111389236546),
  (9, 0.35168961201501875),
  (9, 0.3341677096370463),
  (9, 0.3354192740926158),
  (8, 0.32916145181476847),
  (9, 0.32665832290362956),
  (8, 0.3216520650813517),
  (8, 0.33917396745932415),
  (9, 0.32665832290362956),
  (8, 0.3141426783479349)],
 0.5147647647647647)

In [135]:
o_ridge_demoted2, o_trunc_demoted2, np.mean(odf['demotion'])

([(0.5, 0.023779724655819776),
  (0.5, 0.017521902377972465),
  (0.5, 0.017521902377972465),
  (0.5, 0.02002503128911139),
  (0.0, 0.017521902377972465),
  (0.5, 0.016270337922403004),
  (0.5, 0.015018773466833541),
  (0.0, 0.01877346683354193),
  (8.0, 0.026282853566958697),
  (0.5, 0.02002503128911139)],
 [(1, 0.023779724655819776),
  (1, 0.017521902377972465),
  (1, 0.017521902377972465),
  (1, 0.02002503128911139),
  (1, 0.017521902377972465),
  (1, 0.016270337922403004),
  (1, 0.015018773466833541),
  (1, 0.01877346683354193),
  (1, 0.025031289111389236),
  (1, 0.02002503128911139)],
 0.017767767767767766)

In [119]:
o_trunc_demoted, o_ridge_demoted, np.mean(odf['demotion'])

([(1, 0.023779724655819776),
  (1, 0.017521902377972465),
  (1, 0.017521902377972465),
  (1, 0.02002503128911139),
  (1, 0.017521902377972465),
  (1, 0.016270337922403004),
  (1, 0.015018773466833541),
  (1, 0.01877346683354193),
  (1, 0.025031289111389236),
  (1, 0.02002503128911139),
  (1, 0.015018773466833541),
  (1, 0.015018773466833541),
  (1, 0.016270337922403004),
  (1, 0.02127659574468085),
  (1, 0.016270337922403004),
  (1, 0.016270337922403004),
  (1, 0.012515644555694618),
  (1, 0.011264080100125156),
  (1, 0.01376720901126408),
  (1, 0.02002503128911139),
  (1, 0.011264080100125156),
  (1, 0.017521902377972465),
  (1, 0.01877346683354193),
  (1, 0.015018773466833541),
  (1, 0.025031289111389236),
  (1, 0.01376720901126408),
  (1, 0.023779724655819776),
  (1, 0.02127659574468085),
  (1, 0.016270337922403004),
  (1, 0.02127659574468085),
  (1, 0.015018773466833541),
  (1, 0.01877346683354193),
  (1, 0.017521902377972465),
  (1, 0.02252816020025031),
  (1, 0.02127659574468085)

In [105]:
o_trunc_substantiated = get_errors(odf,regularization_param='trunc',outcome_var='substantiated',
                                  outcome_var_type="continuous", num_trials=50, 
                                  officer_df=True, vb=False)
o_ridge_substantiated = get_errors(odf,regularization_param='rls',outcome_var='substantiated',
                                  outcome_var_type="continuous", num_trials=50, 
                                  officer_df=True, vb=False)

In [120]:
o_trunc_substantiated, o_ridge_substantiated, np.mean(odf['substantiated'])

([(9, 1.4445545601389942),
  (9, 1.4662648699605993),
  (5, 1.3666337344841948),
  (9, 1.4429534947391456),
  (7, 1.5661161972939974),
  (7, 1.4166632670068346),
  (9, 1.534965727536938),
  (9, 1.4478032972957566),
  (9, 1.3986373487005446),
  (9, 1.5154031254523574),
  (9, 1.2399723368838889),
  (9, 1.3493200379010535),
  (9, 1.4689592301511711),
  (9, 1.5684799607970166),
  (5, 1.2532617301171565),
  (9, 1.4338740093684146),
  (9, 1.4507500623793055),
  (9, 1.4360428015706106),
  (9, 1.3993205579616614),
  (9, 1.355302599993845),
  (9, 1.389724525932441),
  (9, 1.3802180634429864),
  (6, 1.3428509748328403),
  (5, 1.3706177514742268),
  (6, 1.2922298510525936),
  (9, 1.489813656685988),
  (5, 1.4591231988850863),
  (9, 1.370199602145142),
  (7, 1.3766398579333754),
  (9, 1.432333004433521),
  (5, 1.507679274972666),
  (9, 1.408887889906318),
  (8, 1.4053319751615532),
  (5, 1.374146642609442),
  (9, 1.3761223402025182),
  (9, 1.3994290946552235),
  (5, 1.4857176005748922),
  (9, 1.42

In [106]:
o_trunc_charged = get_errors(odf,regularization_param='trunc',outcome_var='officer_charged',
                                  outcome_var_type="continuous", num_trials=50, 
                                  officer_df=True, vb=False)
o_ridge_charged = get_errors(odf,regularization_param='rls',outcome_var='officer_charged',
                                  outcome_var_type="continuous", num_trials=50, 
                                  officer_df=True, vb=False)

In [121]:
o_trunc_charged, o_ridge_charged, np.mean(odf['officer_charged'])

([(9, 1.3335991451183489),
  (6, 1.5927084851564315),
  (5, 1.4137456994490714),
  (9, 1.4405635032738957),
  (9, 1.4892897458071737),
  (7, 1.4418533842150576),
  (7, 1.531970643381867),
  (9, 1.5657789701602123),
  (8, 1.4128095429212708),
  (9, 1.507112583159253),
  (8, 1.313037191318337),
  (7, 1.303791304493412),
  (8, 1.366971536447982),
  (5, 1.552402031438257),
  (6, 1.318993499710545),
  (8, 1.5134664749089148),
  (8, 1.3457327677833668),
  (8, 1.4901228089394116),
  (5, 1.3998612697298671),
  (7, 1.4398983242214),
  (8, 1.3254374093274939),
  (9, 1.4087972671555928),
  (7, 1.4620058179175623),
  (4, 1.390552681822451),
  (9, 1.3742066372420494),
  (8, 1.444924424798773),
  (8, 1.6296281832152197),
  (5, 1.4214381926020025),
  (9, 1.4349001425017556),
  (8, 1.392052469199597),
  (9, 1.5330958550946359),
  (5, 1.447010249686909),
  (8, 1.5236188904849537),
  (8, 1.3814652902865985),
  (8, 1.288096337723136),
  (9, 1.3663868429528903),
  (5, 1.5470044491684618),
  (8, 1.41686677

In [133]:
list(odf.columns)

['Unnamed: 0',
 'officer_id',
 'c_black',
 'c_unknown',
 'c_white',
 'c_asian',
 'c_hispanic',
 'c_american_indian',
 'c_female',
 'c_male',
 'c_transman_(ftm)',
 'c_transwoman_(mtf)',
 'c_gender_non-conforming',
 'demotion',
 'promotion',
 'dispute',
 'vehicle',
 'suspected_violation',
 'phone',
 'warrant',
 'aided_case',
 'checkpoint',
 'demonstrations_protests',
 'report',
 'disturbance',
 'precinct_complaint_info',
 'vehicle_violation',
 'arrest_resist',
 'arrest_obstruct',
 'other_summons',
 'Arrest/Complainant',
 'C/V intervened on behalf of/observed encounter w/3rd party',
 'C/V at PCT to file complaint of crime',
 'Regulatory inspection',
 'Assist ACS or other agency',
 'CV already in custody',
 'Report-gun possession/shots fired',
 'Stop/Question/Frisk',
 'C/V requested info from officer',
 'Patrol Encounter',
 'Arrest/Not Complainant',
 'Summons/Complainant',
 'Complainant Witnessing Incident',
 'C/V at PCT to retrieve property',
 'Parade/special event',
 'Victim Subject of S

In [140]:
outcome_var = "promotion"
lambda_ = 0.5
officer_df = True
vb = False
df = odf_small
# def get_errors(df,regularization_param,outcome_var,outcome_var_type="binary",
#                num_trials=10, officer_df=False, vb=False)

test_drop = ['c_transwoman_(mtf)','c_gender_non-conforming','c_american_indian']

if officer_df:
    outcomeList = ['substantiated', 'ever_charged', 'demotion',
                   'officer_charged', 'promotion']


train,test = train_test_split(df,randomState=12345, verbose=vb)
# train.drop(['c_unknown',"c_transman_(ftm)","c_white","Animal","Victim Subject of Sex Crime","Mace"],axis=1,inplace=True)
trainY = np.array(train[[outcome_var]])
trainX = np.array(train.drop(['officer_id','Unnamed: 0']+outcomeList+test_drop,axis=1))
testY = np.array(test[[outcome_var]])
testX = np.array(test.drop(['officer_id','Unnamed: 0']+outcomeList+test_drop,axis=1))

train = train.drop(['officer_id','Unnamed: 0']+outcomeList+test_drop,axis=1)


#w_hat = la.inv(trainX.T@trainX)@trainX.T@trainY
w_hat = get_svd(trainX,"rls",param=lambda_, verbose=vb)@trainY





In [141]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [142]:
train.describe()

Unnamed: 0,c_black,c_asian,c_hispanic,c_female,c_male,abuse_of_authority,discourtesy,offensive_language,force,mos_gender,mos_age_incident,rank_abbrev_incident,complaint_count
count,3197.0,3197.0,3197.0,3197.0,3197.0,3197.0,3197.0,3197.0,3197.0,3197.0,3197.0,3197.0,3197.0
mean,2.069753,0.068815,0.767282,0.758211,2.821082,5.075696,1.16578,0.184235,1.9071,0.093525,31.962781,1.458859,8.332812
std,2.346371,0.269926,1.094539,1.026571,2.623046,5.216576,1.536797,0.526068,2.452516,0.291213,5.473064,0.933971,7.92818
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.5,1.0,1.0
25%,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,28.0,1.0,3.0
50%,1.0,0.0,0.0,0.0,2.0,3.0,1.0,0.0,1.0,0.0,31.0,1.0,6.0
75%,3.0,0.0,1.0,1.0,4.0,7.0,2.0,0.0,3.0,0.0,35.333333,1.666667,11.0
max,24.0,3.0,8.0,8.0,23.0,57.0,21.0,6.0,20.0,1.0,58.0,8.0,75.0


In [143]:
w_hat = pd.DataFrame(w_hat)

In [144]:
w_hat.columns

RangeIndex(start=0, stop=1, step=1)

In [145]:
w_hat['abs'] = [abs(x) for x in w_hat[0]]
w_hat.sort_values(by='abs')

Unnamed: 0,0,abs
12,-0.000109,0.000109
10,0.005535,0.005535
5,0.006248,0.006248
7,-0.010235,0.010235
6,-0.012739,0.012739
4,0.014585,0.014585
3,-0.014733,0.014733
8,0.016616,0.016616
2,0.017973,0.017973
0,0.023244,0.023244


In [146]:
train.columns[11],train.columns[9],train.columns[1],train.columns[0],train.columns[2]

('rank_abbrev_incident', 'mos_gender', 'c_asian', 'c_black', 'c_hispanic')

In [138]:
train.columns[1],train.columns[7],train.columns[0],train.columns[4],train.columns[2]

('c_asian', 'offensive_language', 'c_black', 'c_male', 'c_hispanic')

In [71]:
train.columns[75], train.columns[135],train.columns[152],train.columns[107],train.columns[144],train.columns[128],train.columns[44],train.columns[90],train.columns[122],train.columns[120]

('Sexual orientation',
 'Physical disability',
 'rank_abbrev_incident',
 'Threat re: removal to hospital',
 'Sh Refuse Cmp',
 'Punch/Kick',
 'Harrassment/Arrested/Summons',
 'Gun Drawn',
 'Other - Abuse',
 'Nightstick/Billy/Club')

In [139]:
np.mean(train[['offensive_language']]),np.sum(train[['offensive_language']])

(offensive_language    0.184235
 dtype: float64,
 offensive_language    589.0
 dtype: float64)

In [87]:
list(train.columns)

['c_black',
 'c_asian',
 'c_hispanic',
 'c_american_indian',
 'c_female',
 'c_male',
 'c_transwoman_(mtf)',
 'c_gender_non-conforming',
 'dispute',
 'vehicle',
 'suspected_violation',
 'phone',
 'warrant',
 'aided_case',
 'checkpoint',
 'demonstrations_protests',
 'report',
 'disturbance',
 'precinct_complaint_info',
 'vehicle_violation',
 'arrest_resist',
 'arrest_obstruct',
 'other_summons',
 'Arrest/Complainant',
 'C/V intervened on behalf of/observed encounter w/3rd party',
 'C/V at PCT to file complaint of crime',
 'Regulatory inspection',
 'Assist ACS or other agency',
 'CV already in custody',
 'Report-gun possession/shots fired',
 'Stop/Question/Frisk',
 'C/V requested info from officer',
 'Patrol Encounter',
 'Arrest/Not Complainant',
 'Summons/Complainant',
 'Complainant Witnessing Incident',
 'C/V at PCT to retrieve property',
 'Parade/special event',
 'No contact',
 'Arrest - other violation/crime',
 'No arrest made or summons issued',
 'Assault/Arrested',
 'Arrest - harras

In [97]:
keep_cols = ['Unnamed: 0','officer_id','c_black',
 'c_asian',
 'c_hispanic',
 'c_american_indian',
 'c_female',
 'c_male',
 'c_transwoman_(mtf)',
 'c_gender_non-conforming',
          'abuse_of_authority',
 'discourtesy',
 'offensive_language',
 'force',  
            'mos_gender',
 'mos_age_incident',
 'rank_abbrev_incident',
 'complaint_count']

In [98]:
outcomeList

['substantiated', 'ever_charged', 'demotion', 'officer_charged', 'promotion']

In [99]:
odf_small = odf[keep_cols+outcomeList]

In [100]:
odf_small.columns

Index(['Unnamed: 0', 'officer_id', 'c_black', 'c_asian', 'c_hispanic',
       'c_american_indian', 'c_female', 'c_male', 'c_transwoman_(mtf)',
       'c_gender_non-conforming', 'abuse_of_authority', 'discourtesy',
       'offensive_language', 'force', 'mos_gender', 'mos_age_incident',
       'rank_abbrev_incident', 'complaint_count', 'substantiated',
       'ever_charged', 'demotion', 'officer_charged', 'promotion'],
      dtype='object')