In [20]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import numpy.linalg as la
import pandas as pd
from scipy.linalg import svd
from mpl_toolkits.mplot3d import Axes3D
import random
from pipeline import generate_basic_exploration

In [14]:
df = pd.read_csv('allegations_cleaned2.csv')
df = df.drop(columns='board_disposition')

In [87]:
odf = pd.read_csv('officer_df.csv')

In [33]:
officer_exclude_cols =  ['substantiated',
                         'ever_charged']

In [5]:
len(odf)

3996

In [7]:
np.mean(odf['ever_charged'])

0.43743743743743746

In [64]:
np.mean(df['closure_time'])

296.2687211463517

In [9]:
np.mean(odf['demotion']), np.mean(odf['promotion'])

(0.017767767767767766, 0.5147647647647647)

In [42]:
# helper functions
def get_svd(trainX,regularization_param,param=0, verbose=False):
    # Calculate truncated SVD
    if verbose:
        print('Generating SVD')
    U, S, V = la.svd(trainX, full_matrices=False)
#     print('Initial U: {}'.format(str(U[:5])))
#     print('Initial S: {}'.format(str(S[:5])))
#     print('Initial V: {}'.format(str(V[:5])))
    
    if regularization_param =='trunc':
        # take inverse of each elemant
        S_inv = np.reciprocal(S)
#         print('First values of inverse: {}'.format(str(S_inv[:5])))
        S_inv[param:] = 0
    if regularization_param == 'rls':
        S_inv = S/(np.square(S)+param)

    new_S = np.zeros((U.T.shape[0],V.shape[0])) 
    np.fill_diagonal(new_S,S_inv)
    if verbose:
        print(V.T.shape, new_S.T.shape, U.T.shape)
    w_hat = V.T@new_S.T@U.T

    return w_hat
    
def key_min_val(d):
#      a) create a list of the dict's keys and values; 
#      b) return the key with the min value
    v=list(d.values())
    k=list(d.keys())
    
    return k[v.index(min(v))]


def rmse(y_hat, y):
    return np.sqrt(((y_hat - y) ** 2).mean())

def mse(y_hat, y):
    return ((y_hat - y) ** 2).mean()



def get_error_rate(w_hat,testX,testY,outcome_var_type):
#     print('testX shape is {}, w_hat shape is {}'.format(testX.shape, w_hat.shape))
    y_hat = testX@w_hat
#     print(y_hat[:20])
    
    if outcome_var_type == "binary":
        y_hat_encode = [0 if x <= 0.5
                        else 1 
                        for x in y_hat]
        
    if outcome_var_type == 'three-class':    
        y_hat_encode = [-1 if x <= -0.5 
                      else 0 if x > -0.5 and x < 0.5 
                      else 1
                     for x in y_hat]

    if outcome_var_type not in ("binary", "three-class"):
        return rmse(y_hat,testY)

    else:
        equal = np.sum(np.array(y_hat_encode).reshape(len(testY),) == np.array(testY).reshape(len(testY),))
#         print(y_hat_encode[:5], testY[:5])
#         print('sum y_hat_encode is {}'.format(sum(y_hat_encode)), 
#               'len y_hat_encode is {}'.format(len(y_hat_encode)))
#         print('mean equal is {}'.format(equal.mean()))
#         print('len testY = {}'.format(len(testY)))
        error_rate = (len(testY)-equal)/len(testY)

        return error_rate
    

In [62]:
def get_errors(df,regularization_param,outcome_var,outcome_var_type="binary",
               num_trials=10, officer_df=False, vb=False):
    outer_error_rates = list()

    outcomeList = ['outcome','demotion', 'closure_time',
                   'promotion', 'officer_charged']
    if officer_df:
        outcomeList = ['substantiated', 'ever_charged', 'demotion',
                       'officer_charged', 'promotion']
    randomState = 2
    for i in range(num_trials):
        
        inner_error_rates = dict()
        
        train,test = train_test_split(df,randomState=randomState, verbose=vb)
        trainY = np.array(train[[outcome_var]])
        trainX = np.array(train.drop(['officer_id','Unnamed: 0']+outcomeList,axis=1))
        testY = np.array(test[[outcome_var]])
        testX = np.array(test.drop(['officer_id','Unnamed: 0']+outcomeList,axis=1))
        
        randomState += 1

        if regularization_param == "trunc":
            for k in range(1,10):
                w_hat = get_svd(trainX,"trunc",param=k, verbose=vb)@trainY
                inner_error_rates[k] = get_error_rate(w_hat,testX,testY,outcome_var_type)

        if regularization_param == "rls":
            for lambda_ in np. array ([0 , 0.5 , 1, 2, 4, 8, 16]):
                w_hat = get_svd(trainX,"rls",param=lambda_, verbose=vb)@trainY
                inner_error_rates[lambda_] = get_error_rate(w_hat,testX,testY,outcome_var_type)

        min_key = key_min_val(inner_error_rates)
        outer_error_rates.append((min_key,inner_error_rates[min_key]))
            
    return outer_error_rates


In [39]:
# Group train/test by officer ID somehow? See Piazza post
def train_test_split(df,holdOut=0.2, randomState = 1, verbose=False):
    # Train, val, test split:
    # get number of unique ids and the uniqe IDs
    n_ID = len(df.officer_id.unique())
    ids = pd.DataFrame(df.officer_id.unique())

    # sample from IDs
    train_index = ids.sample(round(n_ID*(1-holdOut)),random_state = randomState ).values.tolist()
    #train_index = [item for sublist in train_index for item in sublist]
    train_index = [x[0] for x in train_index]
    # train data is data from any IDs that show up in train index
    train_data = df[df.officer_id.isin(train_index)]
    # test data is data from any IDs that don't show up in train index
    test_data = df[~df.officer_id.isin(train_index)]

    # Sanity check
    if verbose:
        print("Total Number of Unique IDs:" , len(df.officer_id.unique()))
        print("Total Number of IDs in Test Data:" , len(test_data.officer_id.unique()))
        print("Total Number of IDs in Train Data:" , len(train_data.officer_id.unique()))
        print("Do the IDs add up?" , len(test_data.officer_id.unique()) + len(train_data.officer_id.unique())  ==  len(df.officer_id.unique()))
        print("Does Test Represent 20% of the data?", (len(test_data.officer_id.unique())/len(df.officer_id.unique())) == holdOut)
        print("Test Represents X% of the data:", (len(test_data.officer_id.unique())/len(df.officer_id.unique())))
    
    return train_data, test_data

In [44]:
ridge_closure = get_errors(df,regularization_param="rls",outcome_var="closure_time",
                   outcome_var_type="time", num_trials = 5)

In [45]:
ridge_closure

[(2.0, 131.5778457861212),
 (1.0, 137.61129946435318),
 (16.0, 130.97976856682877),
 (0.5, 128.5108532952614),
 (0.5, 134.66732753731944)]

In [48]:
trunc_closure = get_errors(df,regularization_param="trunc",outcome_var="closure_time",
                   outcome_var_type="time", num_trials = 5)

In [49]:
trunc_closure

[(9, 148.4993023563817),
 (9, 155.54348688428993),
 (9, 149.1783522445068),
 (9, 146.16344743252546),
 (9, 152.70707625702337)]

In [51]:
trunc_outcome = get_errors(df,regularization_param="trunc",outcome_var="outcome",
                   outcome_var_type="three-class", num_trials = 5)

In [52]:
trunc_outcome

[(9, 0.6990720913633119),
 (9, 0.7073674885575078),
 (9, 0.7173847544441097),
 (9, 0.6967836257309942),
 (9, 0.7112492770387507)]

In [53]:
ridge_outcome = get_errors(df,regularization_param="rls",outcome_var="outcome",
                   outcome_var_type="three-class", num_trials = 5)

In [54]:
ridge_outcome

[(0.5, 0.5912919343326195),
 (0.0, 0.5989960135833456),
 (0.0, 0.5979210605604097),
 (0.0, 0.5752923976608187),
 (0.0, 0.6015037593984962)]

In [88]:
o_ridge_ever_charged = get_errors(odf,regularization_param='rls',outcome_var='ever_charged',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)
o_trunc_ever_charged = get_errors(odf,regularization_param='trunc',outcome_var='ever_charged',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)

In [89]:
o_ridge_ever_charged, o_trunc_ever_charged, np.mean(odf['ever_charged'])

([(1.0, 0.3341677096370463),
  (2.0, 0.36795994993742176),
  (0.0, 0.3379224030037547),
  (16.0, 0.37296620775969963),
  (4.0, 0.327909887359199),
  (4.0, 0.344180225281602),
  (16.0, 0.3541927409261577),
  (8.0, 0.37922403003754696),
  (1.0, 0.3579474342928661),
  (8.0, 0.3642052565707134)],
 [(9, 0.37797246558197745),
  (7, 0.40425531914893614),
  (9, 0.36545682102628285),
  (7, 0.3742177722152691),
  (9, 0.36545682102628285),
  (5, 0.37797246558197745),
  (9, 0.3667083854818523),
  (8, 0.38172715894868586),
  (9, 0.3742177722152691),
  (9, 0.3692115143929912)],
 0.43743743743743746)

In [90]:
o_ridge_promoted = get_errors(odf,regularization_param='rls',outcome_var='promotion',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)
o_trunc_promoted = get_errors(odf,regularization_param='trunc',outcome_var='promotion',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)

In [99]:
o_trunc_promoted, o_ridge_promoted, np.mean(odf['promotion'])

([(9, 0.3742177722152691),
  (9, 0.3667083854818523),
  (8, 0.35544430538172717),
  (9, 0.37546933667083854),
  (9, 0.376720901126408),
  (9, 0.34668335419274093),
  (9, 0.3504380475594493),
  (9, 0.3717146433041302),
  (9, 0.33917396745932415),
  (9, 0.327909887359199)],
 [(16.0, 0.2903629536921151),
  (8.0, 0.3191489361702128),
  (0.5, 0.3141426783479349),
  (2.0, 0.3078848560700876),
  (4.0, 0.3016270337922403),
  (8.0, 0.2878598247809762),
  (0.5, 0.27784730913642053),
  (4.0, 0.3091364205256571),
  (2.0, 0.3041301627033792),
  (0.0, 0.28660826032540676)],
 0.5147647647647647)

In [92]:
o_ridge_demoted = get_errors(odf,regularization_param='rls',outcome_var='demotion',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)
o_trunc_demoted = get_errors(odf,regularization_param='trunc',outcome_var='demotion',
                                  outcome_var_type="binary", num_trials=10, 
                                  officer_df=True, vb=False)

In [93]:
o_trunc_demoted, o_ridge_demoted, np.mean(odf['demotion'])

([(1, 0.023779724655819776),
  (1, 0.017521902377972465),
  (1, 0.017521902377972465),
  (1, 0.02002503128911139),
  (1, 0.017521902377972465),
  (1, 0.016270337922403004),
  (1, 0.015018773466833541),
  (1, 0.01877346683354193),
  (1, 0.025031289111389236),
  (1, 0.02002503128911139)],
 [(0.0, 0.023779724655819776),
  (0.5, 0.017521902377972465),
  (0.0, 0.017521902377972465),
  (0.5, 0.02002503128911139),
  (0.5, 0.017521902377972465),
  (0.0, 0.016270337922403004),
  (0.5, 0.015018773466833541),
  (1.0, 0.01877346683354193),
  (8.0, 0.026282853566958697),
  (0.0, 0.02002503128911139)],
 0.017767767767767766)

In [94]:
o_trunc_substantiated = get_errors(odf,regularization_param='trunc',outcome_var='substantiated',
                                  outcome_var_type="continuous", num_trials=10, 
                                  officer_df=True, vb=False)
o_ridge_substantiated = get_errors(odf,regularization_param='rls',outcome_var='substantiated',
                                  outcome_var_type="continuous", num_trials=10, 
                                  officer_df=True, vb=False)

In [96]:
o_trunc_substantiated, o_ridge_substantiated, np.mean(odf['substantiated'])

([(9, 1.4445545601389942),
  (9, 1.4662648699605993),
  (5, 1.3666337344841948),
  (9, 1.4429534947391456),
  (7, 1.5661161972939974),
  (7, 1.4166632670068346),
  (9, 1.534965727536938),
  (9, 1.4478032972957566),
  (9, 1.3986373487005446),
  (9, 1.5154031254523574)],
 [(16.0, 1.4872164868531255),
  (16.0, 1.4349182735773764),
  (16.0, 1.3823168045560326),
  (16.0, 1.3822093372710078),
  (16.0, 1.5000962538591305),
  (16.0, 1.4334361716628754),
  (16.0, 1.514022349840476),
  (16.0, 1.4008208003330032),
  (16.0, 1.394520810082202),
  (16.0, 1.529046140451915)],
 2.0773273273273274)

In [97]:
o_trunc_charged = get_errors(odf,regularization_param='trunc',outcome_var='officer_charged',
                                  outcome_var_type="continuous", num_trials=10, 
                                  officer_df=True, vb=False)
o_ridge_charged = get_errors(odf,regularization_param='rls',outcome_var='officer_charged',
                                  outcome_var_type="continuous", num_trials=10, 
                                  officer_df=True, vb=False)

In [98]:
o_trunc_charged, o_ridge_charged, np.mean(odf['officer_charged'])

([(9, 1.3335991451183489),
  (6, 1.5927084851564315),
  (5, 1.4137456994490714),
  (9, 1.4405635032738957),
  (9, 1.4892897458071737),
  (7, 1.4418533842150576),
  (7, 1.531970643381867),
  (9, 1.5657789701602123),
  (8, 1.4128095429212708),
  (9, 1.507112583159253)],
 [(16.0, 1.3633941321227059),
  (16.0, 1.5463872984359848),
  (16.0, 1.407617882606474),
  (16.0, 1.4139583932463144),
  (16.0, 1.4134495030689416),
  (16.0, 1.477761262450351),
  (16.0, 1.5484799534831641),
  (16.0, 1.5056049378008),
  (16.0, 1.459661843478969),
  (16.0, 1.5230677344943782)],
 0.94994994994995)