In [31]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import numpy.linalg as la
import pandas as pd
from scipy.linalg import svd
from mpl_toolkits.mplot3d import Axes3D
import random
from pipeline import generate_basic_exploration

In [32]:
df = pd.read_csv('allegations_cleaned2.csv')

In [48]:
# sum(df['promotion'])/len(df['promotion'])

0.47376941063612926

In [33]:
sum(df['demotion'])/len(df['demotion'])

0.009622879069488579

In [42]:
# helper functions
def get_svd(trainX,regularization_param,param=0):
    # Calculate truncated SVD
    print('Generating SVD')
    U, S, V = la.svd(trainX, full_matrices=False)
#     print('Initial U: {}'.format(str(U[:5])))
#     print('Initial S: {}'.format(str(S[:5])))
#     print('Initial V: {}'.format(str(V[:5])))
    
    if regularization_param =='trunc':
        # take inverse of each elemant
        S_inv = np.reciprocal(S)
#         print('First values of inverse: {}'.format(str(S_inv[:5])))
        S_inv[param:] = 0
    if regularization_param == 'rls':
        S_inv = S/(np.square(S)+param)

    new_S = np.zeros((U.T.shape[0],V.shape[0])) 
    np.fill_diagonal(new_S,S_inv)
    print(V.T.shape, new_S.T.shape, U.T.shape)
    w_hat = V.T@new_S.T@U.T

    return w_hat
    
def key_min_val(d):
#      a) create a list of the dict's keys and values; 
#      b) return the key with the min value
    v=list(d.values())
    k=list(d.keys())
    
    return k[v.index(min(v))]


def rmse(y_hat, y):
    return np.sqrt(((y_hat - y) ** 2).mean())

def mse(y_hat, y):
    return ((y_hat - y) ** 2).mean()



def get_error_rate(w_hat,testX,testY,outcome_var_type):
#     print('testX shape is {}, w_hat shape is {}'.format(testX.shape, w_hat.shape))
    y_hat = testX@w_hat
#     print(y_hat[:20])
    
    if outcome_var_type == "binary":
        y_hat_encode = [0 if x <= 0.5
                        else 1 
                        for x in y_hat]
        
    if outcome_var_type == 'three-class':    
        y_hat_encode = [-1 if x <= -0.5 
                      else 0 if x > -0.5 and x < 0.5 
                      else 1
                     for x in y_hat]

    if outcome_var_type not in ("binary", "three-class"):
        return rmse(y_hat,testY)

    else:
        equal = np.sum(np.array(y_hat_encode).reshape(len(testY),) == np.array(testY).reshape(len(testY),))
#         print(y_hat_encode[:5], testY[:5])
#         print('sum y_hat_encode is {}'.format(sum(y_hat_encode)), 
#               'len y_hat_encode is {}'.format(len(y_hat_encode)))
#         print('mean equal is {}'.format(equal.mean()))
#         print('len testY = {}'.format(len(testY)))
        error_rate = (len(testY)-equal)/len(testY)

        return error_rate
    

In [52]:
def get_errors(df,regularization_param,outcome_var,outcome_var_type="binary",
               num_trials=10):
    outer_error_rates = list()

    outcomeList = ['outcome','demotion', 'closure_time', 'promotion']
    randomState = 2
    for i in range(num_trials):
        
        inner_error_rates = dict()
        
        train,test = train_test_split(df,randomState=randomState)
        trainY = np.array(train[[outcome_var]])
        trainX = np.array(train.drop(['officer_id','Unnamed: 0']+outcomeList,axis=1))
        testY = np.array(test[[outcome_var]])
        testX = np.array(test.drop(['officer_id','Unnamed: 0']+outcomeList,axis=1))
        
        randomState += 1

        if regularization_param == "trunc":
            for k in range(1,10):
                w_hat = get_svd(trainX,"trunc",param=k)@trainY
                inner_error_rates[k] = get_error_rate(w_hat,testX,testY,outcome_var_type)

        if regularization_param == "rls":
            for lambda_ in np. array ([0 , 0.5 , 1, 2, 4, 8, 16]):
                w_hat = get_svd(trainX,"rls",param=lambda_)@trainY
                inner_error_rates[lambda_] = get_error_rate(w_hat,testX,testY,outcome_var_type)

        min_key = key_min_val(inner_error_rates)
        outer_error_rates.append((min_key,inner_error_rates[min_key]))
            
    return outer_error_rates


In [39]:
# Group train/test by officer ID somehow? See Piazza post
def train_test_split(df,holdOut=0.2, randomState = 1):
    # Train, val, test split:
    # get number of unique ids and the uniqe IDs
    n_ID = len(df.officer_id.unique())
    ids = pd.DataFrame(df.officer_id.unique())

    # sample from IDs
    train_index = ids.sample(round(n_ID*(1-holdOut)),random_state = randomState ).values.tolist()
    #train_index = [item for sublist in train_index for item in sublist]
    train_index = [x[0] for x in train_index]
    # train data is data from any IDs that show up in train index
    train_data = df[df.officer_id.isin(train_index)]
    # test data is data from any IDs that don't show up in train index
    test_data = df[~df.officer_id.isin(train_index)]

    # Sanity check
    print("Total Number of Unique IDs:" , len(df.officer_id.unique()))
    print("Total Number of IDs in Test Data:" , len(test_data.officer_id.unique()))
    print("Total Number of IDs in Train Data:" , len(train_data.officer_id.unique()))
    print("Do the IDs add up?" , len(test_data.officer_id.unique()) + len(train_data.officer_id.unique())  ==  len(df.officer_id.unique()))
    print("Does Test Represent 20% of the data?", (len(test_data.officer_id.unique())/len(df.officer_id.unique())) == holdOut)
    print("Test Represents X% of the data:", (len(test_data.officer_id.unique())/len(df.officer_id.unique())))
    
    return train_data, test_data

In [43]:
ridge_closure = get_errors(df,regularization_param="rls",outcome_var="closure_time",
                   outcome_var_type="time", num_trials = 5)

Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
Generating SVD
(166, 166) (166, 166) (166, 26585)
Generating SVD
(166, 166) (166, 166) (166, 26585)
Generating SVD
(166, 166) (166, 166) (166, 26585)
Generating SVD
(166, 166) (166, 166) (16

In [49]:
trunc_demote = get_errors(df,regularization_param="trunc",outcome_var="demotion",
                   outcome_var_type="binary", num_trials = 5)

Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Generating SVD
(166, 166) (166, 166) (166, 26353)
Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
Generating SVD
(166, 166) (166, 166) (166, 26585)
Generating SVD
(166, 166) (166, 166) (16

In [50]:
trunc_demote

[(1, 0.003854389721627409),
 (1, 0.011959249963088735),
 (1, 0.00783368484483278),
 (1, 0.011988304093567251),
 (1, 0.006651243493348756)]

In [53]:
trunc_promote = get_errors(df,regularization_param="trunc",outcome_var="promotion",
                   outcome_var_type="binary", num_trials = 5)

Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
Generating SVD
(165, 165) (165, 165) (165, 26585)
Generating SVD
(165, 165) (165, 165) (16

In [54]:
trunc_promote

[(9, 0.11277658815132048),
 (9, 0.10896205521925291),
 (9, 0.12910515215426333),
 (6, 0.1260233918128655),
 (9, 0.12377096587622903)]

In [55]:
ridge_promote = get_errors(df,regularization_param="rls",outcome_var="promotion",
                   outcome_var_type="binary", num_trials = 5)

Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Generating SVD
(165, 165) (165, 165) (165, 26353)
Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
Generating SVD
(165, 165) (165, 165) (165, 26585)
Generating SVD
(165, 165) (165, 165) (165, 26585)
Generating SVD
(165, 165) (165, 165) (165, 26585)
Generating SVD
(165, 165) (165, 165) (16

In [56]:
ridge_promote

[(1.0, 0.08636688079942897),
 (2.0, 0.08297652443525765),
 (0.5, 0.09912624284423019),
 (0.5, 0.0912280701754386),
 (0.5, 0.07938114517061885)]