In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import numpy.linalg as la
import pandas as pd
from scipy.linalg import svd
from mpl_toolkits.mplot3d import Axes3D
import random
from pipeline import generate_basic_exploration

In [2]:
df = pd.read_csv('allegations_cleaned2.csv')

In [6]:
# helper functions
def get_svd(trainX,regularization_param,param=0):
    # Calculate truncated SVD
    U, S, V = la.svd(trainX)
    
    if regularization_param =='trunc':
        # take inverse of each elemant
        S_inv = np.reciprocal(S)
        S_inv[param:] = 0
    if regularization_param == 'rls':
        S_inv = S/(np.square(S)+param)

    new_S = np.zeros((U.shape[0],V.shape[0])) 
    np.fill_diagonal(new_S,S_inv)

    return V.T@new_S.T@U.T

    
def key_min_val(d):
#      a) create a list of the dict's keys and values; 
#      b) return the key with the min value
    v=list(d.values())
    k=list(d.keys())
    
    return k[v.index(min(v))]


def rmse(y_hat, y):
    return np.sqrt(((y_hat - y) ** 2).mean())

def mse(y_hat, y):
    return ((y_hat - y) ** 2).mean()



def get_error_rate(w_hat,testX,testY,outcome_var_type):
    y_hat = testX@w_hat
    
    if outcome_var_type == "binary":
        y_hat_encode = [0 if x <= 0.5
                        else 1 
                        for x in y_hat]
        
    if outcome_var_type == 'three-class':    
        y_hat_encode = [-1 if x <= -0.5 
                      else 0 if x > -0.5 and x < 0.5 
                      else 1
                     for x in y_hat]

    if outcome_var_type != "binary" & outcome_var_type != "three-class":
        return mse(y_hat,testY)

    else:
        equal = np.sum(y_hat_encode == testY)
        error_rate = (len(testY)-equal)/len(testY)

        return error_rate
    

In [7]:
def get_errors(df,regularization_param,outcome_var,outcome_var_type="binary"):
    outer_error_rates = list()

    outcomeList = ['outcome','demotion']
    randomState = 1 
    for i in range(50):
        
        inner_error_rates = dict()
        
        train,test = train_test_split(df,randomState=randomState)
        trainY = train[[outcome_var]]
        trainX = train.drop(['officer_id','Unnamed: 0']+outcomeList,axis=1)
        testY = test[[outcome_var]]
        testX = test.drop(['officer_id','Unnamed: 0']+outcomeList,axis=1)
        
        randomState += 1

        if regularization_param == "trunc":
            for k in range(1,10):
                w_hat = get_svd(trainX,"trunc",param=k)@trainY
                inner_error_rates[k] = get_error_rate(w_hat,testX,testY,outcome_var_type)

        if regularization_param == "rls":
            for lambda_ in np. array ([0 , 0.5 , 1, 2, 4, 8, 16]):
                w_hat = get_svd(trainX,"rls",param=lambda_)@trainY
                inner_error_rates[lambda_] = get_error_rate(w_hat,testX,testY,outcome_var_type)

        min_key = key_min_val(inner_error_rates)
        outer_error_rates.append((min_key,inner_error_rates[min_key]))
            
    return outer_error_rates


In [8]:
# Group train/test by officer ID somehow? See Piazza post
def train_test_split(df,holdOut=0.2, randomState = 1):
    # Train, val, test split:
    # get number of unique ids and the uniqe IDs
    n_ID = len(df.officer_id.unique())
    ids = pd.DataFrame(df.officer_id.unique())

    # sample from IDs
    train_index = ids.sample(round(n_ID*(1-holdOut)),random_state = randomState ).values.tolist()
    #train_index = [item for sublist in train_index for item in sublist]
    train_index = [x[0] for x in train_index]
    # train data is data from any IDs that show up in train index
    train_data = df[df.officer_id.isin(train_index)]
    # test data is data from any IDs that don't show up in train index
    test_data = df[~df.officer_id.isin(train_index)]

    # Sanity check
    print("Total Number of Unique IDs:" , len(df.officer_id.unique()))
    print("Total Number of IDs in Test Data:" , len(test_data.officer_id.unique()))
    print("Total Number of IDs in Train Data:" , len(train_data.officer_id.unique()))
    print("Do the IDs add up?" , len(test_data.officer_id.unique()) + len(train_data.officer_id.unique())  ==  len(df.officer_id.unique()))
    print("Does Test Represent 20% of the data?", (len(test_data.officer_id.unique())/len(df.officer_id.unique())) == holdOut)
    print("Test Represents X% of the data:", (len(test_data.officer_id.unique())/len(df.officer_id.unique())))
    
    return train_data, test_data

In [None]:
trunc = get_errors(df,regularization_param="trunc",outcome_var="outcome",outcome_var_type="three-class")

Total Number of Unique IDs: 3996
Total Number of IDs in Test Data: 799
Total Number of IDs in Train Data: 3197
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.19994994994994994
