# Notebook Setup

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
%%capture
!pip install dmba

In [16]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)}) #SET SIZE OF ALL SEABORN PLOTS
import sklearn as sk

from dmba import classificationSummary, gainsChart
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, plot_confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Binarizer, Normalizer, OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# -----
from sklearn import set_config
set_config(display="diagram")

# Functions

In [17]:
def confusionMatrices(model, title):
    print(title + ' - training results')
    
    y_train_pred = model.predict(X_train)
    confusionMat = (confusion_matrix(y_train, y_train_pred))
    
    acc = accuracy_score(y_train, y_train_pred)
    f1 = f1_score(y_train, y_train_pred,average='micro')
    print("Accuracy Score: ",round(acc,3))
    print("F1 Score: ",round(f1,3))
                         
    fig, ax = plt.subplots(figsize=(20, 15))
    ax = sns.heatmap(confusionMat, annot=True, cmap='Reds')

    ax.set_title('Seaborn Confusion Matrix with labels for training data\n\n');
    ax.set_xlabel('\nPredicted Values')
    ax.set_ylabel('Actual Values ');

    ethnicities = ['Asian',
                   'Black/African American',
                   'Hispanic/Latino/a',
                   'Middle Eastern or South Asian',
                   'Native American',
                   'Pacific Islander',
                   'White']

    ## Ticket labels - List must be in alphabetical order
    ax.xaxis.set_ticklabels(ethnicities)
    ax.yaxis.set_ticklabels(ethnicities)

    ## Display the visualization of the Confusion Matrix.
    plt.show()

    print(title + ' - Test results')
    
    y_pred = model.predict(X_test)
    testconfusionMat = (confusion_matrix(y_test, y_pred))
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred,average='micro')
    print("Accuracy Score: ",round(acc,3))
    print("F1 Score: ",round(f1,3))
                      
    fig, ax = plt.subplots(figsize=(20, 15))
    ax = sns.heatmap(testconfusionMat, annot=True, cmap='Reds')

    ax.set_title('Seaborn Confusion Matrix with labels for testing dataset\n\n');
    ax.set_xlabel('\nPredicted Values')
    ax.set_ylabel('Actual Values ');

    ## Ticket labels - List must be in alphabetical order
    ax.xaxis.set_ticklabels(ethnicities)
    ax.yaxis.set_ticklabels(ethnicities)
    
    ## Display the visualization of the Confusion Matrix.
    plt.show()   

# Read in Data

In [18]:
path_ = '/content/drive/MyDrive/Capstone/Data/pure_ripa_final_dec2.csv'
df = pd.read_csv(path_)
print('DF Shape', df.shape)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


DF Shape (702106, 26)


Unnamed: 0,stop_id,pid,reason_for_stop,reason_for_stopcode,resultkey,result,code,resulttext,exp_years,date_stop,...,perceived_limited_english,perceived_age,perceived_gender,gender_nonconforming,gend,perceived_lgbt,race,time_period,light,temp
0,10000,1,Traffic Violation,54115.0,3,Citation for infraction,65002.0,65002 ZZ - LOCAL ORDINANCE VIOL (I) 65002,26,2018-07-15,...,0,50,Male,0,1,No,White,2,1,74.5
1,10001,1,Reasonable Suspicion,29050.0,6,Custodial Arrest without warrant,35143.0,11550 HS - UNDER INFLUENCE CNTL SUB (M) 35143,1,2018-07-15,...,0,25,Male,0,1,No,Hispanic/Latino/a,1,1,80.7
2,10002,1,Reasonable Suspicion,53072.0,10,Psychiatric hold,,,10,2018-07-15,...,0,40,Male,0,1,No,White,2,1,74.5
3,10002,1,Reasonable Suspicion,53072.0,10,Psychiatric hold,,,10,2018-07-15,...,0,40,Male,0,1,No,White,2,1,74.5
4,10003,1,Investigation to determine whether the person ...,,10,Psychiatric hold,,,1,2018-07-15,...,0,30,Female,0,2,No,White,1,1,80.7


# Feature Setup

In [19]:
ethnicities = ['White',
               'Hispanic/Latino/a',
               'Black/African American',
               'Asian',
               'Middle Eastern or South Asian',
               'Pacific Islander',
               'Native American']
labels = [0, 1,2,3,4,5,6]
df['race'].replace(ethnicities, labels, inplace=True)


dummy_df = pd.get_dummies(df['reason_for_stop'])
df = pd.concat([df, dummy_df], axis=1).reindex(df.index)
df.head()

Unnamed: 0,stop_id,pid,reason_for_stop,reason_for_stopcode,resultkey,result,code,resulttext,exp_years,date_stop,...,light,temp,Consensual Encounter resulting in a search,Determine whether the student violated school policy,Investigation to determine whether the person was truant,Knowledge of outstanding arrest warrant/wanted person,Known to be on Parole / Probation / PRCS / Mandatory Supervision,"Possible conduct warranting discipline under Education Code sections 48900, 48900.2, 48900.3, 48900.4 and 48900.7",Reasonable Suspicion,Traffic Violation
0,10000,1,Traffic Violation,54115.0,3,Citation for infraction,65002.0,65002 ZZ - LOCAL ORDINANCE VIOL (I) 65002,26,2018-07-15,...,1,74.5,0,0,0,0,0,0,0,1
1,10001,1,Reasonable Suspicion,29050.0,6,Custodial Arrest without warrant,35143.0,11550 HS - UNDER INFLUENCE CNTL SUB (M) 35143,1,2018-07-15,...,1,80.7,0,0,0,0,0,0,1,0
2,10002,1,Reasonable Suspicion,53072.0,10,Psychiatric hold,,,10,2018-07-15,...,1,74.5,0,0,0,0,0,0,1,0
3,10002,1,Reasonable Suspicion,53072.0,10,Psychiatric hold,,,10,2018-07-15,...,1,74.5,0,0,0,0,0,0,1,0
4,10003,1,Investigation to determine whether the person ...,,10,Psychiatric hold,,,1,2018-07-15,...,1,80.7,0,0,1,0,0,0,0,0


## Droping Unwanted Columns

In [20]:
cols_2_drop = ['assignment',
              'code',
              'date_stop',
              'isstudent',
              'officer_assignment_key',
              'perceived_age',
              'perceived_gender',
              'perceived_lgbt',
              'perceived_limited_english',
              'pid',
              'reason_for_stop',
              'reason_for_stopcode',
              'reason_for_stop',
              'result',
              'resulttext',
              'stop_id',
              'time_stop']
df = df.drop(columns=cols_2_drop)
df.head(2)       

Unnamed: 0,resultkey,exp_years,stopduration,beat,gender_nonconforming,gend,race,time_period,light,temp,Consensual Encounter resulting in a search,Determine whether the student violated school policy,Investigation to determine whether the person was truant,Knowledge of outstanding arrest warrant/wanted person,Known to be on Parole / Probation / PRCS / Mandatory Supervision,"Possible conduct warranting discipline under Education Code sections 48900, 48900.2, 48900.3, 48900.4 and 48900.7",Reasonable Suspicion,Traffic Violation
0,3,26,5,313.0,0,1,0,2,1,74.5,0,0,0,0,0,0,0,1
1,6,1,120,721.0,0,1,1,1,1,80.7,0,0,0,0,0,0,1,0


# Train/Test Split

In [21]:
y = df['race']
X = df.drop(columns=['race'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12345)

In [22]:
print('train size', X_train.shape[0])
print('test size', X_test.shape[0])

train size 491474
test size 210632


# Models

## Decision Tree Classifier

In [23]:
classTree = DecisionTreeClassifier()
classTree.fit(X_train, y_train)

# Start with an initial guess for parameters
param_grid = {
    'max_depth': [10, 20, 30, 40], 
    'min_samples_split': [20, 40, 60, 80, 100], 
    'min_impurity_decrease': [0, 0.0005, 0.001, 0.005, 0.01], 
}
gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(X_train, y_train)
print('Initial score: ', gridSearch.best_score_)
print('Initial parameters: ', gridSearch.best_params_)

Initial score:  0.5157383729743945
Initial parameters:  {'max_depth': 10, 'min_impurity_decrease': 0, 'min_samples_split': 100}


In [24]:
# Refine grid based on result from initial grid search
param_grid = {
    'max_depth': list(range(3, 16)), 
    'min_samples_split': list(range(96, 105)), 
    'min_impurity_decrease': [0, 0.0005, 0.001, 0.005, 0.01], 
}
gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(X_train, y_train)
print('Improved score: ', gridSearch.best_score_)
print('Improved parameters: ', gridSearch.best_params_)

classTree = gridSearch.best_estimator_
confusionMatrices(classTree, 'Decision tree')

PicklingError: ignored

In [None]:
# Refine grid based on result from initial grid search
param_grid = {
    'max_depth': [3], 
    'min_samples_split': [103], 
    'min_impurity_decrease': [0.0005], 
}
gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(X_train, y_train)
print('Improved score: ', gridSearch.best_score_)
print('Improved parameters: ', gridSearch.best_params_)

classTree = gridSearch.best_estimator_
confusionMatrices(classTree, 'Decision tree')

In [None]:
y_pred = classTree.predict(X_test)
print('Accuracy: ',accuracy_score(y_test, y_pred))

## Bagging Classifier

In [None]:
bagging = BaggingClassifier(classTree, max_samples=0.5, max_features=0.5)
bagging.fit(X_train, y_train)
confusionMatrices(bagging, 'Bagged Decision tree')

In [None]:
y_pred = bagging.predict(X_test)
print('Accuracy: ',accuracy_score(y_test, y_pred))

## AdaBoost

In [None]:
adaboost = AdaBoostClassifier(n_estimators=10, base_estimator=classTree)
adaboost.fit(X_train, y_train)
confusionMatrices(adaboost, 'Boosted decision tree')

# Lift Chart Comparison

In [None]:
# plotting function for Lift Chart
def addLiftChart(predict_proba, label, ax=None, color="blue"):
    df = pd.DataFrame(data={'prob': [p[1] for p in predict_proba], 'actual': y_test})
    df = df.sort_values(by=['prob'], ascending=False)
    ax = gainsChart(df.actual, ax=ax, label=label, color=color)
    return ax

ax = addLiftChart(classTree.fit(X_train,y_train).predict_proba(X_test), 'Classification Tree')
addLiftChart(bagging.predict_proba(X_test), 'Bagging', ax=ax, color='blue')
addLiftChart(adaboost.predict_proba(X_test), 'AdaBoost', ax=ax, color='red')

ax.vlines(x=[len(y_test) * 0.3, len(y_test) * 0.7], ymin=-40, ymax=80, linestyles='dotted')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()