In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')
sns.set_context('notebook')

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix,classification_report, roc_curve, log_loss, brier_score_loss, roc_auc_score
from timeit import default_timer as timer

In [2]:
#Cross Validation Function

def CrossVal(model):

    start_cross_val = timer()

    scores = cross_val_score(model, X_train, y_train)

    print('Mean Accuracy:',round(scores.mean()*100,2),'%')
    print('Accuracy Standard Deviation',round(scores.std()*100,2),'%')

    end_cross_val = timer()
    print('Cross Validation Time:', round(end_cross_val - start_cross_val,1), 'seconds')

In [3]:
#murders_full = pd.read_csv('C:\\Users\\Classy\\Desktop\\murders_short.csv')
murders_no_unknowns = pd.read_csv('C:\\Users\\Classy\\Desktop\\murders_no_unknowns.csv')
#murders_unknown_ethnic = pd.read_csv('C:\\Users\\Classy\\Desktop\\murders_no_unknowns_except_VicEthnic.csv')

In [4]:
murders_no_unknowns.Solved.value_counts()

Yes    220148
No      88813
Name: Solved, dtype: int64

In [5]:
# Balance

solved_count, unsolved_count = murders_no_unknowns.Solved.value_counts()
solved = murders_no_unknowns[murders_no_unknowns.Solved == 'Yes']
unsolved = murders_no_unknowns[murders_no_unknowns.Solved == 'No']
solved_sample = solved.sample(unsolved_count)

murders_no_unknowns = pd.concat([unsolved, solved_sample], axis = 0)

murders_no_unknowns.Solved.value_counts()

Yes    88813
No     88813
Name: Solved, dtype: int64

Models using no_unknowns (smallest and cleanest) dataset (with MSA excluded for now):

In [8]:
#Full Training Set Random Forest

model = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    #'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
df_y = pd.DataFrame(murders_no_unknowns.Solved)
    
start_preprocessing = timer()

#Create dummy columns for categorical features
model_df = pd.get_dummies(df_categorical,
                          drop_first = True)

#Add numerical features to model dataframe
model_df = pd.concat([model_df, df_numerical], axis = 1)

#Encode prediction variable and add to dataframe
ord_enc = OrdinalEncoder().fit(df_y)
y = ord_enc.transform(df_y)

#print(y.shape)

#Create scaled training and validation datasets
X = StandardScaler().fit_transform(model_df)

X_train, X_validation, y_train, y_validation = train_test_split(X, y, stratify = y, random_state = 33)

end_preprocessing = timer()
print('Preprocessing Time:', round(end_preprocessing - start_preprocessing,1), 'seconds')

#Train model passed to function
start_training = timer()

#model.fit(X_train, y_train)

end_training = timer()
print('Training Time:', round(end_training - start_training,1), 'seconds')

Preprocessing Time: 1.6 seconds
Training Time: 0.0 seconds


In [None]:
#Use trained model to predict test results
start_prediction = timer()

y_pred = model.predict(X_validation)
y_pred_prob = model.predict_proba(X_validation)[:,1]

end_prediction = timer()
print('Prediction Time:', round(end_prediction - start_prediction,1), 'seconds')

#Output test reports
start_reports = timer()

print('Confusion Matrix:\n', confusion_matrix(y_validation, y_pred))
print('Classification Report:\n', classification_report(y_validation, y_pred))
print('Accuracy:', round(model.score(X_validation, y_validation) *100,2), "%")
print('Log Loss:', log_loss(y_validation, y_pred_prob))
print('Brier Score Loss:', brier_score_loss(y_validation, y_pred_prob))

fpr, tpr, thresholds = roc_curve(y_validation, y_pred_prob)

plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

print('ROC AUC Score:', roc_auc_score(y_validation, y_pred_prob))

forest_feature_importance = pd.DataFrame(zip(model_df.columns,model.feature_importances_))
forest_feature_importance.sort_values(1, inplace = True)
forest_feature_importance.reset_index(inplace = True, drop = True)

plt.bar(forest_feature_importance[0][-20:],forest_feature_importance[1][-20:])
plt.xticks(rotation = 'vertical')
plt.title('20 Largest Importances')
plt.show()

plt.bar(forest_feature_importance[0][0:20],forest_feature_importance[1][0:20])
plt.xticks(rotation = 'vertical')
plt.title('20 Smallest Importances')
plt.show()

proba_df = pd.DataFrame(y_validation, columns = ['Solved'])
proba_df['Predicted Probability'] = model.predict_proba(X_validation)[:,1]



In [None]:
chart_df

In [None]:
chart_df.Actual - chart_df.Predicted

In [None]:
#The below bins by Predicted Probability
bins = np.arange(0,1.05,0.05)
labels = ['0.05','0.1','0.15','0.2','0.25','0.3','0.35','0.4','0.45','0.5',
          '0.55','0.6','0.65','0.7','0.75','0.8','0.85','0.9','0.95','1']
proba_df['Bin'] = pd.cut(proba_df['Predicted Probability'], bins = bins, labels = labels)
proba_df.reset_index(drop = True, inplace = True)
chart_df = pd.DataFrame(proba_df.groupby('Bin')['Solved', 'Predicted Probability'].mean())
chart_df['Count'] = proba_df.groupby('Bin')['Solved'].count()
chart_df.reset_index(inplace = True)
chart_df.columns = ['Bin', 'Actual', 'Predicted', 'Count']

print(chart_df)

plt.bar(chart_df.Bin, chart_df.Count)
plt.xticks(rotation = 45)
plt.title('Frequency of Test Cases by Predicted Clearance Probability')
plt.xlabel('Case Clearance Probability')
plt.ylabel('Count')
plt.show()

plt.scatter(chart_df.Bin, chart_df.Actual, label = 'Actual', alpha = 1)
plt.scatter(chart_df.Bin, chart_df.Predicted, color = 'r', marker = 'D', label = 'Predicted', alpha = 0.75)
plt.legend()
plt.title('Actual and Predicted Clearance Rate by 5% Bin')
plt.xticks(rotation = 45)
plt.xlabel('Bin (Range = Value - 5% to Value)')
plt.yticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
plt.ylabel('Clearance Rate')
plt.show()

plt.scatter(chart_df.Predicted, chart_df.Actual)
plt.plot([0,1],[0,1], linestyle = '--', color = 'g')
plt.title('Actual vs. Predicted Clearance Rate by 5% bin')
plt.xticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],rotation = 45)
plt.xlabel('Predicted')
plt.yticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
plt.ylabel('Actual')
plt.show()

plt.bar(chart_df.Bin, chart_df.Actual - chart_df.Predicted,)
plt.title('Actual - Predicted Clearance Rate by 5% Bin')
plt.xticks(rotation = 45)
plt.xlabel('Bin (Range = Value - 5% to Value)')
plt.yticks([-0.05,-0.04,-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05])
plt.ylabel('Delta Clearance Rate')
plt.show()

In [None]:
binned_sum_of_squares = sum((chart_df.Actual - chart_df.Predicted).fillna(0) ** 2)
print('Binned Sum of Squares:', binned_sum_of_squares)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[['Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon']],
                                                drop_first = True)

df_numerical = murders_no_unknowns[['VicAge','VicCount','OriCases','WhiteMurderPercent']]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

For some reason, the above two results show that the random state is fixed for this model, but the results differ somewhat from the 'Murder Machine Learning 6' notebook

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    #'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    #'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    #'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    #'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                #'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                #'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                #'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                #'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                #'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                #'VicRace',
                                                'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                #'VicEthnic',
                                                'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)

In [None]:
#Full Training Set Random Forest

forest_1 = RandomForestClassifier(n_estimators = 1000,
                                  max_depth = 20,
                                  random_state = 33,
                                  n_jobs = 3)

df_categorical = pd.get_dummies(murders_no_unknowns[[
                                                'Agentype',
                                                'Month',
                                                'Homicide',
                                                'Situation',
                                                'VicSex',
                                                'VicRace',
                                                'VicEthnic',
                                                #'Weapon'
                                                    ]],
                                                drop_first = True)

df_numerical = murders_no_unknowns[[
                                    'VicAge',
                                    'VicCount',
                                    'OriCases',
                                    'WhiteMurderPercent'
                                    ]]
                                                    
y = pd.DataFrame(murders_no_unknowns.Solved)

TrainVal(forest_1, df_categorical, df_numerical, y)