In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
crimes = pd.read_csv("/dbfs/FileStore/tables/Crimes___2001_to_present-9be1b.csv", header='infer')
crimes.head(5)

In [3]:
crimes['District']

In [5]:
crimes['District'].mean()

In [6]:
type(crimes)

In [7]:
crimes.isnull().values.any()

In [8]:
crimes.dtypes

In [9]:
crimes['district'] = crimes['District'].fillna(crimes['District'].mean())

In [10]:
#In the previous part, I tried to see if there was any way to discern whether an arrest would be made or not;
#therefore, in this one I will also try to do that given the new information about the zip code data.
#First, a GLM because the data has concrete boundaries on values
from sklearn import linear_model
#Importing PCA library
from sklearn.decomposition import PCA
pca = PCA()
reg = linear_model.LinearRegression()

In [11]:
crimes['ArrestNumber'] = crimes['Arrest'].astype(int)

In [12]:
crimes.head(5)

In [13]:
crimes.dropna()

In [14]:
crimes.isnull().sum()

In [15]:
crimes.District.replace(" ","a")

In [17]:
#crimes['location_description'] = crimes['Location Description'].astype(int)
crimes['domestic'] = crimes['Domestic'].astype(int)
crimes['beat'] = crimes['Beat'].astype(int)
##crimes['ward'] = crimes['Ward'].astype(int)
#crimes['community_area'] = crimes['Community Area'].astype(int)
#crimes['fbi_code'] = crimes['FBI Code'].astype(int)
#crimes['primary_type'] = crimes['Primary Type'].astype(int)

In [18]:
Primary_Type = crimes['Primary Type'].count()
Primary_Type

In [19]:
crimes1 = crimes[['domestic','beat','district']]

In [20]:
crimes1

In [21]:
#Separating Data into predictors and response dataframes
Y = crimes['ArrestNumber']
#crimes.drop(['ArrestNumber'], inplace = True, axis = 1)
#Using PCA to optimize fit for each algorithm
pca.fit(crimes1)
PCAData = pca.fit_transform(crimes1)
X = PCAData
print(pca.explained_variance_ratio_)

In [22]:

#Splitting Data randomly 80-20 training-test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 30)

In [23]:
#GLM Least Squares Regression split along the decision border, converting probability of prediction to True or False
reg.fit(X_train,Y_train)
Reg_Expected_Y = reg.predict(X_test)
for i in range(len(Reg_Expected_Y)):
    if (Reg_Expected_Y[i] >= 0.5):
        Reg_Expected_Y[i] = True
    else:
        Reg_Expected_Y[i] = False

In [24]:
#Importing Confusion matrix and accuracy score reporting libraries
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import accuracy_score
cm = confusion_matrix(Y_test, Reg_Expected_Y)

In [25]:
#copied confusion matrix plotting algorithm from scikit documentation
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    with sns.axes_style("white"):
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True Arrest')
    plt.xlabel('Predicted Arrest')

In [26]:
import seaborn as sns

In [27]:
#Accuracy greatly increased compared to previous part of analyzing crime as a whole, population data for each zipcode
#is a helpful predictor to whether an arrest will be made
plt.figure()
plot_confusion_matrix(cm, normalize = True, classes = Y.unique(), title = 'Arrest Predictions Confusion Matrix Using GLM')
plt.show()
display()
print("Accuracy of Model is: %f"%accuracy_score(Y_test, Reg_Expected_Y))

In [28]:
#Accuracy of Model is: 0.815194

In [29]:
#Will now do the Naive Bayes to predict arrest probability
#Using Bernoulli because it can deal with discrete values better than the other types in scikitlearn
from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()

In [30]:
#Fitting Naive Bayes Model to data and creating predictions
BNB.fit(X_train, Y_train)
BNB_Expected_Y = BNB.predict(X_test)

In [31]:
cm = confusion_matrix(Y_test, BNB_Expected_Y)
plt.figure()
plot_confusion_matrix(cm, normalize = True, classes = Y.unique(), 
                      title = 'Arrest Predictions Confusion Matrix Using Bernoulli Naive Bayes')
plt.show()
display()
print("Accuracy of Model is: %f"%accuracy_score(Y_test, BNB_Expected_Y))

In [32]:
#Will now do Random Forest
from sklearn.ensemble import RandomForestClassifier

In [33]:
#testing oob score of random forests with sizes between 50 to 100 trees
OOB_Err = list(range(50,100))
for i in range(50,100):
    rfc = RandomForestClassifier(n_estimators = i, oob_score = True, n_jobs = -1)
    rfc.fit(X_train,Y_train)
    OOB_Err[i-50] = 1 - rfc.oob_score_

In [34]:
#plotting OOB Scores
plt.figure(figsize = (10,10))
with sns.axes_style("white"):
    plt.plot(list(range(50,100)), OOB_Err)
plt.title('OOB Errors Over Number of Trees')
plt.xlabel('Number of Trees')
plt.ylabel('OOB Error')
plt.show()
display()

In [35]:
rforest = RandomForestClassifier(n_estimators = 90, n_jobs = -1)
rforest.fit(X_train, Y_train)
RF_Expected_Y = rforest.predict(X_test)

In [36]:
#Very accurate compared to previous project and much faster
#Running for cumulative crimes led to a 4-hour wait and an accuracy below 50%
cm = confusion_matrix(Y_test, RF_Expected_Y)
plt.figure()
plot_confusion_matrix(cm, normalize = True, classes = Y.unique(), 
                      title = 'Arrest Predictions Confusion Matrix Using Random Forest')
plt.show()
display()
print("Accuracy of Model is: %f"%accuracy_score(Y_test, RF_Expected_Y))

In [37]:
#Find most important variables in determining arrest rates Using Out of Bag Error
Col_Imp =[]
Col_Imp.append(list(data.columns))
Col_Imp.append(list(rforest.feature_importances_))
Col_Imp = list(map(list, zip(*Col_Imp)))
Col_Imp = pd.DataFrame(Col_Imp, columns = ['Predictors','Feature Importances'])

#plot feature importance
Col_Imp.index = Col_Imp['Predictors']
colors = plt.cm.RdYlGn(np.linspace(0,1,len(Col_Imp)))
plt.title('Feature Importances of Each Predictor')
plt.xlabel('Importance')
with sns.axes_style("white"):
    Col_Imp['Feature Importances'].sort_values().plot(figsize = (10,10), kind = 'barh', color = colors)
plt.show()
display()