# Setup and Imports

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
import seaborn as sns
sns.set
import warnings
import re
from pandas.io import gbq
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import xgboost
import pickle
from sklearn.model_selection import ParameterSampler
from scipy import sparse
#Custom Python Module with functions specifically for this project
import ChicagoDataCleaningFunctions as cd
#Custom Python Module to fetch the data
import FetchChicagoData as fc

import PrepareChicago as pc

# Get the Data

In [12]:
%%time
#Specify input values for fetching the data
query = """
            SELECT unique_key, date, primary_type, location_description, 
                    arrest, domestic, community_area, year
            FROM `gdac-327115.Chicago.chicago2`
            WHERE year >= 2011
        """
project_id = "gdac-327115"
excel_file = "ChicagoCommunityAreas.xlsx"

#Fetch the data
chicago = fc.fetch_chicago_data(query, project_id, excel_file, verbose=True)

Fetching Chicago Data Started...

Successfully queried Google BigQuery.
Sucessfully read in excel file.
Sucessfully joined Chicago districts to main data.
Successfully dropped duplicate column

Succcessfully fetched Chicago Data
Wall time: 3min 20s


# Clean the Data

In [99]:
%%capture --no-stdout
#Clean the full data set
cd.chicago_data_cleaner(chicago, verbose = True)

Cleaning Started...

Successfully Cleaned Primary Type
Successfully Imputed Location
Successfully Cleaned Location
Successfully Added Month Column
Successfully Added Hour Column
Successfully Cleaned Community

Data Set Successfully Cleaned!


# Load in Production Model

In [104]:
best_model = pickle.load(open("best_model.sav", 'rb'))
best_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.45195454681591674,
              enable_categorical=False, gamma=0.546708263364187, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.38768070515882624, max_delta_step=0, max_depth=7,
              min_child_weight=25, missing=nan, monotone_constraints='()',
              n_estimators=195, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=0.6338249886045665,
              scale_pos_weight=1, subsample=0.7838501639099957,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [13]:
chicago.columns

Index(['unique_key', 'date', 'primary_type', 'location_description', 'arrest',
       'domestic', 'year', 'district_name', 'community_name', 'Month', 'Hour'],
      dtype='object')

In [136]:
chicago.dtypes

unique_key                            Int64
date                    datetime64[ns, UTC]
primary_type                         object
location_description                 object
arrest                              boolean
domestic                            boolean
year                                  Int64
district_name                        object
community_name                       object
Month                                 int64
Hour                                  int64
dtype: object

In [17]:
chicago.head()

Unnamed: 0,unique_key,date,primary_type,location_description,arrest,domestic,year,district_name,community_name,Month,Hour
0,10225218,2015-09-05 23:00:00+00:00,ROBBERY,CTA,False,False,2015,Far Southeast,RIVERDALE,9,23
1,10341077,2015-12-10 05:15:00+00:00,BURGLARY,RESIDENCE,False,False,2015,Far Southeast,RIVERDALE,12,5
2,11861007,2019-10-14 22:47:00+00:00,OTHER OFFENSE,STREET,True,False,2019,Far Southeast,RIVERDALE,10,22
3,10599541,2016-07-15 22:30:00+00:00,INTERFERENCE WITH PUBLIC OFFICER,PARKING,True,False,2016,Far Southeast,RIVERDALE,7,22
4,10707134,2016-10-04 17:00:00+00:00,DECEPTIVE PRACTICE,STREET,False,False,2016,Far Southeast,RIVERDALE,10,17


In [27]:
def get_district(community):
    district = dict(zip(chicago["community_name"], chicago["district_name"]))
    return district[community]

In [14]:
def str_to_date(date_str, time_str):
    crime_time = date_str + " " + time_str
    return datetime.strptime(crime_time, "%m/%d/%Y %H:%M:%S")


datetime.datetime(2022, 2, 13, 2, 30, 15)

In [29]:
def user_input():
    community_name = str.upper(input("Enter the Community Name: "))
    district = get_district(community_name)
    
    primary_type = str.upper(input("Enter the crime committed: "))
    
    location_description = str.upper(input("Enter the crime's location (street, residence, etc.): "))
    
    domestic = (str.upper(input("Was the crime domestic? (Yes/No): ")) == "YES")
    
    date = input("Date of Crime (Ex. 01/01/2022): ")
    
    hour = input("Time of Crime: ")
    
    date_time = str_to_date(date, hour)
    
    return [community_name, district, primary_type, location_description, domestic, date_time]

In [101]:
def unique_column_values(df):
    crime_values = list(np.sort(df["primary_type"].value_counts().index))
    location_values = list(np.sort(df["location_description"].value_counts().index))
    domestic_values = list(np.sort(df["domestic"].value_counts().index))
    district_values = list(np.sort(df["district_name"].value_counts().index))
    community_values = list(np.sort(df["community_name"].value_counts().index))
    Month_values = list(np.sort(df["Month"].value_counts().index))
    Hour_values = list(np.sort(df["Hour"].value_counts().index))

    column_values = [crime_values, location_values, domestic_values, district_values, community_values, Month_values, Hour_values]
    return column_values

In [30]:
def input_to_category(df):
    crime_details = user_input()[::-1]
    
    communities = pd.Categorical(chicago["community_name"]).categories
    community_name = pd.Categorical(crime_details.pop(), categories=communities)
    
    districts = pd.Categorical(chicago["district_name"]).categories
    district = pd.Categorical(crime_details.pop(), categories=districts)
    
    crime_grps = pd.Categorical(chicago["primary_type"]).categories
    crime = pd.Categorical(crime_details.pop(), categories=crime_grps)
    
    location_grps = pd.Categorical(chicago["location_description"]).categories
    location = pd.Categorical(crime_details.pop(), categories=location_grps)
    
    domestic_grp = pd.Categorical(chicago["domestic"]).categories
    domestic = pd.Categorical(crime_details.pop(), categories=domestic_grp)
    
    crime_instance = pd.DataFrame({"date": crime_details.pop(), "primary_type": crime, 
                                   "location_description": location, "domestic": domestic,
                                   "district_name": district, "community_name": community_name},
                                 index = [1])
    return crime_instance

In [110]:
temp = input_to_category(chicago)

Enter the Community Name: austin
Enter the crime committed: narcotics
Enter the crime's location (street, residence, etc.): sidewalk
Was the crime domestic? (Yes/No): no
Date of Crime (Ex. 01/01/2022): 02/16/2022
Time of Crime: 19:00:00


In [111]:
temp

Unnamed: 0,date,primary_type,location_description,domestic,district_name,community_name
1,2022-02-16 19:00:00,NARCOTICS,SIDEWALK,False,West,AUSTIN


In [112]:
cd.chicago_data_cleaner(temp, verbose=True)

Cleaning Started...

Successfully Cleaned Primary Type
Successfully Imputed Location
Successfully Cleaned Location
Successfully Added Month Column
Successfully Added Hour Column
Successfully Cleaned Community

Data Set Successfully Cleaned!


In [113]:
temp

Unnamed: 0,date,primary_type,location_description,domestic,district_name,community_name,Month,Hour
0,2022-02-16 19:00:00,NARCOTICS,SIDEWALK,False,West,AUSTIN,2,19


In [114]:
unique_vals = unique_column_values(chicago)
cat_encoder = OneHotEncoder(categories=unique_vals)
X = cat_encoder.fit_transform(temp[attribs])

In [149]:
def prepare_new_instance(df, unique_vals_per_attrib):
    cat_encoder = OneHotEncoder(categories=unique_vals)
    X = cat_encoder.fit_transform(df[attribs])
    return X
    

In [131]:
unique_vals = unique_column_values(chicago)
X = prepare_new_instance(temp, unique_vals)

In [132]:
X.shape

(1, 183)

In [150]:
def make_prediction(X, model, probability = True):
    if probability:
        return model.predict_proba(X)
    else:
        return model.predict(X)
    

In [151]:
def chicago_crime_prediction(df, model):
    new_crime = input_to_category(df)
    
    clean_new_crime = cd.chicago_data_cleaner(new_crime, verbose=False)
    
    unique_vals = unique_column_values(df)
    prepared_crime = prepare_new_instance(df, unique_vals)
    
    crime_prediction = make_prediction(prepared_crime, model, probability=True)
    chance = np.round(crime_prediction[0][1], 4) * 100
    
    print(f"\nThe model predicts a {chance:.2f}% chance of making an arrest")

In [153]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: rogers park
Enter the crime committed: narcotics
Enter the crime's location (street, residence, etc.): sidewalk
Was the crime domestic? (Yes/No): no
Date of Crime (Ex. 01/01/2022): 02/16/2022
Time of Crime: 19:00:00

The model predicts a 2.72% chance of making an arrest


In [134]:
y_prob = make_prediction(X, best_model)

In [140]:
y_prob

array([[4.8506260e-04, 9.9951494e-01]], dtype=float32)

In [143]:
chance = np.round(y_prob[0][1], 4) * 100
print(f"The model predicts a a {chance:.2f}% chance of making an arrest")

The model predicts a a 99.95% chance of making an arrest


In [118]:
best_model.predict(X)

array([1])