# Setup and Imports

In [21]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
import pickle
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder

#Custom Python Module with functions specifically for this project
import ChicagoDataCleaningFunctions as cd
#Custom Python Module to fetch the data
import FetchChicagoData as fc
#Custom Python Module to prepare new crime instances
import PrepareChicago as pc

# Get the Data

In [2]:
%%time
#Specify input values for fetching the data
query = """
            SELECT unique_key, date, primary_type, location_description, 
                    arrest, domestic, community_area, year
            FROM `gdac-327115.Chicago.chicago2`
            WHERE year >= 2011
        """
project_id = "gdac-327115"
excel_file = "ChicagoCommunityAreas.xlsx"

#Fetch the data
chicago = fc.fetch_chicago_data(query, project_id, excel_file, verbose=True)

Fetching Chicago Data Started...

Successfully queried Google BigQuery.
Sucessfully read in excel file.
Sucessfully joined Chicago districts to main data.
Successfully dropped duplicate column

Succcessfully fetched Chicago Data
Wall time: 3min 37s


# Clean the Data

In [3]:
%%capture --no-stdout
#Clean the full data set
cd.chicago_data_cleaner(chicago, verbose = True)

Cleaning Started...

Successfully Cleaned Primary Type
Successfully Imputed Location
Successfully Cleaned Location
Successfully Added Month Column
Successfully Added Hour Column
Successfully Cleaned Community

Data Set Successfully Cleaned!


# Load in Traditional Model

In [109]:
best_model = pickle.load(open("best_model.sav", 'rb'))
best_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.45195454681591674,
              enable_categorical=False, gamma=0.546708263364187, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.38768070515882624, max_delta_step=0, max_depth=7,
              min_child_weight=25, missing=nan, monotone_constraints='()',
              n_estimators=195, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=0.6338249886045665,
              scale_pos_weight=1, subsample=0.7838501639099957,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

# Load in Deep Learning Model

In [105]:
#Load the model 
model = keras.models.load_model("best_nn_model_fitted_h5")
model

<tensorflow.python.keras.engine.functional.Functional at 0x1d349f25610>

# Create Helper Functions for Traditional Model

In [4]:
def get_district(community):
    """
    This function returns the district that corresponds to the community area provided
    
    community: community name
    
    returns: district name as string
    """
    district = dict(zip(chicago["community_name"], chicago["district_name"]))
    return district[community]

In [5]:
def str_to_date(date_str, time_str):
    """
    This function takes takes a date and time string and concatenates them together. Finally, it returns a datetime object.
    
    date_str: string containing the date in form mm/dd/yyyy
    time_str: string containing the time in form hh:mm:ss
    
    returns: datetime object 
    """
    crime_time = date_str + " " + time_str
    return datetime.strptime(crime_time, "%m/%d/%Y %H:%M:%S")


In [6]:
def user_input():
    """
    This function asks the user for the new crime details. It then converts the input to the form found in the original
    dataframe. 
    
    returns: DataFrame with the crime details
    """
    community_name = str.upper(input("Enter the Community Name: "))
    district = get_district(community_name)
    
    primary_type = str.upper(input("Enter the crime committed: "))
    
    location_description = str.upper(input("Enter the crime's location (street, residence, etc.): "))
    
    domestic = (str.upper(input("Was the crime domestic?: ")) == "YES")
    
    date = input("Date of Crime (mm/dd/yyy): ")
    
    hour = input("Time of Crime (hh:mm:ss): ")
    
    date_time = str_to_date(date, hour)
    
    new_crime_df = pd.DataFrame({"date": date_time, "primary_type": primary_type , 
                                 "location_description": location_description, "domestic": domestic,  
                                 "community_name": community_name,  "district_name": district}, index = [1])
    
    return new_crime_df

In [7]:
def unique_column_values(df):
    """
    This function creates lists containing the unique values for each variable in the chicago data set. It returns a list of
    lists for each variable
    
    df: Dataframe 
    
    returns: List of list with unique values for each variable
    """
    crime_values = list(np.sort(df["primary_type"].value_counts().index))
    location_values = list(np.sort(df["location_description"].value_counts().index))
    domestic_values = list(np.sort(df["domestic"].value_counts().index))
    community_values = list(np.sort(df["community_name"].value_counts().index))
    district_values = list(np.sort(df["district_name"].value_counts().index))
    Month_values = list(np.sort(df["Month"].value_counts().index))
    Hour_values = list(np.sort(df["Hour"].value_counts().index))

    column_values = [crime_values, location_values, domestic_values, community_values, district_values, Month_values, Hour_values]
    return column_values

In [54]:
def prepare_new_instance(df, unique_vals_per_attrib, output_sparse = True):
    """
    This function prepares a new crime instance by one hot encoding the variables. It specifies the categories from the 
    original data set.
    
    df: Dataframe containing the new crime instance
    unique_vals_per_attrib: List of list containing the unique values for each variable
    
    returns: X (sparse matrix)
    """
    #attribs = ["primary_type", "location_description", "domestic", "community_name", "Month", "Hour"]
    
    df.drop("date", axis = 1, inplace = True)
    cat_encoder = OneHotEncoder(categories=unique_vals_per_attrib, sparse=output_sparse)
    X = cat_encoder.fit_transform(df)
    return X
    

In [9]:
def make_prediction(X, model, probability = True):
    """
    This function makes the prediction on the new crime instance. It can return either the predicted probability or class. 
    
    X (Sparse matrix)
    model: Model capable of making returning a predicted probability or class
    probability: Specifies if the function should return a predicted probability or class
    """
    if probability:
        return model.predict_proba(X)
    else:
        return model.predict(X)
    

In [108]:
def chicago_crime_prediction(df, model):
    """
    This is the main function for turning a user's input into a predicted response. It contains functions that get the input, 
    clean it, transform it to the correct form for the model and finally outputs a predicted probability of success. 
    
    df: Dataframe of past instances
    model: Model capable of giving a predicted probability
    
    """
    #Enter the new crime
    new_crime = user_input()
    #Clean the new crime
    cd.chicago_data_cleaner(new_crime, verbose=False)
    #Generate the unique values per column
    unique_vals = unique_column_values(df)
    #Prepare the new instance by one-hot encoding the features
    prepared_crime = prepare_new_instance(new_crime, unique_vals)
    #Get the predicted probability
    crime_prediction = make_prediction(prepared_crime, model, probability=True)
    #Change the prediction to a percentage
    chance = np.round(crime_prediction[0][1], 4) * 100
    #Print the chance of making an arrest
    print(f"\nThe traditional model predicts a {chance:.2f}% chance of making an arrest")

# Create Helper Functions for Deep Learning Model

In [16]:
def data_to_array(df, attribs):
    """
    This function converts the dataframe into a numpy array based on the specified features 
    
    df: Pandas dataframe
    attribs: List of columns to use as attributes
    
    returns: X (numpy array)
    """
    X = df[attribs].values
    
    return X

In [144]:
def prepare_inputs(X_old, X_new):
    """
    This function prepares the input data by ordinal encoding each one and adding it to a list.
    
    X_old: Array of original data 
    X_new: Array of new instance
    
    returns: Two lists of encoded training and test features
    """
    X_old_enc, X_new_enc = list(), list()
    #label encode each column
    for i in range(X_old.shape[1]):
        le = LabelEncoder()
        le.fit(X_old[:, i])
        #encode
        old_enc = le.transform(X_old[:, i])
        new_enc = le.transform(X_new[:, i])
        #store
        X_old_enc.append(old_enc)
        X_new_enc.append(new_enc)
    return X_new_enc

In [119]:
features = ["primary_type", "location_description", "domestic", "district_name", "community_name", "Month", "Hour"]

def chicago_dl_crime_prediction(model, df_orig, features = features):
    """
    This is the main function for turning a user's input into a predicted response based on the deep learning model.
    It contains functions that get the input, clean it, transform it to the correct form for the model and finally 
    outputs a predicted probability of success. 
    
    model: Deep learning model capable of giving a predicted probabilit
    df_orig: Dataframe of past instances
    features: Features to use for the model
    
    """
    #Enter the new crime
    new_crime = user_input()
    #Clean the crime
    cd.chicago_data_cleaner(new_crime, verbose=False)
    #Convert the original data into an array
    original_array = data_to_array(df_orig, features)
    #Convert the new data into an array
    new_instance_array = data_to_array(new_crime, features)
    #Generate the encodings for the new crime based on the original data
    new_crime_enc = prepare_inputs(original_array, new_instance_array)
    #Get the prediction
    y_pred = model.predict(new_crime_enc)
    #Convert the prediction to a percentage
    chance = np.round(y_pred.reshape((y_pred.shape[0]))[0], 4) * 100
    #Print the chance of making an arrest
    print(f"\nThe deep learning model predicts a {chance:.2f}% chance of making an arrest")


# Examples: New Instances

### Domestic Effect on Homicide using Traditional Model

There is a common belief that a spouse, family member or loved one is the primary culprit in a homicide. Let's see an example of this.

In [110]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: woodlawn
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): residence
Was the crime domestic?: yes
Date of Crime (mm/dd/yyy): 02/16/2022
Time of Crime (hh:mm:ss): 20:30:00

The traditional model predicts a 83.96% chance of making an arrest


In [145]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: woodlawn
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): residence
Was the crime domestic?: no
Date of Crime (mm/dd/yyy): 02/16/2022
Time of Crime (hh:mm:ss): 20:30:00

The traditional model predicts a 49.16% chance of making an arrest


### Domestic Effect on Homicide using Deep Learning Model

In [146]:
chicago_dl_crime_prediction(model, chicago, features = features)

Enter the Community Name: woodlawn
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): residence
Was the crime domestic?: yes
Date of Crime (mm/dd/yyy): 02/16/2022
Time of Crime (hh:mm:ss): 20:30:00

The deep learning model predicts a 98.87% chance of making an arrest


In [113]:
chicago_dl_crime_prediction(model, chicago, features = features)

Enter the Community Name: woodlawn
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): residence
Was the crime domestic?: no
Date of Crime (mm/dd/yyy): 02/16/2022
Time of Crime (hh:mm:ss): 20:30:00

The deep learning model predicts a 46.03% chance of making an arrest


### Prediction Based on Highest Arrest Percentage

Lets take a look at an extreme example that uses the crime details corresponding to the variables with the highest value of arrest percentage. We'll substitute "gambling" for "public indecency" since it has a larger count of occurrences. We can use the graphs from Data Visualization to determine the artificial example. 

In [132]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: west garfield park
Enter the crime committed: gambling
Enter the crime's location (street, residence, etc.): police
Was the crime domestic?: no
Date of Crime (mm/dd/yyy): 02/16/2022
Time of Crime (hh:mm:ss): 19:00:00

The model predicts a 99.62% chance of making an arrest


In [114]:
chicago_dl_crime_prediction(model, chicago, features = features)

Enter the Community Name: west garfield park
Enter the crime committed: gambling
Enter the crime's location (street, residence, etc.): police
Was the crime domestic?: no
Date of Crime (mm/dd/yyy): 02/16/2022
Time of Crime (hh:mm:ss): 19:00:00

The deep learning model predicts a 99.99% chance of making an arrest


Therfore, both models show that if you are in the police station in West Garfield Park at 7pm gambling on February 16th, 2022, you are most certainly going to be arrested. 

### Prediction Based on Lowest Arrest Percentage

Similar to above, let's make an extreme observation based on the variable values that correspond to the lowest arrest percentage. 

In [133]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: forest glen
Enter the crime committed: burglary
Enter the crime's location (street, residence, etc.): construction site
Was the crime domestic?: no
Date of Crime (mm/dd/yyy): 12/16/2022
Time of Crime (hh:mm:ss): 05:00:00

The model predicts a 2.99% chance of making an arrest


In [116]:
chicago_dl_crime_prediction(model, chicago, features = features)

Enter the Community Name: forest glen
Enter the crime committed: burglary
Enter the crime's location (street, residence, etc.): construction site
Was the crime domestic?: no
Date of Crime (mm/dd/yyy): 12/16/2022
Time of Crime (hh:mm:ss): 05:00:00

The deep learning model predicts a 6.31% chance of making an arrest


Thus, if you enter into a construction site illegally in Forest Glen at 5am on December 16th, 2022 you most likely will get away with it. Construction site owners have been warned!

# Random Crime Instances

To get more examples, we'll randomly generate crimes and take a look at their prediction. 

In [121]:
def gen_random_crime(df):
    """
    This function generates a random crime instance and returns the new instance as a dataframe.
    
    df: Dataframe of original data
    
    returns: Dataframe of new instance
    """
    #Randomly select values from each feature
    community = np.random.choice(chicago["community_name"])
    crime = np.random.choice(chicago["primary_type"])
    location = np.random.choice(chicago["location_description"])
    domestic = np.random.choice(chicago["domestic"])
    date = np.random.choice(chicago["date"])
    
    print(f"Crime Details\n")
    print(f"Date: {date}")
    print(f"Community Name: {community}")
    print(f"District: {get_district(community)}")
    print(f"Crime committed: {crime}")
    print(f"Location: {location}")
    print(f"Domesic?: {domestic}")
    
    #Combine values into a dataframe
    random_crime_df = new_crime_df = pd.DataFrame({"date": date, "primary_type": crime, "location_description": location, 
                                                   "domestic": domestic,  "community_name": community,  
                                                   "district_name": get_district(community)}, index = [1])
    return random_crime_df


In [129]:
def random_chicago_crime_prediction(df, model):
    """
    This function outputs a prediction for a randomly generated crime using the traditional model. 
    
    df: Dataframe of past instances
    model: Model capable of giving a predicted probability
    
    """
    #Generate random crime
    new_crime = gen_random_crime(df)
    #Clean the new crime
    cd.chicago_data_cleaner(new_crime, verbose=False)
    #Get the unique values base on the original data
    unique_vals = unique_column_values(df)
    #Prepare the new crime
    prepared_crime = prepare_new_instance(new_crime, unique_vals)
    #Get the probability for the new instance
    crime_prediction = make_prediction(prepared_crime, model, probability=True)
    #Convert into a percentage
    chance = np.round(crime_prediction[0][1], 4) * 100
    #Print the chance of making an arrest
    print(f"\nThe traditional model predicts a {chance:.2f}% chance of making an arrest")

In [124]:
features = ["primary_type", "location_description", "domestic", "district_name", "community_name", "Month", "Hour"]

def random_chicago_dl_crime_prediction(model, df_orig, features = features):
    """
    This function outputs a prediction for a randomly generated crime using the traditional model. 

    model: Deep learning model capable of giving a predicted probabilit
    df_orig: Dataframe of past instances
    features: Features to use for the model
    
    """
    #Generate the new crime
    new_crime = gen_random_crime(df_orig)
    #Clean the crime
    cd.chicago_data_cleaner(new_crime, verbose=False)
    #Convert the original data into an array
    original_array = data_to_array(df_orig, features)
    #Convert the new data into an array
    new_instance_array = data_to_array(new_crime, features)
    #Generate the encodings for the new crime based on the original data
    new_crime_enc = prepare_inputs(original_array, new_instance_array)
    #Get the prediction
    y_pred = model.predict(new_crime_enc)
    #Convert the prediction to a percentage
    chance = np.round(y_pred.reshape((y_pred.shape[0]))[0], 4) * 100
    #Print the chance of making an arrest
    print(f"\nThe deep learning model predicts a {chance:.2f}% chance of making an arrest")


### Random Instance #1

In [130]:
#Set random seed for reproducibility
np.random.seed(28)
random_chicago_crime_prediction(chicago, best_model)

Crime Details

Date: 2019-06-18 13:00:00+00:00
Community Name: ROSELAND
District: Far Southeast
Crime committed: THEFT
Location: RESIDENCE
Domesic?: False

The traditional model predicts a 2.07% chance of making an arrest


In [127]:
#Check that we would get the same result entering the information manually for the traditional model
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: roseland
Enter the crime committed: theft
Enter the crime's location (street, residence, etc.): residence
Was the crime domestic?: no
Date of Crime (mm/dd/yyy): 06/18/2019
Time of Crime (hh:mm:ss): 13:00:00

The traditional model predicts a 2.07% chance of making an arrest


In [125]:
#Set random seed for reproducibilty
np.random.seed(28)
random_chicago_dl_crime_prediction(model, chicago, features)

Crime Details

Date: 2019-06-18 13:00:00+00:00
Community Name: ROSELAND
District: Far Southeast
Crime committed: THEFT
Location: RESIDENCE
Domesic?: False

The deep learning model predicts a 1.98% chance of making an arrest


In [131]:
#Check that we would get the same result entering the information manually for the deep learning model
chicago_dl_crime_prediction(model, chicago, features)

Enter the Community Name: roseland
Enter the crime committed: theft
Enter the crime's location (street, residence, etc.): residence
Was the crime domestic?: no
Date of Crime (mm/dd/yyy): 06/18/2019
Time of Crime (hh:mm:ss): 13:00:00

The deep learning model predicts a 1.98% chance of making an arrest


### Random Instance #2

In [133]:
#Set random seed for reproducibility
np.random.seed(32)
random_chicago_crime_prediction(chicago, best_model)

Crime Details

Date: 2018-02-01 10:30:00+00:00
Community Name: SOUTH CHICAGO
District: Far Southeast
Crime committed: BATTERY
Location: RESIDENCE
Domesic?: False

The traditional model predicts a 22.09% chance of making an arrest


In [132]:
#Set random seed for reproducibility
np.random.seed(32)
random_chicago_dl_crime_prediction(model, chicago)

Crime Details

Date: 2018-02-01 10:30:00+00:00
Community Name: SOUTH CHICAGO
District: Far Southeast
Crime committed: BATTERY
Location: RESIDENCE
Domesic?: False

The deep learning model predicts a 22.67% chance of making an arrest


### Random Instance #3

In [134]:
#Set random seed for reproducibility
np.random.seed(84)
random_chicago_crime_prediction(chicago, best_model)

Crime Details

Date: 2012-08-30 21:00:00+00:00
Community Name: ALBANY PARK
District: Far North
Crime committed: OTHER OFFENSE
Location: RESIDENCE
Domesic?: False

The traditional model predicts a 5.00% chance of making an arrest


In [135]:
#Set random seed for reproducibility
np.random.seed(84)
random_chicago_dl_crime_prediction(model, chicago)

Crime Details

Date: 2012-08-30 21:00:00+00:00
Community Name: ALBANY PARK
District: Far North
Crime committed: OTHER OFFENSE
Location: RESIDENCE
Domesic?: False

The deep learning model predicts a 4.80% chance of making an arrest


### Random Instance #4

In [136]:
#Set random seed for reproducibility
np.random.seed(96)
random_chicago_crime_prediction(chicago, best_model)

Crime Details

Date: 2018-10-23 22:28:00+00:00
Community Name: ENGLEWOOD
District: Southwest
Crime committed: OTHER OFFENSE
Location: APARTMENT
Domesic?: False

The traditional model predicts a 10.57% chance of making an arrest


In [137]:
#Set random seed for reproducibility
np.random.seed(96)
random_chicago_dl_crime_prediction(model, chicago)

Crime Details

Date: 2018-10-23 22:28:00+00:00
Community Name: ENGLEWOOD
District: Southwest
Crime committed: OTHER OFFENSE
Location: APARTMENT
Domesic?: False

The deep learning model predicts a 9.86% chance of making an arrest


### Random Instance #5

In [142]:
#Set random seed for reproducibility
np.random.seed(53)
random_chicago_crime_prediction(chicago, best_model)

Crime Details

Date: 2020-07-08 09:25:00+00:00
Community Name: PULLMAN
District: Far Southeast
Crime committed: NARCOTICS
Location: STREET
Domesic?: True

The traditional model predicts a 99.61% chance of making an arrest


In [143]:
#Set random seed for reproducibility
np.random.seed(53)
random_chicago_dl_crime_prediction(model, chicago)

Crime Details

Date: 2020-07-08 09:25:00+00:00
Community Name: PULLMAN
District: Far Southeast
Crime committed: NARCOTICS
Location: STREET
Domesic?: True

The deep learning model predicts a 93.81% chance of making an arrest


# Final Comments

We see that overall both the traditional model and the deep learning model give similar results. 