# Setup and Imports

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set
import warnings
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.models import Model
from keras.layers import Input
from keras.layers import Embedding
from keras.layers.merge import concatenate
#from keras.utils import plot_model

#Custom Python Module with functions specifically for this project
import ChicagoDataCleaningFunctions as cd
#Custom Python Module to fetch the data
import FetchChicagoData as fc
#Custom Python Module to prepare new crime instances
import PrepareChicago as pc

# Get the Data

In [2]:
%%time
#Specify input values for fetching the data
query = """
            SELECT unique_key, date, primary_type, location_description, 
                    arrest, domestic, community_area, year
            FROM `gdac-327115.Chicago.chicago2`
            WHERE year >= 2011
        """
project_id = "gdac-327115"
excel_file = "ChicagoCommunityAreas.xlsx"

#Fetch the data
chicago = fc.fetch_chicago_data(query, project_id, excel_file, verbose=True)

Fetching Chicago Data Started...

Successfully queried Google BigQuery.
Sucessfully read in excel file.
Sucessfully joined Chicago districts to main data.
Successfully dropped duplicate column

Succcessfully fetched Chicago Data
Wall time: 3min 16s


# Split the Data into Training and Test Sets

In [3]:
chicago_train = chicago.loc[chicago["year"] != 2021]
chicago_test = chicago.loc[chicago["year"] == 2021]

# Clean the Training Data

In [4]:
%%capture --no-stdout
cd.chicago_data_cleaner(chicago_train, verbose=True)

Cleaning Started...

Successfully Cleaned Primary Type
Successfully Imputed Location
Successfully Cleaned Location
Successfully Added Month Column
Successfully Added Hour Column
Successfully Cleaned Community

Data Set Successfully Cleaned!


# Prepare the Data

Since we are focusing on using deep learning techniques, we do more than just one hot encoding the variables. Instead, we'll use embeddings to encode the features.

In [131]:
def data_to_array(df, attribs):
    X = df[attribs].values
    y = df["arrest"].values
    
    return X, y

In [132]:
def prepare_inputs(X_train, X_test):
    """
    This function prepares the input data by ordinal encoding each one and adding it to a list.
    
    X_train: Array of training features
    X_test: Array of test features
    
    returns: Two lists of encoded training and test features
    """
    X_train_enc, X_test_enc = list(), list()
    # label encode each column
    for i in range(X_train.shape[1]):
        le = LabelEncoder()
        le.fit(X_train[:, i])
        # encode
        train_enc = le.transform(X_train[:, i])
        test_enc = le.transform(X_test[:, i])
        # store
        X_train_enc.append(train_enc)
        X_test_enc.append(test_enc)
    return X_train_enc, X_test_enc

In [30]:
def prepare_targets(y_train, y_test):
    """
    This function transforms the target by ordinal encoding the values. 
    
    y_train: Array of training targets
    y_test: Array of test targets
    
    returns: Two lists containing the transformed targets
    """
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [136]:
#Specify the features to use
features = ["primary_type", "location_description", "domestic", "district_name", "community_name", "Month", "Hour"]
#Transform the features and targets into arrays
X, y = data_to_array(chicago_train, features)
#Split the data twice to quickly train preliminary models
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .80, random_state = 42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size =.10, random_state = 42, stratify = y_train)

In [137]:
#Check the shapes
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(506634, 7)
(56293, 7)
(506634,)
(56293,)


In [138]:
#Encode the training and target features
X_train_enc, X_val_enc = prepare_inputs(X_train, X_val)
#Encode the training and validation targets
y_train_enc, y_val_enc = prepare_targets(y_train, y_val)

In [144]:
def input_to_embedding(n_embeddings, training_encodings):
    """
    This function iterates through each variable and constructs and input layer and connects it to an embedding
    layer, and stores both layers in a list. 
        
    n_embeddings: Number of embedding dimensions
    training_encodings: List of training features
    
    returns:
        in_layers: List of input layers
        em_layers: List of embedding layers
    """
    in_layers = list()
    em_layers = list()
    for i in range(len(training_encodings)):
        #Calculate the number of unique inputs
        n_labels = len(np.unique(training_encodings[i]))
        #Define input layer
        in_layer = Input(shape=(1,))
        #Define embedding layer
        em_layer = Embedding(n_labels, n_embeddings)(in_layer)
        #Store layers
        in_layers.append(in_layer)
        em_layers.append(em_layer)
    return in_layers, em_layers


# Baseline Model

Although, no one will agree on what a "baseline" neural network model looks like, we can build a simple two layer model with 10 neurons in each layer using 10 as the dimension of the embeddings. The model will use the binary crossentropy loss with the adam optimizer and run for 10 epochs with a batch size of 32. 

In [189]:
input_layers, embed_layers = input_to_embedding(n_embeddings=10, training_encodings=X_train_enc)
input_ = keras.layers.concatenate(em_layers)
hidden1 = Dense(10, activation='relu')(input_)
hidden2 = Dense(10, activation='relu')(hidden1)
output = Dense(1, activation='sigmoid')(hidden2)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [191]:
%%time
model.fit(X_train_enc, y_train_enc, epochs=10, batch_size=32, verbose=2)

Epoch 1/10
15833/15833 - 20s - loss: 0.3075 - accuracy: 0.8790
Epoch 2/10
15833/15833 - 19s - loss: 0.3073 - accuracy: 0.8789
Epoch 3/10
15833/15833 - 18s - loss: 0.3071 - accuracy: 0.8792
Epoch 4/10
15833/15833 - 17s - loss: 0.3069 - accuracy: 0.8791
Epoch 5/10
15833/15833 - 16s - loss: 0.3067 - accuracy: 0.8790
Epoch 6/10
15833/15833 - 17s - loss: 0.3066 - accuracy: 0.8793
Epoch 7/10
15833/15833 - 16s - loss: 0.3065 - accuracy: 0.8790
Epoch 8/10
15833/15833 - 16s - loss: 0.3064 - accuracy: 0.8793
Epoch 9/10
15833/15833 - 17s - loss: 0.3063 - accuracy: 0.8793
Epoch 10/10
15833/15833 - 16s - loss: 0.3062 - accuracy: 0.8793
Wall time: 2min 52s


<keras.callbacks.History at 0x1e1a285cdf0>

In [192]:
y_pred = model.predict(X_val_enc)
y_pred = np.round(y_pred.reshape((y_pred.shape[0])))
y_val = y_val.astype(np.int)
model_base_acc = np.round(accuracy_score(y_pred, y_val), 4) * 100
model_base_f1 = np.round(f1_score(y_pred, y_val), 4) * 100
print(f"Model Baseline Accuracy Score: {model_base_acc:.2f}%")
print(f"Model Baseline F1-Score: {model_base_f1:.2f}%")

Model Baseline Accuracy Score: 87.64%
Model Baseline F1-Score: 68.99%


# Hyperparameter Tuning 

In [186]:
input_ = keras.layers.concatenate(em_layers)
hidden1 = Dense(10, activation='relu', kernel_initializer='he_normal')(input_)
hidden2 = Dense(10, activation='relu', kernel_initializer='he_normal')(hidden1)
concat = keras.layers.Concatenate()([input_, hidden10])
output = Dense(1, activation='sigmoid')(concat)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [187]:
model.fit(X_train_enc, y_train_enc, epochs=10, batch_size=256, verbose=2)

Epoch 1/10
1980/1980 - 8s - loss: 0.3161 - accuracy: 0.8757
Epoch 2/10
1980/1980 - 4s - loss: 0.3095 - accuracy: 0.8783
Epoch 3/10
1980/1980 - 3s - loss: 0.3086 - accuracy: 0.8787
Epoch 4/10
1980/1980 - 4s - loss: 0.3081 - accuracy: 0.8790
Epoch 5/10
1980/1980 - 4s - loss: 0.3078 - accuracy: 0.8789
Epoch 6/10
1980/1980 - 4s - loss: 0.3074 - accuracy: 0.8792
Epoch 7/10
1980/1980 - 4s - loss: 0.3071 - accuracy: 0.8791
Epoch 8/10
1980/1980 - 3s - loss: 0.3069 - accuracy: 0.8790
Epoch 9/10
1980/1980 - 4s - loss: 0.3067 - accuracy: 0.8791
Epoch 10/10
1980/1980 - 3s - loss: 0.3066 - accuracy: 0.8792


<keras.callbacks.History at 0x1e19a5292b0>

In [188]:
y_pred = model.predict(X_val_enc)
y_pred = np.round(y_pred.reshape((y_pred.shape[0])))
y_val = y_val.astype(np.int)
model_base_acc = np.round(accuracy_score(y_pred, y_val), 4) * 100
model_base_f1 = np.round(f1_score(y_pred, y_val), 4) * 100
print(f"Model Baseline Accuracy Score: {model_base_acc:.2f}%")
print(f"Model Baseline F1-Score: {model_base_f1:.2f}%")

Model Baseline Accuracy Score: 87.62%
Model Baseline F1-Score: 69.07%


In [101]:
Model?

In [86]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(15, activation = "relu", input_shape=in_layers))
model.add(keras.layers.Dense(10, activation = "relu"))
model.add(keras.layers.Dense(1, activation = "sigmoid"))
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

TypeError: Dimension value must be integer or None or have an __index__ method, got value '<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_8')>' with type '<class 'keras.engine.keras_tensor.KerasTensor'>'

In [85]:
in_layers

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_8')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_9')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_10')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_11')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_12')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_13')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_14')>]