# Setup and Imports

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set
import warnings
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.models import Model
from keras.layers import Input
from keras.layers import Embedding
from keras.layers.merge import concatenate
#from keras.utils import plot_model

#Custom Python Module with functions specifically for this project
import ChicagoDataCleaningFunctions as cd
#Custom Python Module to fetch the data
import FetchChicagoData as fc
#Custom Python Module to prepare new crime instances
import PrepareChicago as pc

# Get the Data

In [2]:
%%time
#Specify input values for fetching the data
query = """
            SELECT unique_key, date, primary_type, location_description, 
                    arrest, domestic, community_area, year
            FROM `gdac-327115.Chicago.chicago2`
            WHERE year >= 2011
        """
project_id = "gdac-327115"
excel_file = "ChicagoCommunityAreas.xlsx"

#Fetch the data
chicago = fc.fetch_chicago_data(query, project_id, excel_file, verbose=True)

Fetching Chicago Data Started...

Successfully queried Google BigQuery.
Sucessfully read in excel file.
Sucessfully joined Chicago districts to main data.
Successfully dropped duplicate column

Succcessfully fetched Chicago Data
Wall time: 3min 16s


# Split the Data into Training and Test Sets

In [3]:
chicago_train = chicago.loc[chicago["year"] != 2021]
chicago_test = chicago.loc[chicago["year"] == 2021]

# Clean the Training Data

In [4]:
%%capture --no-stdout
cd.chicago_data_cleaner(chicago_train, verbose=True)

Cleaning Started...

Successfully Cleaned Primary Type
Successfully Imputed Location
Successfully Cleaned Location
Successfully Added Month Column
Successfully Added Hour Column
Successfully Cleaned Community

Data Set Successfully Cleaned!


# Prepare the Data

Since we are focusing on using deep learning techniques, we do more than just one hot encoding the variables. Instead, we'll use embeddings to encode the features.

In [25]:
#List of variables to use in the model
cat_attribs = ["primary_type", "location_description", "domestic", "district_name", "community_name", "Month", "Hour"]

#Prepare the data for modelling
X, y = prepare_chicago_train(df = chicago_train, attribs = cat_attribs.copy())

#Subset the data twice to quickly fit models
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size =.80, random_state = 42, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size =.10, random_state = 42, stratify = y_train)

NameError: name 'prepare_chicago_train' is not defined

In [None]:
#Check the shapes
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

In [9]:
districts = list(chicago_train["district_name"].unique())

In [10]:
districts

['Far North',
 'Southwest',
 'Far Southeast',
 'South',
 'Far Southwest',
 'Northwest',
 'North',
 'Central',
 'West']

In [15]:
indices = tf.range(len(districts), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(districts, indices)
num_oov_buckets = 3
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [17]:
new_categories = tf.constant(["North", "South", "Far Southeast", "Central", "West"])
cat_indices = table.lookup(new_categories)
cat_one_hot = tf.one_hot(cat_indices, depth = len(districts))
cat_one_hot

<tf.Tensor: shape=(5, 9), dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)>

In [19]:
embedding_dim = 2
embed_init = tf.random.uniform([len(districts), embedding_dim])
embedding_matrix = tf.Variable(embed_init)

In [20]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(9, 2) dtype=float32, numpy=
array([[0.90956664, 0.5041356 ],
       [0.5772327 , 0.05990565],
       [0.3537835 , 0.83260906],
       [0.22642648, 0.39416325],
       [0.1990608 , 0.35851908],
       [0.17701542, 0.8149395 ],
       [0.6873293 , 0.07146227],
       [0.54479957, 0.965935  ],
       [0.01674485, 0.59335685]], dtype=float32)>

In [23]:
embedding = keras.layers.Embedding(input_dim = len(districts), output_dim = embedding_dim)
embedding(cat_indices)

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[ 0.04185473, -0.04508716],
       [-0.02397012,  0.0022164 ],
       [ 0.02695402, -0.02753395],
       [ 0.03512988,  0.00303012],
       [-0.01244684, -0.02148068]], dtype=float32)>

In [29]:
# prepare input data
def prepare_inputs(X_train, X_test):
    X_train_enc, X_test_enc = list(), list()
    # label encode each column
    for i in range(X_train.shape[1]):
        le = LabelEncoder()
        le.fit(X_train[:, i])
        # encode
        train_enc = le.transform(X_train[:, i])
        test_enc = le.transform(X_test[:, i])
        # store
        X_train_enc.append(train_enc)
        X_test_enc.append(test_enc)
    return X_train_enc, X_test_enc

In [30]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [31]:
chicago_train.head()

Unnamed: 0,unique_key,date,primary_type,location_description,arrest,domestic,year,district_name,community_name,Month,Hour
0,11228094,2017-11-20 15:00:00+00:00,DECEPTIVE PRACTICE,WATER,False,False,2017,Far North,NORTH PARK,11,15
1,12135292,2020-08-13 12:00:00+00:00,DECEPTIVE PRACTICE,BANK,False,False,2020,Far North,NORTH PARK,8,12
2,8905294,2012-11-27 17:30:00+00:00,PUBLIC PEACE VIOLATION,RESTAURANT,False,False,2012,Far North,NORTH PARK,11,17
3,8805120,2012-09-17 00:30:00+00:00,DECEPTIVE PRACTICE,VEHICLE,False,False,2012,Far North,NORTH PARK,9,0
4,9030552,2013-02-21 00:00:00+00:00,CRIMINAL SEXUAL ASSAULT,RESIDENCE,False,True,2013,Far North,NORTH PARK,2,0


In [33]:
features = ["primary_type", "location_description", "domestic", "district_name", "community_name", "Month", "Hour"]
X = chicago_train[features]
y = chicago_train["arrest"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .80, random_state = 42, stratify=y)

In [34]:
X.shape

(2814636, 7)

In [35]:
y.shape

(2814636,)