# Setup and Imports

In [173]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
import pickle

#Custom Python Module with functions specifically for this project
import ChicagoDataCleaningFunctions as cd
#Custom Python Module to fetch the data
import FetchChicagoData as fc
#Custom Python Module to prepare new crime instances
import PrepareChicago as pc

# Get the Data

In [12]:
%%time
#Specify input values for fetching the data
query = """
            SELECT unique_key, date, primary_type, location_description, 
                    arrest, domestic, community_area, year
            FROM `gdac-327115.Chicago.chicago2`
            WHERE year >= 2011
        """
project_id = "gdac-327115"
excel_file = "ChicagoCommunityAreas.xlsx"

#Fetch the data
chicago = fc.fetch_chicago_data(query, project_id, excel_file, verbose=True)

Fetching Chicago Data Started...

Successfully queried Google BigQuery.
Sucessfully read in excel file.
Sucessfully joined Chicago districts to main data.
Successfully dropped duplicate column

Succcessfully fetched Chicago Data
Wall time: 3min 20s


# Clean the Data

In [99]:
%%capture --no-stdout
#Clean the full data set
cd.chicago_data_cleaner(chicago, verbose = True)

Cleaning Started...

Successfully Cleaned Primary Type
Successfully Imputed Location
Successfully Cleaned Location
Successfully Added Month Column
Successfully Added Hour Column
Successfully Cleaned Community

Data Set Successfully Cleaned!


# Load in Production Model

In [104]:
best_model = pickle.load(open("best_model.sav", 'rb'))
best_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.45195454681591674,
              enable_categorical=False, gamma=0.546708263364187, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.38768070515882624, max_delta_step=0, max_depth=7,
              min_child_weight=25, missing=nan, monotone_constraints='()',
              n_estimators=195, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=0.6338249886045665,
              scale_pos_weight=1, subsample=0.7838501639099957,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

# Create Helper Functions

In [27]:
def get_district(community):
    """
    This function returns the district that corresponds to the community area provided
    
    community: community name
    
    returns: district name as string
    """
    district = dict(zip(chicago["community_name"], chicago["district_name"]))
    return district[community]

In [192]:
def str_to_date(date_str, time_str):
    """
    This function takes takes a date and time string and concatenates them together. Finally, it returns a datetime object.
    
    date_str: string containing the date in form dd/mm/yyyy
    time_str: string containing the time in form hh:mm:ss
    
    returns: datetime object 
    """
    crime_time = date_str + " " + time_str
    return datetime.strptime(crime_time, "%m/%d/%Y %H:%M:%S")


In [174]:
def user_input():
    """
    This function asks the user for the new crime details. It then converts the input to the form found in the original
    dataframe. 
    
    returns: DataFrame with the crime details
    """
    community_name = str.upper(input("Enter the Community Name: "))
    district = get_district(community_name)
    
    primary_type = str.upper(input("Enter the crime committed: "))
    
    location_description = str.upper(input("Enter the crime's location (street, residence, etc.): "))
    
    domestic = (str.upper(input("Was the crime domestic? (Yes/No): ")) == "YES")
    
    date = input("Date of Crime (Ex. 01/01/2022): ")
    
    hour = input("Time of Crime: ")
    
    date_time = str_to_date(date, hour)
    
    new_crime_df = pd.DataFrame({"date": date_time, "primary_type": primary_type , 
                                 "location_description": location_description, "domestic": domestic,  
                                 "community_name": community_name,  "district_name": district}, index = [1])
    
    return new_crime_df

In [193]:
def unique_column_values(df):
    """
    This function creates lists containing the unique values for each variable in the chicago data set. It returns a list of
    lists for each variable
    
    df: Dataframe 
    
    returns: List of list with unique values for each variable
    """
    crime_values = list(np.sort(df["primary_type"].value_counts().index))
    location_values = list(np.sort(df["location_description"].value_counts().index))
    domestic_values = list(np.sort(df["domestic"].value_counts().index))
    district_values = list(np.sort(df["district_name"].value_counts().index))
    community_values = list(np.sort(df["community_name"].value_counts().index))
    Month_values = list(np.sort(df["Month"].value_counts().index))
    Hour_values = list(np.sort(df["Hour"].value_counts().index))

    column_values = [crime_values, location_values, domestic_values, district_values, community_values, Month_values, Hour_values]
    return column_values

In [149]:
def prepare_new_instance(df, unique_vals_per_attrib):
    """
    This function prepares a new crime instance by one hot encoding the variables. It specifies the categories from the 
    original data set.
    
    df: Dataframe containing the new crime instance
    unique_vals_per_attrib: List of list containing the unique values for each variable
    
    returns: X (sparse matrix)
    """
    cat_encoder = OneHotEncoder(categories=unique_vals)
    X = cat_encoder.fit_transform(df[attribs])
    return X
    

In [150]:
def make_prediction(X, model, probability = True):
    """
    This function makes the prediction on the new crime instance. It can return either the predicted probability or class. 
    
    X (Sparse matrix)
    model: Model capable of making returning a predicted probability or class
    probability: Specifies if the function should return a predicted probability or class
    """
    if probability:
        return model.predict_proba(X)
    else:
        return model.predict(X)
    

In [183]:
def chicago_crime_prediction(df, model):
    """
    This the main function for turning a user's input into a predicted response. It contains functions that get the input, 
    clean it if necessary, transform it to the correct form for the model and finally outputs a predicted probability of 
    success. 
    
    df: Dataframe of past instances
    model: Model capable of giving a predicted probability
    
    """
    #new_crime = input_to_category(df)
    new_crime = user_input()
    #print(new_crime)
    cd.chicago_data_cleaner(new_crime, verbose=False)
    #print(new_crime.head)
    unique_vals = unique_column_values(df)
    prepared_crime = prepare_new_instance(new_crime, unique_vals)
    #print(prepared_crime.shape)
    crime_prediction = make_prediction(prepared_crime, model, probability=True)
    #print(crime_prediction)
    chance = np.round(crime_prediction[0][1], 4) * 100
    #print(chance)
    print(f"\nThe model predicts a {chance:.2f}% chance of making an arrest")

In [190]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: woodlawn
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): residence
Was the crime domestic? (Yes/No): yes
Date of Crime (Ex. 01/01/2022): 06/21/2022
Time of Crime: 15:00:00

The model predicts a 79.61% chance of making an arrest


In [191]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: woodlawn
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): residence
Was the crime domestic? (Yes/No): no
Date of Crime (Ex. 01/01/2022): 06/21/2022
Time of Crime: 15:00:00

The model predicts a 37.46% chance of making an arrest


In [171]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: austin
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): street
Was the crime domestic? (Yes/No): no
Date of Crime (Ex. 01/01/2022): 06/20/2022
Time of Crime: 15:00:00
[[0.6885135 0.3114865]]

The model predicts a 31.15% chance of making an arrest


In [194]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: woodlawn
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): hotel
Was the crime domestic? (Yes/No): yes
Date of Crime (Ex. 01/01/2022): 06/01/2022
Time of Crime: 20:00:00

The model predicts a 93.78% chance of making an arrest


In [197]:
chicago_crime_prediction(chicago, best_model)

Enter the Community Name: woodlawn
Enter the crime committed: homicide
Enter the crime's location (street, residence, etc.): hotel
Was the crime domestic? (Yes/No): no
Date of Crime (Ex. 01/01/2022): 6/1/2022
Time of Crime: 5:00:00

The model predicts a 47.33% chance of making an arrest


In [30]:
def input_to_category(df):
    
    crime_details = user_input()[::-1]
    
    communities = pd.Categorical(chicago["community_name"]).categories
    community_name = pd.Categorical(crime_details.pop(), categories=communities)
    
    districts = pd.Categorical(chicago["district_name"]).categories
    district = pd.Categorical(crime_details.pop(), categories=districts)
    
    crime_grps = pd.Categorical(chicago["primary_type"]).categories
    crime = pd.Categorical(crime_details.pop(), categories=crime_grps)
    
    location_grps = pd.Categorical(chicago["location_description"]).categories
    location = pd.Categorical(crime_details.pop(), categories=location_grps)
    
    domestic_grp = pd.Categorical(chicago["domestic"]).categories
    domestic = pd.Categorical(crime_details.pop(), categories=domestic_grp)
    
    crime_instance = pd.DataFrame({"date": crime_details.pop(), "primary_type": crime, 
                                   "location_description": location, "domestic": domestic,
                                   "district_name": district, "community_name": community_name},
                                 index = [1])
    return crime_instance

In [29]:
def user_input():
    """
    This function asks the user for the new crime details. It then converts the input to the form found in the original
    dataframe. 
    
    returns: list with the crime details
    """
    community_name = str.upper(input("Enter the Community Name: "))
    district = get_district(community_name)
    
    primary_type = str.upper(input("Enter the crime committed: "))
    
    location_description = str.upper(input("Enter the crime's location (street, residence, etc.): "))
    
    domestic = (str.upper(input("Was the crime domestic? (Yes/No): ")) == "YES")
    
    date = input("Date of Crime (Ex. 01/01/2022): ")
    
    hour = input("Time of Crime: ")
    
    date_time = str_to_date(date, hour)
    
    return [community_name, district, primary_type, location_description, domestic, date_time]