# Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import london_data_functions as ldf
import warnings
import pickle
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Get the Data

In [2]:
%%time
query = """
            SELECT incident_number, timestamp_of_call, incident_group, property_category, 
                    address_qualifier, borough_name, first_pump_arriving_attendance_time as first_time,
                    num_stations_with_pumps_attending as station_pumps, num_pumps_attending as pumps_attending,
                    FROM `gdac-327115.LondonFire.fire_brigade`
        """

london = ldf.fetch_london_data(query_string=query, project_id = "gdac-327115", location = "eu")

Wall time: 8.11 s


# Clean the Data

In [3]:
warnings.filterwarnings('ignore')
london = ldf.clean_london(london, verbose = True)

Cleaning London Data Started...

Sucessfully Added Emergency Column!
Sucessfully Added Month Column!
Sucessfully Added Hour Column!
Sucessfully Cleanded Property Category!
Sucessfully Cleaned Address!
Sucessfully Cleaned Borough Names!
Sucessfully Cleaned Station Arriving Time!
Sucessfully Cleaned Number of Stations with Pumps!
Sucessfully Cleaned Number of Pumps Attending!
Sucessfully Dropped Unecessary Columns!

Sucessfully Cleaned London Data!


# Load in Model

In [4]:
model = pickle.load(open("best_model.sav", 'rb'))
model

LogisticRegression(C=0.1, max_iter=10000, random_state=42)

# Create Helper Functions

In [5]:
def str_to_date(date_str, time_str):
    """
    This function takes takes a date and time string and concatenates them together. Finally, it returns a datetime object.
    
    date_str: string containing the date in form mm/dd/yyyy
    time_str: string containing the time in form hh:mm:ss
    
    returns: datetime object 
    """
    crime_time = date_str + " " + time_str
    return datetime.strptime(crime_time, "%m/%d/%Y %H:%M:%S")

In [6]:
def user_input():
    """
    This function asks the user for the new crime details. It then converts the input to the form found in the original
    dataframe. 
    
    returns: DataFrame with the crime details
    """
    property_category = str.upper(input("Enter the property category: "))
    
    address = str.upper(input("Enter the address qualifier: "))
    
    borough = str.upper(input("Enter the borough name: "))
    
    first_time = np.float64((input("Enter the pump arriving time: ")))
    
    station_pumps = np.str(input("Enter the number of stations with pumps attending: "))
    
    pumps_attending = np.str(input("Enter the number of pumps attending: "))
    
    date = input("Date of Crime (mm/dd/yyy): ")
    
    hour = input("Time of Crime (hh:mm:ss): ")
    
    date_time = str_to_date(date, hour)
    
    new_crime_df = pd.DataFrame({"timestamp_of_call": date_time, "property_category": property_category, 
                                 "address_qualifier": address, "borough_name": borough,  
                                 "first_time": first_time, "station_pumps": station_pumps,
                                 "pumps_attending": pumps_attending}, index = [1])
    
    return new_crime_df

In [7]:
def clean_new_instance(df):
    """
    This function cleans the new instance. It uses functions from the london data functions module.
    
    params:
        df - dataframe with necessary columns
    """
    #Merge the property categories if necessary
    ldf.merge_property(df)
    #Merge the property categories if necessary
    ldf.merge_address(df)
    #Create the column for month
    ldf.create_month(df)
    #Create the column for hour
    ldf.create_hour(df)

In [8]:
def unique_column_values(df, cat_vars):
    """
    This function creates lists containing the unique values for each variable in the london data set. It returns a list of
    lists for each variable
    
    params:
        df - dataframe 
    
    returns:
        list of list with unique values for each variable
    """
    feature_list = list()
    for feature in df[cat_vars].columns:
        feature_list.append([str.upper(element) for element in df[feature].unique()])
    return feature_list

In [9]:
def prepare_new_instance(df, cat_vars, num_vars, unique_vals_per_attrib):
    """
    This function prepares a new crime instance by one hot encoding the variables. It specifies the categories from the 
    original data set.
    
    params:
        df - dataframe containing the new crime instance
        unique_vals_per_attrib: List of list containing the unique values for each variable
    
    returns: X (sparse matrix)
    """
    
    df.drop("timestamp_of_call", axis = 1, inplace = False)
 
    full_pipeline = ColumnTransformer([
        ("num", StandardScaler(), num_vars),
        ("cat", OneHotEncoder(categories=unique_vals_per_attrib), cat_vars),
    ])
    
    X = full_pipeline.fit_transform(df)
    return X
    

In [10]:
def make_prediction(X, model, probability = True):
    """
    This function makes the prediction on the new crime instance. It can return either the predicted probability or class. 
    
    X (Sparse matrix)
    model: Model capable of making returning a predicted probability or class
    probability: Specifies if the function should return a predicted probability or class
    """
    if probability:
        return model.predict_proba(X)
    else:
        return model.predict(X)

In [11]:
def london_emergency_prediction(df, model):
    """
    This is the main function for turning a user's input into a predicted response. It contains functions that get the input, 
    clean it, transform it to the correct form for the model and finally outputs a predicted probability of success. 
    
    params:
        df - dataframe of past instances
        model - model capable of giving a predicted probability
    
    """
    #Enter the new incident
    new_incident = user_input()
    #Clean the new incident
    clean_new_instance(new_incident)
    #Create two list with the categorical and numerical features to be used.
    cat_vars = ["property_category", "address_qualifier", "borough_name", "station_pumps", "pumps_attending", "Month", "Hour"]
    num_vars = ["first_time"]
    #Generate the unique values per column
    unique_vals = unique_column_values(df, cat_vars=cat_vars)
    #Prepare the new instance by one-hot encoding the features
    prepared_emergency = prepare_new_instance(new_incident, cat_vars=cat_vars, num_vars=num_vars, unique_vals_per_attrib=unique_vals)
    #Get the predicted probability
    emergency_prediction = make_prediction(prepared_emergency, model, probability=True)
    #Change the prediction to a percentage
    chance = np.round(emergency_prediction[0][1], 4) * 100
    #Print the chance of the emergency being real
    print(f"\nThe model predicts a {chance:.2f}% chance that the incident is a real emergency")

# Example: New Instances

In [130]:
london_emergency_prediction(london, model = model)

Enter the property category: residential
Enter the address qualifier: within same building
Enter the borough name: hounslow
Enter the pump arriving time: 2
Enter the number of stations with pumps attending: 1.0
Enter the number of pumps attending: 1.0
Date of Crime (mm/dd/yyy): 03/03/2022
Time of Crime (hh:mm:ss): 18:30:30

The model predicts a 89.73% chance that the incident is a real emergency


# Random Emergency Instances

In [24]:
def gen_random_emergency(df):
    """
    This function generates a random crime instance and returns the new instance as a dataframe.
    
    df: Dataframe of original data
    
    returns: Dataframe of new instance
    """
    #Randomly select values from each feature
    date = np.random.choice(df["timestamp_of_call"])
    property_cat = str.upper(np.random.choice(df["property_category"]))
    address = str.upper(np.random.choice(df["address_qualifier"]))
    borough = str.upper(np.random.choice(df["borough_name"]))
    first_time = np.random.choice(df["first_time"])
    station_pumps = np.random.choice(df["station_pumps"])
    pumps_attending = np.random.choice(df["pumps_attending"])
    
    print(f"Emergency Details\n")
    print(f"Date: {date}")
    print(f"Property Category: {property_cat}")
    print(f"Address Qualifier: {address}")
    print(f"Borough: {borough}")
    print(f"Arrival Time: {first_time}")
    print(f"Number of Stations with Pumps: {station_pumps}")
    print(f"Number Pumps: {pumps_attending}")
    
    #Combine values into a dataframe
    random_crime_df = pd.DataFrame({"timestamp_of_call": date, "property_category": property_cat, 
                                                   "address_qualifier": address, "borough_name": borough,  
                                                   "first_time": first_time, "station_pumps": station_pumps, 
                                                   "pumps_attending": pumps_attending}, index = [1])
    return random_crime_df

In [28]:
def random_london_emergency_prediction(df, model):
    """
    This is the main function for turning a user's input into a predicted response. It contains functions that get the input, 
    clean it, transform it to the correct form for the model and finally outputs a predicted probability of success. 
    
    params:
        df - dataframe of past instances
        model - model capable of giving a predicted probability
    
    """
    #Enter the new incident
    new_incident = gen_random_emergency(df)
    #Clean the new incident
    clean_new_instance(new_incident)
    #Create two list with the categorical and numerical features to be used.
    cat_vars = ["property_category", "address_qualifier", "borough_name", "station_pumps", "pumps_attending", "Month", "Hour"]
    num_vars = ["first_time"]
    #Generate the unique values per column
    unique_vals = unique_column_values(df, cat_vars=cat_vars)
    #Prepare the new instance by one-hot encoding the features
    prepared_emergency = prepare_new_instance(new_incident, cat_vars=cat_vars, num_vars=num_vars, unique_vals_per_attrib=unique_vals)
    #Get the predicted probability
    emergency_prediction = make_prediction(prepared_emergency, model, probability=True)
    #Change the prediction to a percentage
    chance = np.round(emergency_prediction[0][1], 4) * 100
    #Print the chance of the emergency being real
    print(f"\nThe model predicts a {chance:.2f}% chance that the incident is a real emergency")

## Random Instance #1

In [30]:
np.random.seed(40)
random_london_emergency_prediction(london, model = model)

Emergency Details

Date: 2017-01-07 00:47:48+00:00
Property Category: RESIDENTIAL
Address Qualifier: CORRECT INCIDENT LOCATION
Borough: SUTTON
Arrival Time: 399.0
Number of Stations with Pumps: 1.0
Number Pumps: 2.0

The model predicts a 37.55% chance that the incident is a real emergency


## Random Instance #2

In [32]:
np.random.seed(41)
random_london_emergency_prediction(london, model = model)

Emergency Details

Date: 2017-02-22 05:42:14+00:00
Property Category: NON RESIDENTIAL
Address Qualifier: GAZETTER
Borough: EALING
Arrival Time: 427.0
Number of Stations with Pumps: 1.0
Number Pumps: 1.0

The model predicts a 40.98% chance that the incident is a real emergency


## Random Instance #3

In [33]:
np.random.seed(42)
random_london_emergency_prediction(london, model = model)

Emergency Details

Date: 2017-04-19 08:49:48+00:00
Property Category: OUTDOOR
Address Qualifier: WITHIN SAME BUILDING
Borough: KINGSTON UPON THAMES
Arrival Time: 294.0
Number of Stations with Pumps: 2.0
Number Pumps: 2.0

The model predicts a 78.86% chance that the incident is a real emergency


## Random Instance #4

In [34]:
np.random.seed(44)
random_london_emergency_prediction(london, model = model)

Emergency Details

Date: 2017-02-16 11:49:56+00:00
Property Category: RESIDENTIAL
Address Qualifier: CORRECT INCIDENT LOCATION
Borough: WALTHAM FOREST
Arrival Time: 343.0
Number of Stations with Pumps: 1.0
Number Pumps: 1.0

The model predicts a 76.58% chance that the incident is a real emergency


## Random Instance #5

In [35]:
np.random.seed(45)
random_london_emergency_prediction(london, model = model)

Emergency Details

Date: 2017-01-05 23:00:08+00:00
Property Category: NON RESIDENTIAL
Address Qualifier: GAZETTER
Borough: LEWISHAM
Arrival Time: 322.6451204055767
Number of Stations with Pumps: 1.0
Number Pumps: 1.0

The model predicts a 31.43% chance that the incident is a real emergency
