# Setup and Imports

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set
import warnings
from pandas.io import gbq
from sklearn.model_selection import train_test_split

# Get the Data

In [5]:
%%time
query = """
            SELECT incident_number, timestamp_of_call, incident_group, stop_code_description, property_category, 
                    property_type, address_qualifier, postcode_district, borough_name, ward_name, 
                    first_pump_arriving_attendance_time, first_pump_arriving_deployed_from_station,
                    second_pump_arriving_attendance_time, second_pump_arriving_deployed_from_station, 
                    num_stations_with_pumps_attending, num_pumps_attending,
                    FROM `gdac-327115.LondonFire.fire_brigade`
        """

london = gbq.read_gbq(query, project_id = "gdac-327115", location="eu")

Wall time: 9.05 s


In [7]:
#Check the shape
london.shape

(32247, 16)

In [6]:
#View the first few rows
london.head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,postcode_district,borough_name,ward_name,first_pump_arriving_attendance_time,first_pump_arriving_deployed_from_station,second_pump_arriving_attendance_time,second_pump_arriving_deployed_from_station,num_stations_with_pumps_attending,num_pumps_attending
0,000320-01012017,2017-01-01 15:30:22+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Within same building,IG9,NOT GEO-CODED,Not geo-coded,,,,,2,2
1,003727-09012017,2017-01-09 10:41:54+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
2,003762-09012017,2017-01-09 12:28:39+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
3,006403-16012017,2017-01-16 04:02:14+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
4,010947-26012017,2017-01-26 11:14:03+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG10,NOT GEO-CODED,Not geo-coded,,,,,1,1


In [13]:
london.dtypes

incident_number                                            object
timestamp_of_call                             datetime64[ns, UTC]
incident_group                                             object
stop_code_description                                      object
property_category                                          object
property_type                                              object
address_qualifier                                          object
postcode_district                                          object
borough_name                                               object
ward_name                                                  object
first_pump_arriving_attendance_time                         Int64
first_pump_arriving_deployed_from_station                  object
second_pump_arriving_attendance_time                        Int64
second_pump_arriving_deployed_from_station                 object
num_stations_with_pumps_attending                           Int64
num_pumps_

# Split the Data into Training and Test Sets

Since we are going to be predicting if the call is a false alarm, we'll want to our test observations to be the newest in the data set. 

In [15]:
london["timestamp_of_call"].dt.year.value_counts()

2017    32247
Name: timestamp_of_call, dtype: int64

We see that all of our observations are from the same year, therefore we can just randomly split the data. However, let's first check the proportion of calls that ended up being a false alarm.  

In [18]:
london["incident_group"].value_counts()

False Alarm        15732
Special Service    10081
Fire                6434
Name: incident_group, dtype: int64

Not all incidents were either a Fire or False Alarm. There is a third category, Special Service, as well. Since the entire purpose of this project is to reduce the number of false alarms that the department wastes its resources on, we'll combine Special Services and the Fire into one category called Emergency. We could have dropped all incidents with Special Service. However, the fire department has to respond to those types of calls so it's important to include them. 

In [19]:
emergency = ["Special Service", "Fire"]

london["incident_group"].replace(emergency, "Emergency", inplace = True)

In [20]:
london["incident_group"].value_counts() / london.shape[0]

Emergency      0.512141
False Alarm    0.487859
Name: incident_group, dtype: float64

In [29]:
london_train, london_test = train_test_split(london, test_size = .15, stratify = london["incident_group"])

In [30]:
print(london_train.shape)
print(london_test.shape)

(27409, 16)
(4838, 16)


# Clean the Data

### Check for missing values

In [31]:
london_train.isnull().sum()

incident_number                                   0
timestamp_of_call                                 0
incident_group                                    0
stop_code_description                             0
property_category                                 0
property_type                                     0
address_qualifier                                 0
postcode_district                                 0
borough_name                                      0
ward_name                                         0
first_pump_arriving_attendance_time            1555
first_pump_arriving_deployed_from_station      1555
second_pump_arriving_attendance_time          17196
second_pump_arriving_deployed_from_station    17196
num_stations_with_pumps_attending                58
num_pumps_attending                              58
dtype: int64

Looks like we won't be able to use "second_pump_arriving_attendance_time" and "second_pump_arriving_deployed_from_station" since two-thirds of its values are missing. 

### Incident Number

In [33]:
#Number of observations
london_train.shape[0]

27409

In [32]:
#Number of incidents
len(london_train["incident_number"].unique())

27409

As we'd hope, each observation in our data is a different reponse and there are no duplicates. 