# Setup and Imports

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set
import warnings
from pandas.io import gbq
from sklearn.model_selection import train_test_split

# Get the Data

In [81]:
%%time
query = """
            SELECT incident_number, timestamp_of_call, incident_group, stop_code_description, property_category, 
                    property_type, address_qualifier, postcode_district, borough_name, ward_name, 
                    first_pump_arriving_attendance_time, first_pump_arriving_deployed_from_station,
                    second_pump_arriving_attendance_time, second_pump_arriving_deployed_from_station, 
                    num_stations_with_pumps_attending, num_pumps_attending,
                    FROM `gdac-327115.LondonFire.fire_brigade`
        """

london = gbq.read_gbq(query, project_id = "gdac-327115", location="eu")

Wall time: 8.37 s


In [7]:
#Check the shape
london.shape

(32247, 16)

In [6]:
#View the first few rows
london.head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,postcode_district,borough_name,ward_name,first_pump_arriving_attendance_time,first_pump_arriving_deployed_from_station,second_pump_arriving_attendance_time,second_pump_arriving_deployed_from_station,num_stations_with_pumps_attending,num_pumps_attending
0,000320-01012017,2017-01-01 15:30:22+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Within same building,IG9,NOT GEO-CODED,Not geo-coded,,,,,2,2
1,003727-09012017,2017-01-09 10:41:54+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
2,003762-09012017,2017-01-09 12:28:39+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
3,006403-16012017,2017-01-16 04:02:14+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
4,010947-26012017,2017-01-26 11:14:03+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG10,NOT GEO-CODED,Not geo-coded,,,,,1,1


In [82]:
london.dtypes

incident_number                                            object
timestamp_of_call                             datetime64[ns, UTC]
incident_group                                             object
stop_code_description                                      object
property_category                                          object
property_type                                              object
address_qualifier                                          object
postcode_district                                          object
borough_name                                               object
ward_name                                                  object
first_pump_arriving_attendance_time                         Int64
first_pump_arriving_deployed_from_station                  object
second_pump_arriving_attendance_time                        Int64
second_pump_arriving_deployed_from_station                 object
num_stations_with_pumps_attending                           Int64
num_pumps_

# Split the Data into Training and Test Sets

Since we are going to be predicting if the call is a false alarm, we'll want to our test observations to be the newest in the data set. 

In [15]:
london["timestamp_of_call"].dt.year.value_counts()

2017    32247
Name: timestamp_of_call, dtype: int64

We see that all of our observations are from the same year, therefore we can just randomly split the data. However, let's first check the proportion of calls that ended up being a false alarm.  

In [18]:
london["incident_group"].value_counts()

False Alarm        15732
Special Service    10081
Fire                6434
Name: incident_group, dtype: int64

Not all incidents were either a Fire or False Alarm. There is a third category, Special Service, as well. Since the entire purpose of this project is to reduce the number of false alarms that the department wastes its resources on, we'll combine Special Services and the Fire into one category called Emergency. We could have dropped all incidents with Special Service. However, the fire department has to respond to those types of calls so it's important to include them. 

In [83]:
emergency = ["Special Service", "Fire"]

london["incident_group"].replace(emergency, "Emergency", inplace = True)

In [84]:
london["incident_group"].value_counts() / london.shape[0]

Emergency      0.512141
False Alarm    0.487859
Name: incident_group, dtype: float64

In [85]:
#Finally split the data
london_train, london_test = train_test_split(london, test_size = .15, stratify = london["incident_group"], random_state=42)

In [86]:
print(london_train.shape)
print(london_test.shape)

(27409, 16)
(4838, 16)


# Clean the Data

In [89]:
#Change the objects variables to categorical
london_train.loc[:, london_train.dtypes == "object"] = london_train.select_dtypes(["object"]).apply(lambda x: x.astype("category"))

In [90]:
london_train.dtypes

incident_number                                          category
timestamp_of_call                             datetime64[ns, UTC]
incident_group                                           category
stop_code_description                                    category
property_category                                        category
property_type                                            category
address_qualifier                                        category
postcode_district                                        category
borough_name                                             category
ward_name                                                category
first_pump_arriving_attendance_time                         Int64
first_pump_arriving_deployed_from_station                category
second_pump_arriving_attendance_time                        Int64
second_pump_arriving_deployed_from_station               category
num_stations_with_pumps_attending                           Int64
num_pumps_

### Check for missing values

In [91]:
london_train.isnull().sum()

incident_number                                   0
timestamp_of_call                                 0
incident_group                                    0
stop_code_description                             0
property_category                                 0
property_type                                     0
address_qualifier                                 0
postcode_district                                 0
borough_name                                      0
ward_name                                         0
first_pump_arriving_attendance_time            1542
first_pump_arriving_deployed_from_station      1542
second_pump_arriving_attendance_time          17230
second_pump_arriving_deployed_from_station    17230
num_stations_with_pumps_attending                56
num_pumps_attending                              56
dtype: int64

Looks like we won't be able to use "second_pump_arriving_attendance_time" and "second_pump_arriving_deployed_from_station" since two-thirds of its values are missing. 

### Incident Number

In [92]:
#Number of observations
london_train.shape[0]

27409

In [93]:
#Number of incidents
len(london_train["incident_number"].unique())

27409

As we'd hope, each observation in our data is a different reponse and there are no duplicates. 

### Time

In [73]:
london_train["timestamp_of_call"].head()

25119   2017-02-25 21:25:09+00:00
9509    2017-03-12 01:28:04+00:00
14240   2017-04-06 17:03:02+00:00
18540   2017-03-21 10:43:54+00:00
24730   2017-01-06 15:10:11+00:00
Name: timestamp_of_call, dtype: datetime64[ns, UTC]

### Create a Month variable

In [94]:
%%capture --no-stdout
london_train["Month"] = london_train["timestamp_of_call"].dt.month

In [95]:
london_train["Month"].value_counts().reset_index().rename(columns={"index":"Month", "Month":"Count"}).sort_values(by="Month")

Unnamed: 0,Month,Count
1,1,6982
3,2,6217
2,3,6779
0,4,7431


### Create a Hour variable

In [96]:
%%capture --no-stdout
london_train["Hour"] = london_train["timestamp_of_call"].dt.hour

In [97]:
london_train["Hour"].value_counts().reset_index().rename(columns={"index":"Hour", "Hour":"Count"}).sort_values(by="Hour")

Unnamed: 0,Hour,Count
16,0,830
18,1,744
20,2,550
21,3,482
23,4,423
22,5,434
19,6,552
17,7,745
14,8,1014
12,9,1169


### Drop the timestamp variable

In [62]:
#london_train.drop("timestamp_of_call", axis = 1, inplace = True)

### Stop Code Description

In [98]:
london_train["stop_code_description"].value_counts()

AFA                                      10017
Special Service                           8580
Primary Fire                              3031
False alarm - Good intent                 2977
Secondary Fire                            2407
False alarm - Malicious                    378
Chimney Fire                                11
Flood call attended - Batch mobilised        7
Late Call                                    1
Name: stop_code_description, dtype: int64

In [60]:
london_train.head()

Unnamed: 0,incident_number,incident_group,stop_code_description,property_category,property_type,address_qualifier,postcode_district,borough_name,ward_name,first_pump_arriving_attendance_time,first_pump_arriving_deployed_from_station,second_pump_arriving_attendance_time,second_pump_arriving_deployed_from_station,num_stations_with_pumps_attending,num_pumps_attending,Month,Hour
25119,023952-25022017,Emergency,Special Service,Road Vehicle,Car,In street outside gazetteer location,IG1,REDBRIDGE,MAYFIELD,296,Ilford,,,1,1,2,21
9509,029837-12032017,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,SW1P,WESTMINSTER,ST. JAMES'S,276,Lambeth,518.0,Lambeth,1,2,3,1
14240,041522-06042017,Emergency,Primary Fire,Dwelling,House - single occupancy,Within same building,E16,NEWHAM,CUSTOM HOUSE,335,East Ham,575.0,Poplar,2,2,4,17
18540,033936-21032017,Emergency,Special Service,Non Residential,Single shop,Correct incident location,HA8,BARNET,BURNT OAK,334,Mill Hill,,,1,1,3,10
24730,002591-06012017,Emergency,Special Service,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Correct incident location,E15,NEWHAM,FOREST GATE SOUTH,99,Stratford,,,1,1,1,15


### Property Category

In [99]:
london_train["property_category"].value_counts()

Dwelling             13048
Non Residential       6519
Road Vehicle          2369
Outdoor               2196
Outdoor Structure     1684
Other Residential     1537
Aircraft                21
Boat                    20
Rail Vehicle            15
Name: property_category, dtype: int64

In [102]:
res = ["Dwelling", "Other Residential"]
london_train["property_category"].replace(res, "Residential", inplace = True)

In [105]:
vehicle = ["Road Vehicle", "Aircraft", "Boat", "Rail Vehicle"]
london_train["property_category"].replace(vehicle, "Vehicle", inplace = True)

In [109]:
london_train["property_category"].replace("Outdoor Structure", "Outdoor", inplace = True)

In [110]:
london_train["property_category"].value_counts()

Residential        14585
Non Residential     6519
Outdoor             3880
Vehicle             2425
Name: property_category, dtype: int64

In [100]:
london_train["property_type"].value_counts()

Purpose Built Flats/Maisonettes - 4 to 9 storeys      3266
House - single occupancy                              3234
Purpose Built Flats/Maisonettes - Up to 3 storeys     2486
Car                                                   1403
Self contained Sheltered Housing                      1224
                                                      ... 
Theme Park                                               1
Tennis Courts                                            1
Naval vessel                                             1
Private greenhouse                                       1
Agricultural equipment                                   1
Name: property_type, Length: 256, dtype: int64