# Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set
import warnings
from pandas.io import gbq
from sklearn.model_selection import train_test_split

# Get the Data

In [2]:
%%time
query = """
            SELECT incident_number, timestamp_of_call, incident_group, stop_code_description, property_category, 
                    property_type, address_qualifier, postcode_district, borough_name, ward_name, 
                    first_pump_arriving_attendance_time, first_pump_arriving_deployed_from_station,
                    second_pump_arriving_attendance_time, second_pump_arriving_deployed_from_station, 
                    num_stations_with_pumps_attending, num_pumps_attending,
                    FROM `gdac-327115.LondonFire.fire_brigade`
        """

london = gbq.read_gbq(query, project_id = "gdac-327115", location="eu")

Wall time: 8.63 s


In [380]:
#Check the shape
london.shape

(32247, 16)

In [381]:
#View the first few rows
london.head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,postcode_district,borough_name,ward_name,first_pump_arriving_attendance_time,first_pump_arriving_deployed_from_station,second_pump_arriving_attendance_time,second_pump_arriving_deployed_from_station,num_stations_with_pumps_attending,num_pumps_attending
0,000320-01012017,2017-01-01 15:30:22+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Within same building,IG9,NOT GEO-CODED,Not geo-coded,,,,,2,2
1,003727-09012017,2017-01-09 10:41:54+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
2,003762-09012017,2017-01-09 12:28:39+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
3,006403-16012017,2017-01-16 04:02:14+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,IG7,NOT GEO-CODED,Not geo-coded,,,,,2,2
4,010947-26012017,2017-01-26 11:14:03+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,IG10,NOT GEO-CODED,Not geo-coded,,,,,1,1


In [382]:
london.dtypes

incident_number                                            object
timestamp_of_call                             datetime64[ns, UTC]
incident_group                                             object
stop_code_description                                      object
property_category                                          object
property_type                                              object
address_qualifier                                          object
postcode_district                                          object
borough_name                                               object
ward_name                                                  object
first_pump_arriving_attendance_time                         Int64
first_pump_arriving_deployed_from_station                  object
second_pump_arriving_attendance_time                        Int64
second_pump_arriving_deployed_from_station                 object
num_stations_with_pumps_attending                           Int64
num_pumps_

In [383]:
#Check for missing values
london.isnull().sum()

incident_number                                   0
timestamp_of_call                                 0
incident_group                                    0
stop_code_description                             0
property_category                                 0
property_type                                     0
address_qualifier                                 0
postcode_district                                 0
borough_name                                      0
ward_name                                         0
first_pump_arriving_attendance_time            1819
first_pump_arriving_deployed_from_station      1819
second_pump_arriving_attendance_time          20281
second_pump_arriving_deployed_from_station    20281
num_stations_with_pumps_attending                68
num_pumps_attending                              68
dtype: int64

Since "second_pump_arriving_attendance_time" and "second_pump_arriving_deployed_from_station" have most of their values missing, we'll drop those from the analysis. 

In [3]:
london.drop(["second_pump_arriving_attendance_time", "second_pump_arriving_deployed_from_station"], axis=1, inplace=True)

# Split the Data into Training and Test Sets

Since we are going to be predicting if the call is a false alarm, we'll want to our test observations to be the newest in the data set. 

In [385]:
london["timestamp_of_call"].dt.year.value_counts()

2017    32247
Name: timestamp_of_call, dtype: int64

We see that all of our observations are from the same year, therefore we can just randomly split the data. However, let's first check the proportion of calls that ended up being a false alarm.  

In [386]:
london["incident_group"].value_counts()

False Alarm        15732
Special Service    10081
Fire                6434
Name: incident_group, dtype: int64

Not all incidents were either a Fire or False Alarm. There is a third category, Special Service, as well. Since the entire purpose of this project is to reduce the number of false alarms that the department wastes its resources on, we'll combine Special Services and the Fire into one category called Emergency. We could have dropped all incidents with Special Service. However, the fire department has to respond to those types of calls so it's important to include them. 

In [4]:
emergency = ["Special Service", "Fire"]

london["incident_group"].replace(emergency, "Emergency", inplace = True)

In [5]:
london["incident_group"].value_counts() / london.shape[0]

Emergency      0.512141
False Alarm    0.487859
Name: incident_group, dtype: float64

In [29]:
#Finally split the data
london_train, london_test = train_test_split(london, test_size = .15, stratify = london["incident_group"], random_state=42)

In [7]:
print(london_train.shape)
print(london_test.shape)

(27409, 14)
(4838, 14)


In [8]:
london_train.reset_index(drop=True, inplace = True)
london_train.head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,postcode_district,borough_name,ward_name,first_pump_arriving_attendance_time,first_pump_arriving_deployed_from_station,num_stations_with_pumps_attending,num_pumps_attending
0,023952-25022017,2017-02-25 21:25:09+00:00,Emergency,Special Service,Road Vehicle,Car,In street outside gazetteer location,IG1,REDBRIDGE,MAYFIELD,296,Ilford,1,1
1,029837-12032017,2017-03-12 01:28:04+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,SW1P,WESTMINSTER,ST. JAMES'S,276,Lambeth,1,2
2,041522-06042017,2017-04-06 17:03:02+00:00,Emergency,Primary Fire,Dwelling,House - single occupancy,Within same building,E16,NEWHAM,CUSTOM HOUSE,335,East Ham,2,2
3,033936-21032017,2017-03-21 10:43:54+00:00,Emergency,Special Service,Non Residential,Single shop,Correct incident location,HA8,BARNET,BURNT OAK,334,Mill Hill,1,1
4,002591-06012017,2017-01-06 15:10:11+00:00,Emergency,Special Service,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Correct incident location,E15,NEWHAM,FOREST GATE SOUTH,99,Stratford,1,1


# Clean the Data

### Make indicator variable for Emergency outcome

In [9]:
%%capture --no-stdout 
london_train["Emergency"] = (london_train["incident_group"] == "Emergency").astype(np.int64)

In [11]:
#Change the objects variables to categorical
london_train.loc[:, london_train.dtypes == "object"] = london_train.select_dtypes(["object"]).apply(lambda x: x.astype("category"))

### Check for missing values

In [396]:
london_train.isnull().sum()

incident_number                                 0
timestamp_of_call                               0
incident_group                                  0
stop_code_description                           0
property_category                               0
property_type                                   0
address_qualifier                               0
postcode_district                               0
borough_name                                    0
ward_name                                       0
first_pump_arriving_attendance_time          1542
first_pump_arriving_deployed_from_station    1542
num_stations_with_pumps_attending              56
num_pumps_attending                            56
Emergency                                       0
dtype: int64

### Incident Number

In [397]:
#Number of observations
london_train.shape[0]

27409

In [398]:
#Number of incidents
len(london_train["incident_number"].unique())

27409

As we'd hope, each observation in our data is a different reponse and there are no duplicates. 

### Time

In [399]:
london_train["timestamp_of_call"].head()

0   2017-02-25 21:25:09+00:00
1   2017-03-12 01:28:04+00:00
2   2017-04-06 17:03:02+00:00
3   2017-03-21 10:43:54+00:00
4   2017-01-06 15:10:11+00:00
Name: timestamp_of_call, dtype: datetime64[ns, UTC]

### Create a Month variable

In [12]:
%%capture --no-stdout
london_train["Month"] = london_train["timestamp_of_call"].dt.month

In [13]:
london_train["Month"].value_counts().reset_index().rename(columns={"index":"Month", "Month":"Count"}).sort_values(by="Month")

Unnamed: 0,Month,Count
1,1,6982
3,2,6217
2,3,6779
0,4,7431


### Create a Hour variable

In [14]:
%%capture --no-stdout
london_train["Hour"] = london_train["timestamp_of_call"].dt.hour

In [15]:
london_train["Hour"].value_counts().reset_index().rename(columns={"index":"Hour", "Hour":"Count"}).sort_values(by="Hour")

Unnamed: 0,Hour,Count
16,0,830
18,1,744
20,2,550
21,3,482
23,4,423
22,5,434
19,6,552
17,7,745
14,8,1014
12,9,1169


### Drop the timestamp variable

In [62]:
#london_train.drop("timestamp_of_call", axis = 1, inplace = True)

### Stop Code Description

In [404]:
london_train["stop_code_description"].value_counts()

AFA                                      10017
Special Service                           8580
Primary Fire                              3031
False alarm - Good intent                 2977
Secondary Fire                            2407
False alarm - Malicious                    378
Chimney Fire                                11
Flood call attended - Batch mobilised        7
Late Call                                    1
Name: stop_code_description, dtype: int64

### Property Category

In [16]:
london_train["property_category"].value_counts()

Dwelling             13048
Non Residential       6519
Road Vehicle          2369
Outdoor               2196
Outdoor Structure     1684
Other Residential     1537
Aircraft                21
Boat                    20
Rail Vehicle            15
Name: property_category, dtype: int64

In [18]:
res = ["Dwelling", "Other Residential"]
london_train["property_category"].replace(res, "Residential", inplace = True)

vehicle = ["Road Vehicle", "Aircraft", "Boat", "Rail Vehicle"]
london_train["property_category"].replace(vehicle, "Vehicle", inplace = True)

london_train["property_category"].replace("Outdoor Structure", "Outdoor", inplace = True)

In [19]:
london_train["property_category"].value_counts()

Residential        14585
Non Residential     6519
Outdoor             3880
Vehicle             2425
Name: property_category, dtype: int64

### Property Type

In [20]:
london_train["property_type"].value_counts()

Purpose Built Flats/Maisonettes - 4 to 9 storeys      3266
House - single occupancy                              3234
Purpose Built Flats/Maisonettes - Up to 3 storeys     2486
Car                                                   1403
Self contained Sheltered Housing                      1224
                                                      ... 
Theme Park                                               1
Tennis Courts                                            1
Naval vessel                                             1
Private greenhouse                                       1
Agricultural equipment                                   1
Name: property_type, Length: 256, dtype: int64

Clearly, there are way too many categories in the "property_type" variable to be used in a model. Instead of trying to logically merge categories based on group similarity, we'll group based on emergency percentage. Those categories with a high percentage of real emergencies will be grouped with other high percentage emergencies. 

In [21]:
#Group by property type and summarize by the Emergency variable
pt_grp = london_train.groupby("property_type")["Emergency"].aggregate(["mean", "sum"]).sort_values(by="mean", ascending=False)
pt_grp.reset_index(inplace = True)
#Change the proportion into a percentage
pt_grp["mean"] *= 100
#Create a new variable that ranks the values based on their percentage
pt_grp["pt_rank"] = (pt_grp["mean"] / 10).astype(np.int)
#Specifically change the values with 0 percentage to -1 as a special category i.e no chance it's a real emergency
pt_grp.loc[pt_grp["mean"] == 0, "pt_rank"] = "-1"
pt_grp

Unnamed: 0,property_type,mean,sum,pt_rank
0,Agricultural equipment,100.0,1,10
1,Camping tent,100.0,2,10
2,Freight plane,100.0,1,10
3,Human harm outdoors,100.0,5,10
4,Kiosk,100.0,1,10
...,...,...,...,...
251,Airport - hangar,0.0,0,-1
252,Greyhound stadium,0.0,0,-1
253,Airport - fuel storage,0.0,0,-1
254,Cricket ground,0.0,0,-1


In [22]:
#Join the london data with the pt_df to match the property type rankings
london_train = pd.merge(left = london_train, right = pt_grp[["property_type", "pt_rank"]], left_on = "property_type", right_on="property_type")

In [23]:
london_train[["property_type", "pt_rank"]].tail(n = 10)

Unnamed: 0,property_type,pt_rank
27399,Airfield/runway,10
27400,Bulk oil storage,-1
27401,Bulk oil storage,-1
27402,Stacked/baled crop,10
27403,Agricultural equipment,10
27404,Tennis Courts,10
27405,Other tent/marquee,5
27406,Other tent/marquee,5
27407,Airport - fuel storage,-1
27408,Minibus,10


In [416]:
london_train.head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,postcode_district,borough_name,ward_name,first_pump_arriving_attendance_time,first_pump_arriving_deployed_from_station,num_stations_with_pumps_attending,num_pumps_attending,Emergency,Month,Hour,pt_rank
0,023952-25022017,2017-02-25 21:25:09+00:00,Emergency,Special Service,Vehicle,Car,In street outside gazetteer location,IG1,REDBRIDGE,MAYFIELD,296,Ilford,1,1,1,2,21,8
1,045461-13042017,2017-04-13 10:39:29+00:00,Emergency,Special Service,Vehicle,Car,In street close to gazetteer location,BR6,BROMLEY,ORPINGTON,29,Orpington,1,1,1,4,10,8
2,031311-15032017,2017-03-15 13:27:36+00:00,Emergency,Special Service,Vehicle,Car,In street outside gazetteer location,TW13,HOUNSLOW,HANWORTH PARK,364,Feltham,1,1,1,3,13,8
3,032072-17032017,2017-03-17 09:12:15+00:00,False Alarm,False alarm - Malicious,Vehicle,Car,In street close to gazetteer location,DA14,BEXLEY,CRAY MEADOWS,980,Sidcup,1,2,0,3,9,8
4,034893-23032017,2017-03-23 14:37:02+00:00,Emergency,Special Service,Vehicle,Car,In street outside gazetteer location,N1,ISLINGTON,ST. PETER'S,219,Islington,1,1,1,3,14,8


### Address Qualifier

In [30]:
london_train["address_qualifier"].value_counts()

Correct incident location                          16392
Within same building                                4814
In street outside gazetteer location                2216
On land associated with building                    1779
In street close to gazetteer location                948
Open land/water - nearest gazetteer location         733
On motorway / elevated road                          153
Nearby address - no building in street               131
Nearby address - street not listed in gazetteer      106
In street remote from gazetteer location              96
Railway land or rolling stock                         41
Name: address_qualifier, dtype: int64

There are not too many different categories for the address qualifier variable. We'll combine a few of them based on logical relationships. 

In [32]:
street = ["In street outside gazetteer location", "In street remote from gazetteer location", 
          "In street close to gazetteer location", "Open land/water - nearest gazetteer location"]
london_train["address_qualifier"].replace(street, "Gazetter", inplace = True)

nearby = ["Nearby address - no building in street", "Nearby address - street not listed in gazetteer"]
london_train["address_qualifier"].replace(nearby, "Nearby Address", inplace = True)

other = ["On motorway / elevated road", "Railway land or rolling stock"]
london_train["address_qualifier"].replace(other, "Other", inplace = True)

In [36]:
london_train["address_qualifier"].value_counts()

Correct incident location           16392
Within same building                 4814
Gazetter                             3993
On land associated with building     1779
Nearby Address                        237
Other                                 194
Name: address_qualifier, dtype: int64