# Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set
import warnings
from pandas.io import gbq
from sklearn.model_selection import train_test_split

# Get the Data

In [59]:
%%time
query = """
            SELECT incident_number, timestamp_of_call, incident_group, stop_code_description, property_category, 
                    property_type, address_qualifier, borough_name, ward_name, 
                    first_pump_arriving_attendance_time as first_time, first_pump_arriving_deployed_from_station as first_station,
                    second_pump_arriving_attendance_time as second_time, second_pump_arriving_deployed_from_station as second_station, 
                    num_stations_with_pumps_attending as station_pumps, num_pumps_attending as pumps_attending,
                    FROM `gdac-327115.LondonFire.fire_brigade`
        """

london = gbq.read_gbq(query, project_id = "gdac-327115", location="eu")

Wall time: 7.23 s


In [380]:
#Check the shape
london.shape

(32247, 16)

In [584]:
#View the first few rows
london.head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,borough_name,ward_name,first_time,first_station,second_time,second_station,station_pumps,pumps_attending
0,000320-01012017,2017-01-01 15:30:22+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Within same building,NOT GEO-CODED,Not geo-coded,,,,,2,2
1,003727-09012017,2017-01-09 10:41:54+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,NOT GEO-CODED,Not geo-coded,,,,,2,2
2,003762-09012017,2017-01-09 12:28:39+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,NOT GEO-CODED,Not geo-coded,,,,,2,2
3,006403-16012017,2017-01-16 04:02:14+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,NOT GEO-CODED,Not geo-coded,,,,,2,2
4,010947-26012017,2017-01-26 11:14:03+00:00,False Alarm,AFA,Dwelling,House - single occupancy,Correct incident location,NOT GEO-CODED,Not geo-coded,,,,,1,1


In [382]:
london.dtypes

incident_number                                            object
timestamp_of_call                             datetime64[ns, UTC]
incident_group                                             object
stop_code_description                                      object
property_category                                          object
property_type                                              object
address_qualifier                                          object
postcode_district                                          object
borough_name                                               object
ward_name                                                  object
first_pump_arriving_attendance_time                         Int64
first_pump_arriving_deployed_from_station                  object
second_pump_arriving_attendance_time                        Int64
second_pump_arriving_deployed_from_station                 object
num_stations_with_pumps_attending                           Int64
num_pumps_

In [383]:
#Check for missing values
london.isnull().sum()

incident_number                                   0
timestamp_of_call                                 0
incident_group                                    0
stop_code_description                             0
property_category                                 0
property_type                                     0
address_qualifier                                 0
postcode_district                                 0
borough_name                                      0
ward_name                                         0
first_pump_arriving_attendance_time            1819
first_pump_arriving_deployed_from_station      1819
second_pump_arriving_attendance_time          20281
second_pump_arriving_deployed_from_station    20281
num_stations_with_pumps_attending                68
num_pumps_attending                              68
dtype: int64

Since "second_pump_arriving_attendance_time" and "second_pump_arriving_deployed_from_station" have most of their values missing, we'll drop those from the analysis. 

In [60]:
london.drop(["second_time", "second_station"], axis=1, inplace=True)

# Split the Data into Training and Test Sets

Since we are going to be predicting if the call is a false alarm, we'll want to our test observations to be the newest in the data set. 

In [385]:
london["timestamp_of_call"].dt.year.value_counts()

2017    32247
Name: timestamp_of_call, dtype: int64

We see that all of our observations are from the same year, therefore we can just randomly split the data. However, let's first check the proportion of calls that ended up being a false alarm.  

In [386]:
london["incident_group"].value_counts()

False Alarm        15732
Special Service    10081
Fire                6434
Name: incident_group, dtype: int64

Not all incidents were either a Fire or False Alarm. There is a third category, Special Service, as well. Since the entire purpose of this project is to reduce the number of false alarms that the department wastes its resources on, we'll combine Special Services and the Fire into one category called Emergency. We could have dropped all incidents with Special Service. However, the fire department has to respond to those types of calls so it's important to include them. 

In [61]:
emergency = ["Special Service", "Fire"]

london["incident_group"].replace(emergency, "Emergency", inplace = True)

In [5]:
london["incident_group"].value_counts() / london.shape[0]

Emergency      0.512141
False Alarm    0.487859
Name: incident_group, dtype: float64

In [62]:
#Finally split the data
london_train, london_test = train_test_split(london, test_size = .15, stratify = london["incident_group"], random_state=42)

In [95]:
print(london_train.shape)
print(london_test.shape)

(27409, 14)
(4838, 14)


In [7]:
london_train.reset_index(drop=True, inplace = True)
london_train.head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,borough_name,ward_name,first_time,first_station,station_pumps,pumps_attending
0,023952-25022017,2017-02-25 21:25:09+00:00,Emergency,Special Service,Road Vehicle,Car,In street outside gazetteer location,REDBRIDGE,MAYFIELD,296,Ilford,1,1
1,029837-12032017,2017-03-12 01:28:04+00:00,False Alarm,AFA,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 stor...,Correct incident location,WESTMINSTER,ST. JAMES'S,276,Lambeth,1,2
2,041522-06042017,2017-04-06 17:03:02+00:00,Emergency,Primary Fire,Dwelling,House - single occupancy,Within same building,NEWHAM,CUSTOM HOUSE,335,East Ham,2,2
3,033936-21032017,2017-03-21 10:43:54+00:00,Emergency,Special Service,Non Residential,Single shop,Correct incident location,BARNET,BURNT OAK,334,Mill Hill,1,1
4,002591-06012017,2017-01-06 15:10:11+00:00,Emergency,Special Service,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Correct incident location,NEWHAM,FOREST GATE SOUTH,99,Stratford,1,1


# Clean the Data

### Make indicator variable for Emergency outcome

In [70]:
%%capture --no-stdout 
london_train["Emergency"] = (london_train["incident_group"] == "Emergency").astype(np.int64)

In [142]:
#Change the objects variables to categorical
#london_train.loc[:, london_train.dtypes == "object"] = london_train.select_dtypes(["object"]).apply(lambda x: x.astype("category"))

### Check for missing values

In [71]:
london_train.isnull().sum()

incident_number             0
timestamp_of_call           0
incident_group              0
stop_code_description       0
property_category           0
property_type               0
address_qualifier           0
borough_name                0
ward_name                   0
first_time               1542
first_station            1542
station_pumps              56
pumps_attending            56
Month                       0
Hour                        0
Emergency                   0
dtype: int64

### Incident Number

In [397]:
#Number of observations
london_train.shape[0]

27409

In [398]:
#Number of incidents
len(london_train["incident_number"].unique())

27409

As we'd hope, each observation in our data is a different reponse and there are no duplicates. 

### Time

In [399]:
london_train["timestamp_of_call"].head()

0   2017-02-25 21:25:09+00:00
1   2017-03-12 01:28:04+00:00
2   2017-04-06 17:03:02+00:00
3   2017-03-21 10:43:54+00:00
4   2017-01-06 15:10:11+00:00
Name: timestamp_of_call, dtype: datetime64[ns, UTC]

### Create a Month variable

In [64]:
%%capture --no-stdout
london_train["Month"] = london_train["timestamp_of_call"].dt.month

In [65]:
london_train["Month"].value_counts().reset_index().rename(columns={"index":"Month", "Month":"Count"}).sort_values(by="Month")

Unnamed: 0,Month,Count
1,1,6982
3,2,6217
2,3,6779
0,4,7431


### Create a Hour variable

In [66]:
%%capture --no-stdout
london_train["Hour"] = london_train["timestamp_of_call"].dt.hour

In [67]:
london_train["Hour"].value_counts().reset_index().rename(columns={"index":"Hour", "Hour":"Count"}).sort_values(by="Hour")

Unnamed: 0,Hour,Count
16,0,830
18,1,744
20,2,550
21,3,482
23,4,423
22,5,434
19,6,552
17,7,745
14,8,1014
12,9,1169


### Drop the timestamp variable

In [62]:
#london_train.drop("timestamp_of_call", axis = 1, inplace = True)

### Stop Code Description

In [404]:
london_train["stop_code_description"].value_counts()

AFA                                      10017
Special Service                           8580
Primary Fire                              3031
False alarm - Good intent                 2977
Secondary Fire                            2407
False alarm - Malicious                    378
Chimney Fire                                11
Flood call attended - Batch mobilised        7
Late Call                                    1
Name: stop_code_description, dtype: int64

### Property Category

In [13]:
london_train["property_category"].value_counts()

Dwelling             13048
Non Residential       6519
Road Vehicle          2369
Outdoor               2196
Outdoor Structure     1684
Other Residential     1537
Aircraft                21
Boat                    20
Rail Vehicle            15
Name: property_category, dtype: int64

In [68]:
res = ["Dwelling", "Other Residential"]
london_train["property_category"].replace(res, "Residential", inplace = True)

vehicle = ["Road Vehicle", "Aircraft", "Boat", "Rail Vehicle"]
london_train["property_category"].replace(vehicle, "Vehicle", inplace = True)

london_train["property_category"].replace("Outdoor Structure", "Outdoor", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [15]:
london_train["property_category"].value_counts()

Residential        14585
Non Residential     6519
Outdoor             3880
Vehicle             2425
Name: property_category, dtype: int64

### Property Type

In [16]:
london_train["property_type"].value_counts()

Purpose Built Flats/Maisonettes - 4 to 9 storeys      3266
House - single occupancy                              3234
Purpose Built Flats/Maisonettes - Up to 3 storeys     2486
Car                                                   1403
Self contained Sheltered Housing                      1224
                                                      ... 
Tennis Courts                                            1
Kiosk                                                    1
Freight Train                                            1
Airport - fuel storage                                   1
Agricultural equipment                                   1
Name: property_type, Length: 256, dtype: int64

Clearly, there are way too many categories in the "property_type" variable to be used in a model. Instead of trying to logically merge categories based on group similarity, we'll group based on emergency percentage. Those categories with a high percentage of real emergencies will be grouped with other high percentage emergencies. 

In [72]:
#Group by property type and summarize by the Emergency variable
pt_grp = london_train.groupby("property_type")["Emergency"].aggregate(["mean", "sum"]).sort_values(by="mean", ascending=False)
pt_grp.reset_index(inplace = True)
#Change the proportion into a percentage
pt_grp["mean"] *= 100
#Create a new variable that ranks the values based on their percentage
pt_grp["pt_rank"] = (pt_grp["mean"] / 10).astype(np.int)
#Specifically change the values with 0 percentage to -1 as a special category i.e no chance it's a real emergency
pt_grp.loc[pt_grp["mean"] == 0, "pt_rank"] = -1
pt_grp

Unnamed: 0,property_type,mean,sum,pt_rank
0,Agricultural equipment,100.0,1,10
1,Camping tent,100.0,2,10
2,Freight plane,100.0,1,10
3,Human harm outdoors,100.0,5,10
4,Kiosk,100.0,1,10
...,...,...,...,...
251,Airport - hangar,0.0,0,-1
252,Greyhound stadium,0.0,0,-1
253,Airport - fuel storage,0.0,0,-1
254,Cricket ground,0.0,0,-1


In [73]:
#Join the london data with the pt_df to match the property type rankings
london_train = pd.merge(left = london_train, right = pt_grp[["property_type", "pt_rank"]], left_on = "property_type", right_on="property_type")

In [74]:
london_train[["property_type", "pt_rank"]].tail(n = 10)

Unnamed: 0,property_type,pt_rank
27399,Airfield/runway,10
27400,Bulk oil storage,-1
27401,Bulk oil storage,-1
27402,Stacked/baled crop,10
27403,Agricultural equipment,10
27404,Tennis Courts,10
27405,Other tent/marquee,5
27406,Other tent/marquee,5
27407,Airport - fuel storage,-1
27408,Minibus,10


In [75]:
london_train["pt_rank"].value_counts().reset_index()\
                                      .rename(columns={"index":"Property Type Rank", "pt_rank":"Number of Property Types"})\
                                      .sort_values(by="Property Type Rank")

Unnamed: 0,Property Type Rank,Number of Property Types
9,-1,189
7,0,812
1,1,5456
4,2,1437
6,3,1011
10,4,181
0,5,6974
2,6,5374
8,7,770
3,8,3926


### Address Qualifier

In [21]:
london_train["address_qualifier"].value_counts()

Correct incident location                          16392
Within same building                                4814
In street outside gazetteer location                2216
On land associated with building                    1779
In street close to gazetteer location                948
Open land/water - nearest gazetteer location         733
On motorway / elevated road                          153
Nearby address - no building in street               131
Nearby address - street not listed in gazetteer      106
In street remote from gazetteer location              96
Railway land or rolling stock                         41
Name: address_qualifier, dtype: int64

There are not too many different categories for the address qualifier variable. We'll combine a few of them based on logical relationships. 

In [76]:
street = ["In street outside gazetteer location", "In street remote from gazetteer location", 
          "In street close to gazetteer location", "Open land/water - nearest gazetteer location"]
london_train["address_qualifier"].replace(street, "Gazetter", inplace = True)

nearby = ["Nearby address - no building in street", "Nearby address - street not listed in gazetteer"]
london_train["address_qualifier"].replace(nearby, "Nearby Address", inplace = True)

other = ["On motorway / elevated road", "Railway land or rolling stock"]
london_train["address_qualifier"].replace(other, "Other", inplace = True)

In [77]:
london_train["address_qualifier"].value_counts()

Correct incident location           16392
Within same building                 4814
Gazetter                             3993
On land associated with building     1779
Nearby Address                        237
Other                                 194
Name: address_qualifier, dtype: int64

### Borough Name

In [24]:
london_train["borough_name"].value_counts()

WESTMINSTER               2114
CAMDEN                    1272
SOUTHWARK                 1215
LAMBETH                   1172
TOWER HAMLETS             1149
HACKNEY                    994
CROYDON                    980
LEWISHAM                   923
NEWHAM                     899
EALING                     898
WANDSWORTH                 890
HILLINGDON                 884
BARNET                     880
GREENWICH                  857
BRENT                      856
KENSINGTON AND CHELSEA     852
ENFIELD                    850
ISLINGTON                  848
HAMMERSMITH AND FULHAM     778
BROMLEY                    768
HARINGEY                   751
WALTHAM FOREST             742
HOUNSLOW                   699
HAVERING                   607
REDBRIDGE                  604
BEXLEY                     559
BARKING AND DAGENHAM       552
MERTON                     527
RICHMOND UPON THAMES       475
HARROW                     469
SUTTON                     461
KINGSTON UPON THAMES       419
CITY OF 

We see that there is one category, "NOT GEO_CODED" that needs to be cleaned up for the borough name variable. Let's check if it has a corresponding postcode. 

We see that the postcode won't be useful in finding the correct borough for the missing values. Let's look at the ward. 

In [78]:
london_train.loc[london_train["borough_name"] == " NOT GEO-CODED", ["ward_name", "borough_name"]].value_counts()

ward_name       borough_name  
 Not geo-coded   NOT GEO-CODED    156
dtype: int64

We also see that every missing value for borough also has a corresponding missing value for ward. Therefore, we'll just drop the "NOT GEO-CODED" from borough. 

In [79]:
%%capture --no-stdout
london_train["borough_name"].replace(" NOT GEO-CODED", np.nan, inplace=True)
london_train.dropna(subset=["borough_name"], inplace = True)

In [27]:
london_train["borough_name"].isnull().sum()

0

In [28]:
#Total number of boroughs
len(london_train["borough_name"].unique())

33

### Ward Name

In [29]:
london_train["ward_name"].value_counts()

WEST END                  518
ST. JAMES'S               419
FAIRFIELD                 186
MARYLEBONE HIGH STREET    177
KILBURN                   175
                         ... 
CHEAP                       6
CORNHILL                    6
BASSISHAW                   5
QUEENHITHE                  4
DOWGATE                     4
Name: ward_name, Length: 630, dtype: int64

In [80]:
%%capture --no-stdout
london_train["ward_name"].replace(" Not geo-coded", np.nan, inplace=True)
london_train.dropna(subset=["ward_name"], inplace = True)

In [81]:
london_train["ward_name"].isnull().sum()

0

In [82]:
#Group by property type and summarize by the Emergency variable
wn_grp = london_train.groupby("ward_name")["Emergency"].aggregate(["mean", "sum"]).sort_values(by="mean", ascending=False)
wn_grp.reset_index(inplace = True)
#Change the proportion into a percentage
wn_grp["mean"] *= 100
#Create a new variable that ranks the values based on their percentage
wn_grp["wn_rank"] = (wn_grp["mean"] / 10).astype(np.int)
#Specifically change the values with 0 percentage to -1 as a special category i.e no chance it's a real emergency
wn_grp.loc[wn_grp["mean"] == 0, "wn_rank"] = -1
wn_grp

Unnamed: 0,ward_name,mean,sum,wn_rank
0,ASHBURTON,91.666667,22,9
1,TUDOR,87.500000,14,8
2,DARWIN,85.714286,18,8
3,WANDSWORTH COMMON,85.714286,36,8
4,BECONTREE,84.000000,21,8
...,...,...,...,...
625,LANGBOURN,7.142857,1,0
626,WALBROOK,6.666667,1,0
627,CORDWAINER,6.250000,1,0
628,DOWGATE,0.000000,0,-1


In [83]:
#Join the london data with the pt_df to match the property type rankings
london_train = pd.merge(left = london_train, right = wn_grp[["ward_name", "wn_rank"]], left_on = "ward_name", right_on="ward_name")

In [84]:
london_train["wn_rank"].value_counts().reset_index()\
                                      .rename(columns={"index":"Ward Name Rank", "wn_rank":"Number of Wards"})\
                                      .sort_values(by="Ward Name Rank")

Unnamed: 0,Ward Name Rank,Number of Wards
10,-1,10
8,0,72
7,1,195
5,2,1960
3,3,3518
1,4,5897
0,5,7845
2,6,5244
4,7,1973
6,8,515


### First Pump Arrving Deployed From Station

In [257]:
london_train["first_station"].isnull().sum()

1392

In [258]:
london_train.loc[london_train["first_station"].isnull()].head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,borough_name,ward_name,first_time,first_station,station_pumps,pumps_attending,Emergency,Month,Hour,pt_rank,wn_rank
29,014413-03022017,2017-02-03 18:36:39+00:00,Emergency,Special Service,Residential,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Within same building,BROMLEY,ORPINGTON,,,1,1,1,2,18,6,5
30,013424-01022017,2017-02-01 08:29:47+00:00,Emergency,Special Service,Residential,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Within same building,BROMLEY,ORPINGTON,,,1,1,1,2,8,6,5
32,049618-21042017,2017-04-21 15:20:47+00:00,Emergency,Special Service,Residential,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Correct incident location,BROMLEY,ORPINGTON,,,1,1,1,4,15,6,5
61,034258-21032017,2017-03-21 23:32:24+00:00,False Alarm,False alarm - Good intent,Residential,House - single occupancy,Correct incident location,HOUNSLOW,HANWORTH PARK,,,1,1,0,3,23,5,6
62,018581-13022017,2017-02-13 17:34:31+00:00,Emergency,Special Service,Residential,Purpose Built Flats/Maisonettes - 4 to 9 storeys,Correct incident location,HOUNSLOW,HANWORTH PARK,,,1,1,1,2,17,6,6


Since this is a categorical variable and it represents a location, we'll look at it and ward name together to see if we can impute the missing values based on the filled in stations for the wards. 

In [206]:
station_df = pd.DataFrame(london_train[["ward_name", "first_station"]].value_counts())
station_df = station_df.reset_index().rename(columns={0: "Count"})
station_df

Unnamed: 0,ward_name,first_station,Count
0,WEST END,Soho,403
1,ST. JAMES'S,Soho,275
2,FAIRFIELD,Croydon,127
3,HOLBORN AND COVENT GARDEN,Soho,119
4,BUNHILL,Shoreditch,117
...,...,...,...
2360,GLYNDON,Eltham,1
2361,GIPSY HILL,Woodside,1
2362,GIPSY HILL,Forest Hill,1
2363,SOUTH NORWOOD,Croydon,1


We see that some stations map to multiple wards. However, let's impute the missing stations based on the highest counts for the ward. For instance, if there are any missing stations in the West End ward, we'll impute the value as Soho. 

In [35]:
def station_imputer(df):
    """
    This function imputes the missing stations for the london data set by setting them to the most occurring stations in 
    their ward. 
    
    params:
        df: Dataframe containing the necessary columns
        
    returns:
        df: Dataframe with the missing values imputed
    
    """
    #Create a list of wards that have missing stations
    miss_stat_wards = list(df.loc[df["first_station"].isnull(), "ward_name"].drop_duplicates())
    #Convert the column to a string type. If not, there's a data type mismatch
    #df["first_station"] = df["first_station"].astype(str)
    df["first_station"] = df["first_station"].astype("category")
    #print("Made it here!")
    def ward_to_station(df):
        """
        This function creates a data frame that has the counts of the wards and stations to be used to find the station 
        with the most counts to be used in the imputer function
        """
        station_df = pd.DataFrame(df[["ward_name", "first_station"]].value_counts())
        station_df = station_df.reset_index().rename(columns={0: "Count"})
        
        return station_df
    #print("Made it here!")
    wts_df = ward_to_station(df=df)
    wts_df.reset_index(drop=True, inplace = True)
    #print("Made it here!")
    #Loop through all the wards that have missing stations
    #print(len(miss_stat_wards))
    for ward in miss_stat_wards:
        #print("Made it here!")
        #print(ward)
        #Set a mask based on the ward
        mask = wts_df["ward_name"].str.contains(ward)
        #Create a temporary dataframe containing only the single ward and possibly multiple stations
        temp_df = wts_df.loc[mask]
        temp_df.reset_index(drop=True, inplace=True)
        #Get the index of the station with the highest count in the ward
        max_cnt_ward = temp_df["Count"].argmax()
        #Get the station
        station = temp_df.loc[max_cnt_ward, "first_station"]
        #print(station)
        #Impute the missing station given the what ward it is in
        #df.loc[mask & df["first_station"].isnull()] = station
        df.loc[(df["ward_name"] == ward) & (df["first_station"].isnull()), "first_station"] = station
        #print("Made it here!")
    return df
  

In [85]:
london_train = station_imputer(london_train)

In [86]:
london_train["first_station"].isnull().sum()

0

Now that we have all the missing values imputed, we can go ahead and look at the different categories. 

In [38]:
london_train["first_station"].value_counts()

Soho              1062
Lambeth            616
Paddington         570
Euston             531
West Hampstead     525
                  ... 
Addington          127
Purley             108
Hainault            94
Wennington          63
Biggin Hill         26
Name: first_station, Length: 102, dtype: int64

In [87]:
#Group by property type and summarize by the Emergency variable
stat_grp = london_train.groupby("first_station")["Emergency"].aggregate(["mean", "sum", "count"]).sort_values(by="mean", ascending=False)
stat_grp.reset_index(inplace = True)
#Change the proportion into a percentage
stat_grp["mean"] *= 100
stat_grp.tail(20)

Unnamed: 0,first_station,mean,sum,count
82,Surbiton,43.979058,84,191
83,Lambeth,43.831169,270,616
84,Harrow,43.414634,89,205
85,Acton,43.369176,121,279
86,Lewisham,43.145161,107,248
87,Wennington,42.857143,27,63
88,Shoreditch,42.769857,210,491
89,Paddington,42.105263,240,570
90,North Kensington,41.970021,196,467
91,Richmond,41.081081,76,185


In [88]:
#Group by property type and summarize by the Emergency variable
stat_grp = london_train.groupby("first_station")["Emergency"].aggregate(["mean", "sum"]).sort_values(by="mean", ascending=False)
stat_grp.reset_index(inplace = True)
#Change the proportion into a percentage
stat_grp["mean"] *= 100
#Create a new variable that ranks the values based on their percentage
stat_grp["stat_rank"] = (stat_grp["mean"] / 10).astype(np.int)
stat_grp.head(10)

Unnamed: 0,first_station,mean,sum,stat_rank
0,Orpington,73.333333,154,7
1,Eltham,66.666667,114,6
2,Plaistow,66.0,198,6
3,Poplar,65.714286,184,6
4,Dagenham,65.505226,188,6
5,Biggin Hill,65.384615,17,6
6,Addington,65.354331,83,6
7,Edmonton,64.802632,197,6
8,Plumstead,64.470588,274,6
9,East Ham,63.71308,151,6


In [89]:
london_train = pd.merge(left = london_train, right = stat_grp[["first_station", "stat_rank"]], left_on = "first_station", right_on="first_station")

In [90]:
london_train["stat_rank"].value_counts()\
                         .reset_index()\
                         .rename(columns={"index":"stat_rank", "stat_rank":"Number of Stations"})\
                         .sort_values(by="stat_rank")

Unnamed: 0,stat_rank,Number of Stations
4,2,1410
3,3,1868
1,4,7620
0,5,11798
2,6,4347
5,7,210


### First Pump Arriving Time

In [91]:
london_train["first_time"].isnull().sum()

1392

Before we impute the missing values with either a mean/median, we should look at the distribution to get a sense of the spread in arriving time. 

In [92]:
london_train["first_time"].describe()

count    25861.000000
mean       317.496346
std        135.484306
min          2.000000
25%        231.000000
50%        298.000000
75%        378.000000
max       1196.000000
Name: first_time, dtype: float64

We can see that mean and median are relatively close which is good, not too much skew. However, we see that the max value is 1,196 seconds which is more than 6 standard deviations away so quite unlikely. To be conservative, lets see how many arriving times are more than 4 standard deviations above the mean. 

In [93]:
pump_mean = np.mean(london_train["first_time"])
pump_std = np.std(london_train["first_time"])
#Any pump times above this should be flagged
cutoff_time = pump_mean + 4*pump_std
cutoff_time

859.4230907364656

In [94]:
london_train.loc[london_train["first_time"] > cutoff_time].shape

(178, 19)

In [95]:
london_train.loc[london_train["first_time"] > cutoff_time].head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,borough_name,ward_name,first_time,first_station,station_pumps,pumps_attending,Month,Hour,Emergency,pt_rank,wn_rank,stat_rank
50,020444-18022017,2017-02-18 08:10:43+00:00,Emergency,Special Service,Outdoor,Road surface/pavement,Gazetter,REDBRIDGE,CRANBROOK,878,Ilford,1,1,2,8,1,8,6,5
81,030043-12032017,2017-03-12 14:25:25+00:00,Emergency,Secondary Fire,Outdoor,Scrub land,Gazetter,BARKING AND DAGENHAM,THAMES,920,Ilford,1,1,3,14,1,8,5,5
366,027245-05032017,2017-03-05 17:13:37+00:00,Emergency,Special Service,Non Residential,Church/Chapel,Correct incident location,REDBRIDGE,CLEMENTSWOOD,918,Ilford,1,1,3,17,1,3,5,5
423,035636-25032017,2017-03-25 10:10:03+00:00,Emergency,Secondary Fire,Outdoor,Tree scrub,On land associated with building,BARKING AND DAGENHAM,EASTBROOK,864,Dagenham,1,1,3,10,1,8,7,6
427,035773-25032017,2017-03-25 15:49:36+00:00,Emergency,Special Service,Vehicle,Car,Gazetter,NEWHAM,EAST HAM SOUTH,952,Dagenham,2,2,3,15,1,8,8,6


We can go ahead and drop those times that are more than 4 standard deviations above the mean arriving time. 

In [50]:
london_train.shape

(27253, 19)

In [100]:
london_train = london_train.loc[(london_train["first_time"] < cutoff_time) | london_train["first_time"].isnull()]

In [101]:
london_train.shape

(27075, 19)

Now we can go ahead and impute the missing values. 

In [103]:
london_train.loc[london_train["first_time"].notnull()].head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,borough_name,ward_name,first_time,first_station,station_pumps,pumps_attending,Month,Hour,Emergency,pt_rank,wn_rank,stat_rank
0,023952-25022017,2017-02-25 21:25:09+00:00,Emergency,Special Service,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,296,Ilford,1,1,2,21,1,8,6,5
1,009056-22012017,2017-01-22 01:41:22+00:00,Emergency,Special Service,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,194,Ilford,1,2,1,1,1,8,6,5
2,016512-08022017,2017-02-08 20:08:17+00:00,False Alarm,False alarm - Good intent,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,179,Ilford,1,3,2,20,0,8,6,5
3,023003-23022017,2017-02-23 19:11:48+00:00,Emergency,Primary Fire,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,314,Ilford,1,1,2,19,1,8,6,5
4,025248-01032017,2017-03-01 02:13:26+00:00,Emergency,Special Service,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,355,Ilford,1,1,3,2,1,8,6,5


We can impute the mean by using which station the first pump came from. 

In [107]:
#Group by property type and summarize by the Emergency variable
pump_time_grp = london_train.groupby("first_station")["first_time"]\
                        .aggregate(["median","mean", "std", "max", "min","count"])\
                        .sort_values(by="mean", ascending=False)
pump_time_grp.reset_index(inplace = True)
pump_time_grp

Unnamed: 0,first_station,median,mean,std,max,min,count
0,Biggin Hill,408.0,411.24,114.158983,715,205,25
1,Ruislip,398.0,402.528571,128.494629,837,16,140
2,Orpington,372.0,377.083333,147.611853,785,3,204
3,Wennington,358.5,373.783333,146.171027,771,3,60
4,Northolt,351.0,372.523207,144.505339,858,8,237
...,...,...,...,...,...,...,...
97,Islington,260.0,269.752542,98.225358,776,2,295
98,Chelsea,256.0,266.41623,106.020956,762,8,382
99,Old Kent Road,247.0,264.566038,108.998073,812,5,265
100,Lewisham,241.5,258.1125,104.062219,831,47,240


In [133]:
london_train["first_time"] = london_train["first_time"].astype(np.float64)
london_train["first_time"] = london_train.groupby("first_station")["first_time"].\
                                                transform(lambda grp: grp.fillna(np.mean(grp)))

In [119]:
london_train["first_time"].isnull().sum()

0

In [120]:
london_train.head()

Unnamed: 0,incident_number,timestamp_of_call,incident_group,stop_code_description,property_category,property_type,address_qualifier,borough_name,ward_name,first_time,first_station,station_pumps,pumps_attending,Month,Hour,Emergency,pt_rank,wn_rank,stat_rank
0,023952-25022017,2017-02-25 21:25:09+00:00,Emergency,Special Service,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,296.0,Ilford,1,1,2,21,1,8,6,5
1,009056-22012017,2017-01-22 01:41:22+00:00,Emergency,Special Service,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,194.0,Ilford,1,2,1,1,1,8,6,5
2,016512-08022017,2017-02-08 20:08:17+00:00,False Alarm,False alarm - Good intent,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,179.0,Ilford,1,3,2,20,0,8,6,5
3,023003-23022017,2017-02-23 19:11:48+00:00,Emergency,Primary Fire,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,314.0,Ilford,1,1,2,19,1,8,6,5
4,025248-01032017,2017-03-01 02:13:26+00:00,Emergency,Special Service,Vehicle,Car,Gazetter,REDBRIDGE,MAYFIELD,355.0,Ilford,1,1,3,2,1,8,6,5


### Number of Stations with Pumps Attending

In [121]:
london_train["station_pumps"].isnull().sum()

54

In [123]:
london_train["station_pumps"].describe()

count    27021.000000
mean         1.354946
std          0.607220
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          6.000000
Name: station_pumps, dtype: float64

In [124]:
london_train["station_pumps"].value_counts()

1    18954
2     6885
3      859
4      307
5       13
6        3
Name: station_pumps, dtype: Int64

We see that only a very small amount of stations had more than 5 pumps attending. Therefore, we'll go ahead and drop any observations that have more than 5 pumps attending.

In [130]:
london_train = london_train.loc[(london_train["station_pumps"] < 5) | (london_train["station_pumps"].isnull())]

Now we can impute the missing values using the mean from which station the attending pumps came from. 

In [142]:
#Group by property type and summarize by the Emergency variable
station_pump_grp = london_train.groupby("first_station")["station_pumps"]\
                        .aggregate(["median","mean", "std", "max", "min","count"])\
                        .sort_values(by="mean", ascending=False)
station_pump_grp.reset_index(inplace = True)
station_pump_grp

Unnamed: 0,first_station,median,mean,std,max,min,count
0,Heathrow,1.0,1.650685,0.859771,4.0,1.0,146
1,Hayes,1.5,1.642857,0.759320,4.0,1.0,140
2,Deptford,1.0,1.626374,0.924179,4.0,1.0,182
3,Battersea,1.0,1.619718,0.869280,4.0,1.0,213
4,Lewisham,2.0,1.616935,0.650782,4.0,1.0,248
...,...,...,...,...,...,...,...
97,Plumstead,1.0,1.188095,0.521931,4.0,1.0,420
98,Northolt,1.0,1.186736,0.420420,3.0,1.0,242
99,Norbury,1.0,1.178988,0.440909,4.0,1.0,257
100,Orpington,1.0,1.129851,0.363578,3.0,1.0,209


In [134]:
london_train["station_pumps"] = london_train["station_pumps"].astype(np.float64)
london_train["station_pumps"] = london_train.groupby("first_station")["station_pumps"].\
                                                transform(lambda grp: grp.fillna(np.mean(grp)))

In [135]:
london_train["station_pumps"].isnull().sum()

0

In [131]:
tester.shape

(27059, 19)

### Number of Pumps Attending

In [136]:
london_train["pumps_attending"].isnull().sum()

54

In [137]:
london_train["pumps_attending"].describe()

count    27021.000000
mean         1.535842
std          0.738057
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          6.000000
Name: pumps_attending, dtype: float64

In [138]:
london_train["pumps_attending"].value_counts()

1    15679
2     9005
3     1591
4      700
5       38
6        8
Name: pumps_attending, dtype: Int64

We'll go ahead and drop any pumps that are greater than 4. 

In [139]:
london_train = london_train.loc[(london_train["pumps_attending"] < 5) | (london_train["pumps_attending"].isnull())]

In [141]:
#Group by property type and summarize by the Emergency variable
pump_grp = london_train.groupby("first_station")["pumps_attending"]\
                        .aggregate(["median","mean", "std", "max", "min","count"])\
                        .sort_values(by="mean", ascending=False)
pump_grp.reset_index(inplace = True)
pump_grp

Unnamed: 0,first_station,median,mean,std,max,min,count
0,North Kensington,2.0,1.769892,0.801908,4,1,465
1,Paddington,2.0,1.735816,0.846722,4,1,564
2,Hayes,1.5,1.728571,0.888316,4,1,140
3,Twickenham,1.0,1.671362,0.827007,4,1,213
4,Heston,2.0,1.652482,0.700643,4,1,282
...,...,...,...,...,...,...,...
97,Shadwell,1.0,1.387097,0.729291,4,1,124
98,Hornchurch,1.0,1.377358,0.558924,4,1,159
99,Mitcham,1.0,1.340909,0.512099,3,1,220
100,Dowgate,1.0,1.326531,0.763626,4,1,343


In [145]:
london_train["pumps_attending"] = london_train["pumps_attending"].astype(np.float64)
london_train["pumps_attending"] = london_train.groupby("first_station")["pumps_attending"].\
                                                transform(lambda grp: grp.fillna(np.mean(grp)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  london_train["pumps_attending"] = london_train["pumps_attending"].astype(np.float64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  london_train["pumps_attending"] = london_train.groupby("first_station")["pumps_attending"].\


In [146]:
london_train["pumps_attending"].isnull().sum()

0

In [147]:
london_train.isnull().sum()

incident_number          0
timestamp_of_call        0
incident_group           0
stop_code_description    0
property_category        0
property_type            0
address_qualifier        0
borough_name             0
ward_name                0
first_time               0
first_station            0
station_pumps            0
pumps_attending          0
Month                    0
Hour                     0
Emergency                0
pt_rank                  0
wn_rank                  0
stat_rank                0
dtype: int64