# Setup and Imports

In [8]:
import pandas as pd
import numpy as np
import london_data_functions as ldf
import warnings
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import xgboost
import pickle
from sklearn.model_selection import ParameterSampler
from scipy import sparse

# Get the Data

In [12]:
%%time
query = """
            SELECT incident_number, timestamp_of_call, incident_group, property_category, 
                    property_type, address_qualifier, borough_name, ward_name, 
                    first_pump_arriving_attendance_time as first_time, first_pump_arriving_deployed_from_station as first_station,
                    num_stations_with_pumps_attending as station_pumps, num_pumps_attending as pumps_attending,
                    FROM `gdac-327115.LondonFire.fire_brigade`
        """

london = ldf.fetch_london_data(query_string=query, project_id = "gdac-327115", location = "eu")

Wall time: 7.76 s


# Split the Data into Training and Test Sets

In [13]:
london_train, london_test = train_test_split(london, test_size = .16, stratify = london["incident_group"], random_state = 42)

# Clean the Training Data

In [14]:
warnings.filterwarnings('ignore')
london_train = ldf.clean_london(london_train)

Cleaning London Data Started...

Sucessfully Added Emergency Column!
Sucessfully Added Month Column!
Sucessfully Added Hour Column!
Sucessfully Cleanded Property Category!
Sucessfully Cleaned and Ranked Property Type!
Sucessfully Cleaned Address!
Sucessfully Cleaned Borough Names!
Sucessfully Cleaned and Ranked Ward Names!
Sucessfully Imputed Station Names!
Sucessfully Ranked Stations!
Sucessfully Cleaned Station Arriving Time!
Sucessfully Cleaned Number of Stations with Pumps!
Sucessfully Cleaned Number of Pumps Attending!
Sucessfully Dropped Unecessary Columns!

Sucessfully Cleaned London Data!


In [30]:
london_train.dtypes

property_category     object
address_qualifier     object
borough_name          object
first_time           float64
station_pumps         object
pumps_attending       object
Emergency              int64
Month                 object
Hour                  object
pt_rank               object
wn_rank               object
stat_rank             object
dtype: object

# Prepare the Data

In [18]:
def prepare_london(df, cat_attribs, num_attribs, target):
    """
    
    """
    #num_pipeline = Pipeline(["std_scaler", StandardScaler()])
    cat_encoder = OneHotEncoder()
    
    full_pipeline = ColumnTransformer([
        ("num", StandardScaler(), num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
    
    X = full_pipeline.fit_transform(df)
    y = df[target]
    
    return X, y
    

In [19]:
cat_vars = ["property_category", "address_qualifier", "borough_name", "station_pumps", "pumps_attending", "Month", 
            "Hour", "pt_rank", "wn_rank", "stat_rank"]
num_vars = ["first_time"]
target = "Emergency"
X, y = prepare_london(london_train, cat_vars, num_vars, target)

In [24]:
print(X.shape)
print(y.shape)

(26719, 109)
(26719,)


In [33]:
feature_dim = 0

for feature in london_train[cat_vars].columns:
    print(feature)
    print(len(london_train[feature].unique()))
    feature_dim += len(london_train[feature].unique())
    
feature_dim += 1

property_category
4
address_qualifier
6
borough_name
33
station_pumps
4
pumps_attending
4
Month
4
Hour
24
pt_rank
12
wn_rank
10
stat_rank
7


In [34]:
print(feature_dim)

109


In [26]:
london_train[cat_vars].columns

Index(['property_category', 'address_qualifier', 'borough_name',
       'station_pumps', 'pumps_attending', 'Month', 'Hour', 'pt_rank',
       'wn_rank', 'stat_rank'],
      dtype='object')

In [15]:
london_train.head()

Unnamed: 0,property_category,address_qualifier,borough_name,first_time,station_pumps,pumps_attending,Emergency,Month,Hour,pt_rank,wn_rank,stat_rank
0,Non Residential,Correct incident location,SOUTHWARK,331.0,1.0,1.0,0,4,18,1,4,4
1,Non Residential,Within same building,SOUTHWARK,525.0,1.0,1.0,1,3,17,1,4,4
2,Non Residential,Correct incident location,SOUTHWARK,258.0,1.0,1.0,1,4,17,1,4,4
3,Residential,Correct incident location,SOUTHWARK,413.0,1.0,1.0,1,2,21,5,4,4
4,Residential,Correct incident location,SOUTHWARK,220.0,2.0,2.0,0,4,10,5,4,4


In [None]:
def prepare_chicago_test(df, attribs):
    """
    This function is just a convenient wrapper around the ColumnTransformer method for OneHotEncoding categorical features
    specific to the test data
    
    df: DataFrame
    attribs: Columns specified to be transformed. Expected data structure is a list
    
    returns: X(Sparse Matrix): y(Series)
    """
    cat_encoder = OneHotEncoder()
    X = cat_encoder.fit_transform(df[attribs])
    
    y = (df["arrest"] == True).astype(np.int)
    return X, y