# Importing relevant libraries

In [4]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import datetime
import json
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from pandas.tseries.holiday import AbstractHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
from pandas.tseries.holiday import Holiday
from dateutil.relativedelta import SU
from dateutil.relativedelta import TH
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
plt.style.use('ggplot')

# Run this cell if running online

In [1]:
# S3 prefix
prefix = 'ems_call_volume'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

Couldn't call 'get_role' to get Role ARN from role name tyler.c.buffington@utexas.edu to get Role path.


ValueError: The current AWS identity is not a role: arn:aws:iam::445861113736:user/tyler.c.buffington@utexas.edu, therefore it cannot be used as a SageMaker execution role

# This is the cell I use for debugging locally. Don't run it online

In [5]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import pandas as pd

# S3 prefix
prefix = 'ems_call_volume'
# session = boto3.session.Session(region_name='us-east-2')
# sagemaker_session = sagemaker.LocalSession(boto_session=session)
sagemaker_session = sagemaker.LocalSession()

# Get a SageMaker-compatible role used by this Notebook Instance.
# role = 'arn:aws:iam::467626235021:role/service-role/AmazonSageMaker-ExecutionRole-20190830T140378'
role = 'arn:aws:iam::445861113736:role/service-role/AmazonSageMaker-ExecutionRole-20190903T114521'

## Upload the data for training <a class="anchor" id="upload_data"></a>

I performed the following query to get the data. Again, don't run this online.


In [4]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
import json
#Generating the dataframe from NFORS
es = Elasticsearch()
s = Search(using=es,index='*-fire-incident-*')
s = s.source(['description.event_opened',
                     'description.day_of_week',
                    'NFPA.type',
                     'fire_department.firecares_id'])



q = Q("match",fire_department__firecares_id =  '79592') | Q("match",fire_department__firecares_id =  '93345')
results = s.query(q)

#Performing the query and converting to pandas dataframe
df = pd.DataFrame((d.to_dict() for d in tqdm_notebook(results.scan())))
json_struct = json.loads(df.to_json(orient="records"))

df = pd.io.json.json_normalize(json_struct)

#Converting date
df['date'] = df['description.event_opened'].apply(lambda x: x[:10])
df['month'] = df.apply(lambda x: x['date'][5:7], axis=1)
df['hour'] = df['description.event_opened'].apply(lambda x: x[11:13])


#Converting df dates to datetime objects
df['date'] = df.apply(lambda x: datetime.datetime.strptime(x['date'],'%Y-%m-%d'),axis=1)
# df['date'] = df.apply(lambda x: datetime.datetime.strptime(x['date'],'%Y-%m-%d'),axis=1)

#Renaming columns for convenience
new_col = list(df.columns)
for i,column in enumerate(df.columns):
    new_col[i] = column.replace('weather.currently.','').replace('weather.daily.','')
df.columns = new_col

df.to_pickle('query_results')


hourly = df[['fire_department.firecares_id', 'date','description.day_of_week', 'hour']].groupby(['fire_department.firecares_id', 'date','description.day_of_week', 'hour']).aggregate(len).reset_index()
hourly = hourly.rename(columns={0: 'calls'})
jsondata = {}
jsondata['model_name'] = 'calls_by_hour'
jsondata['model_version'] = 1.0
jsondata['prediction_data'] = hourly.drop('date',axis=1).to_dict(orient='records')

with open('./data/training_data.json', 'w') as outfile:
    json.dump(jsondata, outfile)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

Once we have the data locally, we can use use the tools provided by the SageMaker Python SDK to upload the data to a default bucket. 

In [6]:
WORK_DIRECTORY = 'data'
train_input = sagemaker_session.upload_data(WORK_DIRECTORY, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY) )

## Create SageMaker Scikit Estimator <a class="anchor" id="create_sklearn_estimator"></a>

To run our Scikit-learn training script on SageMaker, we construct a `sagemaker.sklearn.estimator.sklearn` estimator, which accepts several constructor arguments:

* __entry_point__: The path to the Python script SageMaker runs for training and prediction.
* __role__: Role ARN
* __train_instance_type__ *(optional)*: The type of SageMaker instances for training. __Note__: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.
* __sagemaker_session__ *(optional)*: The session used to train on Sagemaker.
* __hyperparameters__ *(optional)*: A dictionary passed to the train function as hyperparameters.

To see the code for the SKLearn Estimator, see here: https://github.com/aws/sagemaker-python-sdk/tree/master/src/sagemaker/sklearn

In [20]:
from sagemaker.sklearn.estimator import SKLearn

script_path = 'hourly_call_prediction.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={'n_estimators': 1000})

## Train SKLearn Estimator on EMS data <a class="anchor" id="train_sklearn"></a>
Training is very simple, just call `fit` on the Estimator! This will start a SageMaker Training job that will download the data for us, invoke our scikit-learn code (in the provided script file), and save any model artifacts that the script creates.

In [21]:
sklearn.fit({'train': train_input})

Creating tmpkd_esy20_algo-1-tgf70_1 ... 
[1BAttaching to tmpkd_esy20_algo-1-tgf70_12mdone[0m
[36malgo-1-tgf70_1  |[0m 2020-01-21 17:25:41,955 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
[36malgo-1-tgf70_1  |[0m 2020-01-21 17:25:41,961 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-tgf70_1  |[0m 2020-01-21 17:25:41,984 sagemaker_sklearn_container.training INFO     Invoking user training script.
[36malgo-1-tgf70_1  |[0m 2020-01-21 17:25:42,318 sagemaker-containers INFO     Module hourly_call_prediction does not provide a setup.py. 
[36malgo-1-tgf70_1  |[0m Generating setup.py
[36malgo-1-tgf70_1  |[0m 2020-01-21 17:25:42,318 sagemaker-containers INFO     Generating setup.cfg
[36malgo-1-tgf70_1  |[0m 2020-01-21 17:25:42,318 sagemaker-containers INFO     Generating MANIFEST.in
[36malgo-1-tgf70_1  |[0m 2020-01-21 17:25:42,319 sagemaker-containers INFO     Installing module with the following 

[36mtmpkd_esy20_algo-1-tgf70_1 exited with code 0
[0mAborting on container exit...
===== Job Complete =====


## Using the trained model to make inference requests <a class="anchor" id="inference"></a>

### Deploy the model <a class="anchor" id="deploy"></a>

Deploying the model to SageMaker hosting just requires a `deploy` call on the fitted model. This call takes an instance count and instance type.

In [10]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

Attaching to tmphei5k5ux_algo-1-4e8pj_1
[36malgo-1-4e8pj_1  |[0m Processing /opt/ml/code
[36malgo-1-4e8pj_1  |[0m Building wheels for collected packages: hourly-call-prediction
[36malgo-1-4e8pj_1  |[0m   Building wheel for hourly-call-prediction (setup.py) ... [?25ldone
[36malgo-1-4e8pj_1  |[0m [?25h  Created wheel for hourly-call-prediction: filename=hourly_call_prediction-1.0.0-py2.py3-none-any.whl size=7368 sha256=c1f939daa52cdf21e278954815036c093f22f31ec578120a999542b440a1f8c4
[36malgo-1-4e8pj_1  |[0m   Stored in directory: /tmp/pip-ephem-wheel-cache-710ngrax/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3
[36malgo-1-4e8pj_1  |[0m Successfully built hourly-call-prediction
[36malgo-1-4e8pj_1  |[0m Installing collected packages: hourly-call-prediction
[36malgo-1-4e8pj_1  |[0m Successfully installed hourly-call-prediction-1.0.0
[36malgo-1-4e8pj_1  |[0m   import imp
[36malgo-1-4e8pj_1  |[0m [2020-01-21 17:17:39 +0000] [216] [INFO] Starting gunic

In [None]:
docker-compose -f /tmp/tmp46n28vem/docker-compose.yaml up --build --abort-on-container-exit

In [None]:
#Make every combination of departments, days of week, and hour
from itertools import product
dep_list = hourly['fire_department.firecares_id'].unique()
days = hourly['description.day_of_week'].unique()
hours = hourly['hour'].unique()


test_df = pd.DataFrame(list(product(dep_list, days, hours)), columns=['fire_department.firecares_id', 'description.day_of_week', 'hour'])
jsondata = {}
jsondata['model_name'] = 'calls_by_hour'
jsondata['model_version'] = 1.0
jsondata['prediction_data'] = test_df.to_dict(orient='records')

with open('./prediction_data/test_data.json', 'w') as outfile:
    json.dump(jsondata, outfile)


# test_features = pd.get_dummies(test_df)[feature_list]
# test_df['predictions'] = rf.predict(test_features)
# department = '93345'
# # department = '79592'
# day = 'Tuesday'

# subset = test_df[(test_df['fire_department.firecares_id'] == department) 
#                  & (test_df['description.day_of_week'] == day)]
# subset = subset.sort_values(by='hour')
# plt.rcParams['figure.figsize'] = [12,8]
# subset.plot.bar(x='hour', y='predictions', color='dodgerblue', width=.7)
# plt.yticks([0.7,1,1.3], [])

### Choose some data and use it for a prediction <a class="anchor" id="prediction_request"></a>

In order to do some predictions, we'll extract some of the data we used for training and do predictions against it. This is, of course, bad statistical practice, but a good way to see how the mechanism works.

Prediction is as easy as calling predict with the predictor we got back from deploy and the data we want to do predictions with. The output from the endpoint return an numerical representation of the classification prediction; in the original dataset, these are flower names, but in this example the labels are numerical. We can compare against the original label that we parsed.

### Endpoint cleanup <a class="anchor" id="endpoint_cleanup"></a>

When you're done with the endpoint, you'll want to clean it up.

In [11]:
sklearn.delete_endpoint()

Gracefully stopping... (press Ctrl+C again to force)


## Batch Transform <a class="anchor" id="batch_transform"></a>
We can also use the trained model for asynchronous batch inference on S3 data using SageMaker Batch Transform.

In [22]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn.transformer(instance_count=1, instance_type='ml.m4.xlarge')

### Prepare Input Data <a class="anchor" id="prepare_input_data"></a>
We will extract 10 random samples of 100 rows from the training data, then split the features (X) from the labels (Y). Then upload the input data to a given location in S3.

In [33]:
#Upload test data to s3
WORK_DIRECTORY = 'prediction_data'
batch_input_s3 = sagemaker_session.upload_data(WORK_DIRECTORY, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY) )

### Run Transform Job <a class="anchor" id="run_transform_job"></a>
Using the Transformer, run a transform job on the S3 input data.

In [34]:
# Start a transform job and wait for it to finish
transformer.transform(batch_input_s3, content_type='application/json')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()

Attaching to tmp9ck0gmpj_algo-1-o416z_1
[36malgo-1-o416z_1  |[0m Processing /opt/ml/code
[36malgo-1-o416z_1  |[0m Building wheels for collected packages: hourly-call-prediction
[36malgo-1-o416z_1  |[0m   Building wheel for hourly-call-prediction (setup.py) ... [?25ldone
[36malgo-1-o416z_1  |[0m [?25h  Created wheel for hourly-call-prediction: filename=hourly_call_prediction-1.0.0-py2.py3-none-any.whl size=7377 sha256=15f46bf3b6ffa9fed0e883247b497602f14a1aaaaac0a49508955731e2058ebb
[36malgo-1-o416z_1  |[0m   Stored in directory: /tmp/pip-ephem-wheel-cache-9kakzzf0/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3
[36malgo-1-o416z_1  |[0m Successfully built hourly-call-prediction
[36malgo-1-o416z_1  |[0m Installing collected packages: hourly-call-prediction
[36malgo-1-o416z_1  |[0m Successfully installed hourly-call-prediction-1.0.0
[36malgo-1-o416z_1  |[0m   import imp
[36malgo-1-o416z_1  |[0m [2020-01-21 17:27:28 +0000] [216] [INFO] Starting gunic

[36malgo-1-o416z_1  |[0m [2020-01-21 17:27:31 +0000] [4044] [INFO] Booting worker with pid: 4044
[36malgo-1-o416z_1  |[0m [2020-01-21 17:27:31 +0000] [4045] [INFO] Booting worker with pid: 4045
[36malgo-1-o416z_1  |[0m [2020-01-21 17:27:31 +0000] [4173] [INFO] Booting worker with pid: 4173
[36malgo-1-o416z_1  |[0m [2020-01-21 17:27:31 +0000] [4300] [INFO] Booting worker with pid: 4300
[36malgo-1-o416z_1  |[0m 172.18.0.1 - - [21/Jan/2020:17:27:31 +0000] "GET /execution-parameters HTTP/1.1" 404 232 "-" "-"
[36malgo-1-o416z_1  |[0m [2020-01-21 17:27:31 +0000] [4367] [INFO] Booting worker with pid: 4367
[36malgo-1-o416z_1  |[0m 2020-01-21 17:27:32,226 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)
[36malgo-1-o416z_1  |[0m   import imp
[36malgo-1-o416z_1  |[0m 172.18.0.1 - - [21/Jan/2020:17:27:33 +0000] "POST /invocations HTTP/1.1" 200 6667 "-" "-"
Gracefully stopping... (press Ctrl+C again to force)
Waiting for transform job: sagemaker-scikit

### Check Output Data  <a class="anchor" id="check_output_data"></a>
After the transform job has completed, download the output data from S3. For each file "f" in the input data, we have a corresponding file "f.out" containing the predicted labels from each input row. We can compare the predicted labels to the true labels saved earlier.

In [None]:
# Download the output data from S3 to local filesystem
batch_output = transformer.output_path
!mkdir -p batch_data/output
!aws s3 cp --recursive $batch_output/ batch_data/output/
# Head to see what the batch output looks like
!head batch_data/output/*

In [None]:
# Pulling the predictions, comparing to the observed number of calls

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
output_loc = './batch_data/output/test_data.json.out'

with open (output_loc, "r") as myfile:
    data=myfile.readlines()
    
string_output = data[0].replace('[','').replace(']','')

predicted = np.genfromtxt(StringIO(string_output),delimiter=',')
actual = np.genfromtxt('observed',delimiter=',')[1:,1]
plt.scatter(predicted,actual,alpha=0.3)
plt.xlabel('predicted')
plt.ylabel('actual')