# Forest Fire Predictive Analytics with AWS SageMaker

- This notebook details the training and deployment of a machine learning model on AWS.

In [None]:
import os
import boto3
import time
import re
import pandas as pd
from sklearn.metrics import accuracy_score
import sagemaker
from sagemaker import get_execution_role
from sklearn import tree, preprocessing
import sklearn.ensemble as ske
from sklearn.model_selection import train_test_split

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# Now let's define the S3 bucket we'll used for the remainder of this example.

# bucket = '' #  enter your s3 bucket where you will copy data and model artificats
# prefix = 'sagemaker/DEMO-xgboost'  # place to upload training files within the bucket

In [None]:
etl_data = "path/from/s3/processed/data/from/aws/glue"

In [None]:
# Read data
bucket='your/bucket/name'
data_key = 'the/etl/output/train/data'
data_location = 's3://{}/{}'.format(bucket, data_key)
print(data_location)

In [None]:
train_input = data_location

In [None]:
df = pd.read_csv(data_location)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['DATE'] = pd.to_datetime(df['discovery_date'] - pd.Timestamp(0).to_julian_date(), unit='D')

In [None]:
df['MONTH'] = pd.DatetimeIndex(df['DATE']).month
df['DAY_OF_WEEK'] = df['DATE'].dt.weekday_name
df_orig = df.copy() #I will use this copy later
df.head()

In [None]:
le = preprocessing.LabelEncoder()
# df['STAT_CAUSE_DESCR'] = le.fit_transform(df['stat_cause_descr'])
df['STATE'] = le.fit_transform(df['state'])
df['DAY_OF_WEEK'] = le.fit_transform(df['DAY_OF_WEEK'])

In [None]:
def set_label(cat):
    cause = 0
    natural = ['Lightning']
    accidental = ['Structure','Fireworks','Powerline','Railroad','Smoking',
                  'Children','Campfire','Equipment Use','Debris Burning']
    malicious = ['Arson']
    other = ['Missing/Undefined','Miscellaneous']
    if cat in natural:
        cause = 1
    elif cat in accidental:
        cause = 2
    elif cat in malicious:
        cause = 3
    else:
        cause = 4
    return cause
     

In [None]:
df['LABEL'] = df['stat_cause_descr'].apply(lambda x: set_label(x)) # I created a copy of the original df earlier in the kernel
df = df.drop('stat_cause_descr',axis=1)

In [None]:
df.drop(['state', 'fire_size_class', 'discovery_date', 'cont_date'], axis=1, inplace=True)

In [None]:
df = df.drop('DATE',axis=1)
df = df.dropna()

In [None]:
df.head()

## Traing Model with SageMaker SKLearn Estimator API

In [None]:
from sagemaker.sklearn.estimator import SKLearn

script_path = 'aws_forest_wildfire_analytics.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.m5.large",
    role=role,
    sagemaker_session=sagemaker_session)

In [None]:
sklearn.fit({'train': train_input})

## Deploy Model, Run Predictions, and Evaluate

In [None]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

In [None]:
# Add code for getting test values

# Read Test data
bucket='your/bucket/name'
data_key = 'the/etl/output/test/set'
data_location = 's3://{}/{}'.format(bucket, data_key)
print(data_location)

test_df = pd.read_csv(data_location)
test_df_orig = test_df.copy()
test_df['DATE'] = pd.to_datetime(test_df['discovery_date'] - pd.Timestamp(0).to_julian_date(), unit='D')
test_df['MONTH'] = pd.DatetimeIndex(test_df['DATE']).month
test_df['DAY_OF_WEEK'] = test_df['DATE'].dt.weekday_name
le = preprocessing.LabelEncoder()
test_df['STATE'] = le.fit_transform(test_df['state'])
test_df['DAY_OF_WEEK'] = le.fit_transform(test_df['DAY_OF_WEEK'])

def set_label(cat):
    cause = 0
    natural = ['Lightning']
    accidental = ['Structure','Fireworks','Powerline','Railroad','Smoking',
                  'Children','Campfire','Equipment Use','Debris Burning']
    malicious = ['Arson']
    other = ['Missing/Undefined','Miscellaneous']
    if cat in natural:
        cause = 1
    elif cat in accidental:
        cause = 2
    elif cat in malicious:
        cause = 3
    else:
        cause = 4
    return cause
     

test_df['LABEL'] = test_df['stat_cause_descr'].apply(lambda x: set_label(x)) # I created a copy of the original test_df earlier in the kernel
test_df = test_df.drop('stat_cause_descr',axis=1)
test_df.drop(['state', 'fire_size_class', 'discovery_date', 'DATE', 'cont_date'], axis=1, inplace=True)
test_df = test_df.dropna()

test_X = test_df.drop(['LABEL'], axis=1).values
test_y = test_df['LABEL'].values

In [None]:
test_df.head()

In [None]:
test_df_orig.head()

In [None]:
y_preds = predictor.predict(test_X)
y_true = test_y

print(predictor.predict(test_X))
print(test_y)

In [None]:
acc = accuracy_score(y_true, y_preds)

print("Accuracy on test set: {:.2%}".format(acc))

## Delete Endpoint

In [None]:
sklearn.delete_endpoint()