# XGBoost and Linear Learner in SageMaker

## Goal:
Model to decide (classify) if newly reported UFO sightings are legitimate (1: explained, 2: unexplained, 3: probable)

## Steps:
1. [Load csv from S3](#Step-1:-Loading-csv-from-S3)
1. [Cleaning, transforming, analyize, and preparing dataset](#Step-2:-Cleaning,-transforming,-analyize,-and-preparing-the-dataset)
1. [Create and train XGBoost](#Step-3:-Train-XGBoost)
1. [Create and train Linear Learner](#Step-4:-Linear-Learner)

## Imports

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import io
import sagemaker.amazon.common as smac

import boto3
from sagemaker import get_execution_role
import sagemaker

import matplotlib.pyplot as plt
import seaborn as sns

## Step 1: Loading csv from S3

In [None]:
role = get_execution_role()
bucket = 'ml_ufo_sightings'
sub_folder = 'ufo_dataset'
data_key = 'ufo_fullset.csv'
data_location = 's3://{}/{}/{}'.format(bucket, sub_folder, data_key)

df = pd.read_csv(data_location, low_memory=False)
df.head()

## Step 2: Cleaning, transforming, analyize, and preparing the dataset

In [None]:
# any missing values?
#boolean
missing_values = df.isnull().values.any()
# if true, then show null rows
if(missing_values):
    display(df[df.isnull().any(axis=1)])

In [None]:
# Values are missing in the 'shape' feature. What are the other 'shape' values ordered by counts?
df['shape'].value_counts()

In [None]:
# Replace missing 'shape' values with the most common shape -> count index 0
df['shape'] = df['shape'].fillna(df['shape'].value_counts().index[0])

In [None]:
# Convert the `reportedTimestamp` and `eventDate` to a datetime data types.
df['reportedTimestamp'] = pd.to_datetime(df['reportedTimestamp'])
df['eventDate'] = pd.to_datetime(df['eventDate'])

# Convert the `shape` and `weather` to a category data type.
df['shape'] = df['shape'].astype('category')
df['weather'] = df['weather'].astype('category')

# Map the `physicalEvidence` and `contact` from 'Y', 'N' to `0`, `1`.
df['physicalEvidence'] = df['physicalEvidence'].replace({'Y': 1, 'N': 0})
df['contact'] = df['contact'].replace({'Y': 1, 'N': 0})

# Convert the `researchOutcome` to a category data type (target attribute).
df['researchOutcome'] = df['researchOutcome'].astype('category')

In [None]:
df.dtypes

## Exploration and Graphing

In [None]:
%matplotlib inline
sns.set_context("paper", font_scale=1.4)

In [None]:
# Graph: Was Contact Made?
m_cts = (df['contact'].value_counts())
m_ctsx = m_cts.index
m_ctsy = m_cts.get_values()
f, ax = plt.subplots(figsize=(5,5))

sns.barplot(x=m_ctsx, y=m_ctsy)
ax.set_title('UFO Sightings and Contact')
ax.set_xlabel('Was contact made?')
ax.set_ylabel('Number of Sightings')
ax.set_xticklabels(['No', 'Yes'])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Graph: Physical Evidence?
m_cts = (df['physicalEvidence'].value_counts())
m_ctsx = m_cts.index
m_ctsy = m_cts.get_values()
f, ax = plt.subplots(figsize=(5,5))

sns.barplot(x=m_ctsx, y=m_ctsy)
ax.set_title('UFO Sightings and Physical Evidence')
ax.set_xlabel('Was there physical evidence?')
ax.set_ylabel('Number of Sightings')
ax.set_xticklabels(['No', 'Yes'])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Graph: UFO shapes
m_cts = (df['shape'].value_counts())
m_ctsx = m_cts.index
m_ctsy = m_cts.get_values()
f, ax = plt.subplots(figsize=(9,5))

sns.barplot(x=m_ctsx, y=m_ctsy)
ax.set_title('UFO Sightings by Shape')
ax.set_xlabel('UFO Shape')
ax.set_ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Graph: Weather during Sightings
m_cts = (df['weather'].value_counts())
m_ctsx = m_cts.index
m_ctsy = m_cts.get_values()
f, ax = plt.subplots(figsize=(5,5))

sns.barplot(x=m_ctsx, y=m_ctsy)
ax.set_title('UFO Sightings by Weather')
ax.set_xlabel('Weather')
ax.set_ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Graph: Research Outcome
m_cts = (df['researchOutcome'].value_counts())
m_ctsx = m_cts.index
m_ctsy = m_cts.get_values()
f, ax = plt.subplots(figsize=(5,5))

sns.barplot(x=m_ctsx, y=m_ctsy)
ax.set_title('UFO Sightings and Research Outcome')
ax.set_xlabel('Research Outcome')
ax.set_ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Graph: Sightings by Year
ufo_yr = df['eventDate'].dt.year  # series with the year exclusively

## Set axes ##
years_data = ufo_yr.value_counts()
years_index = years_data.index  # x ticks
years_values = years_data.get_values()

## Create Bar Plot ##
plt.figure(figsize=(15,8))
plt.xticks(rotation = 60)
plt.title('UFO Sightings by Year')
plt.ylabel('Number of Sightings')
plt.xlabel('Year')

years_plot = sns.barplot(x=years_index[:60],y=years_values[:60])

In [None]:
# correlaton table
df.corr()

## Remove uninformative features:
1. `sighting`, always 'Y' 
1. `firstName` and `lastName`, uninformative for predicting `researchOutcome`
1. `reportedTimestamp` uninformative for predicting `researchOutcome`
1. `eventDate` and `eventTime` are very evenly distributed. There are no discernible seasons etc.

In [None]:
df.drop(columns=['firstName', 'lastName', 'sighting', 'reportedTimestamp', 'eventDate', 'eventTime'], inplace=True)

In [None]:
df.head()

## One-hot encoding:

In [None]:
# Categories `weather` and `shape`
df = pd.get_dummies(df, columns=['weather', 'shape'])

## To Numeric:

In [None]:
# `researchOutcome` (target) to numeric values: unexplained, explained, and probable to 0, 1, 2.
df['researchOutcome'] = df['researchOutcome'].replace({'unexplained': 0, 'explained': 1, 'probable': 2})

In [None]:
display(df.head())
display(df.shape)

## Randomize and Split
Tis may be easier with scikit-learn.

In [None]:
# Shuffle/Randomize order
df = df.sample(frac=1).reset_index(drop=True)

# Split into training, validation, testing
rand_split = np.random.rand(len(df))
train_list = rand_split < 0.8                       # 80% training
val_list = (rand_split >= 0.8) & (rand_split < 0.9) # 10% validation
test_list = rand_split >= 0.9                       # 10% testing

data_train = df[train_list]
data_val = df[val_list]
data_test = df[test_list]

XGBoost [documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html#InputOutput-XGBoost).

After that we will go ahead and create those files on our Notebook instance (stored as CSV) and then upload them to S3. 

In [None]:
# move target 'researchOutcome' to the first position, then create CSV files
pd.concat([data_train['researchOutcome'], data_train.drop(['researchOutcome'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([data_val['researchOutcome'], data_val.drop(['researchOutcome'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)

# upload CSV to S3 into train and validation folders
boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/xgboost_train/train.csv').upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/xgboost_validation/validation.csv').upload_file('validation.csv')

## Step 3: Train XGBoost

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'xgboost')

Training with the CSV file format -> create inputs that training function can use as a pointer to the files in S3, which also specify that the content type is CSV.

In [None]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/algorithms_lab/xgboost_train'.format(bucket), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/algorithms_lab/xgboost_validation'.format(bucket), content_type='csv')

In [None]:
# Create a training job name
job_name = 'ufo-xgboost-job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
print('Here is the job name {}'.format(job_name))

# model output path
output_location = 's3://{}/algorithms_lab/xgboost_output'.format(bucket)

In [None]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path=output_location,
                                    sagemaker_session=sess
                                   )

xgb.set_hyperparameters(objective='multi:softmax',
                        num_class=3,
                        num_round=100
                       )

data_channels = {'train': s3_input_train,
                'validation': s3_input_validation
                }

# call `.fit()` function to start training
xgb.fit(data_channels, job_name=job_name)

print('Model name: {}/{}/output/model.tar.gz'.format(output_location, job_name))

After training the model, see the default evaluation metric in the logs. Also access detailed logs in CloudWatch.

The `merror` is used in multiclass classification error rate. It is calculated as #(wrong cases)/#(all cases). This needs to be minimized.

## Step 4: Linear Learner

Randomize the data again and get it ready for the Linear Leaner algorithm. 

Rearrange the columns so it is ready for the algorithm (first column = target attribute).

In [None]:
np.random.seed(0)
rand_split = np.random.rand(len(df))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = df[train_list]
data_val = df[val_list]
data_test = df[test_list]



# Put label up first for all three DFs
cols = list(data_train)
cols.insert(0, cols.pop(cols.index('researchOutcome')))
data_train = data_train[cols]

cols = list(data_val)
cols.insert(0, cols.pop(cols.index('researchOutcome')))
data_val = data_val[cols]

cols = list(data_test)
cols.insert(0, cols.pop(cols.index('researchOutcome')))
data_test = data_test[cols]



# Breaks the datasets into attribute numpy.ndarray and the same for target attribute.  
train_X = data_train.drop(columns='researchOutcome').values
train_y = data_train['researchOutcome'].values

val_X = data_val.drop(columns='researchOutcome').values
val_y = data_val['researchOutcome'].values

test_X = data_test.drop(columns='researchOutcome').values
test_y = data_test['researchOutcome'].values

Pipe Mode (Training Set): Create recordIO_protobuf file and upload it to S3

In [None]:
train_file = 'ufo_sightings_train_recordIO_protobuf.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/linearlearner_train/{}'.format(train_file)).upload_fileobj(f)
training_recordIO_protobuf_location = 's3://{}/algorithms_lab/linearlearner_train/{}'.format(bucket, train_file)

print('The Pipe mode recordIO protobuf training data: {}'.format(training_recordIO_protobuf_location))

Pipe Mode (Validation Set): Create recordIO_protobuf file and upload it to S3

In [None]:
validation_file = 'ufo_sightings_validatioin_recordIO_protobuf.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/linearlearner_validation/{}'.format(validation_file)).upload_fileobj(f)
validate_recordIO_protobuf_location = 's3://{}/algorithms_lab/linearlearner_validation/{}'.format(bucket, validation_file)

print('The Pipe mode recordIO protobuf validation data: {}'.format(validate_recordIO_protobuf_location))

Call Docker image with Linear Learner Algorithm from ECR repository.

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
import sagemaker

container = get_image_uri(boto3.Session().region_name, 'linear-learner', "1")

In [None]:
# training job name
job_name = 'ufo-linear-learner-job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
print('Here is the job name {}'.format(job_name))

# model-artifact oupput path
output_location = 's3://{}/algorithms_lab/linearlearner_output'.format(bucket)

print('Feature_dim hyperparameter needs to be set to {}.'.format(data_train.shape[1] - 1))

In [None]:
sess = sagemaker.Session()

# Setup the LinearLeaner algorithm from the ECR container
linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess,
                                       input_mode='Pipe'
                                      )

# Setup the hyperparameters
linear.set_hyperparameters(feature_dim=22, # number of attributes (minus the researchOutcome attribute)
                           predictor_type='multiclass_classifier', # type of classification problem
                           num_classes=3 # number of classes in out researchOutcome (explained, unexplained, probable)
                            )  


# data input
data_channels = {
                'train': training_recordIO_protobuf_location,
                'validation': validate_recordIO_protobuf_location
                }

# start fit job
linear.fit(data_channels, job_name=job_name)

print('Output path of Linear Learner model: {}/{}/output/model.tar.gz'.format(output_location, job_name))

- Go to CloudWatch to see detailed model logs.
- Stop the Notebook Instance.