In [1]:
!pip install -q boto3

[31mtensorflow 2.0.0 requires opt-einsum>=2.3.2, which is not installed.[0m
[31mtensorflow 2.0.0 has requirement gast==0.2.2, but you'll have gast 0.3.3 which is incompatible.[0m
[31mawscli 1.18.11 has requirement botocore==1.15.11, but you'll have botocore 1.14.17 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [3]:
prefix_train = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/train'
prefix_validation = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/validation'
prefix_test = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/test'

balanced_tfidf_without_header_train_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_train)
balanced_tfidf_without_header_validation_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_validation)
balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_test)

s3_input_train_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_train_s3_uri, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_validation_s3_uri, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_test_s3_uri, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/feature-store/amazon-reviews/csv/balanced-tfidf-without-header/train/data.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/feature-store/amazon-reviews/csv/balanced-tfidf-without-header/validation/data.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/feature-store/amazon-reviews/csv/balanced-tfidf-without-header/test/data.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}


In [4]:
from sagemaker.amazon.amazon_estimator import get_image_uri 

# get the URI for new container
builtin_container_uri = get_image_uri(region_name=region,                                
                                      repo_name='xgboost', 
                                      repo_version='0.90-2')

model_output_path = 's3://{}/models/built-in/training-runs'.format(bucket)

xgb_estimator = sagemaker.estimator.Estimator(image_name=builtin_container_uri, 
                                              role=role, 
                                              hyperparameters={'objective':'binary:logistic',
                                                               'num_round': 1,
                                                               'max_depth': 5},
                                              train_instance_count=1, 
                                              train_instance_type='ml.m4.xlarge', 
                                              output_path=model_output_path, 
                                              sagemaker_session=sess)

In [5]:
xgb_estimator.fit({'train': s3_input_train_data,
                   'validation': s3_input_validation_data
                  }
                  #, wait=False
                 )

2020-03-04 05:19:07 Starting - Starting the training job...
2020-03-04 05:19:09 Starting - Launching requested ML instances......
2020-03-04 05:20:16 Starting - Preparing the instances for training......
2020-03-04 05:21:20 Downloading - Downloading input data...
2020-03-04 05:22:03 Training - Downloading the training image...
2020-03-04 05:22:35 Uploading - Uploading generated training model
2020-03-04 05:22:35 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Det

In [6]:
training_job_name = xgb_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

training_job_name:  sagemaker-xgboost-2020-03-04-05-19-07-509


In [7]:
# TODO:  This is broken
#from sagemaker.xgboost import XGBoost

#xgb_estimator = XGBoost.attach(training_job_name=training_job_name)

In [8]:
# download the model artifact from AWS S3
!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models/built-in/

Completed 2.7 KiB/2.7 KiB (31.5 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-835319576252/models/built-in/training-runs/sagemaker-xgboost-2020-03-04-05-19-07-509/output/model.tar.gz to models/built-in/model.tar.gz


In [9]:
import tarfile
import pickle as pkl

# TODO:  extract to ./model/built-in/

#opens the downloaded model artifcat and loads it as 'model' variable
tar = tarfile.open('./models/built-in/model.tar.gz')
tar.extractall(path='./models/built-in/')
tar.close()

In [10]:
!ls -al ./models/built-in/

total 20
drwxrwxr-x 2 ec2-user ec2-user 4096 Mar  4 05:22 .
drwxrwxr-x 4 ec2-user ec2-user 4096 Mar  4 04:55 ..
-rw-rw-r-- 1 ec2-user ec2-user 2725 Mar  4 05:22 model.tar.gz
-rw-r--r-- 1 ec2-user ec2-user 6254 Mar  4 05:22 xgboost-model


In [11]:
model_restored = pkl.load(open('./models/built-in/xgboost-model', 'rb'))

# TODO:  Calculate Validation and Test Metrics

In [12]:
# $S3_BUCKET/feature-store/amazon-reviews/balanced-tfidf-without-header/data.csv

prefix_test = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/test'

balanced_tfidf_without_header_test_path = './{}/data.csv'.format(prefix_test)

import os
os.makedirs(prefix_test, exist_ok=True)

balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_test)

In [13]:
!aws s3 cp $balanced_tfidf_without_header_test_s3_uri $balanced_tfidf_without_header_test_path

download: s3://sagemaker-us-east-1-835319576252/feature-store/amazon-reviews/csv/balanced-tfidf-without-header/test/data.csv to feature-store/amazon-reviews/csv/balanced-tfidf-without-header/test/data.csv


# Load the data
_Note:  `header=None`_

In [14]:
def load_dataset(path, sep, header):
    data = pd.read_csv(path, sep=sep, header=header)

    labels = data.iloc[:,0]
    features = data.drop(data.columns[0], axis=1)
    
    if header==None:
        # Adjust the column names after dropped the 0th column above
        # New column names are 0 (inclusive) to len(features.columns) (exclusive)
        new_column_names = list(range(0, len(features.columns)))
        features.columns = new_column_names

    return features, labels

In [15]:
X_test, y_test = load_dataset(path=balanced_tfidf_without_header_test_path, sep=',', header=None)


In [16]:
X_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.634733,-0.247392,0.069444,-0.523243,-0.004713,-0.448801,-0.204092,-0.2383,-0.26176,-0.219047,...,0.164698,-0.557045,-0.381416,-0.103129,-0.209871,-0.556612,1.304991,0.49982,1.384731,-0.083465
1,1.258064,-1.061612,-0.027788,-0.43596,0.519479,-0.291335,0.26339,-0.7803,-0.411519,-0.517505,...,-0.080147,0.484613,0.113549,-0.315451,0.965301,-2.017522,0.042569,-0.315694,-1.637476,1.099162
2,-0.123441,-0.381155,0.22087,-0.306529,0.295466,0.015603,0.808974,-0.886816,-0.537187,-1.609552,...,0.907523,-1.370449,-0.497941,-0.000245,-0.229369,-0.549921,-0.901384,0.611026,0.231871,-1.763118
3,-0.085429,-0.558576,-0.111424,-0.648954,-0.045818,-0.389012,-0.304378,-0.226467,0.129833,-0.374666,...,1.990565,0.674278,1.541115,-1.362709,-0.785404,2.047984,0.874032,-0.547825,-0.133409,0.407323
4,0.943603,-0.748338,-0.302065,-0.454148,-0.777959,-1.025572,-0.138429,0.392192,-0.90112,0.385676,...,-0.93025,1.317045,-1.353617,0.330193,-0.600254,0.714468,-1.320635,-2.121154,-0.086226,-0.31821


In [17]:
import matplotlib.pyplot as plt
import xgboost

fig, ax = plt.subplots(figsize=(12,12))
xgboost.plot_importance(model_restored, 
                        importance_type='gain', 
                        max_num_features=30, 
                        height=0.8, 
                        ax=ax, 
                        show_values = True)
plt.title('Feature Importance')
plt.show()

<Figure size 1200x1200 with 1 Axes>

In [18]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

#auc = model_restored.score(X_test, y_test)
#print('Test AUC ', auc)

model_restored.feature_names = X_test.columns
#preds_test = model_restored.predict(X_test)
#print('Test Accuracy: ', accuracy_score(y_test, preds_test))
#print('Test Precision: ', precision_score(y_test, preds_test, average=None))

In [19]:
print(classification_report(y_test, preds_test))

NameError: name 'preds_test' is not defined

# TODO: Deploy Endpoint

In [None]:
# https://towardsdatascience.com/xgboost-in-amazon-sagemaker-28e5e354dbcd
from sagemaker.predictor import csv_serializer

## Deploy trained XGBoost model endpoint to perform predictions
xgb_predictor = xgb_estimator.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

## ?? Function to chunk down test set into smaller increments

# TODO:  1) update this to do TF/IDF
#        2) use this in other versions of the model


In [None]:
def predict(data, model, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''

#        response = runtime_client.invoke_endpoint(EndpointName = endpoint_name,
#                                         ContentType = 'text/csv',
#                                         Body = single_test)
        
    for array in split_array:
        predictions = ','.join([predictions, model.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

## Generate predictions on the test set for the difference models

predictions = predict(X_test.columns.values, xgb_predictor, rows=500)

In [None]:
# https://towardsdatascience.com/xgboost-in-amazon-sagemaker-28e5e354dbcd

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

def calc_specificity(y_true, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_true == 0)) /sum(y_true ==0)

thresh = 0.5
y_pred = predictions
y_pred_binary = np.where(predictions > thresh, 1, 0)
y_true = test_data['OUTPUT_LABEL']

c_mat = confusion_matrix(y_true, y_pred_binary) ## Predicted vs. actual outcome
auc = round(roc_auc_score(y_true, y_pred),4)
accuracy = round(accuracy_score(y_true,(y_pred > thresh) ) ,4)
recall = round(recall_score(y_true, (y_pred > thresh)),4)
precision = round(precision_score(y_true, (y_pred > thresh)),4)
specificity = round(calc_specificity(y_true, y_pred, thresh),4)

class_names = ['Not Readmitted', 'Readmitted'] ## Different class names

def plot_conf_mat(cm, classes, title, cmap = plt.cm.Blues):                                 
                                              
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
        horizontalalignment="center",
        color="white" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

# Plot non-normalized confusion matrix
plt.figure()
fig, ax = plt.subplots(figsize=(6,4))
plot_conf_mat(c_mat, classes=class_names, 
                          title='Confusion matrix')
plt.show()
print(f'AUC is: {auc}')
print(f'Accuracy is: {accuracy}')
print(f'Recall is: {recall}')
print(f'Precision is: {precision}')
print(f'Specificity is: {specificity}')

In [None]:
from sklearn import metrics

auc = round(roc_auc_score(y_true, y_pred), 4)
print('AUC is ' + repr(auc))

fpr, tpr, _ = metrics.roc_curve(y_true, y_pred)

plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# TODO:  1) update this to do TF/IDF
#        2) use this in other versions of the model
# Derived from the following:
#   https://aim357.readthedocs.io/en/latest/GluePySparkMLFeatureEngineering/GluePySparkMLFeatureEngineering.html#deepar-deep-dive

class XGBoostPredictor(sagemaker.predictor.RealTimePredictor):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, content_type=sagemaker.content_types.CONTENT_TYPE_JSON, **kwargs)

    def predict(self, ts, cat=None, dynamic_feat=None,
                num_samples=100, return_samples=False, quantiles=["0.1", "0.5", "0.9"]):
        """Requests the prediction of for the time series listed in `ts`, each with the (optional)
        corresponding category listed in `cat`.

        ts -- `pandas.Series` object, the time series to predict
        cat -- integer, the group associated to the time series (default: None)
        num_samples -- integer, number of samples to compute at prediction time (default: 100)
        return_samples -- boolean indicating whether to include samples in the response (default: False)
        quantiles -- list of strings specifying the quantiles to compute (default: ["0.1", "0.5", "0.9"])

        Return value: list of `pandas.DataFrame` objects, each containing the predictions
        """
        prediction_time = ts.index[-1] + 1
        quantiles = [str(q) for q in quantiles]
        req = self.__encode_request(ts, cat, dynamic_feat, num_samples, return_samples, quantiles)
        res = super(DeepARPredictor, self).predict(req)
        return self.__decode_response(res, ts.index.freq, prediction_time, return_samples)

    def __encode_request(self, ts, cat, dynamic_feat, num_samples, return_samples, quantiles):
        instance = series_to_dict(ts, cat if cat is not None else None, dynamic_feat if dynamic_feat else None)

        configuration = {
            "num_samples": num_samples,
            "output_types": ["quantiles", "samples"] if return_samples else ["quantiles"],
            "quantiles": quantiles
        }

        http_request_data = {
            "instances": [instance],
            "configuration": configuration
        }

        return json.dumps(http_request_data).encode('utf-8')

    def __decode_response(self, response, freq, prediction_time, return_samples):
        # we only sent one time series so we only receive one in return
        # however, if possible one will pass multiple time series as predictions will then be faster
        predictions = json.loads(response.decode('utf-8'))['predictions'][0]
        prediction_length = len(next(iter(predictions['quantiles'].values())))
        prediction_index = pd.DatetimeIndex(start=prediction_time, freq=freq, periods=prediction_length)
        if return_samples:
            dict_of_samples = {'sample_' + str(i): s for i, s in enumerate(predictions['samples'])}
        else:
            dict_of_samples = {}
        return pd.DataFrame(data={**predictions['quantiles'], **dict_of_samples}, index=prediction_index)

    def set_frequency(self, freq):
        self.freq = freq

def encode_target(ts):
    return [x if np.isfinite(x) else "NaN" for x in ts]

def series_to_dict(ts, cat=None, dynamic_feat=None):
    """Given a pandas.Series object, returns a dictionary encoding the time series.

    ts -- a pands.Series object with the target time series
    cat -- an integer indicating the time series category

    Return value: a dictionary
    """
    obj = {"start": str(ts.index[0]), "target": encode_target(ts)}
    if cat is not None:
        obj["cat"] = cat
    if dynamic_feat is not None:
        obj["dynamic_feat"] = dynamic_feat
    return obj

In [None]:
xgb_endpoint_name = prefix + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

xgb_predictor = xgb_estimator.deploy(
                     initial_instance_count=1, 
                     instance_type='ml.m4.xlarge',
                     predictor_cls=XGBoostPredictor,
                     endpoint_name=xgb_endpoint_name)

In [None]:
predictions, raw_outputs = model.predict(["""Very funny. A typical mid 50's comedy."""])
print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))

In [None]:
predictions, raw_outputs = bert_model.predict(["""That movie was absolutely awful."""])
print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))