In [None]:
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Arize-ai/client_python/blob/main/arize/examples/tutorials/Arize_Tutorial_Sagemaker_xgboost.ipynb)

In [None]:
## Install Arize SDK
!pip3 install arize

In [None]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print(f'Success - the MySageMakerInstance is in the {my_region} region. You will use the {containers[my_region]} container for your SageMaker endpoint.')



In [None]:
bucket_name = '<UNIQUE_BUCKET>' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

In [None]:
try:
  urllib.request.urlretrieve ('https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv', 'bank_clean.csv')
  print('Success: downloaded bank_clean.csv.')
except Exception as e:
  print('Data load error: ',e)

try:
  model_data = pd.read_csv('./bank_clean.csv',index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)
    

In [None]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

In [None]:
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [None]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

In [None]:
p_dat = pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)

In [None]:
xgb.fit({'train': s3_input_train})

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

In [None]:
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)


In [None]:
from arize.api import Client
from arize.types import ModelTypes

ORGANIZATION_KEY = 'YOUR ARIZE ORGANIZATION KEY'
API_KEY = 'YOUR ARIZE API KEY'
arize = Client(organization_key=ORGANIZATION_KEY, api_key=API_KEY)

In [None]:
column_names = test_data.drop(['y_no', 'y_yes'], axis=1).columns

In [None]:
##Generate feature map per prediction

def generate_feature_map(column_names, test_data_array, pred_num):
  features = {}
  for idx, col in enumerate(column_names):
    features[col] = str(test_data_array[pred_num][idx])
  return features

In [None]:
import time
import functools

times = []

for pred_num, prediction_val in enumerate(predictions_array):
    if pred_num > 0 and pred_num % 1000 == 0:
        print(f'{pred_num} predictions fired. {functools.reduce(lambda a,b: a+b, times)/pred_num}ms per call')
    start = time.time() * 1000
    arize.log_prediction(
        model_id='sage-maker-batch-1',
        model_version='v0.1', 
        model_type=ModelTypes.BINARY,
        prediction_id='plED4eERDCasd9797ca35'+str(pred_num),
        prediction_label=prediction_val,
        features=generate_feature_map(column_names, test_data_array, pred_num)
    )
    end = time.time() * 1000
    times.append((end-start))

total_time = functools.reduce(lambda a,b: a+b, times)
print(f'Total batch took {total_time}ms, each message took on average {total_time/len(predictions_array)}ms')
