### 1. Create a S3 Bucket

In [None]:
import boto3
s3 = boto3.resource('s3')
bucket_name = 'yahoofinancestockpricedemo'
try: 
    s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': 'us-east-2'})
    print('s3 bucket has been created')
except Exception as e:
    print('s3 error: ', e)


### 2. Load the data for training

#### i) install yahoo finance package

In [None]:
!pip install yfinance

#### ii）import the training data

In [None]:
import yfinance as yf
import pandas as pd
from datetime import datetime

# initial the orginal parameters
start_date = datetime(2021, 1, 1)
end_date = datetime(2023, 1, 1)

# load the data
data = yf.download('NVDA', start=start_date, end=end_date)
data = data.reset_index()
df_data = pd.DatafFrame(data)

In [None]:
# print data
df_data

### 3. Feature Engineering 

In [None]:
# drop Date and Adj Close
df_data = df_data.drop(axis=1, columns=['Date'])
df_data = df_data.drop(axis=1, columns=['Adj Close'])

In [None]:
df_data

In [None]:
# we want to take out a list of data as 'target', which indicates the next day 'Open'
# get the processed data without the last day
df_data_features = df_data.iloc[:-1, :]

In [None]:
df_data_features

In [None]:
# get the target list
df_data_target = df_data.iloc[1:, 0].rename('Target')

In [None]:
df_data_target

In [None]:
# put all the data together
df_data_features['Target'] = list(df_data_target)
move = df_data_features.pop('Target')
df_data_features.insert(0, 'Target', move)
df_data_final = df_data_features

In [None]:
df_data_final

### 4. Split train set and test set

In [None]:
import numpy as np
# randomize the data set to make them be independent
df_randomize = df_data_final.sample(frac=1, random_state=888)

In [None]:
df_randomize

In [None]:
# split data in to train set and test set
train_data, test_data = np.split(df_randomize, [int(0.8*len(df_randomize))])
print(train_data.shape, test_data.shape)

### 5. Set path and upload dataset to S3 bucket

In [None]:
import os
prefix = 'xgboost-as-built-in-algorithm'
train_path = 's3://{}/{}/{}/{}'.format(bucket_name, prefix, 'train', 'train.csv')
test_path = 's3://{}/{}/{}/{}'.format(bucket_name, prefix, 'test', 'test.csv')
print(train_path)
print(test_path)

In [None]:
# save the data sets to S3 bucket
train_data.to_csv(train_path, index=False, header = False)
test_data.to_csv(test_path, index=False, header = False)

### 6. Build up XGBoost Model

#### i) Import relevant libraries

In [None]:
import sagemaker
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

#### ii) Find a XGBoost image URI and build up a XGBoost container

In [None]:
container = image_uris.retrieve('xgboost', boto3.Session().region_name, '1.2-2')
display(container)

#### iii) Initialize the hyperparameters

In [None]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "early_stopping_rounds":10,
        "num_round":1000
        }

#### iv) Set an output path where the trained model will be saved

In [None]:
# set up the output path in s3
output_path = 's3://{}/{}/{}'.format(bucket_name, prefix, 'output')
print(output_path)

#### v) Construct a Sagemaker Estimator to call the container

In [None]:
estimator = sagemaker.estimator.Estimator(image_uri = container,
                                         hyperparameters = hyperparameters,
                                         role = sagemaker.get_execution_role(),
                                         instance_count = 1,
                                         instance_type = 'ml.m4.xlarge',
                                         volum_size = 5,# 5 GB
                                         output_path = output_path,
                                         use_spot_instances = True,
                                         max_run = 300,
                                         max_wait = 600
                                         )

#### vi) Define the data type and paths to the trainning and test datasets

In [None]:
input_type = 'csv'
train_input = TrainingInput('s3://{}/{}/{}'.format(bucket_name, prefix, 'train'), content_type = input_type)
test_input = TrainingInput('s3://{}/{}/{}'.format(bucket_name, prefix, 'test'), content_type = input_type)

#### vii) Execute the XGBoost trainning job

In [117]:
estimator.fit({'train': train_input, 'validation': test_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-07-24-11-11-44-821


2023-07-24 11:11:44 Starting - Starting the training job...
2023-07-24 11:12:10 Starting - Preparing the instances for training.........
2023-07-24 11:13:41 Downloading - Downloading input data
2023-07-24 11:13:41 Training - Downloading the training image......
2023-07-24 11:14:32 Training - Training image download completed. Training in progress...[34m[2023-07-24 11:14:48.713 ip-10-0-130-131.us-east-2.compute.internal:6 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-07-24:11:14:48:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-07-24:11:14:48:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2023-07-24:11:14:48:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-07-24:11:14:48:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2023-07-24:11:14:48:INFO] Determined delimiter of CSV input is ','[0m
[34m[2023-07-24:11:14:48:INFO] Determi

#### viii) Deploy the trained xgboost model as endpoints

In [118]:
from sagemaker.serializers import CSVSerializer
xgb_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=CSVSerializer())

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-07-24-11-16-07-124
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-07-24-11-16-07-124
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-07-24-11-16-07-124


-------------------!

In [146]:
xgb_predictor.endpoint_name # show the name of endpoint

'sagemaker-xgboost-2023-07-24-11-16-07-124'

#### ix) Make prediction with the use of Endpoint

In [149]:
# initial parameters
start = datetime(2023, 7, 25)
end = datetime(2023, 7, 26)

# get the data
df_data = yf.download('NVDA', start=start, end=end)
df_data = df_data.reset_index()

## drop the feartures to lower dimensionality
df_data = df_data.drop(axis=1, columns=['Adj Close'])
df_data = df_data.drop(axis=1, columns = ['Date'])

df_data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Open,High,Low,Close,Volume
0,447.309998,451.089996,440.400085,446.119995,36667890


In [160]:
data_features_arr = df_data.values
data_features_arr # show data array

array([[4.47309998e+02, 4.51089996e+02, 4.40399994e+02, 4.46119995e+02,
        3.82516000e+07]])

### 7. Serialize data

#### i) Inference - Serialized Input by Sagemaker Function

In [121]:
from sagemaker.serializers import CSVSerializer
Serialized_Input_Data = CSVSerializer().serialize([[1.48509995e+02, 1.49960007e+02, 1.40960007e+02, 1.43149994e+02, 4.01277000e+07]])
print(Serialized_Input_Data, type(Serialized_Input_Data))

y_pred_func = xgb_predictor.predict(Serialized_Input_Data).decode('utf-8')
print(y_pred_func) # show serialized output

148.509995,149.960007,140.960007,143.149994,40127700.0 <class 'str'>
143.09617614746094



#### ii) Inference - Serialized Input by built-in function(Lambda function friendly)

In [158]:
Inputs = [[1.48509995e+02, 1.49960007e+02, 1.40960007e+02, 1.43149994e+02,
        4.01277000e+07],
        [1.48509995e+02, 1.49960007e+02, 1.40960007e+02, 1.43149994e+02,
        4.01277000e+07]]
y_pred = []
for input in Inputs:
    Serialized_Input = ','.join(map(str, input))
    print(Serialized_Input)
    y_pred.append(xgb_predictor.predict(Serialized_Input).decode('utf-8'))
y_pred # show serialized output

148.509995,149.960007,140.960007,143.149994,40127700.0
148.509995,149.960007,140.960007,143.149994,40127700.0


['143.09617614746094\n', '143.09617614746094\n']

### 8. Create Lambda function handler

In [144]:
import boto3

ENDPOINT_NAME = 'sagemaker-xgboost-2023-07-24-11-16-07-124'
runtime = boto3.client('runtime.sagemaker') # create a client to invoke the endpoint

def lambda_handler(event, context):
    inputs = event['data']
    result = []
    
    for input in inputs:
        Serialized_input = ','.join(map(str,input))
        
        response = runtime.invoke_endpoint(EndpointName = ENDPOINT_NAME,
                                           ContentType = 'text/csv',
                                           Body = Serialized_input)
        
        result.append(response['Body'].read().decode('utf-8')[:-1])
        
    return result

In [145]:
Input_json = {'data':[
    [1.48509995e+02, 1.49960007e+02, 1.40960007e+02, 1.43149994e+02,4.01277000e+07],
    [1.48509995e+02, 1.49960007e+02, 1.40960007e+02, 1.43149994e+02,4.01277000e+07],
    [1.48509995e+02, 1.49960007e+02, 1.40960007e+02, 1.43149994e+02,4.01277000e+07]
    ]
}
result = lambda_handler(Input_json, _)
result

['143.09617614746094', '143.09617614746094', '143.09617614746094']