In [75]:
# !pip install smart_open
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
## data preprocessing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from smart_open import open as s_open
import s3fs
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"50"}
# set an output path where the trained model will be saved
bucket = sagemaker.Session().default_bucket()
prefix = 'sample-xgboost-prediction'
output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'stock-volume-xgb-framework')
region = sagemaker.Session().boto_region_name

In [76]:
def load_dfs(folder='stock_data_raw_with_3_mo_target'):
    bucket = sagemaker.Session().default_bucket()
    s3f = s3fs.S3FileSystem()
    f_names = s3f.ls(f'{bucket}/{folder}')
    display('number of files', len(f_names))
    dfs = list()
    for file_name in f_names[:100]:  
        try:
            df = pd.read_csv(f's3://{file_name}')
            df.dropna(inplace=True)
            dfs.append(df)
        except Exception as e:
            display(e)
    display(len(dfs))
    df = pd.concat(dfs, ignore_index=True)
    return df
def save_df_to_split(df):
    tr, test = train_test_split(df)
    train, val = train_test_split(tr)
    train.to_csv(f's3://{bucket}/{prefix}/train/train.csv')
    test.to_csv(f's3://{bucket}/{prefix}/test/test.csv')
    val.to_csv(f's3://{bucket}/{prefix}/validation/validation.csv')

In [77]:
df = load_dfs()
display(df)
save_df_to_split(df)

'number of files'

2810

100

Unnamed: 0.1,Unnamed: 0,low_window_20_shift_0_mean_percent_change_percent_lag_90_shift_-90,open,high,low,close,volume
0,2008-06-06,-0.378243,15.90,16.01,15.20,15.20,16376
1,2008-06-09,-0.390225,14.52,14.69,13.45,14.69,62727
2,2008-06-10,-0.402747,14.33,14.68,13.75,14.39,31635
3,2008-06-11,-0.415708,14.26,14.31,14.00,14.19,4180
4,2008-06-12,-0.427644,14.15,14.40,14.15,14.31,7908
...,...,...,...,...,...,...,...
189419,2021-03-26,0.005865,103.13,111.48,103.13,111.05,278118
189420,2021-03-29,0.005850,109.31,110.47,106.03,107.53,375254
189421,2021-03-30,0.007359,106.27,108.23,103.54,105.98,389537
189422,2021-03-31,0.006390,107.98,110.56,107.09,109.17,439153


In [78]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)
# define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = TrainingInput("s3://{}/{}/{}/".format(bucket, prefix, 'train'), content_type=content_type)
validation_input = TrainingInput("s3://{}/{}/{}/".format(bucket, prefix, 'validation'), content_type=content_type)
# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20210310T221946 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


2021-04-20 18:01:45 Starting - Starting the training job...
2021-04-20 18:01:47 Starting - Launching requested ML instancesProfilerReport-1618941705: InProgress
......
2021-04-20 18:03:00 Starting - Preparing the instances for training......
2021-04-20 18:04:02 Downloading - Downloading input data
2021-04-20 18:04:02 Training - Downloading the training image...
2021-04-20 18:04:40 Training - Training image download completed. Training in progress.[34m[2021-04-20 18:04:28.713 ip-10-0-90-166.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34m

In [None]:
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium', 
)

--------

In [None]:
data=
predictor.predict()