This notebook contains the code for the following steps:
1. Feature normalisation 
2. Model training using Sagemaker
3. Model deployment as Sagemaker realtime predictor endpoint
4. Forecast creation for the Future dates using the deployed model

In [None]:
import sagemaker
import pandas as pd
from datetime import datetime
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

from ag_model import (
    AutoGluonSagemakerEstimator,
    AutoGluonNonRepackInferenceModel,
    AutoGluonRepackInferenceModel,
    AutoGluonSagemakerInferenceModel,
    AutoGluonRealtimePredictor,
    AutoGluonBatchPredictor,
)
from sagemaker import utils
from sagemaker.serializers import CSVSerializer
import os
import boto3

role = sagemaker.get_execution_role()
# sagemaker_session = sagemaker.session.Session()
# region = sagemaker_session._region_name
session = boto3.Session()
s3_client = session.client("s3")
sagemaker_session = sagemaker.Session()
region = session.region_name
bucket = sagemaker_session.default_bucket()
bucket

#### initialize variables

In [2]:
s3_prefix = f"autogluon_sm/{utils.sagemaker_timestamp()}"
output_path = f"s3://{bucket}/{s3_prefix}/output/"


known_covariates = ['listing_price_mean','net_unit_price','promo_flag','holiday_flag','weekend_flag','holiday_ahead','promo_ahead']
prediction_length = 90
start_date = '2020-01-01'
# test_start_date="2023-11-03" 
# test_end_date="2024-02-01" 
TARGET = 'target'
trial_name = 'autogluon_ensemble_trial_1'
model_path = f'/home/sagemaker-user/Homepro_AWS_competancy/models/{trial_name}'
TRAIN_TIME_LIMIT =12000

#### data import and preprocess data

In [3]:
dataset = pd.read_csv('../data/final_aggregated_data.csv')
dataset.rename(columns={'transaction_date':'timestamp','sales_quantity_sum':'target'},inplace = True)
dataset

Unnamed: 0,item_id,timestamp,target,net_revenue_sum,listing_price_mean,net_unit_price,promo_flag,holiday_flag,weekend_flag,holiday_ahead,promo_ahead,item_height,item_width,item_length,item_weight,article_category
0,ITEM0001,2020-01-01,640.0,35200.00,55.000000,55.000000,True,True,False,False,True,29.53,82.49,89.19,4.01,Electronics
1,ITEM0001,2020-01-02,838.0,46090.00,55.000000,55.000000,True,False,False,False,True,29.53,82.49,89.19,4.01,Electronics
2,ITEM0001,2020-01-03,956.0,52580.00,55.000000,55.000000,True,False,False,False,True,29.53,82.49,89.19,4.01,Electronics
3,ITEM0001,2020-01-04,1467.0,80685.00,55.000000,55.000000,True,False,True,False,True,29.53,82.49,89.19,4.01,Electronics
4,ITEM0001,2020-01-05,1372.0,75456.47,55.007605,54.997427,True,False,True,False,True,29.53,82.49,89.19,4.01,Electronics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122842,ITEM0100,2024-01-27,42.0,56700.00,1350.000000,1350.000000,True,False,True,False,False,92.30,16.26,81.72,17.57,Electronics
122843,ITEM0100,2024-01-28,55.0,74250.00,1350.000000,1350.000000,False,False,True,False,False,92.30,16.26,81.72,17.57,Electronics
122844,ITEM0100,2024-01-29,46.0,62100.00,1350.000000,1350.000000,False,False,False,False,False,92.30,16.26,81.72,17.57,Electronics
122845,ITEM0100,2024-01-30,32.0,43200.00,1350.000000,1350.000000,False,False,False,False,False,92.30,16.26,81.72,17.57,Electronics


In [4]:
cat_dataset = pd.read_csv('../data/article_metadata.csv')
cat_dataset

Unnamed: 0,item_id,item_height,item_width,item_length,item_weight,article_category
0,ITEM0001,29.53,82.49,89.19,4.01,Electronics
1,ITEM0002,69.36,2.95,19.64,18.04,Toys
2,ITEM0003,86.38,59.71,34.78,6.12,Home
3,ITEM0004,83.84,78.03,89.58,15.34,Clothing
4,ITEM0005,16.68,44.55,64.05,17.96,Electronics
...,...,...,...,...,...,...
495,ITEM0496,50.30,63.68,84.41,8.84,Sports
496,ITEM0497,46.07,78.31,48.13,5.50,Home
497,ITEM0498,24.89,19.40,77.16,0.74,Electronics
498,ITEM0499,5.12,90.64,37.67,6.44,Home


In [5]:
static_feature_cols = ['item_height','item_width','item_length','item_weight','article_category']
for col in static_feature_cols:
    if col in dataset.columns:
        dataset = dataset.drop(col, axis=1)

dataset['timestamp'] = pd.to_datetime(dataset['timestamp'])
bool_cols = dataset.select_dtypes(include=['bool']).columns
for col in bool_cols:
    dataset[col] = dataset[col].astype(int)
dataset.dtypes

item_id                       object
timestamp             datetime64[ns]
target                       float64
net_revenue_sum              float64
listing_price_mean           float64
net_unit_price               float64
promo_flag                     int64
holiday_flag                   int64
weekend_flag                   int64
holiday_ahead                  int64
promo_ahead                    int64
dtype: object

In [6]:
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import pickle
import os
current_dir = os.getcwd()
s3_prefix = f"autogluon_sm/{utils.sagemaker_timestamp()}"


input_df = dataset.copy()
# sampled_items = input_df['item_id'].sample(100, random_state=40).unique()
# data = input_df[input_df['item_id'].isin(sampled_items)].copy()
# input_data = data.copy()
# Define the columns to scale
scale_cols = ['target', 'net_revenue_sum', 'net_unit_price', 'listing_price_mean']

# Dictionary to store scalers for each item_id
scalers = {}

# Group by item_id and scale within each group
for item_id, group_df in tqdm(input_df.groupby('item_id'), desc="Scaling Groups"):
    scalers[item_id] = {}
    for col in scale_cols:
        scaler = MinMaxScaler()
        #group_df[col] = scaler.fit_transform(group_df[[col]])
        scaled_values = scaler.fit_transform(group_df[[col]])
        input_df.loc[group_df.index, col] = scaled_values.flatten()
        scalers[item_id][col] = scaler
        
# Path to save the scaler dictionary
scalers_s3_path = f's3://{bucket}/{s3_prefix}/scalers.pkl'
scalers_local_path = f"{current_dir}/scalers.pkl"
# Save the scalers dictionary to a file using pickle
with open(scalers_local_path, 'wb') as file:
    pickle.dump(scalers, file)

s3_client.upload_file(
                    scalers_local_path,
                    bucket,
                    scalers_s3_path)
# Display the scaled DataFrame
input_df

Scaling Groups:   0%|          | 0/100 [00:00<?, ?it/s]

Scaling Groups: 100%|██████████| 100/100 [00:00<00:00, 114.54it/s]


Unnamed: 0,item_id,timestamp,target,net_revenue_sum,listing_price_mean,net_unit_price,promo_flag,holiday_flag,weekend_flag,holiday_ahead,promo_ahead
0,ITEM0001,2020-01-01,0.117108,0.105665,0.000787,0.044654,1,1,0,0,1
1,ITEM0001,2020-01-02,0.212530,0.191762,0.000787,0.044654,1,0,0,0,1
2,ITEM0001,2020-01-03,0.269398,0.243072,0.000787,0.044654,1,0,0,0,1
3,ITEM0001,2020-01-04,0.515663,0.465273,0.000787,0.044654,1,0,1,0,1
4,ITEM0001,2020-01-05,0.469880,0.423935,0.002306,0.044163,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
122842,ITEM0100,2024-01-27,0.339286,0.334207,0.000000,0.066667,1,0,1,0,0
122843,ITEM0100,2024-01-28,0.455357,0.449168,0.000000,0.066667,0,0,1,0,0
122844,ITEM0100,2024-01-29,0.375000,0.369579,0.000000,0.066667,0,0,0,0,0
122845,ITEM0100,2024-01-30,0.250000,0.245775,0.000000,0.066667,0,0,0,0,0


#### convert to timeseries df

In [38]:
ts = TimeSeriesDataFrame.from_data_frame(input_df)
ts.static_features = cat_dataset.set_index("item_id")
ts = ts.sort_index()
print("converted to timeseries df")

converted to timeseries df


In [39]:
max_date ='2024-01-31'
train_data = ts[ts.index.get_level_values("timestamp")<=max_date]
train_item_counts = train_data.groupby(level=0).size()
min_count = 2*prediction_length + 1
train_data = train_data[train_data.index.get_level_values(0).isin(train_item_counts[train_item_counts >= min_count].index)]


In [40]:
df_train = train_data.reset_index()
df_train.to_csv("../data/train.csv",index = False)

In [41]:
df_train.dtypes

item_id                       object
timestamp             datetime64[ns]
target                       float64
net_revenue_sum              float64
listing_price_mean           float64
net_unit_price               float64
promo_flag                     int64
holiday_flag                   int64
weekend_flag                   int64
holiday_ahead                  int64
promo_ahead                    int64
dtype: object

In [42]:
train_item_list = train_data.index.get_level_values(0).unique().tolist()
static_features = cat_dataset[cat_dataset.item_id.isin(train_item_list)]
static_features.to_csv("../data/static_feats.csv",index = False)

### Training

In [43]:
ag = AutoGluonSagemakerEstimator(
    role=role,
    entry_point="deepar_train.py",
    source_dir = "scripts",
    region=region,
    instance_count=1,
    instance_type="ml.c5.2xlarge",
    framework_version="1.0",
    py_version="py310",
    base_job_name="autogluon-deepar-train",
    disable_profiler=True,
    debugger_hook_config=False,
    dependencies = ['config']
)

In [44]:
train_input = ag.sagemaker_session.upload_data(
    path=os.path.join("../data", "train.csv"), key_prefix=s3_prefix
)
static_feat_input = ag.sagemaker_session.upload_data(
    path=os.path.join("../data", "static_feats.csv"), key_prefix=s3_prefix
)
# eval_input = ag.sagemaker_session.upload_data(
#     path=os.path.join("data", "test.csv"), key_prefix=s3_prefix
# )
config_input = ag.sagemaker_session.upload_data(
    path=os.path.join("config", "config-med.yaml"), key_prefix=s3_prefix
)

# Provide inference script so the script repacking is not needed later
# See more here: https://docs.aws.amazon.com/sagemaker/latest/dg/mlopsfaq.html
# Q. Why do I see a repack step in my SageMaker pipeline?
inference_script = ag.sagemaker_session.upload_data(
    path=os.path.join("scripts", "deepar_serve.py"), key_prefix=s3_prefix
)

#### Fit the model

In [45]:
%time
job_name = utils.unique_name_from_base("test-autogluon-image")
ag.fit(
    {
        "config": config_input,
        "train": train_input,
        "static": static_feat_input,
        #"test": eval_input,
        "serving": inference_script,
    },
    job_name=job_name,
)

INFO:sagemaker:Creating training-job with name: test-autogluon-image-1718778021-6430


CPU times: user 2 μs, sys: 1e+03 ns, total: 3 μs
Wall time: 6.91 μs
2024-06-19 06:20:21 Starting - Starting the training job...
2024-06-19 06:20:38 Starting - Preparing the instances for training...
2024-06-19 06:20:59 Downloading - Downloading input data...
2024-06-19 06:21:25 Downloading - Downloading the training image...
2024-06-19 06:22:10 Training - Training image download completed. Training in progress..bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-06-19 06:22:20,653 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-06-19 06:22:20,653 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-19 06:22:20,654 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-06-19 06:22:20,662 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-06-19 06:22:20,664 s

#### model export and deploying endpoint

In [46]:
!aws s3 cp {ag.model_data} ./scaled_feats_model.tar.gz

download: s3://sagemaker-ap-southeast-1-039116967608/test-autogluon-image-1718778021-6430/output/model.tar.gz to ./scaled_feats_model.tar.gz


In [47]:
endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-autogluon-serving-trained-model")

model_data = sagemaker_session.upload_data(
    path=os.path.join(".", "scaled_feats_model.tar.gz"), key_prefix=f"{endpoint_name}/models"
)

instance_type = "ml.m5.xlarge"


In [48]:
model = AutoGluonNonRepackInferenceModel(
    model_data=model_data,
    role=role,
    region=region,
    framework_version="1.0",
    py_version="py310",
    instance_type=instance_type,
    entry_point="scripts/deepar_serve.py",
    #config_file_path = config_input,
    dependencies = ['config/config-med.yaml'],
)

In [49]:
model.deploy(initial_instance_count=1, serializer=CSVSerializer(), instance_type=instance_type)

INFO:sagemaker:Creating model with name: autogluon-inference-2024-06-19-06-26-06-273


{'SAGEMAKER_PROGRAM': 'deepar_serve.py', 'SAGEMAKER_SUBMIT_DIRECTORY': '/opt/ml/model/code', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_REGION': 'ap-southeast-1'}


INFO:sagemaker:Creating endpoint-config with name autogluon-inference-2024-06-19-06-26-06-957
INFO:sagemaker:Creating endpoint with name autogluon-inference-2024-06-19-06-26-06-957


------!

### Predict on future test data

In [50]:
predictor = AutoGluonRealtimePredictor(model.endpoint_name)

#### Read data

In [66]:
data = pd.read_parquet("/home/sagemaker-user/Homepro_AWS_competancy/data/known_covariates_final.parquet")
data.rename(columns={'transaction_date':'timestamp','listing_price':'listing_price_mean'},inplace = True)

data['timestamp'] = pd.to_datetime(data['timestamp'])
bool_cols = data.select_dtypes(include=['bool']).columns
for col in bool_cols:
    data[col] = data[col].astype(int)
data

Unnamed: 0,item_id,listing_price_mean,net_unit_price,timestamp,holiday_flag,promo_flag,weekend_flag,holiday_ahead,promo_ahead
0,ITEM0001,57.0,54.6,2024-01-31,0,0,0,0,0
1,ITEM0001,57.0,54.6,2024-02-01,0,0,0,0,0
2,ITEM0001,57.0,54.6,2024-02-02,0,0,0,0,0
3,ITEM0001,57.0,54.6,2024-02-03,0,0,1,0,0
4,ITEM0001,57.0,54.6,2024-02-04,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
18095,ITEM0100,1420.0,1269.0,2024-07-25,0,0,0,0,0
18096,ITEM0100,1420.0,1269.0,2024-07-26,0,0,0,0,0
18097,ITEM0100,1420.0,1269.0,2024-07-27,0,0,1,1,0
18098,ITEM0100,1420.0,1269.0,2024-07-28,1,0,1,1,0


In [67]:
data.dtypes

item_id                       object
listing_price_mean           float64
net_unit_price               float64
timestamp             datetime64[ns]
holiday_flag                   int64
promo_flag                     int64
weekend_flag                   int64
holiday_ahead                  int64
promo_ahead                    int64
dtype: object

#### apply scaling for some of the fields

In [68]:
scale_cols = ['target', 'net_revenue_sum', 'net_unit_price', 'listing_price_mean']

reqd_cols = [col for col in data.columns if col in scale_cols]
# Group by item_id and scale within each group
for item_id, group_df in tqdm(data.groupby('item_id'), desc="Scaling fields"):
    for col in reqd_cols:        
        scaler = scalers[item_id][col]
        # Handle NaN values if any
        group_df[col] = group_df[col].fillna(0)
        # Check if the scaler has been fitted
        if hasattr(scaler, 'transform'):
            scaled_values = scaler.transform(group_df[[col]])
            data.loc[group_df.index, col] = scaled_values.flatten()
data    

Scaling fields: 100%|██████████| 100/100 [00:00<00:00, 305.15it/s]


Unnamed: 0,item_id,listing_price_mean,net_unit_price,timestamp,holiday_flag,promo_flag,weekend_flag,holiday_ahead,promo_ahead
0,ITEM0001,0.400472,-0.031773,2024-01-31,0,0,0,0,0
1,ITEM0001,0.400472,-0.031773,2024-02-01,0,0,0,0,0
2,ITEM0001,0.400472,-0.031773,2024-02-02,0,0,0,0,0
3,ITEM0001,0.400472,-0.031773,2024-02-03,0,0,1,0,0
4,ITEM0001,0.400472,-0.031773,2024-02-04,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
18095,ITEM0100,1.000000,-1.013333,2024-07-25,0,0,0,0,0
18096,ITEM0100,1.000000,-1.013333,2024-07-26,0,0,0,0,0
18097,ITEM0100,1.000000,-1.013333,2024-07-27,0,0,1,1,0
18098,ITEM0100,1.000000,-1.013333,2024-07-28,1,0,1,1,0


In [69]:
data  =  data[['item_id','timestamp']+known_covariates]
data

Unnamed: 0,item_id,timestamp,listing_price_mean,net_unit_price,promo_flag,holiday_flag,weekend_flag,holiday_ahead,promo_ahead
0,ITEM0001,2024-01-31,0.400472,-0.031773,0,0,0,0,0
1,ITEM0001,2024-02-01,0.400472,-0.031773,0,0,0,0,0
2,ITEM0001,2024-02-02,0.400472,-0.031773,0,0,0,0,0
3,ITEM0001,2024-02-03,0.400472,-0.031773,0,0,1,0,0
4,ITEM0001,2024-02-04,0.400472,-0.031773,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
18095,ITEM0100,2024-07-25,1.000000,-1.013333,0,0,0,0,0
18096,ITEM0100,2024-07-26,1.000000,-1.013333,0,0,0,0,0
18097,ITEM0100,2024-07-27,1.000000,-1.013333,0,0,1,1,0
18098,ITEM0100,2024-07-28,1.000000,-1.013333,0,1,1,1,0


In [70]:
output = predictor.predict(data,initial_args={'ContentType': 'text/csv'})

In [71]:
df_result = pd.DataFrame(output[1:], columns=output[0])
df_result

Unnamed: 0,item_id,timestamp,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,ITEM0001,2024-02-01,0.291988268494606,0.21117580384016038,0.24078474342823028,0.26175227761268616,0.27796117067337034,0.291988268494606,0.30285040736198426,0.3283840805292129,0.34933605790138245,0.3805624932050705
1,ITEM0001,2024-02-02,0.36673758924007416,0.26374093890190126,0.30494551062583924,0.3313627392053604,0.346591854095459,0.36673758924007416,0.390872859954834,0.4089500278234482,0.4280054390430451,0.4625754922628403
2,ITEM0001,2024-02-03,0.4353574961423874,0.3159086525440216,0.3649071156978607,0.39573778212070465,0.42088884115219116,0.4353574961423874,0.4638036787509918,0.49186643362045285,0.5247165203094483,0.5942060649394989
3,ITEM0001,2024-02-04,0.49144598841667175,0.35284078419208526,0.40349634289741515,0.4351483225822449,0.46380217671394347,0.49144598841667175,0.524494731426239,0.5490080893039704,0.5764159440994263,0.6214187264442443
4,ITEM0001,2024-02-05,0.4012364447116852,0.2941871166229248,0.330983829498291,0.36289151906967165,0.38264173865318296,0.4012364447116852,0.4272363305091858,0.44382139742374416,0.46317067742347723,0.49974214732646943
...,...,...,...,...,...,...,...,...,...,...,...,...
8995,ITEM0100,2024-04-26,0.04365904815495014,-0.01830563899129629,0.00634605456143618,0.022942851297557354,0.03244855031371117,0.04365904815495014,0.05838563516736032,0.07566035911440849,0.10456822365522389,0.15275176316499708
8996,ITEM0100,2024-04-27,0.06858561560511589,0.002674113644752655,0.018881649151444437,0.03728825002908707,0.051238560676574715,0.06858561560511589,0.08697901219129563,0.10867150798439978,0.15086803138256075,0.22542074471712112
8997,ITEM0100,2024-04-28,0.06325497105717659,-0.006100525893270967,0.016846843063831333,0.033527249470353127,0.04814457371830941,0.06325497105717659,0.08132029026746751,0.1089185670018196,0.15588147044181835,0.22254142314195632
8998,ITEM0100,2024-04-29,0.04203391447663307,-0.015284265018999574,0.007404097821563486,0.0216066824272275,0.030341717973351483,0.04203391447663307,0.06233948022127152,0.0779719218611717,0.10812685191631319,0.1629818871617317


In [72]:
item_list = df_result['item_id'].unique().tolist()

cols = ['mean', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9']
# Iterate over item_ids and columns to retrieve unscaled values
for item_id in tqdm(item_list, desc="Descaling itemwise"):
    if item_id in scalers:  # Assuming item_list is defined and contains valid item_ids
        #print(f"{item_id} is in scalers")
        for col in cols:
            # Check if item_id exists in scalers and if 'target' scaler exists for this item_id
            if 'target' in scalers[item_id]:
                scaler = scalers[item_id]['target']
                # Select rows for the current item_id and column col
                mask = df_result['item_id'] == item_id
                scaled_values = df_result.loc[mask, [col]].values
                # Inverse transform the scaled values
                unscaled_values = scaler.inverse_transform(scaled_values)
                # Flatten the array and assign it back to the DataFrame
                df_result.loc[mask, col] = unscaled_values.flatten()
df_result

Descaling itemwise: 100%|██████████| 100/100 [00:01<00:00, 56.26it/s]


Unnamed: 0,item_id,timestamp,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,ITEM0001,2024-02-01,1002.875657,835.189793,896.628343,940.135976,973.769429,1002.875657,1025.414595,1078.396967,1121.87232,1186.667173
1,ITEM0001,2024-02-02,1157.980498,944.262448,1029.761935,1084.577684,1116.178097,1157.980498,1208.061184,1245.571308,1285.111286,1356.844146
2,ITEM0001,2024-02-03,1300.366804,1052.510454,1154.182265,1218.155898,1270.344345,1300.366804,1359.392633,1417.62285,1485.78678,1629.977585
3,ITEM0001,2024-02-04,1416.750426,1129.144627,1234.254912,1299.932769,1359.389517,1416.750426,1485.326568,1536.191785,1593.063084,1686.443857
4,ITEM0001,2024-02-05,1229.565623,1007.438267,1083.791446,1149.999902,1190.981608,1229.565623,1283.515386,1317.9294,1358.079156,1433.964956
...,...,...,...,...,...,...,...,...,...,...,...,...
8995,ITEM0100,2024-04-26,8.889813,1.949768,4.710758,6.569599,7.634238,8.889813,10.539191,12.47396,15.711641,21.108197
8996,ITEM0100,2024-04-27,11.681589,4.299501,6.114745,8.176284,9.738719,11.681589,13.741649,16.171209,20.89722,29.247123
8997,ITEM0100,2024-04-28,11.084557,3.316741,5.886846,7.755052,9.392192,11.084557,13.107873,16.19888,21.458725,28.924639
8998,ITEM0100,2024-04-29,8.707798,2.288162,4.829259,6.419948,7.398272,8.707798,10.982022,12.732855,16.110207,22.253971
