In [1]:
import pandas as pd
import numpy as np
import json
import pathlib
import boto3

import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep
from sagemaker.workflow.step_collections import RegisterModel

from sagemaker.sklearn.processing import SKLearnProcessor

from demo_helper import ModelMetrics

boto_session = boto3.Session()
region = boto_session.region_name
print("Region = {}".format(region))

s3_client = boto3.client('s3', region_name=region)

sagemaker_boto_client = boto_session.client('sagemaker')

sagemaker_session = sagemaker.session.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client)

sagemaker_role = sagemaker.get_execution_role()

default_bucket = sagemaker_session.default_bucket()  # Alterantively you can use our custom bucket here.

prefix = 'sagemaker-tutorial'  # use this prefix to store all files pertaining to this workshop.

dataprefix = prefix + '/data'
traindataprefix = prefix + '/train_data'
testdataprefix = prefix + '/test_data'
testdatanolabelprefix = prefix + '/test_data_no_label'
trainheaderprefix = prefix + '/train_headers'

training_job_output_path = f's3://{default_bucket}/{prefix}/training_jobs'

create_dataset_script_uri = f's3://{default_bucket}/{prefix}/code/create_dataset.py'
deploy_model_script_uri = f's3://{default_bucket}/{prefix}/code/deploy_model.py'

processing_dir = "/opt/ml/processing"

# ======> variables used for parameterizing the notebook run
flow_instance_count = 1
flow_instance_type = "ml.m5.4xlarge"

deploy_model_instance_type = "ml.m4.xlarge"

Region = us-west-2


In [2]:
if __name__ == "__main__":
    # change column names and save the file as .csv
    data_path = './data/default_of_credit_card.xls'

    df = pd.read_excel('./data/default_of_credit_card.xls', header=1)

    df.head()

    timestamp = pd.to_datetime('now').timestamp()
    df['EVENT_TIME'] = timestamp

    df = df.astype(np.float64)

    cols = list(df)
    cols.insert(0, cols.pop(cols.index('default payment next month')))
    df = df.loc[:, cols]

    df.rename(columns={"default payment next month": "LABEL"}, inplace=True)

    df.to_csv('./data/dataset.csv', index=False)

    response = sagemaker_session.upload_data('./data/dataset.csv', bucket=default_bucket, key_prefix=dataprefix)
    print(response)

s3://sagemaker-us-west-2-367158743199/sagemaker-tutorial/data/dataset.csv
