# LAB: Machine Learning with Linear Learner and Hyperparameter Tuning

NOTE: This notebook should be run in AWS SageMaker python environment.

Install and import required Libraries

In [None]:
# If you have an error in creation role, try to upgrade boto3
%pip install --upgrade boto3
! pip install -U numpy

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting numpy
  Downloading numpy-1.24.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.3
    Uninstalling numpy-1.22.3:
      Successfully uninstalled numpy-1.22.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sparkmagic 0.20.3 requires nest-asyncio==1.5.5, but you have nest-asyncio 1.5.6 which is incompatible.
numba 0.56.4 requires numpy<1.24,>=1.

Import dependencies

In [None]:
import pandas as pd
import numpy as np
import boto3
import urllib.request, json, os, sagemaker
from sagemaker import get_execution_role
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

Get region

In [None]:
my_region = boto3.session.Session().region_name
prefix = 'sagemaker/MLI-Lab_solution'

print("Region: {}".format(my_region))

Region: us-west-2


Create boto3 object

In [None]:
s3 = boto3.resource('s3')

Create bucket for model artifacts

In [None]:
bucket_name = 'bah-bucket-sagemaker-course-2023'

In [None]:
try:
    if  my_region == 'us-east-1':
        if not s3.Bucket(bucket_name).creation_date:
            s3.create_bucket(Bucket=bucket_name)
            print('S3 bucket created successfully')
        else:
            print('Bucket already exists!')
    else: 
        if not s3.Bucket(bucket_name).creation_date:
            s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': my_region})
            print('S3 bucket created successfully')
        else:
            print('Bucket already exists!')
except Exception as e:
    print('S3 error: ', e)

Bucket already exists!


Read `heart disease classification dataset.csv` dataset

In [None]:
# This dataset can be found on Kaggle as well: https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset
import os

DATASET = 'heart disease classification dataset.csv'
DATA_FOLDER = 's3://bah-data'

data = pd.read_csv(os.path.join(DATA_FOLDER, DATASET))

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0,63,male,3,145.0,233.0,1,0,150.0,0,2.3,0,0,1,yes
1,1,37,male,2,130.0,250.0,0,1,187.0,0,3.5,0,0,2,yes
2,2,41,female,1,130.0,204.0,0,0,172.0,0,1.4,2,0,2,yes
3,3,56,male,1,120.0,236.0,0,1,178.0,0,0.8,2,0,2,yes
4,4,57,female,0,,354.0,0,1,163.0,1,0.6,2,0,2,yes


Prepare and preprocess the dataset

In [None]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
data.isnull().any()

age         False
sex         False
cp          False
trestbps     True
chol         True
fbs         False
restecg     False
thalach      True
exang       False
oldpeak     False
slope       False
ca          False
thal        False
target      False
dtype: bool

In [None]:
data.dropna(inplace=True)
data = data.reset_index(drop=True)

In [None]:
data.dtypes

age           int64
sex          object
cp            int64
trestbps    float64
chol        float64
fbs           int64
restecg       int64
thalach     float64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target       object
dtype: object

In [None]:
data.nunique()

age          41
sex           2
cp            4
trestbps     48
chol        149
fbs           2
restecg       3
thalach      89
exang         2
oldpeak      40
slope         3
ca            5
thal          4
target        2
dtype: int64

In [None]:
data['target'].value_counts()

yes    160
no     133
Name: target, dtype: int64

Convert target column into numerical representation using LabelEncoder from sklearn

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
label = le.fit_transform(data['target'])

data.drop('target', axis=1, inplace=True)
data["label"] = label

In [None]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,63,male,3,145.0,233.0,1,0,150.0,0,2.3,0,0,1,1
1,37,male,2,130.0,250.0,0,1,187.0,0,3.5,0,0,2,1
2,41,female,1,130.0,204.0,0,0,172.0,0,1.4,2,0,2,1
3,56,male,1,120.0,236.0,0,1,178.0,0,0.8,2,0,2,1
4,57,male,0,140.0,192.0,0,1,148.0,0,0.4,1,0,1,1


Convert categorical features into one-hot encoding (if there are some)

In [None]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()

output = one_hot_encoder.fit_transform(data['sex'].values.reshape(-1, 1)).toarray()

In [None]:
sex_one_hot_encoded = pd.DataFrame(output).rename(columns={0: 'male', 1: 'female'})

In [None]:
data = pd.concat([data, sex_one_hot_encoded], axis=1)

In [None]:
data.drop('sex', axis=1, inplace=True)

In [None]:
data.head(5)

Unnamed: 0,age,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label,male,female
0,63,3,145.0,233.0,1,0,150.0,0,2.3,0,0,1,1,0.0,1.0
1,37,2,130.0,250.0,0,1,187.0,0,3.5,0,0,2,1,0.0,1.0
2,41,1,130.0,204.0,0,0,172.0,0,1.4,2,0,2,1,1.0,0.0
3,56,1,120.0,236.0,0,1,178.0,0,0.8,2,0,2,1,0.0,1.0
4,57,0,140.0,192.0,0,1,148.0,0,0.4,1,0,1,1,0.0,1.0


Upload train/validation data to s3

In [None]:
train_data, validation_data, test_data = np.split(
    data.sample(frac=1, random_state=1729),
    [int(0.7 * len(data)), int(0.9 * len(data))],
)

print(train_data.shape, test_data.shape, validation_data.shape)

(205, 15) (30, 15) (58, 15)


In [None]:
label_column = train_data['label']
train_data = train_data.drop(['label'], axis=1)
train_data = pd.concat([label_column, train_data], axis=1)

train_data.to_csv('train.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='text/csv')

In [None]:
train_data.head()

Unnamed: 0,label,age,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,male,female
178,0,50,0,150.0,243.0,0,0,128.0,0,2.6,1,0,3,0.0,1.0
26,1,65,2,140.0,417.0,1,0,157.0,0,0.8,2,1,2,1.0,0.0
196,0,58,0,150.0,270.0,0,0,111.0,1,0.8,2,0,3,0.0,1.0
248,0,45,0,142.0,309.0,0,0,147.0,1,0.0,1,3,3,0.0,1.0
107,1,57,2,150.0,126.0,1,1,173.0,0,0.2,2,1,3,0.0,1.0


In [None]:
label_column = validation_data['label']
validation_data = validation_data.drop(['label'], axis=1)
validation_data = pd.concat([label_column, validation_data], axis=1)

validation_data.to_csv('validation.csv', index=False, header=False)

boto3.Session().resource("s3").Bucket(bucket_name).Object(os.path.join(prefix, "validation/validation.csv")).upload_file("validation.csv")

s3_input_validation = sagemaker.TrainingInput(s3_data='s3://{}/{}/validation'.format(bucket_name, prefix), content_type='text/csv')

In [None]:
validation_data.head()

Unnamed: 0,label,age,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,male,female
118,1,41,2,112.0,268.0,0,0,172.0,1,0.0,2,0,2,1.0,0.0
234,0,59,0,174.0,249.0,0,1,143.0,1,0.0,1,0,2,1.0,0.0
59,1,52,3,118.0,186.0,0,0,190.0,0,0.0,1,0,1,0.0,1.0
288,0,63,0,124.0,197.0,0,1,136.0,1,0.0,1,0,2,1.0,0.0
198,0,62,0,160.0,164.0,0,0,145.0,0,6.2,0,3,3,1.0,0.0


Train the model (use any built-in algorithm)

Create SageMaker session

In [None]:
sess = sagemaker.Session()

Define IAM role

In [None]:
role = get_execution_role()

Specify container

In [None]:
linear_learner_container = sagemaker.image_uris.retrieve("linear-learner", my_region)

Create Estimator



In [None]:
linear = sagemaker.estimator.Estimator(
    linear_learner_container,
    role,
    train_instance_count=1,
    train_instance_type="ml.m4.xlarge",
    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
    sagemaker_session=sess,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Set initial hyperparameters

In [None]:
linear.set_hyperparameters(
                           predictor_type="binary_classifier",
                           mini_batch_size=30)

Fit the model

In [None]:
linear.fit({"train": s3_input_train})

INFO:sagemaker:Creating training-job with name: linear-learner-2023-03-17-21-59-43-296


2023-03-17 21:59:43 Starting - Starting the training job...
2023-03-17 22:00:11 Starting - Preparing the instances for training......
2023-03-17 22:01:13 Downloading - Downloading input data...
2023-03-17 22:01:38 Training - Downloading the training image.........
2023-03-17 22:02:54 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/17/2023 22:03:11 INFO 140685723006784] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'opt

Hyperparameters tuninig - tune hyperparameters (whatever you want) from XGBoost Algorithm

NOTE: be careful about metrics used for evaluation. For example, for Linear Learner, the following `validation` metrics could be used: https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner-tuning.html

In [None]:
objective_metric_name = "validation:precision"
MAX_JOBS = 3
MAX_PARALLEL_JOBS = 4
STRATEGY = 'Bayesian'
SCALING_TYPE = 'Linear'

In [None]:
tuninig_job_name = "linear-learner" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameter_ranges_linear = {
    "learning_rate": ContinuousParameter(0.001, 0.1, scaling_type=SCALING_TYPE),
}

tuner_linear = HyperparameterTuner(
    linear,
    objective_metric_name,
    hyperparameter_ranges_linear,
    max_jobs=MAX_JOBS,
    max_parallel_jobs=MAX_PARALLEL_JOBS,
    strategy=STRATEGY,
)

tuner_linear.fit(
    {"train": s3_input_train, "validation": s3_input_validation},
    include_cls_metadata=False,
    job_name=tuninig_job_name)

INFO:sagemaker:Creating hyperparameter tuning job with name: linear-learner20230317-22-04-00


......................................................!


Fetch all results as DataFrame

In [None]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(tuninig_job_name)

full_df = tuner.dataframe()
full_df = full_df[full_df["FinalObjectiveValue"] > -float("inf")]
full_df

Unnamed: 0,learning_rate,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.075426,linear-learner20230317-22-04-00-003-cedde489,Completed,0.742857,2023-03-17 22:05:42+00:00,2023-03-17 22:08:04+00:00,142.0
1,0.018349,linear-learner20230317-22-04-00-002-ed7b93c6,Completed,0.764706,2023-03-17 22:05:45+00:00,2023-03-17 22:08:08+00:00,143.0
2,0.001818,linear-learner20230317-22-04-00-001-59df21f3,Completed,0.793103,2023-03-17 22:05:36+00:00,2023-03-17 22:07:58+00:00,142.0


Terminate your resources

In [None]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'JM2QNDS28RXR17AR',
   'HostId': 'n+SJcxe2PeHuo315s6QswdpZknLy9OhoOBG1rTxRmnvnOCd5pmEhTbMIPh7YwU78XBY2PfpTRhI=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'n+SJcxe2PeHuo315s6QswdpZknLy9OhoOBG1rTxRmnvnOCd5pmEhTbMIPh7YwU78XBY2PfpTRhI=',
    'x-amz-request-id': 'JM2QNDS28RXR17AR',
    'date': 'Fri, 17 Mar 2023 22:21:00 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/MLI-Lab_solution/output/linear-learner-2023-03-17-21-59-43-296/profiler-output/system/incremental/2023031722/1679090520.algo-1.json'},
   {'Key': 'sagemaker/MLI-Lab_solution/train/train.csv'},
   {'Key': 'sagemaker/MLI-Lab_solution/output/linear-learner-2023-03-17-21-59-43-296/profiler-output/system/training_job_end.ts'},
   {'Key': 'sagemaker/MLI-Lab_solution/output/linear-learner-2023-03-17-21-59-43-296/profiler-output/system/incre