# IMPORT KEY LIBRARIES/DATASETS AND PREPARE THE DATA FOR TRAINING

In [2]:
# Install seaborn library
!pip install --upgrade Seaborn

# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


[0m

In [3]:
# Read the data using Pandas 
fuel_economy_df = pd.read_csv('FuelEconomy.csv')

In [4]:
# View the DataFrame
fuel_economy_df

Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.952010
3,187.310009,23.384546
4,218.594340,23.426739
...,...,...
95,162.810542,27.418661
96,266.869640,15.989945
97,243.831211,19.253375
98,140.959803,29.515593


In [5]:
# Separate the data into input X and Output y
X = fuel_economy_df[['Horse Power']]
y = fuel_economy_df[['Fuel Economy (MPG)']]

In [6]:
X

Unnamed: 0,Horse Power
0,118.770799
1,176.326567
2,219.262465
3,187.310009
4,218.594340
...,...
95,162.810542
96,266.869640
97,243.831211
98,140.959803


In [7]:
y

Unnamed: 0,Fuel Economy (MPG)
0,29.344195
1,24.695934
2,23.952010
3,23.384546
4,23.426739
...,...
95,27.418661
96,15.989945
97,19.253375
98,29.515593


In [8]:
# Check out the shape of the input
X.shape

(100, 1)

In [9]:
# Check out the shape of the output
y.shape

(100, 1)

In [10]:
# Convert the datatype to float32
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [11]:
# Split the data into training and testing(20%) using SkLearn Library
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# TRAIN A LINEAR LEARNER MODEL USING AWS SAGEMAKER

In [12]:
import sagemaker
import boto3

# Create a Sagemaker session
sagemaker_session = sagemaker.Session()

# Defining the S3 bucket and prefix that we want to use in this session
bucket = 'ml-lab-sagemaker' # bucket need to be created beforehand
prefix = 'linear_learner_fuel_economy' # prefix is the subfolder within the bucket

# Creating the sagemaker execution role
role = sagemaker.get_execution_role()
print(role)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
arn:aws:iam::588738589118:role/service-role/AmazonSageMaker-ExecutionRole-20241129T190553


In [13]:
X_train.shape

(80, 1)

In [16]:
# Make sure that the target label is a vector
y_train = y_train[:,0]

In [17]:
y_train.shape

(80,)

In [18]:
import io # The io module allows for dealing with various types of I/O (text I/O, binary I/O and raw I/O). 
import numpy as np
import sagemaker.amazon.common as smac # sagemaker common libary

# Code below converts the data in numpy array format to RecordIO format
# This is the format required by Sagemaker Linear Learner 

buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_numpy_to_dense_tensor(buf, X_train, y_train)
buf.seek(0) 
# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 


0

In [19]:
import os

# Code to upload RecordIO data to S3
 
# Key refers to the name of the file    
key = 'linear-train-data'

# The following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://ml-lab-sagemaker/linear_learner_fuel_economy/train/linear-train-data


In [20]:
X_test.shape

(20, 1)

In [21]:
y_test.shape

(20, 1)

In [22]:
# Make sure that the target label is a vector
y_test = y_test[:,0]


In [23]:
# Code to upload RecordIO data to S3

buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_numpy_to_dense_tensor(buf, X_test, y_test)
buf.seek(0) 
# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 


0

In [24]:
# Key refers to the name of the file    
key = 'linear-test-data'

# The following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)

# Let's print out the testing data location in s3
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_test_data))

uploaded training data location: s3://ml-lab-sagemaker/linear_learner_fuel_economy/test/linear-test-data


In [25]:
# create an output placeholder in S3 bucket to store the linear learner output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

Training artifacts will be uploaded to: s3://ml-lab-sagemaker/linear_learner_fuel_economy/output


In [26]:
# Obtain a reference to the linearLearner container image

container = sagemaker.image_uris.retrieve("linear-learner", boto3.Session().region_name)

In [29]:
linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       instance_count = 1, 
                                       instance_type = 'ml.m5.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session)

linear.set_hyperparameters(feature_dim = 1,
                           predictor_type = 'regressor',
                           mini_batch_size = 10,
                           epochs = 10,
                           num_models = 32,
                           loss = 'absolute_loss')

# Now we are ready to pass in the training data from S3 to train the linear learner model

linear.fit({'train': s3_train_data})

INFO:sagemaker:Creating training-job with name: linear-learner-2024-12-04-21-40-35-610


2024-12-04 21:40:35 Starting - Starting the training job...
2024-12-04 21:40:53 Starting - Preparing the instances for training...
2024-12-04 21:41:23 Downloading - Downloading input data...
2024-12-04 21:41:44 Downloading - Downloading the training image......
2024-12-04 21:42:55 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/04/2024 21:43:04 INFO 139926935496512] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimi

# DEPLOY AND TEST TRAINED LINEAR LEARNER MODEL 

In [30]:
# Deploying the model to perform inference 

# Content type overrides the data that will be passed to the deployed model, since the deployed model expects data in text/csv format.
# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content type
# Deserializer accepts two arguments, the result data and the response content type, and return a sequence of bytes in the specified content type.

from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer


linear_regressor = linear.deploy(initial_instance_count = 1,
                                 instance_type = 'ml.m5.xlarge',
                                 serializer=CSVSerializer(),
                                 deserializer=JSONDeserializer())

INFO:sagemaker:Creating model with name: linear-learner-2024-12-04-21-44-51-263
INFO:sagemaker:Creating endpoint-config with name linear-learner-2024-12-04-21-44-51-263
INFO:sagemaker:Creating endpoint with name linear-learner-2024-12-04-21-44-51-263


-------!

In [31]:
# Making prediction on the test data
result = linear_regressor.predict(X_test)

In [32]:
result 

{'predictions': [{'score': 22.368885040283203},
  {'score': 21.541534423828125},
  {'score': 21.120811462402344},
  {'score': 20.69001579284668},
  {'score': 22.733306884765625},
  {'score': 30.64632225036621},
  {'score': 25.734302520751953},
  {'score': 27.1295166015625},
  {'score': 26.36477279663086},
  {'score': 23.969985961914062},
  {'score': 31.055259704589844},
  {'score': 26.480239868164062},
  {'score': 21.163394927978516},
  {'score': 20.658857345581055},
  {'score': 28.576061248779297},
  {'score': 29.764095306396484},
  {'score': 24.369625091552734},
  {'score': 16.928207397460938},
  {'score': 23.075531005859375},
  {'score': 25.62125015258789}]}

In [33]:
# Since the result is in json format, we access the scores by iterating through the scores in the predictions
predictions = np.array([r['score'] for r in result['predictions']])

In [34]:
predictions

array([22.36888504, 21.54153442, 21.12081146, 20.69001579, 22.73330688,
       30.64632225, 25.73430252, 27.1295166 , 26.3647728 , 23.96998596,
       31.0552597 , 26.48023987, 21.16339493, 20.65885735, 28.57606125,
       29.76409531, 24.36962509, 16.9282074 , 23.07553101, 25.62125015])

In [35]:
predictions.shape

(20,)

In [None]:
# VISUALIZE TEST SET RESULTS
plt.figure(figsize = (10, 6))
plt.scatter(X_test, y_test, color = 'blue')
plt.plot(X_test, predictions, color = 'red')
plt.xlabel('HorsePower')
plt.ylabel('Fuel Economy [MPG]')
plt.title('HorsePower Vs. Fuel Economy')
plt.grid()

In [47]:
# Testing the trained model
horse_power = [[175]]
fuel_economy = linear_regressor.predict(horse_power)
print(fuel_economy)

{'predictions': [{'score': 25.96426010131836}]}


In [48]:
# Delete the end-point
linear_regressor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: linear-learner-2024-12-02-12-03-52-191
INFO:sagemaker:Deleting endpoint with name: linear-learner-2024-12-02-12-03-52-191
