# Automated Machine Learning using Autopilot

## Create buckets

In [2]:
import boto3
import sagemaker

In [3]:
sess =sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session.region_name

## Configuring autopilot job

### uploading data to s3 bucket

In [4]:
import pandas as pd
df = pd.read_csv('Reviews.csv')
df.head()

Unnamed: 0,sentiment,review_body
0,-1,This suit did nothing for me. the top has zero...
1,-1,Like other reviewers i saw this dress on the ...
2,-1,I wish i had read the reviews before purchasin...
3,-1,I ordered these pants in my usual size (xl) an...
4,-1,I noticed this top on one of the sales associa...


In [5]:
data_uri = sess.upload_data(path='./Reviews.csv',
                      bucket = bucket,
                      key_prefix='data')
data_uri

's3://sagemaker-us-east-1-115472653629/data/Reviews.csv'

In [6]:
!aws s3 ls $data_uri

2022-03-12 12:38:03    2253749 Reviews.csv


### S3 output for generated assets

In [7]:
output_uri = f's3://{bucket}/autopilot'
output_uri

's3://sagemaker-us-east-1-115472653629/autopilot'

### Autopilot name
Autopilot name has to be unique

In [8]:
import time
auto_ml_job_name = f'auto-ml-job-{int(time.time())}'
auto_ml_job_name

'auto-ml-job-1647088684'

### Configuring the autopilot job

In [9]:
max_candidates = 3

auto_ml = sagemaker.automl.automl.AutoML(role=role,
                                        target_attribute_name = 'sentiment',
                                         output_path = output_uri,
                                         base_job_name = auto_ml_job_name,
                                         sagemaker_session = sess,
                                         max_candidates = 3,
                                         max_runtime_per_training_job_in_seconds = 1200,
                                         total_job_runtime_in_seconds = 7200
                                        )

## Launch the autopilot job

In [11]:
auto_ml.fit(inputs = data_uri,
           job_name = auto_ml_job_name,
           wait = False,
           logs = False
          )

## Tracking autopilot job progress
The autopilot job progress can be determined using two response elements form the job description generated using the describe_auto_ml_job function namely:
1. AutoMLJobStatus: Shows the actual processing job status. Shows whether the processing job is;
    - Completed
    - InProgress
    - Failed
    - Stopped
    - Stopping
2. AutoMLJobSecondaryStatus: Shows which task is being processed;
    - AnalyzingData
    - FeatureEngineering
    - ModelTuning
    

### Get the autopilot job description


In [12]:
job_desc_resp = auto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)

### Check if autopilot job  has started

In [13]:
import json
while 'AutoMLJobStatus' not in job_desc_resp.keys() and 'AutoMLJobSecondaryStatus' not in job_desc_resp.keys():
    print('Processing job not started')
    job_desc_resp = autto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)
    time.sleep(15)
print('[OK] AutoML processing job started')

[OK] AutoML processing job started


### Wait for data anlysis step to finish

In [None]:
%%time
job_status = job_desc_resp['AutoMLJobStatus']
job_sec_status = job_desc_resp['AutoMLJobSecondaryStatus']
if job_status not in ('Failed', 'Stopped'):
    while job_status == 'InProgress' and job_sec_status in ('Starting', 'AnalyzingData'):
        job_desc_resp = auto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)
        job_status = job_desc_resp['AutoMLJobStatus']
        job_sec_status = job_desc_resp['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(15)
    print('[OK] Data analysis completed')

### Check if artifacts have been generated


In [15]:
job_desc_resp = auto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)
while 'AutoMLJobArtifacts' not in job_desc_resp:
    print('[INFO] Autopilot job has not yet generated the artifacts. Please wait. ')
    job_desc_resp = auto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)
    time.sleep(15)
print('[OK] AutoMLJobArtifacts generated.')

[OK] AutoMLJobArtifacts generated.


### Checking if feature engineering is complete

In [None]:
%%time
job_status = job_desc_resp['AutoMLJobStatus']
job_sec_status = job_desc_resp['AutoMLJobSecondaryStatus']
if job_status not in ('Failed', 'Stopped'):
    while job_status == 'InProgress' and job_sec_status == 'FeatureEngineering':
        job_desc_resp = auto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)
        job_status = job_desc_resp['AutoMLJobStatus']
        job_sec_status = job_desc_resp['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(5)
    print('[OK] Feature engineering phase completed.\n')
#print(json.dumps(job_desc_resp, indent=4, sort_keys=True, default=str))

### Waiting for training and tuning to complete

In [None]:
%%time
job_status = job_desc_resp['AutoMLJobStatus']
job_sec_status = job_desc_resp['AutoMLJobSecondaryStatus']
if job_status not in ('Failed', 'Stopped'):
    while job_status == 'InProgress' and job_sec_status == 'ModelTuning':
        job_desc_resp = auto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)
        job_status = job_desc_resp['AutoMLJobStatus']
        job_sec_status = job_desc_resp['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(5)
    print('[OK] Model tuning phase completed.\n')
#print(json.dumps(job_desc_resp, indent=4, sort_keys=True, default=str))

### Finally checking if the autopilot job has completed

In [None]:
job_desc_resp = auto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)

job_status = job_desc_resp['AutoMLJobStatus']
job_sec_status = job_desc_resp['AutoMLJobSecondaryStatus']

if job_status not in ('Failed', 'Stopped'):
    while job_status not in ('Completed'):
        job_desc_resp = auto_ml.describe_auto_ml_job(job_name = auto_ml_job_name)
        job_status = job_desc_resp['AutoMLJobStatus']
        job_sec_status = job_desc_resp['AutoMLJobSecondaryStatus']
        print('Job status:  {}'.format(job_status))
        print('Secondary job status:  {}'.format(job_sec_status)) 
        time.sleep(15)
    print('[OK] Autopilot job completed.\n')
else:
    print('Job status:  {}'.format(job_status))
    print('Secondary job status:  {}'.format(job_sec_status)) 
    

## Comparing model candidates

### Getting list of candidates generated by autopilot

In [19]:
candidates = auto_ml.list_candidates(job_name = auto_ml_job_name,
                                     sort_by = 'FinalObjectiveMetricValue'
                                    )

### Check if candidates exists

In [20]:
while not candidates:
    candidates = auto_ml.list_candidates(job_name = auto_ml_job_name)
    print('[INFO] Autopilot job is generating candidates. Please wait.')
    time.sleep(5)
print('[OK] Candidates generated.')
    

[OK] Candidates generated.


### Check that candidate name exists

In [21]:
while 'CandidateName'not in candidates[0]:
    candidates = auto_ml.list_candidates(job_name = auto_ml_job_name)
    print('[INFO] Autopilot is generating candidate name. Please wait')
    time.sleep(5)
print('[OK] Candidate name generared.')

[OK] Candidate name generared.


### Check that FinalObjectiveMetricValue exists

In [22]:
while 'FinalAutoMLJobObjectiveMetric'not in candidates[0]:
    candidates = auto_ml.list_candidates(job_name = auto_ml_job_name)
    print('[INFO] Autopilot is generating FinalObjectivemetricValue. Please wait.')
print('[OK] FinalObjectiveMetricValue generated.')

[OK] FinalObjectiveMetricValue generated.


In [23]:
#print(json.dumps(candidates[0], indent=4, sort_keys=True, default=str))

### Get the best candidates and their metrics values

In [24]:
for index, candidate in enumerate(candidates):
    print(f"{index} {candidate['CandidateName']} {candidate['FinalAutoMLJobObjectiveMetric']['MetricName']} {candidate['FinalAutoMLJobObjectiveMetric']['Value']}")

0 auto-ml-job-1647088684nXQkORvLLO-003-15d43309 validation:accuracy 0.6254600286483765
1 auto-ml-job-1647088684nXQkORvLLO-002-4f9f6068 validation:accuracy 0.5753899812698364
2 auto-ml-job-1647088684nXQkORvLLO-001-074b5b24 validation:accuracy 0.43281999230384827


### Getting the best candidate

In [25]:
candidates = auto_ml.list_candidates(job_name = auto_ml_job_name)
if candidates:
    best_candidate = auto_ml.best_candidate(job_name = auto_ml_job_name)

#### Check that CandidateName for best candidate exists

In [26]:
while 'CandidateName'not in best_candidate:
    best_candidate = auto_ml.best_candidate(job_name = auto_ml_job_name)
    print('[INFO] Autopilot job is generating best candidate CandidateName. Please wait.')
    time.sleep(15)
print('[OK] Best candidate CandidateName generated.')

[OK] Best candidate CandidateName generated.


#### Check that FinalAutoMLJobObjectiveMetric for best candidate existsÂ¶

In [27]:
while 'FinalAutoMLJobObjectiveMetric' not in best_candidate:
    best_candidate = auto_ml.best_candidate(job_name = auto_ml_job_name)
    print('[INFO] Autopilot job is generating best candidate FInalAUtoMLJobObjectiveMetric. Please wait.')
    time.sleep(15)
print('[OK] Best candidate FInalAUtoMLJobObjectiveMetric generated.')

[OK] Best candidate FInalAUtoMLJobObjectiveMetric generated.


#### Best candidate details

In [28]:
print("Candidate name: " + best_candidate['CandidateName'])
print("Metric name: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("Metric value: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

Candidate name: auto-ml-job-1647088684nXQkORvLLO-003-15d43309
Metric name: validation:accuracy
Metric value: 0.6254600286483765


## Review all output in S3 bucket

In [33]:
!aws s3 ls s3://$bucket/autopilot/auto-ml-job-1647088684/

                           PRE data-processor-models/
                           PRE documentation/
                           PRE preprocessed-data/
                           PRE sagemaker-automl-candidates/
                           PRE transformed-data/
                           PRE tuning/
                           PRE validations/


## Deploying and testing best candidate

In [40]:
inference_response_keys = ['predicted_label', 'probability']

In [41]:
autopilot_model = auto_ml.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    candidate=best_candidate,
    inference_response_keys=inference_response_keys,
    predictor_cls=sagemaker.predictor.Predictor,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer()
)

print('\nEndpoint name:  {}'.format(autopilot_model.endpoint_name))

---------------!
Endpoint name:  sagemaker-sklearn-automl-2022-03-12-13-33-45-011


## Testing the model

In [53]:
review_list = ['This product is great!',
               'OK, but not great.',
               'This is not the right product.']

In [55]:
sm_runtime = boto3.client('sagemaker-runtime')

In [58]:
#sm_runtime = boto3.client('sagemaker-runtime')


for review in review_list:
    
    # remove commas from the review since we're passing the inputs as a CSV
    review = review.replace(",", "")

    response = sm_runtime.invoke_endpoint(
        EndpointName=autopilot_model.endpoint_name, # endpoint name
        ContentType='text/csv', # type of input data
        Accept='text/csv', # type of the inference in the response
        Body=review # review text
        )

    response_body=response['Body'].read().decode('utf-8').strip().split(',')

    print('Review: ', review, ' Predicated class: {}'.format(response_body[0]))

print("(-1 = Negative, 0=Neutral, 1=Positive)")

Review:  This product is great!  Predicated class: 1
Review:  OK but not great.  Predicated class: 0
Review:  This is not the right product.  Predicated class: -1
(-1 = Negative, 0=Neutral, 1=Positive)
