In [4]:
%store -r s3_bucket_name
%store -r prefix

In [5]:
base = f's3://{s3_bucket_name}/{prefix}'
baseline_data_uri = f'{base}/input/training_data.csv'
baseline_results_uri = f"{base}/model-monitor/baseline-results"

In [6]:
local_file = "tmp/baseline.csv"
!aws s3 cp {baseline_data_uri} {local_file}

import pandas as pd
baseline_df = pd.read_csv(local_file)
baseline_df

download: s3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv to tmp/baseline.csv


Unnamed: 0,approved,sex,math,science,technology,random1,random2
0,1,1,97,97,98,93,82
1,1,1,85,68,62,92,65
2,1,1,99,100,80,71,60
3,1,1,91,79,84,60,70
4,1,1,73,86,66,70,98
...,...,...,...,...,...,...,...
595,1,1,99,86,85,98,87
596,1,1,71,97,90,86,99
597,1,1,95,86,62,69,73
598,1,1,78,71,68,72,68


In [7]:
import sagemaker
role = sagemaker.get_execution_role()

In [8]:
from sagemaker.model_monitor import DefaultModelMonitor

default_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)

In [9]:
%%time

from sagemaker.model_monitor import dataset_format

dsf = dataset_format.DatasetFormat.csv(header=True)

default_monitor.suggest_baseline(
    baseline_dataset=baseline_data_uri,
    dataset_format=dsf,
    output_s3_uri=baseline_results_uri,
    wait=True
)


Job Name:  baseline-suggestion-job-2021-06-13-18-42-31-427
Inputs:  [{'InputName': 'baseline_dataset_input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv', 'LocalPath': '/opt/ml/processing/input/baseline_dataset_input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'monitoring_output', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/model-monitor/baseline-results', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
.......................[34m2021-06-13 18:46:12,199 - __main__ - INFO - All params:{'ProcessingJobArn': 'arn:aws:sagemaker:us-east-1:581320662326:processing-job/baseline-suggestion-job-2021-06-13-18-42-31-427', 'ProcessingJobName': 'baseline-suggestion-job-2021-06-13-18-42-31-427', 'Environment': {'dataset_format': '{"csv": {"header": true, "output_col

<sagemaker.processing.ProcessingJob at 0x7f3e3fea16d0>

In [11]:
baseline_job = default_monitor.latest_baselining_job
baseline_job.__dict__

{'inputs': [<sagemaker.processing.ProcessingInput at 0x7f3e3d4baad0>],
 'outputs': [<sagemaker.processing.ProcessingOutput at 0x7f3e3d4b8d50>],
 'output_kms_key': None,
 'sagemaker_session': <sagemaker.session.Session at 0x7f3e3d34b650>,
 'job_name': 'baseline-suggestion-job-2021-06-13-18-42-31-427'}

In [12]:
stats = baseline_job.baseline_statistics()
schema_dict = stats.body_dict["features"]

In [13]:
import pandas as pd
schema_df = pd.json_normalize(schema_dict)
schema_df.head(5)

Unnamed: 0,name,inferred_type,numerical_statistics.common.num_present,numerical_statistics.common.num_missing,numerical_statistics.mean,numerical_statistics.sum,numerical_statistics.std_dev,numerical_statistics.min,numerical_statistics.max,numerical_statistics.distribution.kll.buckets,numerical_statistics.distribution.kll.sketch.parameters.c,numerical_statistics.distribution.kll.sketch.parameters.k,numerical_statistics.distribution.kll.sketch.data
0,approved,Integral,600,0,0.798333,479.0,0.401245,0.0,1.0,"[{'lower_bound': 0.0, 'upper_bound': 0.1, 'cou...",0.64,2048.0,"[[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0,..."
1,sex,Integral,600,0,0.796667,478.0,0.402478,0.0,1.0,"[{'lower_bound': 0.0, 'upper_bound': 0.1, 'cou...",0.64,2048.0,"[[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0,..."
2,math,Integral,600,0,79.303333,47582.0,11.603648,60.0,100.0,"[{'lower_bound': 60.0, 'upper_bound': 64.0, 'c...",0.64,2048.0,"[[97.0, 85.0, 99.0, 91.0, 73.0, 84.0, 90.0, 73..."
3,science,Integral,600,0,79.43,47658.0,11.780284,60.0,100.0,"[{'lower_bound': 60.0, 'upper_bound': 64.0, 'c...",0.64,2048.0,"[[97.0, 68.0, 100.0, 79.0, 86.0, 78.0, 75.0, 7..."
4,technology,Integral,600,0,80.255,48153.0,11.895096,60.0,100.0,"[{'lower_bound': 60.0, 'upper_bound': 64.0, 'c...",0.64,2048.0,"[[98.0, 62.0, 80.0, 84.0, 66.0, 63.0, 83.0, 76..."


In [14]:
constraints = baseline_job.suggested_constraints()
constraints_dict = constraints.body_dict["features"]

In [15]:
constraints_df = pd.json_normalize(constraints_dict)
constraints_df.head(7)

Unnamed: 0,name,inferred_type,completeness,num_constraints.is_non_negative
0,approved,Integral,1.0,True
1,sex,Integral,1.0,True
2,math,Integral,1.0,True
3,science,Integral,1.0,True
4,technology,Integral,1.0,True
5,random1,Integral,1.0,True
6,random2,Integral,1.0,True


In [16]:
!aws s3 cp {baseline_results_uri}/ tmp/ --recursive

download: s3://sagemaker-cookbook-bucket/chapter07/model-monitor/baseline-results/constraints.json to tmp/constraints.json
download: s3://sagemaker-cookbook-bucket/chapter07/model-monitor/baseline-results/statistics.json to tmp/statistics.json


In [17]:
!cat tmp/constraints.json

{
  "version" : 0.0,
  "features" : [ {
    "name" : "approved",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : true
    }
  }, {
    "name" : "sex",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : true
    }
  }, {
    "name" : "math",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : true
    }
  }, {
    "name" : "science",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : true
    }
  }, {
    "name" : "technology",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : true
    }
  }, {
    "name" : "random1",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : true
    }
  }, {
    "name" : "random2",
    "inferred_t

In [18]:
!cat tmp/statistics.json

{
  "version" : 0.0,
  "dataset" : {
    "item_count" : 600
  },
  "features" : [ {
    "name" : "approved",
    "inferred_type" : "Integral",
    "numerical_statistics" : {
      "common" : {
        "num_present" : 600,
        "num_missing" : 0
      },
      "mean" : 0.7983333333333333,
      "sum" : 479.0,
      "std_dev" : 0.40124459151772024,
      "min" : 0.0,
      "max" : 1.0,
      "distribution" : {
        "kll" : {
          "buckets" : [ {
            "lower_bound" : 0.0,
            "upper_bound" : 0.1,
            "count" : 121.0
          }, {
            "lower_bound" : 0.1,
            "upper_bound" : 0.2,
            "count" : 0.0
          }, {
            "lower_bound" : 0.2,
            "upper_bound" : 0.3,
            "count" : 0.0
          }, {
            "lower_bound" : 0.3,
            "upper_bound" : 0.4,
            "count" : 0.0
          }, {
            "lower_bound" : 0.4,
            "upper_bound" : 0.5,
            "count" : 0.0
          }, {
    

In [19]:
from sagemaker.model_monitor import CronExpressionGenerator
from time import gmtime, strftime

In [20]:
import random
from string import ascii_uppercase

def generate_schedule_name():
    chars = random.choices(ascii_uppercase, k=5)
    output = 'schedule-' + ''.join(chars)
    return output

In [21]:
schedule_name = generate_schedule_name()
schedule_name

'schedule-RXZQF'

In [22]:
s3_report_path = f'{base}/report-path'

In [23]:
baseline_statistics = default_monitor.baseline_statistics()

In [24]:
constraints = default_monitor.suggested_constraints()

In [25]:
cron_expression = CronExpressionGenerator.hourly()
cron_expression

'cron(0 * ? * * *)'

In [26]:
%store -r endpoint_name

from sagemaker import Predictor
predictor = Predictor(endpoint_name=endpoint_name)

In [27]:
try:
    default_monitor.delete_monitoring_schedule()
except:
    pass


Deleting Monitoring Schedule with name: None


In [28]:
%store -r csv_input
csv_input

'1,92,83,86,96,67'

In [29]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer

predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

In [30]:
predictor.predict(csv_input)

0.9916712045669556

In [31]:
constraints.__dict__

{'body_dict': {'version': 0.0,
  'features': [{'name': 'approved',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': True}},
   {'name': 'sex',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': True}},
   {'name': 'math',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': True}},
   {'name': 'science',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': True}},
   {'name': 'technology',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': True}},
   {'name': 'random1',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': True}},
   {'name': 'random2',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': True}}],
  'monitoring_config': {'evaluate_c

In [32]:
constraints.body_dict['features'][0]['inferred_type'] = 'Fractional'
constraints.body_dict

{'version': 0.0,
 'features': [{'name': 'approved',
   'inferred_type': 'Fractional',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': True}},
  {'name': 'sex',
   'inferred_type': 'Integral',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': True}},
  {'name': 'math',
   'inferred_type': 'Integral',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': True}},
  {'name': 'science',
   'inferred_type': 'Integral',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': True}},
  {'name': 'technology',
   'inferred_type': 'Integral',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': True}},
  {'name': 'random1',
   'inferred_type': 'Integral',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': True}},
  {'name': 'random2',
   'inferred_type': 'Integral',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': True}}],
 'monitoring_config': {'evaluate_constraints': 'Enabled',
  'emit_metrics':

In [33]:
constraints.save()

's3://sagemaker-cookbook-bucket/chapter07/model-monitor/baseline-results/constraints.json'

In [34]:
default_monitor.create_monitoring_schedule(
    monitor_schedule_name=schedule_name,
    endpoint_input=predictor.endpoint,
    output_s3_uri=s3_report_path,
    statistics=baseline_statistics,
    constraints=constraints,
    schedule_cron_expression=cron_expression,
    enable_cloudwatch_metrics=True,
)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [35]:
default_monitor.describe_schedule()

{'MonitoringScheduleArn': 'arn:aws:sagemaker:us-east-1:581320662326:monitoring-schedule/schedule-rxzqf',
 'MonitoringScheduleName': 'schedule-RXZQF',
 'MonitoringScheduleStatus': 'Pending',
 'MonitoringType': 'DataQuality',
 'CreationTime': datetime.datetime(2021, 6, 13, 18, 48, 19, 131000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 6, 13, 18, 48, 19, 166000, tzinfo=tzlocal()),
 'MonitoringScheduleConfig': {'ScheduleConfig': {'ScheduleExpression': 'cron(0 * ? * * *)'},
  'MonitoringJobDefinitionName': 'data-quality-job-definition-2021-06-13-18-48-18-896',
  'MonitoringType': 'DataQuality'},
 'EndpointName': 'model-VWXTC-2021-06-13-18-20-15-736',
 'ResponseMetadata': {'RequestId': 'c495cb55-6dcf-4b42-887e-b669a34cf8f0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c495cb55-6dcf-4b42-887e-b669a34cf8f0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '538',
   'date': 'Sun, 13 Jun 2021 18:48:19 GMT'},
  'RetryAttempts': 0}}

In [37]:
from time import sleep
sleep(300)

In [38]:
def perform_good_input():
    predictor.predict(csv_input)
    print("good input")

In [39]:
def perform_bad_input():
    csv_bad_input = '1,92,-83.3,86,-96,67'
    predictor.predict(csv_bad_input)
    print("bad input")

In [40]:
perform_good_input()
perform_bad_input()

good input
bad input


In [41]:
dm = default_monitor
monitoring_violations = dm.latest_monitoring_constraint_violations()
monitoring_statistics = dm.latest_monitoring_statistics()

No executions found for schedule. monitoring_schedule_name: schedule-RXZQF
No executions found for schedule. monitoring_schedule_name: schedule-RXZQF
No executions found for schedule. monitoring_schedule_name: schedule-RXZQF
No executions found for schedule. monitoring_schedule_name: schedule-RXZQF


In [42]:
%%time

from time import sleep

violations = monitoring_violations

while not violations:
    print("No executions yet. Sleeping for 5 minutes...")
    sleep(300)
    
    perform_good_input()
    perform_bad_input()
    
    try:
        v = dm.latest_monitoring_constraint_violations()
        violations = v
    except:
        pass
    
print("Executions found!")

No executions yet. Sleeping for 5 minutes...
good input
bad input
No executions found for schedule. monitoring_schedule_name: schedule-RXZQF
No executions yet. Sleeping for 5 minutes...
good input
bad input

Could not retrieve constraints file at location 's3://sagemaker-cookbook-bucket/chapter07/report-path/model-VWXTC-2021-06-13-18-20-15-736/schedule-RXZQF/2021/06/13/19/constraint_violations.json'. To manually retrieve ConstraintViolations object from a given uri, use 'my_model_monitor.constraints(my_s3_uri)' or 'ConstraintViolations.from_s3_uri(my_s3_uri)'
No executions yet. Sleeping for 5 minutes...
good input
bad input
Executions found!
CPU times: user 159 ms, sys: 17.1 ms, total: 176 ms
Wall time: 15min 1s


In [43]:
violations = dm.latest_monitoring_constraint_violations()
violations.__dict__

{'body_dict': {'violations': [{'feature_name': 'science',
    'constraint_check_type': 'data_type_check',
    'description': 'Data type match requirement is not met. Expected data type: Integral, Expected match: 100.0%. Observed: Only 75.0% of data is Integral.'}]},
 'file_s3_uri': 's3://sagemaker-cookbook-bucket/chapter07/report-path/model-VWXTC-2021-06-13-18-20-15-736/schedule-RXZQF/2021/06/13/19/constraint_violations.json',
 'kms_key': None,
 'session': None}

In [44]:
!aws s3 cp {violations.file_s3_uri} tmp/violations.json

download: s3://sagemaker-cookbook-bucket/chapter07/report-path/model-VWXTC-2021-06-13-18-20-15-736/schedule-RXZQF/2021/06/13/19/constraint_violations.json to tmp/violations.json


In [45]:
!cat tmp/violations.json

{
  "violations" : [ {
    "feature_name" : "science",
    "constraint_check_type" : "data_type_check",
    "description" : "Data type match requirement is not met. Expected data type: Integral, Expected match: 100.0%. Observed: Only 75.0% of data is Integral."
  } ]
}

In [46]:
monitoring_statistics = dm.latest_monitoring_statistics()
monitoring_statistics.__dict__

{'body_dict': {'version': 0.0,
  'dataset': {'item_count': 4},
  'features': [{'name': 'approved',
    'inferred_type': 'Fractional',
    'numerical_statistics': {'common': {'num_present': 4, 'num_missing': 0},
     'mean': 0.9229514747858047,
     'sum': 3.691805899143219,
     'std_dev': 0.1190260634633573,
     'min': 0.7167922854423523,
     'max': 0.9916712045669556,
     'distribution': {'kll': {'buckets': [{'lower_bound': 0.7167922854423523,
         'upper_bound': 0.7442801773548127,
         'count': 1.0},
        {'lower_bound': 0.7442801773548127,
         'upper_bound': 0.7717680692672729,
         'count': 0.0},
        {'lower_bound': 0.7717680692672729,
         'upper_bound': 0.7992559611797333,
         'count': 0.0},
        {'lower_bound': 0.7992559611797333,
         'upper_bound': 0.8267438530921936,
         'count': 0.0},
        {'lower_bound': 0.8267438530921936,
         'upper_bound': 0.8542317450046539,
         'count': 0.0},
        {'lower_bound': 0.85423

In [47]:
default_monitor.delete_monitoring_schedule()


Deleting Monitoring Schedule with name: schedule-RXZQF


In [48]:
predictor.delete_endpoint()