In [126]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
import os
import json

from sagemaker.model_monitor import DataCaptureConfig, DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat
# Setup SageMaker session
sagemaker_session = sagemaker.Session()
sagemaker_client = boto3.client("sagemaker")

role = sagemaker.get_execution_role()
print(role)

arn:aws:iam::837028399719:role/iti113-team2-sagemaker-iti113-team2-domain-iti113-team2-Role


----
#### Generate the Baseline

In [17]:
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.model_monitor import DataCaptureConfig, DefaultModelMonitor

bucket_name = 'iti113-team2-bucket'
base_folder = 'Team2'

monitor_output_path = f"s3://{bucket_name}/{base_folder}/monitoring"
baseline_data = f's3://{bucket_name}/{base_folder}/processing/train/v1/train.csv'

# Create a DefaultModelMonitor instance
monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.t3.large',
    sagemaker_session=sagemaker_session
)

# Run the baseline job
baseline_job = monitor.suggest_baseline(
    baseline_dataset=baseline_data,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=f"{monitor_output_path}/baseline",
    wait=True,
    logs=True
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating processing-job with name baseline-suggestion-job-2025-08-27-02-24-50-812


........................[34m2025-08-27 02:28:55.574072: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory[0m
[34m2025-08-27 02:28:55.574140: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.[0m
[34m2025-08-27 02:28:58.100596: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory[0m
[34m2025-08-27 02:28:58.100650: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)[0m
[34m2025-08-27 02:28:58.100681: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-0-130-162.ap-southeast-1.compute.internal): /proc/d

#### Test Code

In [19]:
bucket_name = 'iti113-team2-bucket'
base_folder = 'Team2'

monitor_output_path = f"s3://{bucket_name}/{base_folder}/monitoring"

endpoint_name = "Team2-predictor-endpoint"
schedule_name = 'Team2-monitor-schedule'

monitor.create_monitoring_schedule(
    monitor_schedule_name=schedule_name,
    endpoint_input=endpoint_name,
    output_s3_uri=f"{monitor_output_path}/output",
    statistics=f"{monitor_output_path}/baseline/statistics.json",
    constraints=f"{monitor_output_path}/baseline/constraints.json",
    schedule_cron_expression='cron(0 * ? * * *)'  # hourly
)


INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: Team2-monitor-schedule


In [22]:
import boto3
schedule_name = 'Team2-monitor-schedule'
sm = boto3.client('sagemaker')
sm.start_monitoring_schedule(MonitoringScheduleName=schedule_name)

{'ResponseMetadata': {'RequestId': '71157dc6-6ff2-4b25-929b-b4fd9ddb7c1a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '71157dc6-6ff2-4b25-929b-b4fd9ddb7c1a',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 27 Aug 2025 02:45:39 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}

In [23]:
import boto3
sm = boto3.client("sagemaker")
schedule_name = 'Team2-monitor-schedule'
response = sm.describe_monitoring_schedule(MonitoringScheduleName=schedule_name)
print("Status:", response['MonitoringScheduleStatus'])

last_run = response.get('LastMonitoringExecutionSummary', {})
print("Last execution:", last_run.get('MonitoringExecutionStatus'))


Status: Scheduled
Last execution: None


In [24]:
import boto3

client = boto3.client("sagemaker")
schedule_name = 'Team2-monitor-schedule'
response = client.describe_monitoring_schedule(MonitoringScheduleName=schedule_name)
print(response['MonitoringScheduleConfig'])


{'ScheduleConfig': {'ScheduleExpression': 'cron(0 * ? * * *)'}, 'MonitoringJobDefinitionName': 'data-quality-job-definition-2025-08-27-02-45-17-768', 'MonitoringType': 'DataQuality'}


In [26]:
#delete monitor
import boto3

sm = boto3.client("sagemaker")

schedule_name = 'Team2-monitor-schedule'

# Delete the monitoring schedule
response = sm.delete_monitoring_schedule(MonitoringScheduleName=schedule_name)

print(f"Monitoring schedule '{schedule_name}' deleted successfully.")


Monitoring schedule 'Team2-monitor-schedule' deleted successfully.


------
#### Re-attach to the job as a ProcessingJob

In [19]:
import sagemaker
from sagemaker.processing import ProcessingJob
from sagemaker.model_monitor import Constraints

sagemaker_session = sagemaker.Session()

baseline_job = None
# !!!! The job name from your successful run
baseline_job_name = "baseline-suggestion-job-2025-08-25-14-21-55-214"

try:
    # Re-attach as a ProcessingJob
    baseline_job = ProcessingJob.from_processing_name(
        processing_job_name=baseline_job_name,
        sagemaker_session=sagemaker_session
    )
    print(f"Successfully re-attached to job: {baseline_job_name}")

except Exception as e:
    print(f"Failed to re-attach to the job. Error: {e}")


# Now, load and inspect the constraints from S3
if baseline_job:
    # Find the output configuration in the job description
    output_config = baseline_job.describe()['ProcessingOutputConfig']['Outputs']

    # Find the S3 URI for the baseline output
    baseline_output_uri = None
    for output in output_config:
        if output['OutputName'] == 'monitoring_output':
            baseline_output_uri = output['S3Output']['S3Uri']
            break

    if baseline_output_uri:
        # Construct the full path to the constraints file
        constraints_s3_uri = f"{baseline_output_uri}/constraints.json"
        print(f"\nLoading constraints from: {constraints_s3_uri}")

        # Load the constraints file from S3
        suggested_constraints = Constraints.from_s3_uri(constraints_s3_uri)

        # Now print the dictionary, just like in the other example
        print("\n--- Sample of Generated Constraints ---")
        from pprint import pprint
        pprint(suggested_constraints.body_dict)
        # ===================================================================
    else:
        print("Could not find the baseline output path in the job description.")

Successfully re-attached to job: baseline-suggestion-job-2025-08-25-14-21-55-214

Loading constraints from: s3://iti113-team2-bucket/Team2/monitoring/baseline/constraints.json

--- Sample of Generated Constraints ---
{'features': [{'completeness': 1.0,
               'inferred_type': 'Fractional',
               'name': 'restingBP',
               'num_constraints': {'is_non_negative': False}},
              {'completeness': 1.0,
               'inferred_type': 'Fractional',
               'name': 'serumcholestrol',
               'num_constraints': {'is_non_negative': False}},
              {'completeness': 1.0,
               'inferred_type': 'Fractional',
               'name': 'maxheartrate',
               'num_constraints': {'is_non_negative': False}},
              {'completeness': 1.0,
               'inferred_type': 'Fractional',
               'name': 'oldpeak',
               'num_constraints': {'is_non_negative': False}},
              {'completeness': 1.0,
               '

----
#### Schedule the Monitoring Job

In [134]:
from sagemaker.model_monitor import CronExpressionGenerator, DefaultModelMonitor
from botocore.exceptions import ClientError
import pandas as pd
import boto3
import sagemaker
from sagemaker.model_monitor import EndpointInput

sagemaker_client = boto3.client("sagemaker", region_name="ap-southeast-1")

# Confirm endpoint is InService
endpoint_name = "Team2-predictor-endpoint"
response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
print("Endpoint status:", response['EndpointStatus'])  # should be "InService"

sagemaker_session = sagemaker.Session()
print("SageMaker default region:", sagemaker_session.boto_region_name)
role = sagemaker.get_execution_role()

bucket_name = 'iti113-team2-bucket'
base_folder = 'Team2'

monitor_output_path = f"s3://{bucket_name}/{base_folder}/monitoring"
baseline_output_uri = f"{monitor_output_path}/baseline"

#schedule_name
schedule_name = "Team2-drift-schedule-main"

# Initialize model monitor
monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.t3.large',
    sagemaker_session=sagemaker_session
)

try:
    # Check if schedule exists
    sagemaker_client.describe_monitoring_schedule(MonitoringScheduleName=schedule_name)
    print(f"Found existing monitoring schedule: '{schedule_name}'")

    # Attach monitor to the schedule
    monitor.attach(schedule_name)
    monitor.monitoring_schedule_name = schedule_name
    print(f"Successfully attached monitor object to the schedule.")

except ClientError as e:
    if e.response['Error']['Code'] == 'ResourceNotFound':
        print(f"No schedule named '{schedule_name}' found. Creating a new one.")

        # endpoint_input = EndpointInput(
        #     endpoint_name="Team2-predictor-endpoint",
        #     destination="/opt/ml/processing/input"
        # )
        
        # Create monitoring schedule
        monitor.create_monitoring_schedule(
            monitor_schedule_name=schedule_name,
            endpoint_input=endpoint_name,
            output_s3_uri=f"{monitor_output_path}/reports",
            statistics=f"{baseline_output_uri}/statistics.json",
            constraints=f"{baseline_output_uri}/constraints.json",
            schedule_cron_expression=CronExpressionGenerator.hourly(),
            enable_cloudwatch_metrics=True,
        )
        print(f"Monitoring schedule '{schedule_name}' created successfully.")
        
        # Attach after creation
        # monitor.attach(monitoring_schedule_name=schedule_name)
        
    else:
        print("An unexpected error occurred while checking for the schedule.")
        raise e

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Endpoint status: InService
SageMaker default region: ap-southeast-1
No schedule named 'Team2-drift-schedule-main' found. Creating a new one.


INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: Team2-drift-schedule-main


Monitoring schedule 'Team2-drift-schedule-main' created successfully.


In [142]:
# Now describe schedule details
try:
    schedule_details = monitor.describe_schedule()
    print(f"Schedule status: {schedule_details['MonitoringScheduleStatus']}")
except Exception as e:
    print(f"Could not retrieve schedule details: {e}")

Schedule status: Scheduled


-----

#### Simulate, Detect, and Analyze Drift

In [128]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
import time
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
import json

# === CONFIGURATION ===
bucket_name = 'iti113-team2-bucket'
base_folder = 'Team2'
endpoint_name = "Team2-predictor-endpoint"
aws_region = "ap-southeast-1"

# === SETUP CLIENTS ===
boto_session = boto3.Session(region_name=aws_region)
sagemaker_session = sagemaker.Session(boto_session=boto_session)
sagemaker_client = boto3.client("sagemaker", region_name=aws_region)
s3_client = boto3.client('s3', region_name=aws_region)

s3_process_test_path = f"s3://{bucket_name}/{base_folder}/processing/test/v1/test.csv"
df = pd.read_csv(s3_process_test_path)
df = df.drop("target", axis=1)

# Select a few rows to manipulate (simulate drift)
# drifted_data = df.head(5).copy()
drifted_data = df.head(100).copy()

# Manually introduce DRIFT
print("Original average restingBP:", int(drifted_data['restingBP'].mean()))
drifted_data['restingBP'] = 1
print("New average restingBP:", drifted_data['restingBP'].mean())

print("Original average serumcholestrol:", int(drifted_data['serumcholestrol'].mean()))
drifted_data['serumcholestrol'] = 1
print("New average serumcholestrol:", drifted_data['serumcholestrol'].mean())

# # Print sample payloads
# print("\nSample drifted payloads:")
# print(drifted_data.head(2))

print("\nNumber of features:", drifted_data.shape[1])

# # Create SageMaker predictor
# predictor = sagemaker.predictor.Predictor(
#     endpoint_name=endpoint_name,
#     serializer=JSONSerializer(),
#     deserializer=JSONDeserializer(),
#     sagemaker_session=sagemaker_session
# )

# # === SEND PREDICTION PAYLOADS ===
# print(f"\nSending {len(drifted_data)} drifted requests to endpoint: {endpoint_name}")
# for i, row in drifted_data.iterrows():
#     payload = {"data": [row.to_dict()]}
#     try:
#         response = predictor.predict(payload)
#         print(f"[{i}] ✅ Response: {response}")
#         time.sleep(0.1)  # Optional delay to avoid throttling
#     except Exception as e:
#         print(f"[{i}] ❌ Error sending request: {e}")

# print("\n✅ All drifted requests sent.")

Original average restingBP: 0
New average restingBP: 1.0
Original average serumcholestrol: 0
New average serumcholestrol: 1.0

Number of features: 25


In [129]:
#test invoke endpoint
sagemaker_runtime_client = boto3.client("sagemaker-runtime", region_name=aws_region)

response = sagemaker_runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=json.dumps({"data": drifted_data.to_dict(orient="records")})
)
print(f"\nContentType: {response["ContentType"]}")
print(response["Body"].read().decode("utf-8"))


ContentType: application/json
{"predictions": [1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1]}


----
#### Manual start monitoring

In [143]:
monitor.start_monitoring_schedule()

INFO:sagemaker:Starting Monitoring Schedule with name: Team2-drift-schedule-main


In [162]:
schedules = client.list_monitoring_schedules()["MonitoringScheduleSummaries"]
print([s["MonitoringScheduleName"] for s in schedules])

['Team2-drift-schedule-main', 'ADL-drift-schedule-main']


----
#### Check scheduled monitor

In [154]:
response = sagemaker_client.list_monitoring_executions(
    MonitoringScheduleName=schedule_name,
    SortBy='CreationTime',
    SortOrder='Descending',
    MaxResults=5
)

print(response['MonitoringExecutionSummaries'])
print("\n")
for job in response['MonitoringExecutionSummaries']:
    print(f"Status: {job['MonitoringExecutionStatus']}, CreationTime: {job['CreationTime']}")
    print(f"EndpointName: {job['EndpointName']}")
    print(f"MonitoringType: {job['MonitoringType']}")
    print(f"FailureReason: {job['FailureReason']}")

[{'MonitoringScheduleName': 'Team2-drift-schedule-main', 'ScheduledTime': datetime.datetime(2025, 8, 27, 3, 0, tzinfo=tzlocal()), 'CreationTime': datetime.datetime(2025, 8, 27, 3, 8, 20, 489000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2025, 8, 27, 3, 15, 58, 571000, tzinfo=tzlocal()), 'MonitoringExecutionStatus': 'Failed', 'ProcessingJobArn': 'arn:aws:sagemaker:ap-southeast-1:837028399719:processing-job/model-monitoring-202508270300-224495b8a80ed9505b8b78d0', 'EndpointName': 'Team2-predictor-endpoint', 'FailureReason': 'AlgorithmError: Error: Errors occurred when analyzing your data. Please check CloudWatch logs for more details., exit code: 255', 'MonitoringJobDefinitionName': 'data-quality-job-definition-2025-08-26-08-34-34-473', 'MonitoringType': 'DataQuality'}]


Status: Failed, CreationTime: 2025-08-27 03:08:20.489000+00:00
EndpointName: Team2-predictor-endpoint
MonitoringType: DataQuality
FailureReason: AlgorithmError: Error: Errors occurred when analyzing your d

-----
#### Manual check baseline and capture

In [164]:
import boto3
import pandas as pd
import json
import io

bucket_name = "iti113-team2-bucket"
capture_prefix = "Team2/monitoring/data-capture/"

s3 = boto3.client('s3')

# List objects under data-capture folder
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=capture_prefix)

# Filter for .jsonl files (if any)
jsonl_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.jsonl')]

print(f"Found {len(jsonl_files)} .jsonl files for captured data.")


Found 34 .jsonl files for captured data.


#### load capture jsonl, capture stats

In [156]:
import boto3
import json
import pandas as pd

bucket_name = "iti113-team2-bucket"
capture_prefix = "Team2/monitoring/data-capture/"

s3 = boto3.client('s3')

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=capture_prefix)
jsonl_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.jsonl')]

if not jsonl_files:
    print("No JSONL files found in the capture folder.")
else:
    # Optionally sort keys by LastModified if you want most recent
    # Here assuming keys sorted alphabetically is enough
    last_file_key = jsonl_files[-1]
    print(f"Loading last file: {last_file_key}")

    obj = s3.get_object(Bucket=bucket_name, Key=last_file_key)
    content = obj['Body'].read().decode('utf-8')

    all_records = []
    for line in content.strip().split("\n"):
        event = json.loads(line)
        input_data_str = event.get("captureData", {}).get("endpointInput", {}).get("data")
        if input_data_str:
            input_data = json.loads(input_data_str)
            records = input_data.get("data", [])
            all_records.extend(records)

    df_captured = pd.DataFrame(all_records)
    df_captured = df_captured.apply(pd.to_numeric, errors='coerce')
    print(f"Captured data from last file loaded with shape: {df_captured.shape}")
    print(df_captured.head())

def calc_stats(df):
    stats = {}
    for col in df.columns:
        col_data = df[col]
        stats[col] = {
            "Mean": col_data.mean(),
            "Std Dev": col_data.std(),
            "Min": col_data.min(),
            "Max": col_data.max(),
            "Completeness": col_data.notna().mean()
        }
    return pd.DataFrame(stats).T
    
captured_stats = calc_stats(df_captured)
print(captured_stats)


Loading last file: Team2/monitoring/data-capture/Team2-predictor-endpoint/AllTraffic/2025/08/27/05/16-26-766-20ea9f40-7337-4d43-b630-333a3ae086c7.jsonl
Captured data from last file loaded with shape: (100, 25)
   restingBP  serumcholestrol  maxheartrate   oldpeak  gender_0  gender_1  \
0          1                1      0.980974  0.228096       0.0       1.0   
1          1                1      1.566229 -1.574342       0.0       1.0   
2          1                1      1.185813 -1.516199       1.0       0.0   
3          1                1      0.659084  1.681675       0.0       1.0   
4          1                1      1.185813 -0.527765       1.0       0.0   

   chestpain_0  chestpain_1  chestpain_2  chestpain_3  ...  exerciseangia_0  \
0          1.0          0.0          0.0          0.0  ...              1.0   
1          1.0          0.0          0.0          0.0  ...              0.0   
2          0.0          1.0          0.0          0.0  ...              1.0   
3          

#### load baseline stats

In [157]:
import boto3
import json

bucket_name = "iti113-team2-bucket"
baseline_key = "Team2/monitoring/baseline/statistics.json"

s3 = boto3.client("s3")

# Load JSON from S3
response = s3.get_object(Bucket=bucket_name, Key=baseline_key)
baseline_stats = json.loads(response['Body'].read().decode('utf-8'))

# Extract numerical stats into a DataFrame
import pandas as pd

rows = []
for feature in baseline_stats["features"]:
    name = feature["name"]
    if "numerical_statistics" in feature:
        stats = feature["numerical_statistics"]
        rows.append({
            "Feature": name,
            "Mean": round(stats.get("mean", None), 5),
            "Std Dev": round(stats.get("std_dev", None), 5),
            "Min": round(stats.get("min", None), 5),
            "Max": round(stats.get("max", None), 5),
            "Completeness": stats.get("completeness", None)
            # "Present": stats.get("common", {}).get("num_present"),
            # "Missing": stats.get("common", {}).get("num_missing"),
        })

df_baseline = pd.DataFrame(rows)
df_baseline = df_baseline.set_index("Feature")

print(df_baseline)

                        Mean  Std Dev      Min      Max  Completeness
Feature                                                              
restingBP            0.01629  1.00516 -1.92810  1.61111           1.0
serumcholestrol      0.01900  0.99702 -2.35272  2.19488           1.0
maxheartrate         0.02130  0.98055 -2.17940  1.65402           1.0
oldpeak              0.00054  1.00526 -1.57434  2.03053           1.0
gender_0             0.23500  0.42400  0.00000  1.00000           1.0
gender_1             0.76500  0.42400  0.00000  1.00000           1.0
chestpain_0          0.41625  0.49294  0.00000  1.00000           1.0
chestpain_1          0.22500  0.41758  0.00000  1.00000           1.0
chestpain_2          0.31125  0.46300  0.00000  1.00000           1.0
chestpain_3          0.04750  0.21271  0.00000  1.00000           1.0
fastingbloodsugar_0  0.69750  0.45934  0.00000  1.00000           1.0
fastingbloodsugar_1  0.30250  0.45934  0.00000  1.00000           1.0
restingrelectro_0   

#### compare stats

In [160]:
import numpy as np
import pandas as pd
import boto3
from io import StringIO

def compare_stats(baseline_df, captured_df, mean_threshold=1.5, completeness_threshold=0.05, tolerance=1e-6, s3_bucket=None, s3_key=None):
    violations = []

    # Align feature names
    common_features = baseline_df.index.intersection(captured_df.index)
    baseline_df = baseline_df.loc[common_features]
    captured_df = captured_df.loc[common_features]
    
    for feature in common_features:
        base = baseline_df.loc[feature]
        cap = captured_df.loc[feature]

        # Mean drift check
        mean_diff = abs(cap["Mean"] - base["Mean"])
        std = base["Std Dev"]
        if std > 0:
            mean_limit = mean_threshold * std
            if mean_diff > mean_limit + tolerance:
                violations.append({
                    "feature": feature,
                    "violation": f"Mean drift too high: Baseline mean={base['Mean']:.4f}, Captured mean={cap['Mean']:.4f}"
                })

        # Completeness drop
        completeness_diff = base["Completeness"] - cap["Completeness"]
        if completeness_diff > completeness_threshold:
            violations.append({
                "feature": feature,
                "violation": f"Completeness dropped by {completeness_diff:.2%}"
            })

        # Min below baseline min
        if not np.isclose(cap["Min"], base["Min"], atol=tolerance) and cap["Min"] < base["Min"]:
            violations.append({
                "feature": feature,
                "violation": f"Captured min {cap['Min']:.4f} below baseline min {base['Min']:.4f}"
            })

        # Max above baseline max
        if not np.isclose(cap["Max"], base["Max"], atol=tolerance) and cap["Max"] > base["Max"]:
            violations.append({
                "feature": feature,
                "violation": f"Captured max {cap['Max']:.4f} above baseline max {base['Max']:.4f}"
            })

        # Variance collapse
        if cap["Std Dev"] < tolerance and base["Std Dev"] > 0:
            violations.append({
                "feature": feature,
                "violation": f"Variance dropped to zero in capture: Baseline std={base['Std Dev']:.4f}, Captured std={cap['Std Dev']:.4f}"
            })

        if base["Std Dev"] < tolerance and cap["Std Dev"] < tolerance:
            violations.append({
                "feature": feature,
                "violation": "Both baseline and capture have near-zero variance (constant feature)"
            })

    # Convert violations to DataFrame
    violations_df = pd.DataFrame(violations)

    if s3_bucket and s3_key:
        s3 = boto3.client("s3")
        json_str = violations_df.to_json(orient="records", lines=False)  # JSON array string
        s3.put_object(Bucket=s3_bucket, Key=s3_key, Body=json_str)
        print(f"Violation report saved to s3://{s3_bucket}/{s3_key}")
        
        # csv_buffer = StringIO()
        # violations_df.to_csv(csv_buffer, index=False)
        # s3.put_object(Bucket=s3_bucket, Key=s3_key, Body=csv_buffer.getvalue())
        # print(f"Violation report saved to s3://{s3_bucket}/{s3_key}")

    return violations_df


In [161]:
# violation_report = compare_stats(df_baseline, captured_stats)
report_df = compare_stats(
    df_baseline,
    captured_stats,
    mean_threshold=1.5,
    completeness_threshold=0.05,
    s3_bucket=bucket_name,
    s3_key="Team2/monitoring/reports/violation_report.json"
)
print(report_df)


Violation report saved to s3://iti113-team2-bucket/Team2/monitoring/reports/violation_report.json
           feature                                          violation
0        restingBP  Variance dropped to zero in capture: Baseline ...
1  serumcholestrol  Variance dropped to zero in capture: Baseline ...


-----
#### Check if DataCaptureConfig enabled

In [39]:
import boto3

sagemaker_client = boto3.client('sagemaker')

endpoint_name = "Team2-predictor-endpoint"

# Get endpoint config name
endpoint_response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
endpoint_config_name = endpoint_response["EndpointConfigName"]

config_response = sagemaker_client.describe_endpoint_config(
    EndpointConfigName=endpoint_config_name
)

capture_config = config_response.get("DataCaptureConfig")

if capture_config and capture_config.get("EnableCapture"):
    print("✅ Data capture is enabled.")
    print(endpoint_response['DataCaptureConfig'])

else:
    print("❌ Data capture is NOT enabled. Monitoring cannot be scheduled.")

✅ Data capture is enabled.
{'EnableCapture': True, 'CaptureStatus': 'Started', 'CurrentSamplingPercentage': 100, 'DestinationS3Uri': 's3://iti113-team2-bucket/Team2/monitoring/data-capture'}


#### Clean Up

In [163]:
# Clean up resources
try:
    # Delete the Monitoring Schedule
    monitor.delete_monitoring_schedule()
    print(f"Monitoring schedule '{schedule_name}' deleted.")
except Exception as e:
    print(f"Could not delete schedule: {e}")

INFO:sagemaker:Deleting Monitoring Schedule with name: Team2-drift-schedule-main
INFO:sagemaker.model_monitor.model_monitoring:Deleting Data Quality Job Definition with name: data-quality-job-definition-2025-08-27-05-20-52-027


Monitoring schedule 'Team2-drift-schedule-main' deleted.
