# Debugger and Profiler

In [None]:
# Install dependencies
!pip install smdebug

In [None]:
# Set the parameters for the submitting script (this time we're working on a notebook to make it easier)

hyperparameters = {
    "batch_size": 2048,
    "gpu": True,
    "epoch": 2,
    "model": "resnet50",
}

#### Key concepts of Amazon SageMaker Debugger

Amazon SageMaker Debugger lets you go beyond just looking at scalars like losses and accuracies during training and gives you full visibility into all tensors 'flowing through the graph' during training. Furthermore, it helps you monitor your training in near real time using rules and provides you alerts, once it has detected inconsistency in training flow.

Concepts
- Tensors: These represent the state of the training network at intermediate points during its execution
- Debug Hook: Hook is the construct with which Amazon SageMaker Debugger looks into the training process and captures the tensors requested at the desired step intervals
- Rule: A logical construct, implemented as Python code, which helps analyze the tensors captured by the hook and report anomalies, if at all


In [None]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

# Create debugging and profiling rules
rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()), 
]

In [None]:
from sagemaker.debugger import DebuggerHookConfig, ProfilerConfig, FrameworkProfile

# Instance profiler and debugger configs
debugger_config = DebuggerHookConfig(
    hook_parameters={"train.save_interval": "100", "eval.save_interval": "10"})

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(num_stpes=10))

In [None]:
import sagemaker
from sagemaker.pytorch import PyTorch

role = sagemaker.get_execution_role()

estimator = PyTorch(
    role=role,
    base_job_name="smdebugger-cifar-pytorch",
    instance_type="ml.p3.2xlarge",
    intance_count=1,
    entry_point="scripts/pytorch_cifar_profiling.py",
    framework_version="1.8",
    py_version="py36",
    hyperparameters=hyperparameters,
    profiler_config=profiler_config,
    debugger_hook_config=debugger_config,
    rules=rules
)

In [None]:
# Launch the training job
estimator.fit(wait=True)

#### Prepare for debugging and profilinf analysis

In [None]:
import boto3

session = boto3.session.Session()
region = session.region_name

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

In [None]:
#
client = estimator.sagemaker_session.sagemaker_client
description = client.describe_training_job(TrainingJobName=training_job_name)

## Checking Training Performance

A Trial is an object used to interact with and analyze the debugging artifacts generated during a training job. SageMaker Debugger collects data such as tensors, metrics, and other relevant information at different points during the training process. These artifacts are stored in a specific path (e.g., in S3), and the Trial object allows you to access, query, and visualize this data.

In [None]:
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys

# Instance the trial object
trial = create_trial(estimator.latest_job_debugger_artifacts_path())

#### Print the names of tracked tensors for both train and evaluation, and the number of datapoints for each tnesor

In [None]:
print(f'Tensor names: {trial.tensor_names()}\n')

In [None]:
# RENAME THE TENSORS AS OUR MODEL USES CROSSENTROPY LOSS, NOT NLL LOSS
print(f'Datapoints in the training tensor: {len(trial.tensor("nll_loss_output_0").steps(mode=ModeKeys.TRAIN))}\n')
print(f'Datapoints in the evaluation tensor: {len(trial.tensor("nll_loss_output_0").steps(mode=ModeKeys.EVAL))}')

### Plot the tracked tensors
Set up functions to plot the output tensors

In [None]:
# Define a function to retrive the desired tensors by name
def get_data(trial, tname, mode):
    tensor = trial.tensor(tname)
    steps = tensor.steps(mode=mode)
    vals = []
    for s in steps:
        vals.append(tensor.value(s, mode=mode))
    return steps, vals

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot

# Def a function to plot the tensors
def plot_tensor(trial, tensor_name):

    steps_train, vals_train = get_data(trial, tensor_name, mode=ModeKeys.TRAIN)
    print("loaded TRAIN data")
    steps_eval, vals_eval = get_data(trial, tensor_name, mode=ModeKeys.EVAL)
    print("loaded EVAL data")

    fig = plt.figure(figsize=(10, 7))
    host = host_subplot(111)

    par = host.twiny()

    host.set_xlabel("Steps (TRAIN)")
    par.set_xlabel("Steps (EVAL)")
    host.set_ylabel(tensor_name)

    (p1,) = host.plot(steps_train, vals_train, label=tensor_name)
    print("completed TRAIN plot")
    (p2,) = par.plot(steps_eval, vals_eval, label="val_" + tensor_name)
    print("completed EVAL plot")
    leg = plt.legend()

    host.xaxis.get_label().set_color(p1.get_color())
    leg.texts[0].set_color(p1.get_color())

    par.xaxis.get_label().set_color(p2.get_color())
    leg.texts[1].set_color(p2.get_color())

    plt.ylabel(tensor_name)

    plt.show()

In [None]:
plot_tensor(trial, "")

## Check System Utilization

The TrainingJob object tj encapsulates all the information and utilities required to interact with the profiling data collected during the specified training job.

In [None]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob

tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

In [None]:
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts

# Display system metrics
system_metrics_reader = tj.get_systems_metrics_reader()
system_metrics_reader.refresh_event_file_list()

view_timeline_charts = TimelineCharts(
    system_metrics_reader,
    framework_metrics_reader=None,
    select_dimensions=["CPU", "GPU"],
    select_events=["total"],
)

## Display the Profiler Report
We will fetch the profiler report from the S3 bucket where it was stored and display it.

In [None]:
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
print(f"You will find the profiler report at {rule_output_path}")

In [None]:
! aws s3 ls {rule_output_path} --recursive

In [None]:
! aws s3 cp {rule_output_path} ./ --recursive

In [None]:
import os

# get the autogenerated folder name of profiler report
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

In [None]:
import IPython

IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")