In [8]:
# !pip3 install sagemaker-experiments -q
# !pip3 install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [2]:
from sagemaker.session import Session
import boto3
from dotenv import load_dotenv
import os

load_dotenv()

sess = boto3.Session(region_name="us-west-2")
sm = sess.client("sagemaker")
role = os.environ.get("SAGEMAKER_ROLE")

region = Session().boto_session.region_name

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/bobrandt/Library/Application Support/sagemaker/config.yaml


#### Create Experiment in SageMaker

In [3]:
from smexperiments.experiment import Experiment

experiment_name = f"sector-classification-experiments-v0"
sector_class_experiment = Experiment.create(
    experiment_name=experiment_name,
    description="Sector / Subsector Classification experiments - v0",
    sagemaker_boto_client=sm,
)

experiment_name = sector_class_experiment.experiment_name
print(experiment_name)

sector-classification-experiments-v0


#### S3 URI's to train and test data

In [4]:
import os
s3_data_root_folder = f"s3://team-orange-datasets"
train_input_path = os.path.join(s3_data_root_folder, "subsector-classification", "train")
test_input_path = os.path.join(s3_data_root_folder, "subsector-classification", "test")

### Train HuggingFace Model via SageMaker Training Job

In [5]:
import time
## Ideally this should run in parallel...kick off training jobs horizontally via Lambda
# ..not sure if SageMaker allows multiple training instances though

# source: https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-script-mode/pytorch-sagemaker-huggingface/huggingface_text_classification.ipynb
# Helpful:
# - https://stackoverflow.com/questions/76821347/sagemaker-experiment-tracking-duplication
# - https://www.youtube.com/watch?v=kK1ohrpJFC0

from sagemaker.huggingface import HuggingFace
from sagemaker.experiments import Run

# Hyperparameters which are passed into the training job
# Memory error when train_batch_size = 32 for bert-base-uncased
transformers_version = "4.26"
pytorch_version = "1.13"
python_version = "py39"
instance_type = "ml.g4dn.xlarge"
instance_count = 1

# hyperparameter tuning: https://medium.com/distributed-computing-with-ray/hyperparameter-optimization-for-transformers-a-guide-c4e32c6c989b
model_list = ["bert-base-uncased"] # ["distilbert-base-uncased", "bert-base-uncased"]
epoch_list = [2] # [2, 3, 4]
learning_rate_list = [5e-5] # [3e-5, 5e-5]
weight_decay_list = [0] #[0, 0.3]
for model in model_list:
    for epoch in epoch_list:
        for learning_rate in learning_rate_list:
            for weight_decay in weight_decay_list:
                hyperparameters = {
                    "epochs": epoch,
                    "train_batch_size": 8,
                    "model_name": model,
                    "weight_decay": weight_decay,
                    "learning_rate": learning_rate,
                }
                with Run(
                    experiment_name=experiment_name,
                    sagemaker_session=Session(),
                    run_name=f"{model}-epoch-{epoch}-lr-{learning_rate}-wd-{str(weight_decay).replace('.','')}-{int(time.time())}",
                ) as run:

                    # HuggingFace container in SageMaker
                    huggingface_estimator = HuggingFace(
                        entry_point="train.py",
                        source_dir="./scripts", # reads requirements.txt file in scripts directory
                        instance_type=instance_type,
                        instance_count=instance_count,
                        role=role,
                        transformers_version=transformers_version,
                        pytorch_version=pytorch_version,
                        py_version=python_version,
                        hyperparameters=hyperparameters,
                        enable_sagemaker_metrics=True,
                    )

                    # starts training job in SageMaker
                    huggingface_estimator.fit(
                        inputs={"train": train_input_path, "test": test_input_path},
                        wait=True,
                    )

                    # log parameter in SageMaker Experiments
                    run.log_parameter(name="instance_type", value=instance_type)
                    run.log_parameter(name="instance_count", value=instance_count)
                    run.log_artifact(
                        name="train", value=train_input_path, is_output=False
                    )
                    run.log_artifact(
                        name="test", value=test_input_path, is_output=False
                    )
                    run.log_artifact(
                        name="SageMaker.ModelArtifact",
                        value=huggingface_estimator.model_data,
                    )

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-06-11-19-08-39-219


2024-06-11 19:08:41 Starting - Starting the training job...
2024-06-11 19:08:57 Starting - Preparing the instances for training...
2024-06-11 19:09:28 Downloading - Downloading input data...
2024-06-11 19:09:53 Downloading - Downloading the training image..................
2024-06-11 19:13:09 Training - Training image download completed. Training in progress..bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-06-11 19:13:21,224 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-06-11 19:13:21,245 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-06-11 19:13:21,255 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-06-11 19:13:21,260 sagemaker_pytorch_container.training INFO     Invoking user training script.
2024-06-11 19:13:22,411 sagemaker-training-toolkit INFO     Installing dependencies 

In [7]:
# delete experiment in Sagemaker -- can't do this in console
# source: https://docs.aws.amazon.com/sagemaker/latest/dg/experiments-cleanup.html - need to make a few modifications
exp = Experiment.load(experiment_name="sector-classification-experiments-v0", sagemaker_boto_client=sm)
exp.delete_all(action="--force")