In [1]:
!pip install transformers



In [2]:
!pip install datasets



In [3]:
!pip install -U sagemaker



In [4]:
!pip install s3fs



In [5]:
!pip install loguru



In [6]:
import sagemaker
from sagemaker.huggingface import HuggingFace
import boto3
import os
from datetime import datetime
from dotenv import load_dotenv

# --- Configuration ---
load_dotenv()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


True

In [7]:
# IAM role for SageMaker
# iam_role = "arn:aws:iam::551529993308:role/service-role/AmazonSageMaker-ExecutionRole-20250711T075198"
iam_role = os.getenv("SAGEMAKER_IAM_ROLE")
# # S3 bucket for data and model artifacts
# s3_bucket = "self-corrective-llm-data" 
s3_bucket = os.getenv("S3_BUCKET")

In [8]:
# Define S3 paths
base_s3_uri = f"s3://{s3_bucket}"
base_model_s3_uri = f"{base_s3_uri}/self-corrective-llm-not-trained"
dataset_s3_uri = f"{base_s3_uri}/dataset/training_data"
output_s3_uri = f"{base_s3_uri}/trained_model/output"

In [9]:
# --- Hyperparameters ---
hyperparameters = {
    # Core parameters
    "epochs": 2,
    "learning_rate": 3e-4,
    "alpha": 0.6,
    "pos_weight": 5.0,

    # Batching and memory
    "train_batch_size": 2,
    "eval_batch_size": 2,
    "gradient_accumulation_steps": 8,

    # LoRA parameters
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,

    # Optimizer and scheduler
    "optim": "paged_adamw_8bit",
    "weight_decay": 0.01,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,

    # Logging and saving
    "logging_steps": 5,
    "eval_steps": 100,
    "save_steps": 100,
}

# --- W&B Configuration ---
run_name = f"self-corrective-run-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
wandb_api_key = os.getenv("WANDB_API_KEY")

environment = {
    "WANDB_API_KEY": wandb_api_key,
    # Set the project name for W&B
    "WANDB_PROJECT": "Self-Corrective-LLM-Finetuning",
    # Set the specific name for this run
    "WANDB_RUN_NAME": run_name,
}

# --- SageMaker Estimator ---
huggingface_estimator = HuggingFace(
    entry_point="train.py",          # Your training script
    source_dir="../../scripts",         # Directory containing the script
    instance_type="ml.g5.12xlarge",   # Instance type for training
    volume_size=100,
    instance_count=1,
    role=iam_role,
    transformers_version="4.49.0",     # Version of transformers
    pytorch_version="2.5.1",           # Version of PyTorch
    py_version="py311",              # Python version
    hyperparameters=hyperparameters,
    output_path=output_s3_uri,
    environment=environment,
    # Dependencies from your project
    dependencies=["../../src"],
    # Input channels for data and base model
    sagemaker_session=sagemaker.Session(),
    distribution={"torch_distributed": {"enabled": True}}
)

In [10]:
# --- Start Training ---
huggingface_estimator.fit({
    "dataset": dataset_s3_uri,
    "model": base_model_s3_uri
})

print("SageMaker training job started.")


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2025-08-16-11-29-50-421


2025-08-16 11:29:52 Starting - Starting the training job
2025-08-16 11:29:52 Pending - Training job waiting for capacity.........
2025-08-16 11:31:11 Pending - Preparing the instances for training...
2025-08-16 11:31:46 Downloading - Downloading input data.........
2025-08-16 11:33:06 Downloading - Downloading the training image............
2025-08-16 11:35:23 Training - Training image download completed. Training in progress.......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34mCUDA compat package should be installed for NVIDIA driver smaller than 550.163.01[0m
[34mCurrent installed NVIDIA driver version is 570.172.08[0m
[34mSkipping CUDA compat setup as newer NVIDIA driver is installed[0m
[34m2025-08-16 11:36:14,908 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-08-16 11:36:14,946 sagemaker-training-toolkit INFO     No Neurons detecte



Training seconds: 9057
Billable seconds: 9057
SageMaker training job started.
