In [1]:
!pip install transformers



In [2]:
!pip install datasets



In [3]:
!pip install -U sagemaker

Collecting botocore<1.41.0,>=1.40.16 (from boto3<2.0,>=1.39.5->sagemaker)
  Using cached botocore-1.40.16-py3-none-any.whl.metadata (5.7 kB)
Using cached botocore-1.40.16-py3-none-any.whl (14.0 MB)
Installing collected packages: botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.37.1
    Uninstalling botocore-1.37.1:
      Successfully uninstalled botocore-1.37.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.21.1 requires botocore<1.37.2,>=1.37.0, but you have botocore 1.40.16 which is incompatible.
sagemaker-studio-analytics-extension 0.2.0 requires sparkmagic==0.22.0, but you have sparkmagic 0.21.0 which is incompatible.[0m[31m
[0mSuccessfully installed botocore-1.40.16


In [4]:
!pip install s3fs

Collecting botocore<1.37.2,>=1.37.0 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Using cached botocore-1.37.1-py3-none-any.whl.metadata (5.7 kB)
Using cached botocore-1.37.1-py3-none-any.whl (13.4 MB)
Installing collected packages: botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.40.16
    Uninstalling botocore-1.40.16:
      Successfully uninstalled botocore-1.40.16
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sagemaker-studio-analytics-extension 0.2.0 requires sparkmagic==0.22.0, but you have sparkmagic 0.21.0 which is incompatible.
s3transfer 0.13.1 requires botocore<2.0a.0,>=1.37.4, but you have botocore 1.37.1 which is incompatible.
boto3 1.40.16 requires botocore<1.41.0,>=1.40.16, but you have botocore 1.37.1 which is incompatible.[0m[31m
[0mSuccessfully installed botocore-1.37.1


In [5]:
!pip install loguru



In [6]:
import sagemaker
from sagemaker.huggingface import HuggingFace
import boto3
import os
from datetime import datetime
from dotenv import load_dotenv

# --- Configuration ---
load_dotenv()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


True

In [7]:
# IAM role for SageMaker
# iam_role = "arn:aws:iam::551529993308:role/service-role/AmazonSageMaker-ExecutionRole-20250711T075198"
iam_role = os.getenv("SAGEMAKER_IAM_ROLE")
# # S3 bucket for data and model artifacts
# s3_bucket = "self-corrective-llm-data" 
s3_bucket = os.getenv("S3_BUCKET")

In [8]:
# Define S3 paths
base_s3_uri = f"s3://{s3_bucket}"
base_model_s3_uri = f"{base_s3_uri}/self-corrective-llm-not-trained"
dataset_s3_uri = f"{base_s3_uri}/dataset/training_data"
output_s3_uri = f"{base_s3_uri}/trained_model/output"

In [9]:
# --- Hyperparameters ---
hyperparameters = {
    # Core parameters
    "epochs": 2,
    "learning_rate": 2e-4,
    "alpha": 0.4,
    "correction_weights": "\"[1.0, 8.0, 3.0, 1.0]\"",
    "max_sequence_length": 1000,

    # Batching and memory
    "train_batch_size": 2,
    "eval_batch_size": 2,
    "gradient_accumulation_steps": 8,

    # LoRA parameters
    "lora_r": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.05,

    # Optimizer and scheduler
    "optim": "paged_adamw_8bit",
    "weight_decay": 0.01,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,

    # Logging and saving
    "logging_steps": 5,
    "eval_steps": 100,
    "save_steps": 100,
}

# --- W&B Configuration ---
run_name = f"self-corrective-run-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
wandb_api_key = os.getenv("WANDB_API_KEY")

environment = {
    "WANDB_API_KEY": wandb_api_key,
    # Set the project name for W&B
    "WANDB_PROJECT": "Self-Corrective-LLM-Finetuning",
    # Set the specific name for this run
    "WANDB_RUN_NAME": run_name,
}

# --- SageMaker Estimator ---
huggingface_estimator = HuggingFace(
    entry_point="train.py",          # Your training script
    source_dir="../../scripts",         # Directory containing the script
    instance_type="ml.g5.12xlarge",   # Instance type for training
    volume_size=100,
    instance_count=1,
    role=iam_role,
    transformers_version="4.49.0",     # Version of transformers
    pytorch_version="2.5.1",           # Version of PyTorch
    py_version="py311",              # Python version
    hyperparameters=hyperparameters,
    output_path=output_s3_uri,
    environment=environment,
    # Dependencies from your project
    dependencies=["../../src"],
    # Input channels for data and base model
    sagemaker_session=sagemaker.Session(),
    distribution={"torch_distributed": {"enabled": True}}
)

In [10]:
# --- Start Training ---
huggingface_estimator.fit({
    "dataset": dataset_s3_uri,
    "model": base_model_s3_uri
})

print("SageMaker training job started.")


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2025-08-23-12-49-35-478


2025-08-23 12:49:37 Starting - Starting the training job
2025-08-23 12:49:37 Pending - Training job waiting for capacity......
2025-08-23 12:50:41 Downloading - Downloading input data.........
2025-08-23 12:52:02 Downloading - Downloading the training image...............
2025-08-23 12:54:23 Training - Training image download completed. Training in progress.....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34mCUDA compat package should be installed for NVIDIA driver smaller than 550.163.01[0m
[34mCurrent installed NVIDIA driver version is 570.172.08[0m
[34mSkipping CUDA compat setup as newer NVIDIA driver is installed[0m
[34m2025-08-23 12:55:12,102 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-08-23 12:55:12,141 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-08-23 12:55:12,150 sage