In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [None]:
# Sagemaker execution role ARN
iam_role_arn = get_execution_role()  

# S3 bucket 
s3_bucket = "chess-project-data" 

Using IAM Role: arn:aws:iam::686821851789:role/service-role/AmazonSageMaker-ExecutionRole-20250709T174464
Using S3 Bucket: chess-project-data


In [None]:
# Setup session and S3 paths
sagemaker_session = sagemaker.session.Session(default_bucket=s3_bucket)

# Training model output path
s3_output_path = f's3://chess-project-data/training-output/'

print(f"SageMaker session created.")
print(f"Model will be saved to: {s3_output_path}")

SageMaker session created.
Model will be saved to: s3://chess-project-data/training-output/


In [None]:
# Metrics to show up on sagemaker metric logs
METRIC_DEFS = [
    {"Name": "train_loss", "Regex": r" - loss:\s*([0-9]*\.?[0-9]+)"},
    {"Name": "train_mae",  "Regex": r" - mae:\s*([0-9]*\.?[0-9]+)"},
    {"Name": "val_loss",   "Regex": r" - val_loss:\s*([0-9]*\.?[0-9]+)"},
    {"Name": "val_mae",    "Regex": r" - val_mae:\s*([0-9]*\.?[0-9]+)"},
    # Optional: learning-rate if you print it; included here in case you add it later
    {"Name": "lr",         "Regex": r" - lr:\s*([0-9]*\.?[0-9]+)"},
]

In [None]:
# Define and Run the GPU Training Job
print("\n--- Starting GPU Model Training ---")
# Create a SageMaker TensorFlow Estimator
estimator = TensorFlow(
    entry_point='ai_train.py',              # Your training script
    source_dir='./dnn_train',               # The local directory with your train.py
    role=iam_role_arn,
    instance_count=1,
    instance_type='ml.g4dn.2xlarge',       # NVIDIA T4 GPU (16GB) 8vCPU (32GB)
    framework_version='2.18',
    py_version='py310',
    output_path=s3_output_path,
    metric_definitions=METRIC_DEFS,
    max_run=172800
)

print("Estimator configured successfully.")


--- Starting GPU Model Training ---
Estimator configured successfully.


In [None]:
# 4. Start the training job on AWS
print(f"Starting training job... ")
estimator.fit()

print("\n✅ Training job complete!")
print(f"Model artifacts saved in S3 at: {estimator.model_data}")

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2025-08-12-03-44-00-259


Starting training job... 
2025-08-12 03:44:01 Starting - Starting the training job...
2025-08-12 03:44:15 Starting - Preparing the instances for training..