# Training Unified Roof Analysis Model on SageMaker

This notebook demonstrates training our unified roof analysis model using two datasets:

1. RID (Roof Information Dataset)
   - Provides detailed roof segmentation
   - Has ridge, valley, and eave lines
   - Includes depth information

2. Roofline-Extraction Dataset
   - Focuses on 3D building reconstruction
   - Has ridge, hip, and valley lines
   - Includes depth maps

## Steps:
1. Set up SageMaker environment
2. Prepare and verify datasets
3. Run quick test training
4. Start full training
5. Monitor progress

In [None]:
import os
import json
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
import boto3
from pathlib import Path

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()
role = get_execution_role()

# S3 bucket for training data and model artifacts
bucket = 'sagemaker-us-east-2-575108929659'
prefix = 'unified-roof-model'

print(f"Using bucket: {bucket}")

## 2. Prepare and Verify Datasets

Our datasets are already in S3 at these locations:
- RID: s3://sagemaker-us-east-2-575108929659/roof-data/RID/
- Roofline: s3://sagemaker-us-east-2-575108929659/roof-data/Roofline/

In [None]:
# Define dataset paths
dataset_paths = {
    'rid': f's3://{bucket}/roof-data/RID',
    'roofline': f's3://{bucket}/roof-data/Roofline'
}

# Verify datasets exist
s3 = boto3.client('s3')

def check_s3_path(s3_path):
    path_parts = s3_path.replace('s3://', '').split('/')
    bucket = path_parts[0]
    prefix = '/'.join(path_parts[1:])
    
    response = s3.list_objects_v2(
        Bucket=bucket,
        Prefix=prefix,
        MaxKeys=1
    )
    return 'Contents' in response

for dataset, path in dataset_paths.items():
    exists = check_s3_path(path)
    print(f"{dataset}: {'✓' if exists else '✗'} ({path})")

## 3. Quick Test Training

First, we'll run a quick test with just a few batches to verify everything works.

In [None]:
# Configure test hyperparameters
test_hyperparameters = {
    'epochs-per-dataset': 2,  # Just 2 epochs for testing
    'batch-size': 4,  # Reduced batch size
    'gradient-accumulation-steps': 4,  # Accumulate gradients
    'learning-rate': 0.001,
    'num-workers': 4,
    'num-classes': 12,
    'datasets': 'rid,roofline',
    'test-run': True  # Enable test mode
}

# Create PyTorch estimator for test
test_estimator = PyTorch(
    entry_point='sagemaker_train.py',
    source_dir='/home/sagemaker-user/Outlinefeature/roof-training/src',
    role=role,
    framework_version='2.0.1',
    py_version='py310',
    instance_count=1,
    instance_type='ml.g5.2xlarge',  # NVIDIA A10G GPU instance
    hyperparameters=test_hyperparameters,
    output_path=f's3://{bucket}/{prefix}/test_output',
    code_location=f's3://{bucket}/{prefix}/test_code',
    metric_definitions=[
        {'Name': 'train:loss', 'Regex': 'Training Loss: ([0-9\.]+)'},
        {'Name': 'val:loss', 'Regex': 'Validation Loss: ([0-9\.]+)'},
        {'Name': 'segments:loss', 'Regex': 'segments: ([0-9\.]+)'},
        {'Name': 'lines:loss', 'Regex': 'lines: ([0-9\.]+)'},
        {'Name': 'depth:loss', 'Regex': 'depth: ([0-9\.]+)'}
    ],
    enable_sagemaker_metrics=True
)

# Start test training
test_estimator.fit({
    'rid': dataset_paths['rid'],
    'roofline': dataset_paths['roofline']
})

## 4. Full Training

If the test run succeeds, we'll start the full training.

In [None]:
# Configure full training hyperparameters
hyperparameters = {
    'epochs-per-dataset': 20,  # Full training
    'batch-size': 4,  # Reduced batch size
    'gradient-accumulation-steps': 4,  # Accumulate gradients
    'learning-rate': 0.001,
    'num-workers': 4,
    'num-classes': 12,
    'datasets': 'rid,roofline',
    'test-run': False  # Disable test mode
}

# Create PyTorch estimator for full training
estimator = PyTorch(
    entry_point='sagemaker_train.py',
    source_dir='/home/sagemaker-user/Outlinefeature/roof-training/src',
    role=role,
    framework_version='2.0.1',
    py_version='py310',
    instance_count=1,
    instance_type='ml.g5.2xlarge',  # NVIDIA A10G GPU instance
    hyperparameters=hyperparameters,
    output_path=f's3://{bucket}/{prefix}/output',
    code_location=f's3://{bucket}/{prefix}/code',
    metric_definitions=[
        {'Name': 'train:loss', 'Regex': 'Training Loss: ([0-9\.]+)'},
        {'Name': 'val:loss', 'Regex': 'Validation Loss: ([0-9\.]+)'},
        {'Name': 'segments:loss', 'Regex': 'segments: ([0-9\.]+)'},
        {'Name': 'lines:loss', 'Regex': 'lines: ([0-9\.]+)'},
        {'Name': 'depth:loss', 'Regex': 'depth: ([0-9\.]+)'}
    ],
    enable_sagemaker_metrics=True
)

# Start full training
estimator.fit({
    'rid': dataset_paths['rid'],
    'roofline': dataset_paths['roofline']
})

## 5. Monitor Training Progress

In [None]:
# Get training job name and CloudWatch URL
training_job_name = estimator.latest_training_job.name
region = sagemaker_session.boto_region_name
print(f'Training job name: {training_job_name}')

# Get CloudWatch metrics URL
cloudwatch_url = f'https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#metricsV2:graph=~(metrics~(~(~\'AWS*2fSageMaker~\'TrainingJobMetrics~\'TrainingJobName~\'{training_job_name}~\'metric~\'train*3aloss))~view~\'timeSeries~stacked~false~region~\'{region}~stat~\'Average~period~60)'
print(f'\nView metrics at: {cloudwatch_url}')