In [None]:
!aws s3 cp s3://lxeml/CH_Test/ . --recursive

In [21]:
import os
import sagemaker
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch
from sagemaker.workflow.functions import Join

role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()

bucket = "lxeml"
prefix = "CH_Test"
base_path = f"s3://{bucket}/{prefix}/"

# -----------------------------
# Step 1: Preprocessing
# -----------------------------
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    base_job_name="ch-test-preprocess-train"
)

preprocess_step = ProcessingStep(
    name="PreprocessTrainData",
    processor=sklearn_processor,
    code="preprocess_train_file_sagemaker.py",
    inputs=[
        ProcessingInput(
            source=os.path.join(base_path, "TRAIN.csv"),
            # Place TRAIN.csv in its own subfolder
            destination="/opt/ml/processing/input/train"
        ),
        ProcessingInput(
            source=os.path.join(base_path, "list_of_categories.csv"),
            # Place list_of_categories.csv in a different subfolder
            destination="/opt/ml/processing/input/categories"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="train_data",
            source="/opt/ml/processing/output",
            destination=os.path.join(base_path, "preprocess_train")
        )
    ]
)

# -----------------------------
# Step 2: Training
# -----------------------------
pytorch_estimator = PyTorch(
    entry_point="train.py",
    source_dir=".",  # your code location
    role=role,
    framework_version="1.7.1",
    py_version="py3",
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    hyperparameters={
        "num_epochs": 3,
        "learning_rate": 1e-5
    },
    output_path=os.path.join(base_path, "results")
)

training_step = TrainingStep(
    name="TrainModel",
    estimator=pytorch_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=preprocess_step.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri
        )
    }
)

# -----------------------------
# Build and Run Pipeline
# -----------------------------
pipeline = Pipeline(
    name="CHTestTrainingPipeline",
    steps=[preprocess_step, training_step],
    sagemaker_session=sagemaker_session
)

pipeline.upsert(role_arn=role)
print("✅ Pipeline definition uploaded/updated.")

execution = pipeline.start()
execution.wait()
print("✅ Pipeline execution complete.")

✅ Pipeline definition uploaded/updated.
✅ Pipeline execution complete.
