In [None]:
!pip install -U "sagemaker"
!pip install -U "boto3"

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role
role = get_execution_role()

In [None]:
!mkdir docker
%%writefile docker/Dockerfile

FROM openjdk:8-jre-slim

RUN apt-get update
RUN apt-get install -y python3 python3-setuptools python3-pip python-dev python3-dev

RUN pip3 install pandas pyspark==3.2.0 delta-spark
ENV PYTHONUNBUFFERED=TRUE

ENTRYPOINT ["python3"]

In [None]:
account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.Session().region_name
ecr_repository = 'sagemaker-processing-container'
tag = ':latest'
processing_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository + tag)

# Create ECR repository and push docker image
!docker build -t $ecr_repository docker
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com
!aws ecr create-repository --repository-name $ecr_repository
!docker tag {ecr_repository + tag} $processing_repository_uri
!docker push $processing_repository_uri

processing_repository_uri

In [None]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

script_processor = ScriptProcessor(command=['python3'],
                image_uri='238023316787.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-processing-container:latest',
                role=role,
                instance_count=1,
                instance_type='local')

In [None]:
input_location = "s3://aws-ml-blog/artifacts/delta-lake-bring-your-own-container/delta-table/california-housing/"
script_processor.run(code='deltaprocess.py',  
                     inputs=[
                         ProcessingInput(source=input_location, destination="/opt/ml/processing/input/")
                     ])