# UDACITY: Processing Job Demo

In [2]:
%%writefile xgboost_process_script.py
# Write the the following code within the cell as python script named 'xgboost_process_script.py' (Entry-point)

# Execute this cell first to write this script to your local directory. 

import pandas

# This method filters out the column at index 1, which is the crime data. 

def filter_crime_data(input_data_path):
    with open(input_data_path, 'r') as f:
        df = pandas.read_csv(f)
    df.drop(df.columns[[1]], axis=1)
    return df

# The main method takes in data at '/opt/ml/processing/input/data/train.csv' 
# and outputs it as a csv to '/opt/ml/processing/output/data_processed'

if __name__ == "__main__":
    filtered_data = filter_crime_data('/opt/ml/processing/input/data/train.csv')
    filtered_data.to_csv('/opt/ml/processing/output/data_processed')      # Stores the data back in S3 bucket



Writing xgboost_process_script.py


In [3]:
import boto3

from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

role = get_execution_role()

sklearn_processor = SKLearnProcessor(framework_version='0.20.0',    # image_uri version (in this case, Scikit learn)
                                     role=role,
                                     instance_type='ml.m5.large',   # Desired compute resources
                                     instance_count=1)


# You will need to replace the 'source' code with the location of the dataset you want to process. 

sklearn_processor.run(code='xgboost_process_script.py',  # Entry-point (python script to be run at start)
                        inputs=[ProcessingInput(       # Inputs: S3 path and local path
                        source='s3://sagemaker-studio-774397897175-mcs3venir6c/Boston-Housing-XGBoost/train.csv',
                        destination='/opt/ml/processing/input/data/')],
                      outputs=[ProcessingOutput(source='/opt/ml/processing/output')] # Outputs: local path
                     )



Job Name:  sagemaker-scikit-learn-2023-01-05-06-22-56-594
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-studio-774397897175-mcs3venir6c/Boston-Housing-XGBoost/train.csv', 'LocalPath': '/opt/ml/processing/input/data/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-774397897175/sagemaker-scikit-learn-2023-01-05-06-22-56-594/input/code/xgboost_process_script.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-774397897175/sagemaker-scikit-learn-2023-01-05-06-22-56-594/output/output-1', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOf