In [4]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import boto3
import sagemaker

%matplotlib inline
pd.options.display.max_columns = 500

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
data_dir = 'input_data'
data_prefix = 'santander_project/data'

In [5]:
# upload train and test data to S3
train_path = 's3://{}/{}/train.csv'.format(bucket, data_prefix)
test_path = 's3://{}/{}/test.csv'.format(bucket, data_prefix)

In [36]:
#Instantiate sklearn processor
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(framework_version='0.23.1',
                                     role=role,
                                     instance_type='ml.c4.xlarge',
                                     instance_count=1)

ValueError: scikit-learn version 0.23.1 is not supported. Supported versions are ['0.20.0']

In [34]:
#Run imputation processing script
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(code='processing_scripts/imputation.py',
                      inputs=[ProcessingInput(
                        source=train_path,
                        destination='/opt/ml/processing/input/train'), ProcessingInput(
                        source=test_path,
                        destination='/opt/ml/processing/input/test'),],
                      outputs=[ProcessingOutput(output_name='train_imputed',
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='test_imputed',
                                                source='/opt/ml/processing/test')]
                     )

Parameter 'session' will be renamed to 'sagemaker_session' in SageMaker Python SDK v2.



Job Name:  sagemaker-scikit-learn-2020-07-13-23-49-50-445
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-278383315865/santander_data/train.csv', 'LocalPath': '/opt/ml/processing/input/train', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-278383315865/santander_data/test.csv', 'LocalPath': '/opt/ml/processing/input/test', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-278383315865/sagemaker-scikit-learn-2020-07-13-23-49-50-445/input/code/process.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', '

UnexpectedStatusException: Error for Processing job sagemaker-scikit-learn-2020-07-13-23-49-50-445: Failed. Reason: AlgorithmError: See job logs for more information

In [None]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_imputed':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test_imputed':
        preprocessed_test_data = output['S3Output']['S3Uri']

In [None]:
sagemaker_session.upload_data(path=os.path.join(preprocessed_training_data + '/train_imputed.csv'), bucket=bucket, key_prefix=data_prefix)
sagemaker_session.upload_data(path=os.path.join(preprocessed_test_data + '/test_imputed.csv'), bucket=bucket, key_prefix=data_prefix)

In [None]:
pd.read_csv(preprocessed_training_data + '/train_imputed.csv').to_csv(os.path.join(data_dir, 'train_imputed.csv'))
pd.read_csv(preprocessed_test_data + '/test_imputed.csv').to_csv(os.path.join(data_dir, 'test_imputed.csv'))