In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import boto3
import sagemaker

%matplotlib inline
pd.options.display.max_columns = 500

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

In [2]:
data_dir = 'input_data'
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'), index_col='ID')

chunksize = 5000
test_df = pd.DataFrame()

for chunk in pd.read_csv(os.path.join(data_dir, 'test.csv'), index_col='ID', chunksize=chunksize):
    test_df = pd.concat([test_df, chunk])

MemoryError: Unable to allocate 1.30 GiB for an array with shape (4991, 35000) and data type float64

In [None]:
!git push

In [None]:
prefix = 'santander_data'

# upload all data to S3
train_path = sagemaker_session.upload_data(path=os.path.join(data_dir, 'train.csv'), bucket=bucket, key_prefix=prefix)
test_path = sagemaker_session.upload_data(path=os.path.join(data_dir, 'test.csv'), bucket=bucket, key_prefix=prefix)

In [None]:
from sagemaker.sklearn.estimator import SKLearnProcessor

sklearn_processor = SKLearnProcessor(framework_version='0.23.1',
                                     role=role,
                                     instance_type='ml.p2.xlarge',
                                     instance_count=1)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(code='preprocessing.py',
                      inputs=[ProcessingInput(
                        source=train_path,
                        destination='/opt/ml/processing/train'),ProcessingInput(
                        source=test_path,
                        destination='/opt/ml/processing/test'),],
                      outputs=[ProcessingOutput(output_name='train_data',
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='test_data',
                                                source='/opt/ml/processing/test')],
                      arguments=[x]
                     )

In [None]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_data':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test_data':
        preprocessed_test_data = output['S3Output']['S3Uri']

In [None]:
training_features = pd.read_csv(preprocessed_training_data + '/train_features.csv', nrows=10)
print('Training features shape: {}'.format(training_features.shape))
training_features.head(n=10)