# Introduction to Data Formats and S3

In [None]:
import pandas as pd
import numpy as np

import boto3
import sagemaker.amazon.common as smac

In [None]:
np.random.seed(5)

## Sample DataSet
### Three features x1,x2,x3 and a target variable y

In [None]:
n = 10

x1 = np.random.random_sample(n)       # n floating point numbers between 0 and 1
x2 = np.random.randint(100,200,n)     # n integers
x3 = np.random.random_sample(n) * 10  # n floating point numbers between 0 and 10
y = np.random.randint(0,2,n)          # Response variable 0 or 1  

In [None]:
y

In [None]:
df = pd.DataFrame({'x1':x1,
              'x2':x2, 
              'x3':x3,
              'y':y})

In [None]:
df

In [None]:
# Write to SageMaker Notebook Instance
# It is saved in the folder where this ipynb was created
# Any data created inside sample-notebooks will be lost when you stop and start the instance
# To preserve data, store it outside of sample-notebooks folder
df.to_csv('demo_file.csv',index=False)

In [None]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [None]:
# http://boto3.readthedocs.io/en/latest/guide/s3.html
def download_from_s3(filename, bucket, key):
    with open(filename,'wb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).download_fileobj(f)

In [None]:
write_to_s3('demo_file.csv', 'chandra-ml-sagemaker', 'data_format/demo_file.csv')

In [None]:
download_from_s3('demo_file_from_s3.csv','chandra-ml-sagemaker','data_format/demo_file.csv')

In [None]:
# Let's Try the Protobuf RecordIO Format
# We will use SageMaker SDK write_numpy_to_dense_tensor() method
# Data Types: Int32, Float32, Float64
# https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/amazon/common.py

In [None]:
df.head()

In [None]:
# X must be an array
X = df.as_matrix(columns=['x1','x2','x3'])

In [None]:
X

In [None]:
type(X)

In [None]:
# Response/Target variable needs to a vector
# y must be a vector 
y = df.as_matrix(columns=['y'])

In [None]:
# it is right now a array of dimensions 10x1
y.shape

In [None]:
y

In [None]:
# Flatten to a single dimension array of 10 elements
y = y.ravel()

In [None]:
y

In [None]:
def write_recordio_file (filename, x, y=None):
    with open(filename, 'wb') as f:
        smac.write_numpy_to_dense_tensor(f, x, y)

In [None]:
def read_recordio_file (filename, recordsToPrint = 10):
    with open(filename, 'rb') as f:
        record = smac.read_records(f)
        for i, r in enumerate(record):
            if i >= recordsToPrint:
                break
            print ("record: {}".format(i))
            print(r)

In [None]:
write_recordio_file('demo_file.recordio',X,y)

In [None]:
df.head(3)

In [None]:
read_recordio_file('demo_file.recordio',3)

In [None]:
write_to_s3('demo_file.recordio', 'chandra-ml-sagemaker', 'data_format/demo_file.recordio')

In [None]:
download_from_s3('demo_file_from_s3.recordio','chandra-ml-sagemaker','data_format/demo_file.recordio')