In [1]:
# Import dependencies

import pandas as pd
import numpy as np
import boto3
import sagemaker.amazon.common as smac

np.random.seed(5)

# Define the S3 Bucket name
s3_bucket_name = 'slavac-ml-sagemaker'

%run ../../helper.ipynb

### Sample dataset with 3 features

In [2]:
n = 10

x1 = np.random.random_sample(n)       # n floating point numbers between 0 and 1
x2 = np.random.randint(100,200,n)     # n integers
x3 = np.random.random_sample(n) * 10  # n floating point numbers between 0 and 10
y = np.random.randint(0,2,n)          # Response variable 0 or 1  
y

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [3]:
# Create a pandas dataframe
df = pd.DataFrame({'x1':x1,
              'x2':x2, 
              'x3':x3,
              'y':y})
df.head(3)

Unnamed: 0,x1,x2,x3,y
0,0.221993,153,2.041547,0
1,0.870732,180,1.190954,0
2,0.206719,127,8.779031,1


In [4]:
# Save the dataframe to csv file
df.to_csv('demo_file.csv', index = False)

In [5]:
# Write a csv file to S3 Bucket
write_to_s3('demo_file.csv', s3_bucket_name, 'data_format/demo_file.csv')

In [6]:
# Downlaod a csv file from S3 Bucket
download_from_s3('demo_file_from_s3.csv', s3_bucket_name, 'data_format/demo_file.csv')

### RecordIO format (used for large datasets)

Data Types: Int32, Float32, Float64

In [7]:
# X and y must be an array
X = df[['x1', 'x2', 'x3']].to_numpy()
y = df[['y']].to_numpy()

# Flatten y to a single dimension array
y = y.ravel()
y

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [10]:
# Write data to a recordio file (locally)
write_recordio_file('demo_file.recordio', X, y)
df.head(3)

Unnamed: 0,x1,x2,x3,y
0,0.221993,153,2.041547,0
1,0.870732,180,1.190954,0
2,0.206719,127,8.779031,1


In [11]:
# Read data from a recordio file (locally)
read_recordio_file('demo_file.recordio')

record: 0
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.22199317108973948
      values: 153.0
      values: 2.0415474783059215
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 0
    }
  }
}

record: 1
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.8707323061773764
      values: 180.0
      values: 1.1909535747826039
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 0
    }
  }
}

record: 2
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.20671915533942642
      values: 127.0
      values: 8.779030712603621
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 1
    }
  }
}

record: 3
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.9186109079379216
      values: 144.0
      values: 5.236752895998791
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 1
    }
  }
}

record: 

In [12]:
# Write data to recordio file in S3
write_to_s3('demo_file.recordio', s3_bucket_name, 'data_format/demo_file.recordio')

In [13]:
download_from_s3('demo_file_from_s3.recordio', s3_bucket_name, 'data_format/demo_file.recordio')