## Ingest CSV files to Sagemaker


In [1]:
import pandas as pd
import boto3
import sagemaker
import matplotlib.pyplot as plt
import os
import numpy as np

# Locate S3 bucket we created manually

In [2]:
!aws s3 ls s3://projectads508/aisles/

2022-04-17 17:50:32          0 
2022-04-17 17:50:46       2603 aisles.csv


In [3]:
!aws s3 ls s3://projectads508/departments/

2022-04-17 17:51:07          0 
2022-04-17 17:51:19        270 departments.csv


In [4]:
!aws s3 ls s3://projectads508/prior/

In [5]:
!aws s3 ls s3://projectads508/train/

2022-04-17 17:51:56          0 
2022-04-17 17:52:06   24680147 order_products__train.csv


In [6]:
!aws s3 ls s3://projectads508/orders/

2022-04-17 17:52:58          0 
2022-04-17 17:53:08  108968645 orders.csv


In [7]:
!aws s3 ls s3://projectads508/products/

2022-04-17 17:54:35          0 
2022-04-17 17:55:00    2166953 products.csv


In [8]:

sess= sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

# Set S3 Source Location (Public S3 Bucket)

In [9]:
s3_public_path_csv = "s3://projectads508"
%store s3_public_path_csv

Stored 's3_public_path_csv' (str)


# Set S3 Destination Location (Our Private S3 Bucket)

In [10]:
s3_private_path_csv = "s3://{}/projectads508".format(bucket)
print(s3_private_path_csv)

s3://sagemaker-us-east-1-254716743917/projectads508


# Copy Data From the Public S3 Bucket to our Private S3 Bucket in this Account


In [11]:
!aws s3 cp --recursive $s3_public_path_csv/ $s3_private_path_csv/ --exclude "*" --include "aisles/aisles.csv"
!aws s3 cp --recursive $s3_public_path_csv/ $s3_private_path_csv/ --exclude "*" --include "departments/departments.csv"
!aws s3 cp --recursive $s3_public_path_csv/ $s3_private_path_csv/ --exclude "*" --include "prior/order_products__prior.csv"
!aws s3 cp --recursive $s3_public_path_csv/ $s3_private_path_csv/ --exclude "*" --include "train/order_products__train.csv"
!aws s3 cp --recursive $s3_public_path_csv/ $s3_private_path_csv/ --exclude "*" --include "orders/orders.csv"
!aws s3 cp --recursive $s3_public_path_csv/ $s3_private_path_csv/ --exclude "*" --include "products/products.csv"

copy: s3://projectads508/aisles/aisles.csv to s3://sagemaker-us-east-1-254716743917/projectads508/aisles/aisles.csv
copy: s3://projectads508/departments/departments.csv to s3://sagemaker-us-east-1-254716743917/projectads508/departments/departments.csv
copy: s3://projectads508/train/order_products__train.csv to s3://sagemaker-us-east-1-254716743917/projectads508/train/order_products__train.csv
copy: s3://projectads508/orders/orders.csv to s3://sagemaker-us-east-1-254716743917/projectads508/orders/orders.csv
copy: s3://projectads508/products/products.csv to s3://sagemaker-us-east-1-254716743917/projectads508/products/products.csv


In [12]:
print(s3_private_path_csv)

s3://sagemaker-us-east-1-254716743917/projectads508


In [13]:
!aws s3 ls $s3_private_path_csv/aisles/

2022-04-17 21:59:24       2603 aisles.csv


In [14]:
!aws s3 ls $s3_private_path_csv/departments/

2022-04-17 21:59:25        270 departments.csv


In [15]:
!aws s3 ls $s3_private_path_csv/prior/

In [16]:
!aws s3 ls $s3_private_path_csv/train/

2022-04-17 21:59:27   24680147 order_products__train.csv


In [17]:
!aws s3 ls $s3_private_path_csv/orders/

2022-04-17 21:59:28  108968645 orders.csv


In [18]:
!aws s3 ls $s3_private_path_csv/products/

2022-04-17 21:59:32    2166953 products.csv


In [19]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/projectads508/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)