In [1]:
# Python Version:
#!python --version

# Pip:
#!pip install --disable-pip-version-check -q pip --upgrade > /dev/null
#!pip install --disable-pip-version-check -q wrapt --upgrade > /dev/null

# AWS CLI:
#!pip install --disable-pip-version-check -q awscli==1.18.216 boto3==1.16.56 botocore==1.19.56

# AWS SageMaker:
#!pip install --disable-pip-version-check -q sagemaker==2.29.0
#!pip install --disable-pip-version-check -q smdebug==1.0.1
#!pip install --disable-pip-version-check -q sagemaker-experiments==0.1.26

# AWS Redshift:
#!pip install --disable-pip-version-check -q SQLAlchemy==1.3.22
#!pip install --disable-pip-version-check -q psycopg2-binary==2.9.1

# Zip:
#!conda install -y zip

# Pip List:
#!pip list

In [2]:
import boto3
import os
import pandas as pd
import sagemaker
import time
from time import gmtime, strftime

In [3]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

# Getting Our Role Name of LabRole:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

# Checking permissions of our role:
admin = False
post_policies = iam.list_attached_role_policies(RoleName=role_name)["AttachedPolicies"]
for post_policy in post_policies:
    if post_policy["PolicyName"] == "AdministratorAccess":
        admin = True
        setup_iam_roles_passed = True
        print("[OK]")
        break
    else:
        print("*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************")

Role name: LabRole
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************
[OK]


In [4]:
# Start with Creation of the S3 Bucket

s3 = boto3.Session().client(service_name="s3", region_name=region)

In [5]:
setup_s3_bucket_passed = False

In [6]:
print("Default bucket: {}".format(bucket))

Default bucket: sagemaker-us-east-1-574641942871


In [7]:
%%bash

aws s3 ls s3://${bucket}/

2022-03-24 20:34:35 sagemaker-us-east-1-574641942871


In [8]:
from botocore.client import ClientError

response = None

try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

{'ResponseMetadata': {'RequestId': 'VFQAZM7NE9SKV0JN', 'HostId': 'oWxAgQqesUkMbMgLqEempKfKCrX9RICSN1twxVePdgGKfhecV/zFAmlTYfej7M1WwWXkE7tW0II=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'oWxAgQqesUkMbMgLqEempKfKCrX9RICSN1twxVePdgGKfhecV/zFAmlTYfej7M1WwWXkE7tW0II=', 'x-amz-request-id': 'VFQAZM7NE9SKV0JN', 'date': 'Fri, 25 Mar 2022 16:58:59 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [9]:
s3_bucket = pd.DataFrame.from_dict(s3.list_objects(Bucket=bucket)['Contents'])

# delete top 2 rows & reset index
s3_bucket = s3_bucket.iloc[2:,:].reset_index(drop=True)
s3_bucket.head(5)

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass,Owner
0,Raw Data/Other_data/Vehicle_Population.xlsx,2022-03-25 16:36:09+00:00,"""4f3900c67ade80b1aeda1e442e3f5fc8""",4505892,STANDARD,{'DisplayName': 'awslabsc0w3919396t1646224181'...
1,Raw Data/Other_data/populations.xlsx,2022-03-25 16:05:58+00:00,"""0457be93f472b053b1fc72c001455aea""",497497,STANDARD,{'DisplayName': 'awslabsc0w3919396t1646224181'...
2,Raw Data/Other_data/zipcodes.pdf,2022-03-25 16:05:58+00:00,"""1f11a19340f38d422f4dcd1c188f1ab9""",215196,STANDARD,{'DisplayName': 'awslabsc0w3919396t1646224181'...
3,Raw Data/SDGE/.DS_Store,2022-03-24 20:55:51+00:00,"""360262756c06c80a898c6a100913ba15""",6148,STANDARD,{'DisplayName': 'awslabsc0w3919396t1646224181'...
4,Raw Data/SDGE/Electric/SDGE-ELEC-2012-Q1.csv,2022-03-24 20:55:55+00:00,"""0070e4176e266ce0618f23246cf009ea""",45132,STANDARD,{'DisplayName': 'awslabsc0w3919396t1646224181'...


In [10]:
# Base directories
root = '/root/'
raw = 'Raw Data'
clean = 'Clean_Data'

extensions = ['.csv', '.pdf', '.xlsx']

In [None]:
# create class dir
raw_dir = root + raw
os.mkdir(raw_dir)
os.chdir(raw_dir)

for index, row in s3_bucket.iterrows():
    file_dir = row['Key']
    
    # print every 10th index
    if index % 25 == 0:
        print(f'Processing {index}/{s3_bucket.shape[0]}')
        
    # dir_comprehension
    dir_comp = []

    dir_comp.append('/root')
    
    folder_dir = file_dir.split('/')

    for dir_ in folder_dir:
        # slowly rebuild folder directory 
        dir_comp.append(dir_)

        # The following checks if path exists
        current_dir_path = os.path.join(*dir_comp)
        
        
        # only create paths for non extension files
        if any(ext in current_dir_path for ext in extensions) != True:
            if os.path.isdir(current_dir_path) == False:
                os.mkdir(current_dir_path)
                os.chdir(current_dir_path)
        else:            
            # remove file_name.ext from current_dir_path by the following:
            # break down path into list 
            dir_path = current_dir_path.split('/')
            
            # get last element file_name.ext
            file_name = dir_path.pop()
            
            # rebuild list back to path
            dir_path = '/' + os.path.join(*dir_path)
            
            if (dir_path== os.getcwd()) != True:
                # aws needs root to be encased in '/'
                os.chdir(dir_path)
                
            # checks if file is one of the accepted extensions
            if any(ext in current_dir_path for ext in extensions) == True:
                s3.download_file(Bucket=bucket, Filename=file_name, Key=file_dir)  
                
    # complete
print('Ingestion is complete!')

Processing 0/998
Processing 10/998
Processing 20/998
Processing 30/998
Processing 40/998
Processing 50/998
Processing 60/998
Processing 70/998
Processing 80/998
Processing 90/998
Processing 100/998
Processing 110/998
Processing 120/998
Processing 130/998
Processing 140/998
Processing 150/998
Processing 160/998
Processing 170/998
Processing 180/998
Processing 190/998
Processing 200/998
Processing 210/998
Processing 220/998
Processing 230/998
Processing 240/998
Processing 250/998
Processing 260/998
Processing 270/998
Processing 280/998
Processing 290/998
Processing 300/998
Processing 310/998
Processing 320/998
Processing 330/998
Processing 340/998
Processing 350/998
Processing 360/998
Processing 370/998
Processing 380/998
Processing 390/998
Processing 400/998
Processing 410/998
Processing 420/998
Processing 430/998
Processing 440/998
Processing 450/998
Processing 460/998
Processing 470/998
Processing 480/998
Processing 490/998
Processing 500/998
Processing 510/998
Processing 520/998
Proc