In [2]:
import boto3
import os
import pandas as pd
import sagemaker
import time
from time import gmtime, strftime

In [3]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

# Getting Our Role Name of LabRole:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

# Checking permissions of our role:
admin = False
post_policies = iam.list_attached_role_policies(RoleName=role_name)["AttachedPolicies"]
for post_policy in post_policies:
    if post_policy["PolicyName"] == "AdministratorAccess":
        admin = True
        setup_iam_roles_passed = True
        print("[OK]")
        break
    else:
        print("*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************")

Role name: LabRole
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************
[OK]


In [4]:
# Start with Creation of the S3 Bucket

s3 = boto3.Session().client(service_name="s3", region_name=region)

setup_s3_bucket_passed = False

print("Default bucket: {}".format(bucket))

Default bucket: sagemaker-us-east-1-380520067514


In [5]:
%%bash

aws s3 ls s3://${bucket}/

2022-04-07 19:46:45 sagemaker-studio-380520067514-grja9c0au5
2022-04-07 20:01:59 sagemaker-us-east-1-380520067514


In [6]:
from botocore.client import ClientError

response = None

try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

{'ResponseMetadata': {'RequestId': 'J3YY3Y2D16T021XA', 'HostId': 'OWU74t9dulzBdKW/t/NoAYI/CgJJp8NUIuOoHkDjTLeQge9gpjuS/4JTaXh3MFGAzvK0u69THGw=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'OWU74t9dulzBdKW/t/NoAYI/CgJJp8NUIuOoHkDjTLeQge9gpjuS/4JTaXh3MFGAzvK0u69THGw=', 'x-amz-request-id': 'J3YY3Y2D16T021XA', 'date': 'Mon, 11 Apr 2022 03:46:23 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [7]:
break

SyntaxError: 'break' outside loop (<ipython-input-7-6aaf1f276005>, line 4)

Please upload FOUNDATION FOLDER into Default Bucket.

In [8]:
s3_bucket = pd.DataFrame.from_dict(s3.list_objects(Bucket=bucket)['Contents'])

# delete top 2 rows & reset index
s3_bucket = s3_bucket.iloc[2:,:].reset_index(drop=True)
s3_bucket.head(3)

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass,Owner
0,Foundation/DataPrep/Instructions.txt,2022-04-11 00:58:20+00:00,"""263c481225c43eb861352ae0f153393c""",251,STANDARD,{'DisplayName': 'awslabsc0w3147735t1637051260'...
1,Foundation/DataPrep/Population_DataPrep.ipynb,2022-04-11 00:58:21+00:00,"""3a612416274730993f3f24c20ef7e77e""",33635,STANDARD,{'DisplayName': 'awslabsc0w3147735t1637051260'...
2,Foundation/DataPrep/SDGE_DataPrep.ipynb,2022-04-11 00:58:20+00:00,"""4699f162d659bd826f006127e41a1d5e""",12969,STANDARD,{'DisplayName': 'awslabsc0w3147735t1637051260'...


In [None]:
# Base directories
root = '/root/'
raw = 'Raw Data'
clean = 'Clean_Data'

extensions = ['.csv', '.pdf', '.xlsx', '.py', '.ipynb']

# create class dir
clean_dir = root + clean
os.mkdir(clean_dir)
os.chdir(clean_dir)

for index, row in s3_bucket.iterrows():
    file_dir = row['Key']
    
    # print every 10th index
    if index % 50 == 0:
        print(f'Processing {index}/{s3_bucket.shape[0]}')
        
    # dir_comprehension
    dir_comp = []

    dir_comp.append('/root')
    
    folder_dir = file_dir.split('/')

    for dir_ in folder_dir:
        # slowly rebuild folder directory 
        dir_comp.append(dir_)

        # The following checks if path exists
        current_dir_path = os.path.join(*dir_comp)
        
        
        # only create paths for non extension files
        if any(ext in current_dir_path for ext in extensions) != True:
            if os.path.isdir(current_dir_path) == False:
                os.mkdir(current_dir_path)
                os.chdir(current_dir_path)
        else:            
            # remove file_name.ext from current_dir_path by the following:
            # break down path into list 
            dir_path = current_dir_path.split('/')
            
            # get last element file_name.ext
            file_name = dir_path.pop()
            
            # rebuild list back to path
            dir_path = '/' + os.path.join(*dir_path)
            
            if (dir_path== os.getcwd()) != True:
                # aws needs root to be encased in '/'
                os.chdir(dir_path)
                
            # checks if file is one of the accepted extensions
            if any(ext in current_dir_path for ext in extensions) == True:
                s3.download_file(Bucket=bucket, Filename=file_name, Key=file_dir)  
                
    # complete
print('Ingestion is complete!')

Processing 0/947
Processing 50/947


# Data Processing

In [None]:
%run /root/Foundation/DataPrep/Population_DataPrep.ipynb
%run /root/Foundation/DataPrep/SDGE_DataPrep.ipynb
%run /root/Foundation/DataPrep/Weather_DataPrep.ipynb
%run /root/Foundation/DataPrep/Vehicle_DataPrep.ipynb

print('!!'*30)
print('All Data is ready to rock!')