In [79]:
import boto3
import botocore
from botocore.exceptions import ClientError
import os, time, json
from datetime import date
from dotenv import load_dotenv

import s3, iam, lf, glue, lambda_fn, sns, eventbridge as event

from ads.utils import red

In [41]:
sts_client           = boto3.client('sts')
rds_client           = boto3.client('rds')
iam_client           = boto3.client('iam')
s3_client            = boto3.client('s3')
glue_client          = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')
ec2_client           = boto3.client('ec2', region_name='us-east-1')
ec2_resource         = boto3.resource('ec2', region_name='us-east-1')
dynamodb_client      = boto3.client('dynamodb')
events_client        = boto3.client('events')
lambda_client        = boto3.client('lambda')
sfn_client           = boto3.client('stepfunctions')

In [42]:
load_dotenv(".env")
ACCOUNT_ID          = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION              = os.environ['AWS_DEFAULT_REGION']
VPC_ID              = 'vpc-03617a8a518caa526'
SUBNET_IDS          = ['subnet-0980ad10eb313405b', 'subnet-0de97821ddb8236f7', 'subnet-0a160fbe0fcafe373', 'subnet-0ca765b361e4cb186', 'subnet-0a972b05a5b162feb']
SUBNET_ID           = SUBNET_IDS[0]
SECURITY_GROUP_ID   = 'sg-07f4ccd7a5be677ea'

### [Knowledge Amplifier: Manage AWS Glue Jobs with Step Functions](https://www.youtube.com/watch?v=KpLm3DFb1h0&t=353s)

<div style="text-align:center"><img src="./state_diagram.png" length="400p" height="400p"></img></div>

#### Create IAM Role

- Create aws glue role by the name of `glue_role_name`.
- Assign Power User Access Policy (`PowerUserAccess`) to the role.

In [43]:
GLUE_ROLE_NAME = 'glue-pipeline-role'
SFN_ROLE_NAME = 'sfn-pipeline-role'
LFN_ROLE_NAME = 'lfn-pipeline-role'

In [44]:
aws_glue_service_policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
cloud_watch_full_access_arn = "arn:aws:iam::aws:policy/CloudWatchFullAccess"
amazon_s3_full_access_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
# admin_access_policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
# power_user_access_policy_arn = "arn:aws:iam::aws:policy/PowerUserAccess"

##### Glue Role

In [None]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
GLUE_ROLE_ARN = iam_client.create_role(
    RoleName=GLUE_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
    Description="Glue Service Role"
)['Role']['Arn']

In [None]:
# Attach AWS managed policy with the role
iam_client.attach_role_policy(
    RoleName=GLUE_ROLE_NAME,
    PolicyArn=aws_glue_service_policy_arn
)
# Attach AWS managed policy with the role
iam_client.attach_role_policy(
    RoleName=GLUE_ROLE_NAME,
    PolicyArn=cloud_watch_full_access_arn
)
# Attach AWS managed policy with the role
iam_client.attach_role_policy(
    RoleName=GLUE_ROLE_NAME,
    PolicyArn=amazon_s3_full_access_arn
)

##### SFN Role

In [None]:
stepfunctions_trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "states.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

SFN_ROLE_ARN = iam_client.create_role(
    RoleName=SFN_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(stepfunctions_trust_policy),
    Description="Glue Service Role"
)['Role']['Arn']

# Attach AWS managed policy with the role
iam_client.attach_role_policy(
    RoleName=SFN_ROLE_NAME,
    PolicyArn=aws_glue_service_policy_arn
)


In [None]:

# Attach AWS managed policy with the role
iam_client.attach_role_policy(
    RoleName=SFN_ROLE_NAME,
    PolicyArn=cloud_watch_full_access_arn
)

##### Lambda Role: NOT NEEDED IN THIS PROJECT YET!

In [None]:
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "lambda.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

# Create the IAM role with the assume role policy document
LFN_ROLE_ARN = iam_client.create_role(
    RoleName=LFN_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
)['Role']['Arn']

In [None]:
# Attach AWS managed policy with the role
iam_client.attach_role_policy(
    RoleName=LFN_ROLE_NAME,
    PolicyArn=aws_glue_service_policy_arn
)
# Attach AWS managed policy with the role
iam_client.attach_role_policy(
    RoleName=LFN_ROLE_NAME,
    PolicyArn=cloud_watch_full_access_arn
)
# Attach AWS managed policy with the role
iam_client.attach_role_policy(
    RoleName=LFN_ROLE_NAME,
    PolicyArn=amazon_s3_full_access_arn
)

In [None]:

# #### Create IAM Role Policy (, S3, Logs Permissions)
# policy_document = {
#     "Version": "2012-10-17",
#     "Statement": [
#         {   # StartCrawler permission
#             "Effect": "Allow",
#             "Action": [
#                 "glue:StartCrawler"
#             ],
#             "Resource": f"arn:aws:glue:region:account-id:crawler/*"
#             # "Resource": f"arn:aws:glue:region:account-id:crawler/{crawler-name}"
#         },
#         {
#             "Effect": "Allow",
#             "Action": [
#                 "glue:StartJobRun",
#                 "glue:GetJob",
#                 "glue:GetJobRun"
#             ],
#             "Resource": f"arn:aws:glue:region:account-id:job/*"
#         },
#         {   # s3 full access
#             "Effect": "Allow",
#             "Action": [
#                 "s3:*",
#                 "s3-object-lambda:*"
#             ],
#             "Resource": "*"
#         },
#         {
#             "Effect": "Allow",
#             "Action": [
#                 "logs:*"
#             ],
#             "Resource": "*"
#         }
#     ]
# }

# policy_name = "s3_logs_policies"

# # Attach the inline policy to the IAM role
# iam_client.put_role_policy(
#     RoleName=LFN_ROLE_NAME,
#     PolicyName=policy_name,
#     PolicyDocument=json.dumps(policy_document)
# )
# print(f"Policy {policy_name} attached to role {LFN_ROLE_NAME}")


#### Create S3 Bucket and Folders

In [77]:
S3_BUCKET_DATALAKE = "httx-datalake-bkt"
S3_BUCKET_GLUE_ASSETS = "httx-glue-assets-bkt"

In [80]:
acl = 'public-read'                         # Set the ACL (e.g., 'private', 'public-read')
enable_versioning = False                   # Enable versioning
enable_encryption = False                   # Enable server-side encryption

folders1 = ['raw/customers/', 'processed/customers/']
folders2 = ['temporary', 'sparkHistoryLogs']

s3.create_s3_bucket(S3_BUCKET_DATALAKE, folders1)
s3.create_s3_bucket(S3_BUCKET_GLUE_ASSETS, folders2)

Bucket 'httx-datalake-bkt' created successfully
Folder 'raw/customers/' created in bucket 'httx-datalake-bkt'
Folder 'processed/customers/' created in bucket 'httx-datalake-bkt'
Bucket 'httx-glue-assets-bkt' created successfully
Folder 'temporary/' created in bucket 'httx-glue-assets-bkt'
Folder 'sparkHistoryLogs/' created in bucket 'httx-glue-assets-bkt'


In [53]:
s3_client.upload_file('./customers.csv', S3_BUCKET_DATALAKE, 'raw/customers/customers.csv')

#### Create Glue Catalog Database

In [54]:
CATALOG_DB_NAME = 'httx-catalog-db'

In [None]:
## Example usage
DATALAKE_LOCATION_URI = f"s3://{S3_BUCKET_DATALAKE}"

create_database_response = glue_client.create_database(
    CatalogId=ACCOUNT_ID,
    DatabaseInput={
        'Name': CATALOG_DB_NAME,
        'Description': '',
        'LocationUri': DATALAKE_LOCATION_URI,
    }
)
print(create_database_response)

- Grant `CREATE_TABLE` permission to `glue_role_name` on data catalog DB.

In [56]:
# Arn for glue_role_name
lf_principle = GLUE_ROLE_ARN

# Grant 'CREATE_TABLE' LF Permission to `glue_role_name` Role
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle
    },
    Resource={
        'Database': {
            'Name': CATALOG_DB_NAME
        }
    },
    Permissions=['CREATE_TABLE', 'DROP'],
    PermissionsWithGrantOption=[]
)

In [57]:
# lf.grant_table_level_permissions(GLUE_ROLE_ARN, CATALOG_DB_NAME, 'employees', ['DROP'])

#### **Crawler**: Catalog Data from `raw/customers` as a table by the name `raw_customers`

In [None]:
S3_RAW_CRAWLER_NAME = "httx-s3crawler"
S3_RAW_CRAWLER_TARGET = {
    'S3Targets': [{'Path': f"s3://{S3_BUCKET_DATALAKE}/{'raw/customers'}"},]
}
glue.create_glue_crawler(S3_RAW_CRAWLER_NAME, GLUE_ROLE_ARN, CATALOG_DB_NAME, S3_RAW_CRAWLER_TARGET, table_prefix='raw_')

In [59]:
# glue_client.start_crawler(Name=S3_CRAWLER_NAME)

In [None]:
AMominNJ_arn = iam_client.get_user(UserName='AMominNJ')['User']['Arn']
lf.grant_table_level_permissions(AMominNJ_arn, CATALOG_DB_NAME, 'raw_customers', ['DROP'])

#### **Job**: Transformed data from `raw/customers`  and load into `processed/customers`

In [61]:
TEM_DIR = f"s3://{S3_BUCKET_GLUE_ASSETS}/temporary/"
SPARK_EVENT_LOG_PATH = f"s3://{S3_BUCKET_GLUE_ASSETS}/sparkHistoryLogs/"

In [62]:
lcl_file_path = './glue_scripts/jb1_s3csv_s3parquet.py'   # The local file you want to upload
object_path = f"glues_scripts/jb1_s3csv_s3parquet.py"     # The name to save the file as in the S3 bucket
s3_client.upload_file(lcl_file_path, S3_BUCKET_GLUE_ASSETS, object_path)

# s3.upload_file_to_s3(S3_BUCKET_GLUE_ASSETS, file_name1, object_name1)

In [None]:
JOB_NAME = 'jb1_s3csv_s3parquet'
JOB_SCRIPT_PATH = f"s3://{S3_BUCKET_GLUE_ASSETS}/{object_path}"
glue.create_glue_job(JOB_NAME, JOB_SCRIPT_PATH, GLUE_ROLE_ARN, TEM_DIR, SPARK_EVENT_LOG_PATH)

In [64]:
# glue.start_glue_job(JOB_NAME1)

In [65]:
# ! aws logs tail --follow /aws-glue/jobs --filter-pattern "SUCCEEDED"

#### SNF

In [66]:
SMN_NAME = 'glue_pipeline_smn'

In [67]:
state_machine_definition = {
  "Comment": "A description of my state machine",
  "StartAt": "StartCrawler",
  "States": {
    "StartCrawler": {
      "Type": "Task",
      "Parameters": {
        "Name": S3_RAW_CRAWLER_NAME
      },
      "Resource": "arn:aws:states:::aws-sdk:glue:startCrawler",
      "Next": "GetCrawler"
    },
    "GetCrawler": {
      "Type": "Task",
      "Parameters": {
        "Name": S3_RAW_CRAWLER_NAME
      },
      "Resource": "arn:aws:states:::aws-sdk:glue:getCrawler",
      "Next": "Choice"
    },
    "Choice": {
      "Type": "Choice",
      "Choices": [
        {
          "Variable": "$.Crawler.State",
          "StringEquals": "RUNNING",
          "Next": "Wait"
        }
      ],
      "Default": "Glue StartJobRun"
    },
    "Wait": {
      "Type": "Wait",
      "Seconds": 5,
      "Next": "GetCrawler"
    },
    "Glue StartJobRun": {
      "Type": "Task",
      "Resource": "arn:aws:states:::glue:startJobRun.sync",
      "Parameters": {
        "JobName": "jb1_s3csv_s3parquet"
      },
      "End": True
    }
  }
}

In [68]:
SFN_ARN = sfn_client.create_state_machine(
    name=SMN_NAME,
    definition=json.dumps(state_machine_definition),
    roleArn=SFN_ROLE_ARN
)['stateMachineArn']

In [None]:
sfn_client.start_execution(
    stateMachineArn=SFN_ARN,
    name=SMN_NAME,
)

#### Delete Resources

In [None]:
glue_client.delete_database(CatalogId=ACCOUNT_ID,Name=CATALOG_DB_NAME)

In [None]:
s3 = boto3.resource('s3')
bucket1 = s3.Bucket(S3_BUCKET_DATALAKE)
bucket2 = s3.Bucket(S3_BUCKET_GLUE_ASSETS)

# Delete all objects in the bucket
bucket1.objects.all().delete()
bucket2.objects.all().delete()

# Delete all object versions (if versioning is enabled)
# bucket1.object_versions.all().delete()
# bucket2.object_versions.all().delete()

# Finally, delete the bucket
bucket1.delete()
bucket2.delete()

In [None]:
glue_client.delete_crawler(Name=S3_RAW_CRAWLER_NAME)
# glue_client.delete_crawler(Name=S3_PROCESSED_CRAWLER_NAME)

In [None]:
glue_client.delete_job(JobName=JOB_NAME)

In [None]:
# sfn_client.delete_state_machine(stateMachineArn=SFN_ARN)
sfn_client.delete_state_machine(stateMachineArn='arn:aws:states:us-east-1:381492255899:stateMachine:glue_pipeline_smn')

In [None]:
## DELETE IAM ROLE AT THE END AFTER DELETING ALL OTHER RESOURCES.
iam.delete_iam_role(GLUE_ROLE_NAME)
iam.delete_iam_role(SFN_ROLE_NAME)