###### AWS Tutorials – Building Glue ETL Pipeline

In [None]:
import boto3
import botocore
from botocore.exceptions import ClientError
from dotenv import load_dotenv
import os, time, json, shutil, subprocess, zipfile
from datetime import date
from pathlib import Path

from misc import load_from_yaml, save_to_yaml
import s3, iam, lf, glue, lambdafn, rds, dynamodb as ddb, eventbridge as event

load_dotenv(".env")


In [None]:
ACCOUNT_ID        = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION            = os.environ['AWS_DEFAULT_REGION']
VPC_ID            = os.environ['AWS_DEFAULT_VPC']
SECURITY_GROUP_ID = os.environ['AWS_DEFAULT_SG_ID']
SUBNET_IDS        = SUBNET_IDS = os.environ["AWS_DEFAULT_SUBNET_IDS"].split(":")
SUBNET_ID         = SUBNET_IDS[0]
print(SUBNET_ID)

In [132]:
sts_client           = boto3.client('sts')
rds_client           = boto3.client('rds')
iam_client           = boto3.client('iam')
s3_client            = boto3.client('s3')
glue_client          = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')
ec2_client           = boto3.client('ec2', region_name='us-east-1')
ec2_resource         = boto3.resource('ec2', region_name='us-east-1')
dynamodb_client      = boto3.client('dynamodb')
events_client        = boto3.client('events')
lambda_client        = boto3.client('lambda')
databrew_client      = boto3.client('databrew')


<div style="text-align: center"><img src="./pipeline_creation_methods.png" length="500p" height="300p"></img></div>

#### Create IAM Role

- Create aws glue role by the name of `glue_role_name`.
- Assign Power User Access Policy (`PowerUserAccess`) to the role.

In [158]:
GLUE_ROLE_NAME = 'glue-pipeline-role'
DATABREW_ROLE_NAME = 'databrew-pipeline-role'
LFN_ROLE_NAME = 'lfn-pipeline-role'

In [85]:
policy_arns = [
    "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole",
    "arn:aws:iam::aws:policy/CloudWatchFullAccess",
    "arn:aws:iam::aws:policy/AmazonS3FullAccess",
    "arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess",
    # "arn:aws:iam::aws:policy/AdministratorAccess",
    # "arn:aws:iam::aws:policy/PowerUserAccess"
]

In [86]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
GLUE_ROLE_ARN = iam_client.create_role(
    RoleName=GLUE_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
    Description="Glue Service Role"
)['Role']['Arn']

In [None]:
# Attach AWS managed policy with the role
[iam_client.attach_role_policy(RoleName=GLUE_ROLE_NAME, PolicyArn=parn) for parn in policy_arns]

In [None]:
assume_role_policy_document = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "databrew.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
}


# Create the IAM role with the assume role policy document
DATABREW_ROLE_ARN = iam_client.create_role(
    RoleName=DATABREW_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
)['Role']['Arn']


policy_arn = 'arn:aws:iam::aws:policy/AwsGlueDataBrewFullAccessPolicy'

iam_client.attach_role_policy(RoleName=DATABREW_ROLE_NAME, PolicyArn=policy_arn)

In [None]:
print(DATABREW_ROLE_ARN)

In [88]:
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "lambda.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

# Create the IAM role with the assume role policy document
LFN_ROLE_ARN = iam_client.create_role(
    RoleName=LFN_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
)['Role']['Arn']


In [None]:
# Attach AWS managed policy with the role
[iam_client.attach_role_policy(RoleName=LFN_ROLE_NAME, PolicyArn=parn) for parn in policy_arns]

In [90]:
# #### Create IAM Role Policy (SQS, S3, Logs Permissions)
# policy_document = {
#     "Version": "2012-10-17",
#     "Statement": [
#         {
#             "Effect": "Allow",
#             "Action": [
#                 "s3:*",
#                 "s3-object-lambda:*"
#             ],
#             "Resource": "*"
#         },
#         {
#             "Effect": "Allow",
#             "Action": [
#                 "logs:*"
#             ],
#             "Resource": "*"
#         }
#     ]
# }

# policy_name = "s3_logs_policies"

# # Attach the inline policy to the IAM role
# iam_client.put_role_policy(
#     RoleName=LFN_ROLE_NAME,
#     PolicyName=policy_name,
#     PolicyDocument=json.dumps(policy_document)
# )
# print(f"Policy {policy_name} attached to role {LFN_ROLE_NAME}")

#### Create S3 Bucket and Folders

In [91]:
S3_BUCKET_DATALAKE = "httx-datalake-bkt"
S3_BUCKET_GLUE_ASSETS = "httx-glue-assets-bkt"

In [None]:
folders1 = ['raw/employees', 'cleansed/employees']
folders2 = ['temporary', 'sparkHistoryLogs']

s3.create_s3_bucket(S3_BUCKET_DATALAKE, folders1)
s3.create_s3_bucket(S3_BUCKET_GLUE_ASSETS, folders2)

In [None]:
response = s3_client.list_objects_v2(Bucket=S3_BUCKET_GLUE_ASSETS)
# print(response)
for obj in response.get('Contents', []):
    print(f'Object: {obj["Key"]}')

#### Create RDS Databases & it's Resources

In [100]:
DB_NAME = 'EmployeeDB'
DB_USERNAME = os.environ['USERNAME']
DB_PASSWORD = os.environ['PASSWORD']
SUBNET_GROUP_NAME = 'httx-rds-subnet-group'

In [None]:
## Create the RDS subnet group
response = rds_client.create_db_subnet_group(
    DBSubnetGroupName=SUBNET_GROUP_NAME,
    DBSubnetGroupDescription='Subnet group for RDS instance',
    SubnetIds=SUBNET_IDS
)
print(response)

In [102]:
instances = [
    {
        'db_instance_identifier': 'httx-rds-mysql',
        'db_name': DB_NAME,
        'db_username': DB_USERNAME,
        'db_password': DB_PASSWORD,
        'engine': 'mysql',
        'port': 3306,
        'engine_version': '8.0.32',
        'db_instance_class': 'db.t3.micro',
        'allocated_storage': 20,
        'availability_zone': 'us-east-1a',
        'tags': [{'Key': 'Project', 'Value': 'glue-rds-Crawler'}],
        'security_group_ids': [SECURITY_GROUP_ID],
        'db_subnet_group_name': SUBNET_GROUP_NAME,
    },
    {
        'db_instance_identifier': 'httx-rds-postgresql',
        'db_name': DB_NAME,
        'db_username': DB_USERNAME,
        'db_password': DB_PASSWORD,
        'port': 5432,
        'engine': 'postgres',
        'engine_version': '14.13',
        'db_instance_class': 'db.t3.micro',
        'allocated_storage': 20,
        'availability_zone': 'us-east-1a',
        'tags': [{'Key': 'Project', 'Value': 'glue-rds-Crawler'}],
        'security_group_ids': [SECURITY_GROUP_ID],
        'db_subnet_group_name': SUBNET_GROUP_NAME,
    },
    {
        'db_instance_identifier': 'httx-rds-mssql',
        'db_name': '',
        'db_username': DB_USERNAME,
        'db_password': DB_PASSWORD,
        'port': 1433,
        'engine': 'sqlserver-ex',
        'engine_version': '15.00.4153.1.v1',
        'db_instance_class': 'db.t3.micro',
        'allocated_storage': 20,
        'availability_zone': 'us-east-1a',
        'tags': [{'Key': 'Project', 'Value': 'glue-rds-Crawler'}],
        'security_group_ids': [SECURITY_GROUP_ID],
        'db_subnet_group_name': SUBNET_GROUP_NAME,
    },
]

In [None]:
rds.create_rds_instance(**instances[0])   # 'httx-rds-mysql'

In [None]:
# Describe the RDS instance
response = rds_client.describe_db_instances(
    DBInstanceIdentifier=instances[0]['db_instance_identifier']
)

# Extract the instance details
db_instances = response['DBInstances']
if db_instances:
    instance = db_instances[0]
    status = instance['DBInstanceStatus']
    
    if status == 'available':
        mysql_endpoint = instance['Endpoint']['Address']
        print(f"RDS Endpoint: {mysql_endpoint}")
    else:
        print(f"RDS instance is in {status} state, NO ENDPOINT AVAILABLE YET!!")
else:
    print("No RDS instance found.")

-   `Gateway` endpoints serve as a target for a route in your route table for traffic destined for the service.

In [105]:
# VPC Endpoint parameters
VPC_ENDPOINT_TAG = 'rds-glue-vpc-endpoint'
VPC_ENDPOINT_SERVICE_NAME = 'com.amazonaws.us-east-1.s3'
SECURITY_GROUP_IDS = [SECURITY_GROUP_ID]  # Security group(s) associated with the endpoint
ROUTE_TABLE_IDS = ['rtb-0ec4311296ec952f8']

# Create an Interface Endpoint
VPC_ENDPOINT_ID = ec2_client.create_vpc_endpoint(
    VpcEndpointType='Gateway',
    VpcId=VPC_ID,
    ServiceName=VPC_ENDPOINT_SERVICE_NAME,
    RouteTableIds=ROUTE_TABLE_IDS,
    # SubnetIds=sg_id,
    # SecurityGroupIds=security_group_ids,
    PrivateDnsEnabled=False  # Enable private DNS to resolve service names within the VPC
)['VpcEndpoint']['VpcEndpointId']

In [None]:
ec2_client.create_tags(Resources=[VPC_ENDPOINT_ID],Tags=[{'Key': 'Name', 'Value': 'rds_vpc_endpoint'}])

###### Load sql data from Local Machine to RDS Instance

-   Load into MySQL (TESTED):

    -   `$ mysql -h <rds-endpoint> -p <port> -U <username> -d <dbname>` -> Connect via Command Line if needed
    -   `$ mysql -h {mysql_endpoint} -P {mysql_port} -u httxadmin -p'{DB_PASSWORD}' interview_questions < /Users/am/mydocs/Software_Development/Web_Development/aws/aws_rds/interview_questions.sql`

In [None]:
# ! mysql -h {mysql_endpoint} -P {instances[0]['port']} -u {DB_USERNAME} -p'{DB_PASSWORD}' {DB_NAME} < ./mysql_employees.sql

#### Create Glue Catalog Database

In [107]:
CATALOG_DB_NAME = 'httx-catalog-db'

In [None]:
## Example usage
DATALAKE_LOCATION_URI = f"s3://{S3_BUCKET_DATALAKE}"

create_database_response = glue_client.create_database(
    CatalogId=ACCOUNT_ID,
    DatabaseInput={
        'Name': CATALOG_DB_NAME,
        'Description': 'A Multi-purpose Database',
        'LocationUri': DATALAKE_LOCATION_URI,
    }
)
print(create_database_response)

- Grant `CREATE_TABLE` permission on `Catalog DB` to `glue_role_name`.

In [162]:
# Arn for glue_role_name
lf_principle = GLUE_ROLE_ARN

# Grant 'CREATE_TABLE' LF Permission to `glue_role_name` Role
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle
    },
    Resource={
        'Database': {
            'Name': CATALOG_DB_NAME
        }
    },
    Permissions=['CREATE_TABLE', 'DROP'],
    PermissionsWithGrantOption=[]
)

In [None]:
lf_principle = DATABREW_ROLE_ARN
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle
    },
    Resource={
        'Table': {
            'DatabaseName': f"{CATALOG_DB_NAME}",
            'TableWildcard': {}
        }
    },
    Permissions=['ALL'],
    PermissionsWithGrantOption=[]
)

In [110]:
# lf.grant_table_level_permissions(GLUE_ROLE_ARN, CATALOG_DB_NAME, 'employees', ['DROP'])

In [111]:
# glue_client.update_database(
#     CatalogId=ACCOUNT_ID,
#     Name=CATALOG_DB_NAME,
#     DatabaseInput={
#         'Name': CATALOG_DB_NAME,
#         'UseOnlyIamAccessControl': False
#     }
# )
# lf.register_s3_path_as_data_lake_location(LFDB_LOCATION_URI)

#### Create Glue Resources

**Crawler-0**(Sources): Wait for RDS instance come into AVAILABE State

In [None]:
MYSQL_CONNECTION_NAME = 'mysql_connection'
MYSQL_CRAWLER_NAME = "httx-mysqlcrawler"
mysql_endpoint = rds.get_rds_endpoint(instances[0]['db_instance_identifier'])
mysql_connection_url = f"jdbc:mysql://{mysql_endpoint}:{instances[0]['port']}/{instances[0]['db_name']}"
RDS_CRAWLER_TARGET_PATH = f"{instances[0]['db_name']}/%"
SOURCE_TABLE_PREFIX = "src_"

In [125]:
TEM_DIR = f"s3://{S3_BUCKET_GLUE_ASSETS}/temporary/"
SPARK_EVENT_LOG_PATH = f"s3://{S3_BUCKET_GLUE_ASSETS}/sparkHistoryLogs/"

In [None]:
glue.create_glue_connection(
    MYSQL_CONNECTION_NAME, 
    mysql_connection_url, 
    DB_USERNAME, 
    DB_PASSWORD, 
    SECURITY_GROUP_ID, 
    SUBNET_ID, 
    REGION
)

In [None]:
glue.create_glue_jdbc_crawler(
    MYSQL_CRAWLER_NAME, 
    MYSQL_CONNECTION_NAME, 
    GLUE_ROLE_ARN, 
    CATALOG_DB_NAME, 
    RDS_CRAWLER_TARGET_PATH, 
    table_prefix=SOURCE_TABLE_PREFIX
)

In [None]:
glue_client.start_crawler(Name=MYSQL_CRAWLER_NAME)

In [None]:
# lf.grant_table_level_permissions(
#     GLUE_ROLE_ARN, 
#     CATALOG_DB_NAME, 
#     f"{SOURCE_TABLE_PREFIX}{DB_NAME}_employee", 
#     ['SELECT']
# )

In [None]:
# inline_policy_doc = {
#     "Version": "2012-10-17",
#     "Statement": [
#         {
#             "Effect": "Allow",
#             "Action": [
#                 "s3:GetObject",
#                 "s3:PutObject",
#                 "s3:DeleteObject"
#             ],
#             "Resource": [
#                 f"arn:aws:s3:::{S3_BUCKET_DATALAKE}/*",
#                 f"arn:aws:s3:::{S3_BUCKET_GLUE_ASSETS}/*"
#             ]
#         },
#     ]
# }
# iam.put_inline_role_policy(role_name=GLUE_ROLE_NAME, policy_name='s3_get_put_del', policy_document=inline_policy_doc)


### [Event Based AWS Glue ETL Pipeline](https://www.youtube.com/watch?v=04BbCLDlvII&list=PLO95rE9ahzRsdzmZ_ZT-3uOn1Nh2eEpWB&index=18&t=21s) || `SUCCESS`

<div style="text-align: center"><img src="./pipeline_architecture.png" length="500p" height="300p"></img></div>

In [None]:
file_name1 = './scripts/jb1_rds_s3_v2.py'           # The local file you want to upload
object_name1 = f"glues_cripts/jb1_rds_s3_v2.py"     # The name to save the file as in the S3 bucket
s3.upload_file_to_s3(S3_BUCKET_GLUE_ASSETS, file_name1, object_name1)

file_name2 = './scripts/jb2_s3_s3_v2.py'            # The local file you want to upload
object_name2 = f"glues_cripts/jb2_s3_s3_v2.py"      # The name to save the file as in the S3 bucket
s3.upload_file_to_s3(S3_BUCKET_GLUE_ASSETS, file_name2, object_name2)

#### Create Glue Resources

**Job 1**: Transforme data from RDS (MySQL) and load into `raw/employees` folder

In [None]:
JOB_NAME1 = 'jb1_rds_s3'
JOB1_SCRIPT_LOCATION = f"s3://{S3_BUCKET_GLUE_ASSETS}/glues_cripts/jb1_rds_s3_v2.py"
TARGET = f"s3://{S3_BUCKET_DATALAKE}/raw/employees"
# create_glue_job(JOB_NAME1, JOB1_SCRIPT_LOCATION, GLUE_ROLE_ARN, TEM_DIR, SPARK_EVENT_LOG_PATH)

In [None]:
# glue.start_glue_job(JOB_NAME1)

##### Parametarization of the Job [`PASSED`]

In [None]:
DEFAULT_ARGS = {
    '--class': 'GlueApp',
    '--enable-continuous-cloudwatch-log': 'true',
    '--enable-glue-datacatalog': 'true',
    '--enable-metrics': 'true',
    '--enable-spark-ui': 'true',
    '--job-bookmark-option': 'job-bookmark-enable',
    '--job-language': 'python',
    '--TempDir': TEM_DIR,
    '--spark-event-logs-path': SPARK_EVENT_LOG_PATH,
    '--extra-py-files': '', # Add S3 path containing zip file of external library your job depends on.
    '--catalog_db_name': CATALOG_DB_NAME,
    '--table_name': f"{SOURCE_TABLE_PREFIX}{DB_NAME}_employee",
    '--target': TARGET,
}

# Create the Glue job
response = glue_client.create_job(
    Name=JOB_NAME1,
    Role=GLUE_ROLE_ARN,
    ExecutionProperty={
        'MaxConcurrentRuns': 1
    },
    Command={
        'Name': 'glueetl',
        'ScriptLocation': JOB1_SCRIPT_LOCATION,
        'PythonVersion': '3'
    },
    DefaultArguments=DEFAULT_ARGS,
    MaxRetries=0,
    Timeout=5,  # in minutes, max is 2,880 min (48 Hours)
    GlueVersion='4.0',
    NumberOfWorkers=2,
    WorkerType='G.1X',  # can be 'Standard', 'G.1X', or 'G.2X'    # ExecutionClass='STANDARD',  # Default execution class for Glue jobs (can be 'STANDARD' or 'FLEX')
    # MaxCapacity=10.0,  # Default maximum capacity for the Glue job
)
print(response)

In [None]:
glue_client.start_job_run(
    JobName=JOB_NAME1,
    Arguments={
        '--catalog_db_name': CATALOG_DB_NAME,
        '--table_name': f"{SOURCE_TABLE_PREFIX}{DB_NAME}_employee",
        '--target': TARGET,
    }
)

-   `NOTES`:
    - If <b style="color:red">LAUNCH ERROR</b> | File --spark-event-logs-path does not existPlease refer logs for details.
    - THEN: Select `-spark-event-logs-path` through AWS Console!!

**Crawler 1**: Catalog Data from `raw/employees` as a table by the name `raw_employee`.

In [66]:
def create_glue_s3_crawler(crawler_name, role_arn, db_name, target_path, table_prefix=''):
    try:
        response = glue_client.create_crawler(
            Name=crawler_name,
            Role=role_arn, # or glue_role_name
            DatabaseName=db_name,
            Description='Crawler for generated Sales schema',
            Targets={
                'S3Targets': [
                    {
                        'Path': target_path
                    },
                ]
            },
            TablePrefix=table_prefix,
            SchemaChangePolicy={
                'UpdateBehavior': 'UPDATE_IN_DATABASE',
                'DeleteBehavior': 'DELETE_FROM_DATABASE'
            },
            RecrawlPolicy={
                'RecrawlBehavior': 'CRAWL_EVERYTHING'
            },
            #,Configuration='{ "Version": 1.0, "CrawlerOutput": { "Partitions": { "AddOrUpdateBehavior": "InheritFromTable" } } }'
        )
        print(f"Successfully created Glue crawler: {crawler_name}")
    except Exception as e:
        print(f"Error creating Glue crawler {crawler_name}: {str(e)}")

In [None]:
S3_RAW_CRAWLER_NAME = "httx-s3_raw_crawler"
S3_CRAWLER_TARGET_PATH = f"s3://{S3_BUCKET_DATALAKE}/{'raw/employees'}"
glue.create_glue_s3_crawler(
    S3_RAW_CRAWLER_NAME, 
    GLUE_ROLE_ARN, 
    CATALOG_DB_NAME, 
    S3_CRAWLER_TARGET_PATH, 
    table_prefix="raw_"
)

In [None]:
glue_client.start_crawler(Name=S3_RAW_CRAWLER_NAME)

**job 2**: Transforme data from `raw/employees` and load into `raw/cleansed` folder.

In [69]:
JOB_NAME2 = 'jb2_s3_s3'
JOB2_SCRIPT_LOCATION = f"s3://{S3_BUCKET_GLUE_ASSETS}/glues_cripts/jb2_s3_s3_v2.py"
JOB2_TARGET = f"s3://{S3_BUCKET_DATALAKE}/cleansed/employees"
# glue.create_glue_job(JOB_NAME2, JOB2_SCRIPT_LOCATION, GLUE_ROLE_ARN, TEM_DIR, SPARK_EVENT_LOG_PATH)

In [None]:
# glue.start_glue_job(JOB_NAME2)

##### Parametarization of the Job [`PASSED`]

In [None]:
DEFAULT_ARGS = {
    '--class': 'GlueApp',
    '--enable-continuous-cloudwatch-log': 'true',
    '--enable-glue-datacatalog': 'true',
    '--enable-metrics': 'true',
    '--enable-spark-ui': 'true',
    '--job-bookmark-option': 'job-bookmark-enable',
    '--job-language': 'python',
    '--TempDir': TEM_DIR,
    '--spark-event-logs-path': SPARK_EVENT_LOG_PATH,
    '--extra-py-files': '', # Add S3 path containing zip file of external library your job depends on.
    '--catalog_db_name': CATALOG_DB_NAME,
    '--table_name': f"raw_employees",
    '--target': JOB2_TARGET,
}

# Create the Glue job
response = glue_client.create_job(
    Name=JOB_NAME2,
    Role=GLUE_ROLE_ARN,
    ExecutionProperty={
        'MaxConcurrentRuns': 1
    },
    Command={
        'Name': 'glueetl',
        'ScriptLocation': JOB2_SCRIPT_LOCATION,
        'PythonVersion': '3'
    },
    DefaultArguments=DEFAULT_ARGS,
    MaxRetries=0,
    Timeout=5,  # in minutes, max is 2,880 min (48 Hours)
    GlueVersion='4.0',
    NumberOfWorkers=2,
    WorkerType='G.1X',  # can be 'Standard', 'G.1X', or 'G.2X'    # ExecutionClass='STANDARD',  # Default execution class for Glue jobs (can be 'STANDARD' or 'FLEX')
    # MaxCapacity=10.0,  # Default maximum capacity for the Glue job
)
print(response)

In [None]:
glue_client.start_job_run(
    JobName=JOB_NAME2,
    Arguments={
        '--catalog_db_name': CATALOG_DB_NAME,
        '--table_name': f"raw_employees",
        '--target': JOB2_TARGET,
    }
)

**Crawler 2**: Catalog Data from `cleased/employees` as a table by the name `cleansed_employee`.

In [None]:
S3_CLEANSED_CRAWLER_NAME = "httx-s3_clensed_crawler"
S3_CRAWLER_TARGET_PATH = f"s3://{S3_BUCKET_DATALAKE}/{'cleansed/employees'}"
glue.create_glue_s3_crawler(
    S3_CLEANSED_CRAWLER_NAME, 
    GLUE_ROLE_ARN, 
    CATALOG_DB_NAME, 
    S3_CRAWLER_TARGET_PATH, 
    "cleansed_"
)

In [None]:
# glue_client.start_crawler(Name=S3_CLEANSED_CRAWLER_NAME)

#### Create DynamoDB

In [None]:
# Example usage
config_table_name = 'pipelineconfig'

config_key_schema = [
    {
        'AttributeName': 'source',
        'KeyType': 'HASH'  # Partition key
    }
]

config_attribute_definitions = [
    {
        'AttributeName': 'source',
        'AttributeType': 'S'  # String
    }
]

# Create tables
ddb.create_dynamodb_table(config_table_name, config_key_schema, config_attribute_definitions)

# Wait for tables to become active
ddb.wait_for_table_active(config_table_name)


In [None]:
# Put items into tables
pipelineconfig_items = [
    {
        'source': {'S': JOB_NAME1},
        'target': {'S': S3_RAW_CRAWLER_NAME},
        'targettype': {'S': 'crawler'},
    },
    {
        'source': {'S': S3_RAW_CRAWLER_NAME},
        'target': {'S': JOB_NAME2},
        'targettype': {'S': 'job'},
    },
    {
        'source': {'S': JOB_NAME2},
        'target': {'S': S3_CLEANSED_CRAWLER_NAME},
        'targettype': {'S': 'crawler'},
    },
    {
        'source': {'S': S3_CLEANSED_CRAWLER_NAME},
        'target': {'S': 'None'},
        'targettype': {'S': 'None'},
    },
]

for pipelineconfig_item in pipelineconfig_items:
    ddb.put_item_into_table(config_table_name, pipelineconfig_item)

#### Create Lambda Function

In [None]:
# lambdafn.create_lambda_package("./lambdas", "./")

In [None]:
LFN_NAME = "glue_lambda_handler"
zip_file = "./package.zip"  # Change this to the actual zip file path

# Create Lambda function
with open(zip_file, 'rb') as f:
    zipped_code = f.read()

LAMBDA_ARN = lambda_client.create_function(
    FunctionName=LFN_NAME,
    Runtime='python3.9',
    Role=LFN_ROLE_ARN,
    Handler='etl_handler.lambda_handler',
    Code={'ZipFile': zipped_code},
    Timeout=120,
    Environment={
        'Variables': {
            'foo': 'BAR'
        }
    }
)['FunctionArn']

```json
{
    "version": "0",
    "id": "5cb429f8-e404-131a-41b8-3ba1e94e550e",
    "detail-type": "Glue Crawler State Change",
    "source": "aws.glue",
    "account": "381492255899",
    "time": "2024-12-11T22:39:29Z",
    "region": "us-east-1",
    "resources": [],
    "detail": {
        "tablesCreated": "0",
        "warningMessage": "N/A",
        "partitionsUpdated": "0",
        "tablesUpdated": "1",
        "message": "Crawler Succeeded",
        "partitionsDeleted": "0",
        "accountId": "381492255899",
        "runningTime (sec)": "32",
        "tablesDeleted": "0",
        "crawlerName": "httx-s3_clensed_crawler",
        "completionDate": "2024-12-11T22:39:29Z",
        "state": "Succeeded",
        "partitionsCreated": "0",
        "cloudWatchLogLink": "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws-glue/crawlers;stream=httx-s3_clensed_crawler"
    }
}
```

In [None]:
# payload = {
#     "detail-type": "Glue Crawler State Change",
#     "detail": {"crawlerName": "httx-s3_clensed_crawler",}
# }

# response = lambda_client.invoke(
#     FunctionName=LFN_NAME,
#     InvocationType='RequestResponse',  # 'RequestResponse' for synchronous execution
#     Payload=json.dumps(payload)
# )

# # Read the response
# response_payload = json.loads(response['Payload'].read())
# print("Response:")
# print(json.dumps(response_payload, indent=4))

In [None]:
lambdafn.print_latest_lambda_logs(LFN_NAME)

#### Create Event Objects

In [None]:
JOB1_RULE_NAME = 'httx-job1-handler-rule'
JOB2_RULE_NAME = 'httx-job2-handler-rule'
S3_RAW_CRAWLER1_RULE_NAME = 'httx-crawler1-rule'
S3_CLEANSED_CRAWLER2_RULE_NAME = 'httx-crawler2-rule'

In [None]:
job1_rule_event_pattern = {
    "source": ["aws.glue"],
    "detail-type": ["Glue Job State Change"], # Event Type
    "detail": {
        "jobName": [JOB_NAME1],
        "state": ["SUCCEEDED"] # MUST BE UPPERCASED
    }
}

JOB1_RULE_ARN = events_client.put_rule(
    Name=JOB1_RULE_NAME,
    EventPattern=json.dumps(job1_rule_event_pattern),
    State='ENABLED',
    Description='Rule to capture AWS Glue job state changes',
)['RuleArn']

# Attach the Lambda function as a target to the EventBridge Rule
events_client.put_targets(
    Rule=JOB1_RULE_NAME,
    Targets=[{
        'Id': f"{JOB1_RULE_NAME}_target",
        'Arn': LAMBDA_ARN
    }]
)

In [None]:
job2_rule_event_pattern = {
    "source": ["aws.glue"],
    "detail-type": ["Glue Job State Change"], # Event Type
    "detail": {
        "jobName": [JOB_NAME2],
        "state": ["SUCCEEDED"] # MUST BE UPPERCASED
    }
}

JOB2_RULE_ARN = events_client.put_rule(
    Name=JOB2_RULE_NAME,
    EventPattern=json.dumps(job2_rule_event_pattern),
    State='ENABLED',
    Description='Rule to capture AWS Glue job state changes',
)['RuleArn']

# Attach the Lambda function as a target to the EventBridge Rule
events_client.put_targets(
    Rule=JOB2_RULE_NAME,
    Targets=[{
        'Id': f"{JOB2_RULE_NAME}_target",
        'Arn': LAMBDA_ARN
    }]
)

In [None]:
crawler1_rule_event_pattern = {
  "source": ["aws.glue"],
  "detail-type": ["Glue Crawler State Change"],
  "detail": {
    "state": ["Succeeded"],
    "crawlerName":[ S3_RAW_CRAWLER_NAME]
  }
}

# Create EventBridge Rule to catch Glue Crawler State Change events
S3_RAW_CRAWLER1_RULE_ARN = events_client.put_rule(
    Name=S3_RAW_CRAWLER1_RULE_NAME,
    EventPattern=json.dumps(crawler1_rule_event_pattern),
    State='ENABLED',
    Description='Rule to capture AWS Glue Crawler state changes',
)['RuleArn']

# Attach the Lambda function as a target to the EventBridge Rule
events_client.put_targets(
    Rule=S3_RAW_CRAWLER1_RULE_NAME,
    Targets=[{
        'Id': f"{S3_RAW_CRAWLER1_RULE_NAME}_target",
        'Arn': LAMBDA_ARN
    }]
)


In [None]:
crawler2_rule_event_pattern = {
  "source": ["aws.glue"],
  "detail-type": ["Glue Crawler State Change"],
  "detail": {
    "state": ["Succeeded"],
    "crawlerName": [S3_CLEANSED_CRAWLER_NAME]
  }
}

# Create EventBridge Rule to catch Glue Crawler State Change events
S3_CLEANSED_CRAWLER2_RULE_ARN = events_client.put_rule(
    Name=S3_CLEANSED_CRAWLER2_RULE_NAME,
    EventPattern=json.dumps(crawler2_rule_event_pattern),
    State='ENABLED',
    Description='Rule to capture AWS Glue Crawler state changes',
)['RuleArn']

# Attach the Lambda function as a target to the EventBridge Rule
events_client.put_targets(
    Rule=S3_CLEANSED_CRAWLER2_RULE_NAME,
    Targets=[{
        'Id': f"{S3_CLEANSED_CRAWLER2_RULE_NAME}_target",
        'Arn': LAMBDA_ARN
    }]
)


##### Add Lambda Permissions for Event Rules

In [None]:

# LAMBDA_ARN = 'arn:aws:lambda:us-east-1:381492255899:function:glue_lambda_handler'
# Grant EventBridge permission to invoke the Lambda function
lambda_client.add_permission(
    FunctionName=LAMBDA_ARN.split(":")[-1],
    StatementId=f"{JOB1_RULE_NAME}-invoke-permission",
    Action="lambda:InvokeFunction",
    Principal="events.amazonaws.com",
    SourceArn=JOB1_RULE_ARN
)
lambda_client.add_permission(
    FunctionName=LAMBDA_ARN.split(":")[-1],
    StatementId=f"{JOB2_RULE_NAME}-invoke-permission",
    Action="lambda:InvokeFunction",
    Principal="events.amazonaws.com",
    SourceArn=JOB2_RULE_ARN
)
lambda_client.add_permission(
    FunctionName=LAMBDA_ARN.split(":")[-1],
    StatementId=f"{S3_RAW_CRAWLER1_RULE_NAME}-invoke-permission",
    Action="lambda:InvokeFunction",
    Principal="events.amazonaws.com",
    SourceArn=S3_RAW_CRAWLER1_RULE_ARN
)
lambda_client.add_permission(
    FunctionName=LAMBDA_ARN.split(":")[-1],
    StatementId=f"{S3_CLEANSED_CRAWLER2_RULE_NAME}-invoke-permission",
    Action="lambda:InvokeFunction",
    Principal="events.amazonaws.com",
    SourceArn=S3_CLEANSED_CRAWLER2_RULE_ARN
)

In [None]:
# # Optional: Add permissions for EventBridge to invoke the Glue Crawler
# policy = {
#     "Version": "2012-10-17",
#     "Statement": [
#         {
#             "Effect": "Allow",
#             "Action": "glue:StartCrawler",
#             "Resource": f'arn:aws:glue:{ACCOUNT_ID}:crawler/{crawler_name}'
#         }
#     ]
# }

# iam_client.put_role_policy(
#     RoleName=role_arn.split('/')[-1],  # Extract role name from ARN
#     PolicyName='EventBridgeTriggerGlueCrawlerPolicy',
#     PolicyDocument=json.dumps(policy)
# )

# print("EventBridge rule and targets created successfully.")


### [Step Functions Based AWS Glue ETL Pipeline](https://www.youtube.com/watch?v=xXMyqT2hDWk&list=PLO95rE9ahzRsdzmZ_ZT-3uOn1Nh2eEpWB&index=6&t=23s)

### Delete Resources

In [None]:
glue_client.delete_database(CatalogId=ACCOUNT_ID,Name=CATALOG_DB_NAME)

In [None]:
s3 = boto3.resource('s3')
bucket1 = s3.Bucket(S3_BUCKET_DATALAKE)
bucket2 = s3.Bucket(S3_BUCKET_GLUE_ASSETS)

# Delete all objects in the bucket
bucket1.objects.all().delete()
bucket2.objects.all().delete()

# Delete all object versions (if versioning is enabled)
# bucket1.object_versions.all().delete()
# bucket2.object_versions.all().delete()

# Finally, delete the bucket
bucket1.delete()
bucket2.delete()

In [None]:
rds_client.delete_db_subnet_group(DBSubnetGroupName=SUBNET_GROUP_NAME)
ec2_client.delete_vpc_endpoints(VpcEndpointIds=[VPC_ENDPOINT_ID])

In [None]:
rds.delete_rds_instance(instances[0]['db_instance_identifier'])

In [None]:
glue_client.delete_connection(ConnectionName=MYSQL_CONNECTION_NAME)
glue_client.delete_crawler(Name=MYSQL_CRAWLER_NAME)
glue_client.delete_crawler(Name=S3_RAW_CRAWLER_NAME)
glue_client.delete_crawler(Name=S3_CLEANSED_CRAWLER_NAME)

In [None]:
glue_client.delete_job(JobName=JOB_NAME1)
glue_client.delete_job(JobName=JOB_NAME2)

In [None]:
dynamodb_client.delete_table(TableName=config_table_name)

In [None]:
lambda_client.delete_function(FunctionName=LFN_NAME)

In [None]:
# List all rules associated with the given prefix
rules = events_client.list_rules(NamePrefix="httx")['Rules']

# List all targates associated with each rule
targets_list = [events_client.list_targets_by_rule(Rule=rule['Name'])['Targets'] for rule in rules]

# Remove all targets associated with each rule
[events_client.remove_targets(Rule=rule['Name'], Ids=[target['Id'] for target in targets]) for rule, targets, in zip(rules, targets_list)]

# Delete all rules
[events_client.delete_rule(Name=rule['Name']) for rule in rules]

In [None]:
databrew_client.delete_project(Name=DATABREW_PROJECT_NAME)
databrew_client.delete_dataset(Name=DATASET_NAME)

In [None]:
## DELETE IAM ROLE AT THE END AFTER DELETING ALL OTHER RESOURCES.
iam.delete_iam_role(GLUE_ROLE_NAME)
iam.delete_iam_role(LFN_ROLE_NAME)
iam.delete_iam_role(DATABREW_ROLE_ARN)