In [None]:
import boto3, botocore
from botocore.exceptions import ClientError
import os, time, json
from datetime import date

from misc import load_from_yaml, save_to_yaml
import iam, s3, lf, rds, glue

# boto3.setup_default_session(profile_name="AMominNJ")

In [None]:
ACCOUNT_ID          = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION              = os.environ['AWS_DEFAULT_REGION']
VPC_ID              = os.environ['AWS_DEFAULT_VPC']
SECURITY_GROUP_ID   = os.environ['AWS_DEFAULT_SG_ID']
SUBNET_IDS       = SUBNET_IDS = os.environ["AWS_DEFAULT_SUBNET_IDS"].split(":")
SUBNET_ID           = SUBNET_IDS[0]
print(SUBNET_IDS)

In [None]:
sts_client           = boto3.client('sts')
rds_client           = boto3.client('rds')
iam_client           = boto3.client('iam')
s3_client            = boto3.client('s3')
glue_client          = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')
ec2_client           = boto3.client('ec2', region_name=REGION)
ec2_resource         = boto3.resource('ec2', region_name=REGION)

In [None]:
# vpc_cidr_block = '172.0.0.0/16'

In [None]:
bucket_name, datalake_folder_name = 'httx-datalake-bkt', "S3-Datalake"
CATALOG_DB_NAME = 'httx-catalog-db'
catalog_db_name = 'httx-catalog-db'
glue_role_name = "httx-glue-role"
S3_CRAWLER_NAME = "httx-s3crawler"
RDS_MYSQL_CRAWLER_NAME = "mysql-crawler"

### [Introduction to Data Catalog in AWS Lake Formation](https://www.youtube.com/watch?v=7U8G3DxTSaU&list=PL8RIJKpVAN1euv-WAoggrdI_wX3zeb9uR&index=2)

#### Create S3 Bucket and Folders

In [None]:
s3.create_s3_bucket(bucket_name)

In [None]:
enable_versioning = False                   # Enable versioning
enable_encryption = False                   # Enable server-side encryption

customers_folder, employees_folder = "customers", 'employees'
scripts_folder = 'scripts'
athena_folder = 'athena'
libraries_folder = "software_libraries"

folders = [datalake_folder_name, customers_folder, scripts_folder, athena_folder, libraries_folder, employees_folder]     # List of folders to create


In [None]:
file_name1 = './customers.csv'  # The local file you want to upload
object_name1 = f"{customers_folder}/customers.csv"  # The name to save the file as in the S3 bucket

# Upload the file
s3.upload_file_to_s3(bucket_name, file_name1, object_name1)

#### Create Glue Catalog Database

In [None]:
## Example usage
catalog_id = ACCOUNT_ID
datalake_location_uri = f"s3://{bucket_name}/{datalake_folder_name}"

create_database_response = glue_client.create_database(
    CatalogId=catalog_id,
    DatabaseInput={
        'Name': CATALOG_DB_NAME,
        'Description': 'This is a Glue Catalog database',
        'LocationUri': datalake_location_uri,
    }
)
print(create_database_response)

In [None]:
# glue_client.update_database(
#     CatalogId=catalog_id,
#     Name=catalog_db_name,
#     DatabaseInput={
#         'Name': catalog_db_name,
#         'UseOnlyIamAccessControl': False
#     }
# )
# lf.register_s3_path_as_data_lake_location(lfdb_location_uri)

### [Lake Formation Data Access Control](https://www.youtube.com/watch?v=PYw4wtyhxbI&list=PL8RIJKpVAN1euv-WAoggrdI_wX3zeb9uR&index=3)

### [Glue Crawler (RDS)](https://www.youtube.com/watch?v=nVtvzFUUmZ8&list=PL8RIJKpVAN1euv-WAoggrdI_wX3zeb9uR&index=4) || `SUCCESS`

- [PART 3 - How to pull data from RDS through AWS Glue | RDS with AWS Glue](https://www.youtube.com/watch?v=4m0tFvhnZE8&t=365s)

#### Create a Role for AWS Glue Service

- Create aws glue role by the name of `glue_role_name`.
- Assign AWS Glue Service Policy (`AWSGlueServiceRole`) to the role.

In [None]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
create_role_response = iam_client.create_role(
    RoleName=glue_role_name,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
    Description="Glue Service Role"
)
GLUE_ROLE_ARN = create_role_response['Role']['Arn']

In [None]:
# aws_glue_service_policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
# admin_access_policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
power_user_access_policy_arn = "arn:aws:iam::aws:policy/PowerUserAccess"

In [None]:
# Attach AWS managed policy with the role
response = iam_client.attach_role_policy(
    RoleName=glue_role_name,
    PolicyArn=power_user_access_policy_arn
)

- Grant `CREATE_TABLE` permission to `glue_role_name` on data catalog DB.

In [None]:
# Arn for glue_role_name
lf_principle = GLUE_ROLE_ARN

# Grant 'CREATE_TABLE' LF Permission to `glue_role_name` Role
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle
    },
    Resource={
        'Database': {
            'Name': CATALOG_DB_NAME
        }
    },
    Permissions=['CREATE_TABLE'],
    PermissionsWithGrantOption=[]
)

#### Start RDS

-   Create and connect to an RDS instance
-   Create table and insert data using SQL script

In [None]:
# rds.get_rds_instance_parameters('mysqlrds1')

In [None]:
db_instance_identifier_mysqlrds1 = 'httx-rds-mysql'
mysql_db_name = 'interview_questions'
DB_PASSWORD = os.environ['PASSWORD']
DB_USERNAME = os.environ['USERNAME']
mysql_endpoint = ''                     # It will be replaced after RDS instance get created
postgresql_endpoint = ''                # It will be replaced after RDS instance get created
mysql_port = '3306'
vpc_id = 'vpc-03617a8a518caa526'        # Default VPC
security_group_id = 'sg-07f4ccd7a5be677ea'
SUBNET_GROUP_NAME = 'httx-rds-subnet-group'

In [None]:
# Create the RDS subnet group
response = rds_client.create_db_subnet_group(
    DBSubnetGroupName=SUBNET_GROUP_NAME,
    DBSubnetGroupDescription='Subnet group for RDS instance',
    SubnetIds=SUBNET_IDS
)

In [None]:
rds_instances = [
    {
        'db_instance_identifier': 'httx-rds-postgresql',
        'db_name': 'interview_questions',
        'db_username': DB_USERNAME,
        'db_password': DB_PASSWORD,
        'port': 5432,
        'engine': 'postgres',
        'engine_version': '14.13',
        'db_instance_class': 'db.t3.micro',
        'allocated_storage': 20,
        'availability_zone': 'us-east-1a',
        'tags': [{'Key': 'Project', 'Value': 'glue-rds-Crawler'}],
        'security_group_ids': [SECURITY_GROUP_ID],
        'db_subnet_group_name': SUBNET_GROUP_NAME,
    },
    {
        'db_instance_identifier': db_instance_identifier_mysqlrds1,
        'db_name': mysql_db_name,
        'db_username': DB_USERNAME,
        'db_password': DB_PASSWORD,
        'engine': 'mysql',
        'port': 3306,
        'engine_version': '8.0.32',
        'db_instance_class': 'db.t3.micro',
        'allocated_storage': 20,
        'availability_zone': 'us-east-1a',
        'tags': [{'Key': 'Project', 'Value': 'glue-rds-Crawler'}],
        'security_group_ids': [SECURITY_GROUP_ID],
        'db_subnet_group_name': SUBNET_GROUP_NAME,
    }
]

In [None]:
# rds.create_rds_instance(**rds_instances[0])   # 'httx-rds-postgresql'
# rds.create_rds_instance(**rds_instances[1])   # 'httx-rds-mysql'

In [None]:
# Describe the RDS instance
response = rds_client.describe_db_instances(
    DBInstanceIdentifier=db_instance_identifier_mysqlrds1
)

# Extract the instance details
db_instances = response['DBInstances']
if db_instances:
    instance = db_instances[0]
    status = instance['DBInstanceStatus']
    
    if status == 'available':
        mysql_endpoint = instance['Endpoint']['Address']
        print(f"RDS Endpoint: {mysql_endpoint}")
    else:
        print(f"RDS instance is in {status} state, NO ENDPOINT AVAILABLE YET!!")
else:
    print("No RDS instance found.")

In [None]:
print(DB_USERNAME, DB_PASSWORD)

In [None]:
# ! mysql -h {mysql_endpoint} -P {mysql_port} -u {DB_USERNAME} -p'{DB_PASSWORD}' interview_questions < /Users/am/mydocs/Software_Development/Web_Development/aws/aws_rds/mysql_interview_questions.sql
# ! PGPASSWORD={DB_PASSWORD} psql -h {postgresql_endpoint} -p {rds_instances[0]['port']} -U {DB_USERNAME} -d interview_questions < /Users/am/mydocs/Software_Development/Web_Development/aws/aws_rds/psql_interview_questions.sql

-   `Gateway` endpoints serve as a target for a route in your route table for traffic destined for the service.

In [None]:
# VPC Endpoint parameters
vpc_endpoint_tag = 'rds-vpc-endpoint'
service_name = f"com.amazonaws.{REGION}.s3"
SECURITY_GROUP_IDS = [SECURITY_GROUP_ID]  # Security group(s) associated with the endpoint
ROUTE_TABLE_IDS = ['rtb-0ec4311296ec952f8']

# Create an Interface Endpoint
vpc_endpoint_id = ec2_client.create_vpc_endpoint(
    VpcEndpointType='Gateway',
    VpcId=vpc_id,
    ServiceName=service_name,
    RouteTableIds=ROUTE_TABLE_IDS,
    # SubnetIds=sg_id,
    # SecurityGroupIds=SECURITY_GROUP_IDS,
    PrivateDnsEnabled=False  # Enable private DNS to resolve service names within the VPC
)['VpcEndpoint']['VpcEndpointId']

In [None]:
ec2_client.create_tags(Resources=[vpc_endpoint_id],Tags=[{'Key': 'Name', 'Value': vpc_endpoint_tag}])

#### Create JDBC connection for Glue Crawler.

In [None]:
# glue_client.delete_connection?
# glue_client.get_connection?

- MySQL Connection

In [None]:
glue_mysql_connection_name = "glue-mysql-connection"

# Construct the connection properties
connection_properties = {
    'JDBC_CONNECTION_URL': f"jdbc:mysql://{mysql_endpoint}:3306/{mysql_db_name}",
    'USERNAME': DB_USERNAME,
    'PASSWORD': DB_PASSWORD,
    # 'JDBC_DRIVER_S3_PATH': 's3://httx-data-lake-01/software_libraries/mysql-connector-j-8.0.32.jar',
    # 'JDBC_DRIVER_CLASS_NAME': 'com.mysql.cj.jdbc.Driver'
}

# Construct the physical connection requirements
physical_connection_requirements = {
    'SecurityGroupIdList': [security_group_id],
    'SubnetId': SUBNET_IDS[0]
    # 'AvailabilityZone': 'us-east-1a'
}

response = glue_client.create_connection(
    ConnectionInput={
        "Name": glue_mysql_connection_name,
        "ConnectionType": "JDBC",
        "ConnectionProperties": connection_properties,
        "PhysicalConnectionRequirements": physical_connection_requirements
    },
    Tags={'string': 'string'}
)

- **Test the Connection**:
    -   <b style="color:red">NOTES</b>: For some unknown reasons connection made Using the SDK (Boto3) does not work unless you make some random eidt on the connection from AWS Console.
    -   <b style="color:red">InvalidInputException</b>: At least one security group must open all ingress ports.To limit traffic, the source security group in your inbound rule can be restricted to the same security group
    -   <b style="color:red">InvalidInputException</b>: VPC S3 endpoint validation failed for SubnetId: subnet-0980ad10eb313405b. VPC: vpc-03617a8a518caa526. Reason: Could not find S3 endpoint or NAT gateway for subnetId: subnet-0980ad10eb313405b in Vpc vpc-03617a8a518caa526

- Postgresql Connection

In [None]:
POSTGRESQL_CONNECTION_NAME = "glue-psql-connection"
postgresql_endpoint = rds.get_rds_endpoint(rds.get_rds_endpoint(rds_instances[0]['db_instance_identifier']))
postgresql_connection_url = f"jdbc:sqlserver://{postgresql_endpoint}:{rds_instances[0]['port']}/{rds_instances[0]['db_name']}"

In [None]:
glue.create_glue_connection(
    POSTGRESQL_CONNECTION_NAME, 
    postgresql_connection_url, 
    DB_USERNAME, 
    DB_PASSWORD, 
    SECURITY_GROUP_ID, 
    SUBNET_ID, 
    REGION
)

In [None]:
# connection_test_response = glue_client.start_connection_test(ConnectionName=POSTGRESQL_CONNECTION_NAME)
# print(connection_test_response.get('Status', 'UNKNOWN'))


In [None]:
# glue_mysql_connection_name = "glue-mysql-connection"
# response = glue_client.get_connection(Name=glue_mysql_connection_name)
# print(response)

#### Create Glue Crawler.

In [None]:
RDS_MYSQL_CRAWLER_NAME = "mysql-crawler"

create_crawler_response1 = glue_client.create_crawler(
    Name=RDS_MYSQL_CRAWLER_NAME,
    Role=GLUE_ROLE_ARN,
    DatabaseName=catalog_db_name,
    Description='Crawler for generated customer schema',
    Targets={
        'JdbcTargets': [
            {
                'ConnectionName': glue_mysql_connection_name,
                'Path': f"{mysql_db_name}/%",
                'Exclusions': [],  # Optional: specify any patterns to exclude
            }
        ],
    },
    TablePrefix="",
    SchemaChangePolicy={
        'UpdateBehavior': 'UPDATE_IN_DATABASE',
        'DeleteBehavior': 'DELETE_FROM_DATABASE'
    },
    RecrawlPolicy={
        'RecrawlBehavior': 'CRAWL_EVERYTHING'
    }
)
print(response)

In [None]:
run_crawler_response1 = glue_client.start_crawler(Name=RDS_MYSQL_CRAWLER_NAME)
print(run_crawler_response1)

- PostgreSQL Crawler

In [None]:
POSTGRESQL_CRAWLER_NAME = "httx-postgresqlcrawler"
glue.create_glue_jdbc_crawler(
    POSTGRESQL_CRAWLER_NAME, 
    POSTGRESQL_CONNECTION_NAME, 
    GLUE_ROLE_ARN, 
    CATALOG_DB_NAME, 
    f"{rds_instances[0]['db_name']}/%",
    table_prefix=''
)

In [None]:
run_crawler_response1 = glue_client.start_crawler(Name=POSTGRESQL_CRAWLER_NAME)
print(run_crawler_response1)

#### Delete All Resources

In [None]:
# lakeformation_client.deregister_resource(ResourceArn=f'arn:aws:s3:::{bucket_name}')
glue_client.delete_database(CatalogId=catalog_id,Name=catalog_db_name)

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)

# Delete all objects in the bucket
bucket.objects.all().delete()

# Delete all object versions (if versioning is enabled)
# bucket.object_versions.all().delete()

# Finally, delete the bucket
bucket.delete()


In [None]:
rds_client.delete_db_subnet_group(DBSubnetGroupName=SUBNET_GROUP_NAME)

In [None]:
rds.delete_rds_instance(db_instance_identifier_mysqlrds1)
# rds.delete_rds_instance(rds_instances[0]['db_instance_identifier'])

In [None]:
## Delete the VPC Endpoint
response = ec2_client.delete_vpc_endpoints(VpcEndpointIds=[vpc_endpoint_id])

In [None]:
response = glue_client.delete_connection(ConnectionName=glue_mysql_connection_name)
# response = glue_client.delete_crawler(Name=rds_crawler_name)

In [None]:
## DELETE IAM ROLE AT THE END AFTER DELETING ALL OTHER RESOURCES.
iam.delete_iam_role(glue_role_name)

### [Glue Data Catalog Revisited](https://www.youtube.com/watch?v=WUojHQTyTaY&list=PL8RIJKpVAN1euv-WAoggrdI_wX3zeb9uR&index=5)

### [Glue Job and Glue Studio](https://www.youtube.com/watch?v=QQb_HOmn3MU&list=PL8RIJKpVAN1euv-WAoggrdI_wX3zeb9uR&index=6)

- [Create Developer Endpoint](https://aws-dojo.com/ws8/labs/developer-endpoint/)
- [AWS Tutorials - Interactively Develop Glue Job using Jupyter Notebook](https://www.youtube.com/watch?v=n4PVC5O_tJo)

In [None]:
EMPLOYEES_DIR = 'employees'
TEM_DIR = 'temporary'
SPARK_HISTORY_LOGS_DIR = 'sparkHistoryLogs'
s3_client.put_object(Bucket=bucket_name, Key=EMPLOYEES_DIR)
s3_client.put_object(Bucket=bucket_name, Key=TEM_DIR)
s3_client.put_object(Bucket=bucket_name, Key=SPARK_HISTORY_LOGS_DIR)

In [None]:
file_name1 = './glue_etl_employee.py'                   # The local file you want to upload
object_name1 = f"glueScripts/glue_etl_employee.py"      # The name to save the file as in the S3 bucket
s3.upload_file_to_s3(bucket_name, file_name1, object_name1)

In [None]:
inline_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:GetObject",
                "s3:PutObject",
                "s3:DeleteObject"
            ],
            "Resource": [
                f"arn:aws:s3:::{bucket_name}/*"
            ]
        },
        {
            "Effect": "Allow",
            "Action": [
                "s3:GetObject"
            ],
            "Resource": [
                f"arn:aws:s3:::{bucket_name}"
            ]
        }
    ]
}
iam.put_inline_role_policy(role_name=glue_role_name, policy_name='s3_get_put_del', policy_document=inline_policy_doc)

lf.grant_table_level_permissions(GLUE_ROLE_ARN, catalog_db_name, 'interview_questions_employee', ['SELECT'])

In [None]:
GLUE_JOB_NAME = 'glue_etl_employee_job'
GLUE_SCRIPT_PATH = f"s3://{bucket_name}/glueScripts/glue_etl_employee.py"
TEM_DIR_PATH = f"s3://{bucket_name}/{TEM_DIR}/"
SPARK_EVENT_LOG_PATH = f"s3://{bucket_name}/{SPARK_HISTORY_LOGS_DIR}/"
glue.create_glue_job(GLUE_JOB_NAME, GLUE_SCRIPT_PATH, GLUE_ROLE_ARN, TEM_DIR, SPARK_EVENT_LOG_PATH)

In [None]:
# glue.start_glue_job(GLUE_JOB_NAME)

In [None]:
# glue_client.delete_job(JobName=GLUE_JOB_NAME)

### [Glue Workflow](https://www.youtube.com/watch?v=QX8stvTQ57o&list=PL8RIJKpVAN1euv-WAoggrdI_wX3zeb9uR&index=8)

-   [AWS Tutorials - Using AWS Glue Workflow](https://www.youtube.com/watch?v=Z3jV3o7NyC8&t=753s)
-   [LAB](https://aws-dojo.com/ws29/labs/)

### Glue Advanced Topics

### Delete Resources