### Glue RDS Crawlers

In [None]:
import boto3, botocore
from botocore.exceptions import ClientError
import os, time, json
from datetime import date

from misc import load_from_yaml, save_to_yaml
import iam, s3, lf, rds

In [None]:
ACCOUNT_ID = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION = os.getenv('AWS_DEFAULT_REGION')
# boto3.setup_default_session(profile_name="AMominNJ")

In [None]:
sts_client           = boto3.client('sts')
rds_client           = boto3.client('rds')
iam_client           = boto3.client('iam')
s3_client            = boto3.client('s3')
glue_client          = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')
ec2_client = boto3.client('ec2', region_name='us-east-1')
ec2_resource = boto3.resource('ec2', region_name='us-east-1')

In [None]:
BUCKET_NAME, DATALAKE_FOLDER_NAME = 'httx-datalake-bkt', "S3-Datalake"
CATALOG_DB_NAME = 'httx-catalog-db'
GLUE_ROLE_NAME = "httx-glue-role" 
S3_CRAWLER_NAME = "httx-s3crawler"
RDS_CRAWLER_NAME = "httx-rdscrawler"

#### Create S3 Bucket and Folders

In [None]:
acl = 'private'                             # Set the ACL (e.g., 'private', 'public-read')
enable_versioning = False                   # Enable versioning
enable_encryption = False                   # Enable server-side encryption

customers_folder, employees_folder = "customers", 'employees'
scripts_folder = 'scripts'
athena_folder = 'athena'
libraries_folder = "software_libraries"

folders = [DATALAKE_FOLDER_NAME, customers_folder, scripts_folder, athena_folder, libraries_folder, employees_folder]     # List of folders to create

s3.create_s3_bucket(BUCKET_NAME, REGION, acl, enable_versioning, enable_encryption, folders)

In [None]:
file_name1 = './customers.csv'  # The local file you want to upload
object_name1 = f"{customers_folder}/customers.csv"  # The name to save the file as in the S3 bucket

# Upload the file
s3.upload_file_to_s3(BUCKET_NAME, file_name1, object_name1)

#### Create Glue Catalog Database

In [None]:
## Example usage
CATALOG_ID = ACCOUNT_ID
datalake_location_uri = f"s3://{BUCKET_NAME}/{DATALAKE_FOLDER_NAME}"

create_database_response = glue_client.create_database(
    CatalogId=CATALOG_ID,
    DatabaseInput={
        'Name': CATALOG_DB_NAME,
        'Description': 'This is a Glue Catalog database',
        'LocationUri': datalake_location_uri,
    }
)
print(create_database_response)

In [None]:
# glue_client.update_database(
#     CatalogId=catalog_id,
#     Name=catalog_db_name,
#     DatabaseInput={
#         'Name': catalog_db_name,
#         'UseOnlyIamAccessControl': False
#     }
# )
# lf.register_s3_path_as_data_lake_location(lfdb_location_uri)

#### Create a Role for AWS Glue Service

- Create aws glue role by the name of `glue_role_name`.
- Assign AWS Glue Service Policy (`AWSGlueServiceRole`) to the role.

In [None]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
create_role_response = iam_client.create_role(
    RoleName=GLUE_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
    Description="Glue Service Role"
)

In [None]:
AWS_GLUE_SERVICE_POLICY_ARN = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
ADMIN_ACCESS_POLICY_ARN = "arn:aws:iam::aws:policy/AdministratorAccess"
POWER_USER_ACCESS_POLICY_ARN = "arn:aws:iam::aws:policy/PowerUserAccess"

In [None]:
# Attach AWS managed policy with the role
response = iam_client.attach_role_policy(
    RoleName=GLUE_ROLE_NAME,
    PolicyArn=POWER_USER_ACCESS_POLICY_ARN
)


- Grant `CREATE_TABLE` permission to `glue_role_name` on data catalog DB.

In [None]:
# Arn for glue_role_name
lf_principle = create_role_response['Role']['Arn']

# Grant 'CREATE_TABLE' LF Permission to `glue_role_name` Role
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle
    },
    Resource={
        'Database': {
            'Name': CATALOG_DB_NAME
        }
    },
    Permissions=['CREATE_TABLE'],
    PermissionsWithGrantOption=[]
)

#### Start RDS (MySQL)

-   Create and connect to an RDS instance
-   Create table and insert data using SQL script

In [None]:
# rds.get_rds_instance_parameters('mysqlrds1')

In [None]:
DB_PASSWORD = os.environ['PASSWORD']
DB_USERNAME = os.environ['USERNAME']
VPC_ID = 'vpc-03617a8a518caa526'
SECURITY_GROUP_ID = 'sg-07f4ccd7a5be677ea'
SUBNET_GROUP_NAME = VPC_ID

In [None]:
DB_INSTANCE_IDENTIFIER_MYSQLRDS1 = 'mysqlrds1'
MYSQL_DB_NAME = 'interview_questions'
MYSQL_PORT = '3306'
mysql_db_endpoint = ''

In [None]:
# rds.create_rds_instance(db_instance_identifier_mysqlrds1, mysql_db_name, security_group_id, subnet_group_name, DB_USERNAME, DB_PASSWORD)

In [None]:
# Describe the RDS instance
response = rds_client.describe_db_instances(
    DBInstanceIdentifier=DB_INSTANCE_IDENTIFIER_MYSQLRDS1
)

# Extract the instance details
db_instances = response['DBInstances']
if db_instances:
    instance = db_instances[0]
    status = instance['DBInstanceStatus']
    
    if status == 'available':
        mysql_db_endpoint = instance['Endpoint']['Address']
        print(f"RDS Endpoint is Available Now: \n\t{mysql_db_endpoint}")
    else:
        print(f"RDS instance is in {status} state, NO ENDPOINT AVAILABLE!!!!")
        print(f"Please wait until the Database is available.")
else:
    print("No RDS instance found.")

In [None]:
# ! mysql -h {endpoint} -P {MYSQL_PORT} -u httxadmin -p'{DB_PASSWORD}' interview_questions < /Users/am/mydocs/Software_Development/Web_Development/aws/rds/interview_questions.sql

-   `Gateway` endpoints serve as a target for a route in your route table for traffic destined for the service.

In [None]:
# VPC Endpoint parameters
VPC_ENDPOINT_TAG = 'rs-glue-vpc-endpoint'
VPC_ENDPOINT_SERVICE_NAME = 'com.amazonaws.us-east-1.s3'
SUBNET_IDS = ['subnet-0980ad10eb313405b', 'subnet-0de97821ddb8236f7', 'subnet-0a160fbe0fcafe373', 'subnet-0ca765b361e4cb186', 'subnet-0a972b05a5b162feb']  # List of subnets where the endpoint should be placed
SECURITY_GROUP_IDS = [SECURITY_GROUP_ID]  # Security group(s) associated with the endpoint
ROUTE_TABLE_IDS = ['rtb-0ec4311296ec952f8']

# Create an Interface Endpoint
vpc_endpoint_id = ec2_client.create_vpc_endpoint(
    VpcEndpointType='Gateway',
    VpcId=VPC_ID,
    ServiceName=VPC_ENDPOINT_SERVICE_NAME,
    RouteTableIds=ROUTE_TABLE_IDS,
    # SubnetIds=SUBNET_IDS,
    # SecurityGroupIds=SECURITY_GROUP_IDS,
    PrivateDnsEnabled=False  # Enable private DNS to resolve service names within the VPC
)['VpcEndpoint']['VpcEndpointId']

In [None]:
ec2_client.create_tags(Resources=['vpc_endpoint_id'],Tags=[{'Key': 'Name', 'Value': VPC_ENDPOINT_TAG}])

#### Create JDBC connection for Glue Crawler.

In [None]:
# glue_client.delete_connection?
# glue_client.get_connection?

In [None]:
GLUE_MYSQL_CONNECTION_NAME = "glue-mysql-connection"

# Construct the connection properties
connection_properties = {
    'JDBC_CONNECTION_URL': f'jdbc:mysql://{DB_INSTANCE_IDENTIFIER_MYSQLRDS1}.rds.amazonaws.com:3306/{MYSQL_DB_NAME}',
    'USERNAME': DB_USERNAME,
    'PASSWORD': DB_PASSWORD,
    # 'JDBC_DRIVER_S3_PATH': 's3://httx-data-lake-01/software_libraries/mysql-connector-j-8.0.32.jar',
    # 'JDBC_DRIVER_CLASS_NAME': 'com.mysql.cj.jdbc.Driver'
}

# Construct the physical connection requirements
physical_connection_requirements = {
    'SecurityGroupIdList': [SECURITY_GROUP_ID],
    'SubnetId': VPC_ID
}

response = glue_client.create_connection(
    ConnectionInput={
        "Name": GLUE_MYSQL_CONNECTION_NAME,
        "ConnectionType": "JDBC",
        "ConnectionProperties": connection_properties,
        "PhysicalConnectionRequirements": physical_connection_requirements
    },
    Tags={'string': 'string'}
)

- **Test the Connection**:
    -   <b style="color:red">InvalidInputException</b>: At least one security group must open all ingress ports.To limit traffic, the source security group in your inbound rule can be restricted to the same security group
    -   <b style="color:red">InvalidInputException</b>: VPC S3 endpoint validation failed for SubnetId: subnet-0980ad10eb313405b. VPC: vpc-03617a8a518caa526. Reason: Could not find S3 endpoint or NAT gateway for subnetId: subnet-0980ad10eb313405b in Vpc vpc-03617a8a518caa526

In [None]:
# glue_mysql_connection_name = "glue-mysql-connection"
# response = glue_client.get_connection(Name=glue_mysql_connection_name)
# print(response)

#### Create Glue Crawler.

In [None]:
role_arn = create_role_response['Role']['Arn']
create_crawler_response1 = glue_client.create_crawler(
    Name=rds_crawler_name,
    Role=role_arn,
    DatabaseName=catalog_db_name,
    Description='Crawler for generated customer schema',
    Targets={
        'JdbcTargets': [
            {
                'ConnectionName': GLUE_MYSQL_CONNECTION_NAME,
                'Path': f"{MYSQL_DB_NAME}/%",
                'Exclusions': [],  # Optional: specify any patterns to exclude
            }
        ],
    },
    SchemaChangePolicy={
        'UpdateBehavior': 'UPDATE_IN_DATABASE',
        'DeleteBehavior': 'DELETE_FROM_DATABASE'
    },
    RecrawlPolicy={
        'RecrawlBehavior': 'CRAWL_EVERYTHING'
    }
)
print(response)

In [None]:
# run_crawler_response1 = glue_client.start_crawler(Name=rds_crawler_name)
# print(run_crawler_response1)

<b style="color:red">InvalidInputException</b>: An error occurred (InvalidInputException) when calling the CreateCrawler operation: Please provide both the driver s3 path and driver classname.


#### Delete All Resources

In [None]:
# s3 = boto3.resource('s3')
# bucket = s3.Bucket(bucket_name)

# # Delete all objects in the bucket
# bucket.objects.all().delete()

# # Delete all object versions (if versioning is enabled)
# bucket.object_versions.all().delete()

# # Finally, delete the bucket
# bucket.delete()


In [None]:
# iam.delete_iam_role(glue_role_name)

In [None]:
# rds.delete_rds_instance(db_instance_identifier_mysqlrds1)

In [None]:
# # Delete the VPC Endpoint
# response = ec2_client.delete_vpc_endpoints(
#     VpcEndpointIds=[vpc_endpoint_id]
# )

In [None]:
# response = glue_client.delete_connection(ConnectionName=glue_mysql_connection_name)
# response = glue_client.delete_crawler(Name=rds_crawler_name)

In [None]:
# lakeformation_client.deregister_resource(ResourceArn=f'arn:aws:s3:::{bucket_name}')
# glue_client.delete_database(CatalogId=catalog_id,Name=catalog_db_name)