-   [Building AWS Glue Job using PySpark](https://www.youtube.com/watch?v=vT9vu3NMsk4&t=1848s)
-   [Building AWS Glue Job using PySpark - Part:1(of 2)](https://aws-dojo.com/workshoplists/workshoplist8/)
    -   [lab](https://aws-dojo.com/ws8/labs/)
-   [Building AWS Glue Job using PySpark - Part:2(of 2)](https://aws-dojo.com/workshoplists/workshoplist9/)
    -   [lab](https://aws-dojo.com/ws9/labs/)

![build_glue_job1](build_glue_job1.png)

In [61]:
import boto3
import botocore
from botocore.exceptions import ClientError
import os, time, json
from datetime import date

from misc import load_from_yaml, save_to_yaml
import s3, iam, lf, glue

In [62]:
boto3.setup_default_session(profile_name="AMominNJ")
sts_client = boto3.client('sts')
iam_client = boto3.client('iam')
lambda_client = boto3.client('lambda')
glue_client = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')

In [None]:
ACCOUNT_ID = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION = os.environ['AWS_DEFAULT_REGION']
VPC_ID = 'vpc-03617a8a518caa526'
SUBNET_IDS = ['subnet-0980ad10eb313405b', 'subnet-0de97821ddb8236f7', 'subnet-0a160fbe0fcafe373', 'subnet-0ca765b361e4cb186', 'subnet-0a972b05a5b162feb']
SUBNET_ID = SUBNET_IDS[0]
SECURITY_GROUP_ID = 'sg-07f4ccd7a5be677ea'

In [63]:
CONFIG_PATH = 'resources_info.yml'
CONFIG = load_from_yaml(CONFIG_PATH)

account_id = os.environ['AWS_ACCOUNT_ID_ROOT']
catalog_id = account_id
region = os.environ['AWS_DEFAULT_REGION']

In [76]:
bucket_name, datalake_folder_name = 'httx-datalake-bkt', "S3-Datalake"
catalog_db_name = 'httx-catalog-db'
glue_role_name = "httx-glue-role" 
s3_crawler_name = "httx-s3crawler"
rds_crawler_name = "httx-rdscrawler"

#### Create IAM Roles and Policies

-   Create Glue Service role

In [77]:
# Role-Assume-Policy
glue_trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
description="Allows Glue to call AWS services on your behalf."
create_glue_role_response = iam_client.create_role(
    RoleName=glue_role_name,
    AssumeRolePolicyDocument=json.dumps(glue_trust_policy),
    Description=description
)

In [None]:
glue_policy_arn = 'arn:aws:iam::aws:policy/PowerUserAccess'
iam.attach_policy_to_role(glue_role_name, glue_policy_arn)

In [92]:

AWSGlueServiceRole_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"

# Attach AWS managed policy with the role
response = iam_client.attach_role_policy(
    RoleName=glue_role_name,
    PolicyArn=AWSGlueServiceRole_arn
)

In [89]:
# policy_document = {
#     "Version": "2012-10-17",
#     "Statement": [
#         {
#             "Effect": "Allow",
#             "Action": [
#                 "s3:GetObject",
#                 "s3:ListBucket"
#             ],
#             "Resource": [
#                 "arn:aws:s3:::{bucket_name}",
#                 "arn:aws:s3:::{bucket_name}/data/customers/*"
#             ]
#         }
#     ]
# }
# iam.put_inline_role_policy(glue_role_name, 'httx-inline-s3-policy', policy_document=policy_document)

#### S3

-   Create bucket with nested folders
    -   `data/{customers,sales}/` --> Datalake
    -   `script/`                 --> to keep glue job scripts

In [None]:
region = os.getenv('AWS_DEFAULT_REGION')    # Specify the AWS region, e.g., 'us-west-1'
acl = 'private'                             # Set the ACL (e.g., 'private', 'public-read')
enable_versioning = False                   # Enable versioning
enable_encryption = False                   # Enable server-side encryption
folders = [f'{datalake_folder_name}', 'script', f'{datalake_folder_name}/customers', f'{datalake_folder_name}/sales', 'processed_data']     # List of folders to create

s3.create_s3_bucket(bucket_name, folders, region, enable_versioning, enable_encryption)


In [None]:
file_name1 = './customers.csv'  # The local file you want to upload
folder_name1 = f'{datalake_folder_name}/customers'
object_name1 = f"{folder_name1}/customers.csv"  # The name to save the file as in the S3 bucket

# Upload the file
s3.upload_file_to_s3(bucket_name, file_name1, object_name1)

In [None]:
file_name2 = './sales.csv'  # The local file you want to upload
folder_name2 = f'{datalake_folder_name}/sales'
object2 = f"{folder_name2}/sales.csv"  # The name to save the file as in the S3 bucket

# Upload the file
s3.upload_file_to_s3(bucket_name, file_name2, object2)

#### [Configure Data Lake: Lake Formation](https://aws-dojo.com/ws8/labs/configure-data-lake/)

- Create a Lake Formation Data Catalog database
- Register S3 bucket location with the Catalog Database
    -   A database is used to organize data catalog tables in the data lake.

<div style="text-align:center" ><img src="./datalake.png" width="800" height="400" /></div>

In [83]:
## Example usage
data_lake_location_uri = f"s3://{bucket_name}/{datalake_folder_name}"
glue_role_arn = create_glue_role_response['Role']['Arn']
# role_arn=f"arn:aws:iam::{account_id}:role/{glue_role_name}"

create_database_response = glue_client.create_database(
    CatalogId=catalog_id,
    DatabaseInput={
        'Name': catalog_db_name,
        'Description': 'This is a Glue Catalog database',
        'LocationUri': data_lake_location_uri,
    }
)

In [24]:
# lf.register_s3_path_as_data_lake_location(data_lake_location_uri)

#### Configure and Run Crawler

-   Attach database level permission with the Glue Service Role so that Crawler can create catalog in it.
-   Create a crawler
-   Run the crawler

-   Every data in the data lake should be catalogued. 
-   The creation of catalogs are automated using crawlers in AWS Glue. 
-   The crawler uses role based authorization to create catalog in the data lake database. 
    -   You created an IAM Role `dojogluerole` in the earlier task which the crawler will use to create data catalog in the database. 
-   You need to assign database permission for this role. 
    -   After the permission configuration, you will create and run crawler to catalog the data.

- Grant `ALL` permission to `glue_role` on data catalog database.

In [84]:
glue_role_arn = create_glue_role_response['Role']['Arn']
# role_arn=f"arn:aws:iam::{account_id}:role/{glue_role_name}"
permissions = ["CREATE_TABLE", "ALTER"] # ['ALL']
grant_permissions_response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': glue_role_arn
    },
    Resource={
        'Database': {
            'Name': catalog_db_name
        }
    },
    Permissions=permissions,
    PermissionsWithGrantOption=permissions  # If you want to allow the role to grant these permissions to others
)

In [None]:
targets = f"s3://{bucket_name}/{datalake_folder_name}/"
create_crawler_response = glue_client.create_crawler(
    Name=s3_crawler_name,
    Role=glue_role_arn, # or glue_role_name
    DatabaseName=catalog_db_name,
    Description='Crawler for generated Sales schema',
    Targets={
        'S3Targets': [
            {
                'Path': f"s3://{bucket_name}/{datalake_folder_name}/"
            },
        ]
    },
    SchemaChangePolicy={
        'UpdateBehavior': 'UPDATE_IN_DATABASE',
        'DeleteBehavior': 'DELETE_FROM_DATABASE'
    },
    RecrawlPolicy={
        'RecrawlBehavior': 'CRAWL_EVERYTHING'
    },
    #,Configuration='{ "Version": 1.0, "CrawlerOutput": { "Partitions": { "AddOrUpdateBehavior": "InheritFromTable" } } }'
)
print(create_crawler_response)

In [None]:
# run_crawler_response = glue_client.start_crawler(Name=s3_crawler_name)
# print(run_crawler_response)

#### [Role Permission to the Catalog](https://aws-dojo.com/ws8/labs/role-permission-catalog/)

- Grant `SELECT` permission to `glue_role` on data catalog Tables (`sales` and `customers`).

You need to assign Table level permission to the role.

In [17]:
def grant_permissions(principal, database_name, table_name, permissions, grant_option=False):
    """
    Grant table permissions to an IAM role in AWS Lake Formation.
    
    :param principal: The ARN of the IAM role or user.
    :param database_name: The name of the database in Lake Formation.
    :param table_name: The name of the table in the database.
    :param permissions: A list of permissions to grant (e.g., ['SELECT', 'ALTER']).
    :param grant_option: Boolean to grant permissions with grant option (default is False).
    """

    try:
        # Define the permissions with or without grant option
        permissions_with_grant_option = permissions if grant_option else []
        
        # Grant permissions on the table
        response = lakeformation_client.grant_permissions(
            Principal={
                'DataLakePrincipalIdentifier': principal
            },
            Resource={
                'Table': {
                    'DatabaseName': database_name,
                    'Name': table_name
                }
            },
            Permissions=permissions,
            PermissionsWithGrantOption=permissions_with_grant_option
        )
        
        print(f"Permissions granted successfully to {principal} for table {table_name} in database {database_name}")
        return response

    except Exception as e:
        print(f"Error granting permissions: {e}")

In [None]:
# Example usage
glue_role_arn = create_glue_role_response['Role']['Arn']
permissions = ['SELECT']

# Attach "SELECT" (table level) permission on certain tables to Glue Service Role
lf.grant_permissions(glue_role_arn, catalog_db_name, 'sales', permissions)
lf.grant_permissions(glue_role_arn, catalog_db_name, 'customers', permissions)

#### [Create Developer Endpoint](https://aws-dojo.com/ws8/labs/developer-endpoint/)

- [AWS Tutorials - Interactively Develop Glue Job using Jupyter Notebook](https://www.youtube.com/watch?v=n4PVC5O_tJo)

In [30]:
cmp_name1 = 'assume_role_for_glue_jobs' # Customer managed policy
cmp_doc1 = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "iam:GetRole",
                "iam:PassRole"
            ],
            "Resource": f"arn:aws:iam::{account_id}:role/{glue_role_name}-*"
        }
    ]
}

create_policy_response1 = iam_client.create_policy(
    PolicyName=cmp_name1,
    PolicyDocument=json.dumps(cmp_doc1),
    Description='assume_role_for_glue_jobs'
)
attach_role_policy_response = iam_client.attach_role_policy(
    RoleName=glue_role_name,
    PolicyArn=create_policy_response1["Policy"]["Arn"]
)

#### Delete all Resources

In [None]:
# iam.delete_iam_policy(glue_policy_arn)
# iam.delete_iam_role(glue_role_name)
# s3.delete_s3_bucket(bucket_name)
## lakeformation_client.deregister_resource(ResourceArn=f'arn:aws:s3:::{bucket_name}/{datalake_folder_name}')
# glue_client.delete_database(CatalogId=catalog_id,Name=catalog_db_name)
# response = glue_client.delete_crawler(Name=s3_crawler_name)