### [AWS Data Enginering](https://www.youtube.com/playlist?list=PLs7fR_1uLDaarvArh3bk_Q45MV5hKvaMG)

In [None]:
import boto3, botocore
from botocore.exceptions import ClientError
import os, time, json
from datetime import date

from misc import load_from_yaml, save_to_yaml
import iam, s3, lf
PASSWORD = os.environ['PASSWORD']


In [None]:
account_id = os.environ['AWS_ACCOUNT_ID_ROOT']
# boto3.setup_default_session(profile_name="AMominNJ")

In [None]:
sts_client = boto3.client('sts')
iam_client = boto3.client('iam')
s3_client = boto3.client('s3')
glue_client = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')

In [None]:
bucket_name, datalake_folder_name = 'httx-datalake-01', "S3-Datalake"
catalog_db_name = 'httx-catalog-db'
glue_role_name = "httx-crawler-role" 
s3_crawler_name = "httx-s3crawler"

#### [AWS Lake Formation initial setup](https://www.youtube.com/watch?v=pOm4UUwLRmQ&list=PLs7fR_1uLDaarvArh3bk_Q45MV5hKvaMG&index=1&t=3s)

- Groups:
  - Admins
    - `AMominNJ` User
  - Developers
    - `dev1` User

- Set `AMominNJ` user as LF Administrator
- Grant the LF Administrator, `AMominNJ`, Database Creator permissions( `Catalog Permission`, `Grantable Permission`)
- Unselect `Use only IAM access control for new databases` from LF Settings
- Unselect `Use only IAM access control for new tables in new databases` from LF Settings

In [None]:
# Create IAM group
dev_group = 'developers'
dev_user_1 = 'dev1'

In [None]:
response = iam_client.create_group(GroupName=dev_group)

print(f"Group created: {response['Group']['GroupName']}")

In [None]:
AWSGlueConsoleFullAccess_arn = "arn:aws:iam::aws:policy/AWSGlueConsoleFullAccess"
# Attach `AWSGlueConsoleFullAccess_arn` policy with the `developers` group 
iam_client.attach_group_policy(
    GroupName=dev_group,
    PolicyArn=AWSGlueConsoleFullAccess_arn
)

In [None]:
# iam.create_iam_user('dev1', 'password')
# iam.delete_iam_user('dev1')

In [None]:
response = iam_client.get_user(UserName=dev_user_1)
print(response['User']['UserName'], response['User']['Arn'], sep='\n')

In [None]:
response = iam_client.add_user_to_group(
    GroupName=dev_group,
    UserName=dev_user_1
)

#### [AWS Lake Formation database creation and access](https://www.youtube.com/watch?v=Xtc75FeS21A&list=PLs7fR_1uLDaarvArh3bk_Q45MV5hKvaMG&index=2)

- Create Lake Formation Data Catalog Database, `catalog_db_name` as `AMominNJ` admin user
- Grant LF permissions to `dev_user_1` user on Catalog DB as `AMominNJ` admin user

In [None]:
s3.create_s3_bucket(bucket_name)

In [None]:

# lfdb_location_uri = f"s3://{bucket_name}"
lfdb_location_uri = f"s3://{bucket_name}/{datalake_folder_name}"
lf.create_glue_database(
    catalog_db_name, catalog_id=account_id,
    location_uri=lfdb_location_uri,
    description="Data landing zone for sources")
# lf.register_s3_path_as_data_lake_location(lfdb_location_uri)

- Grant LF permissions to `dev_user_1` user on Catalog DB.

In [None]:
lf_principle_arn = f"arn:aws:iam::{account_id}:user/{dev_user_1}"
data_lake_location_arn = f"arn:aws:s3:::{bucket_name}/{datalake_folder_name}/"

# Grant multiple Lake Formation Permission to `dev1` user
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle_arn # User: dev1
    },
    Resource={
        'Database': {    # Grant `dev1` user database access
            'Name': catalog_db_name
        },
        'DataLocation': { # Grant `dev1` user data location access
            'ResourceArn': data_lake_location_arn
        }
    },
    Permissions=['ALTER', 'DROP', 'CREATE_TABLE'],
    PermissionsWithGrantOption=[]
)

#### [AWS Lake Formation access control model](https://www.youtube.com/watch?v=LQvARekJ1fM&list=PLs7fR_1uLDaarvArh3bk_Q45MV5hKvaMG&index=3)

- Provide `dev_group` group full access to S3 bucket (`bucket_name`)

In [None]:
policy_name = 's3_full_access'
description = f"Full access to the {bucket_name}"

In [None]:
cmp_doc1 = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "s3:PauseReplication",
                "s3:PutAnalyticsConfiguration",
                "s3:GetObjectVersionTagging",
                "s3:CreateBucket",
                "s3:ReplicateObject",
                "s3:GetObjectAcl",
                "s3:GetBucketObjectLockConfiguration",
                "s3:DeleteBucketWebsite",
                "s3:GetIntelligentTieringConfiguration",
                "s3:PutLifecycleConfiguration",
                "s3:GetObjectVersionAcl",
                "s3:PutObjectTagging",
                "s3:DeleteObject",
                "s3:DeleteObjectTagging",
                "s3:GetBucketPolicyStatus",
                "s3:GetObjectRetention",
                "s3:GetBucketWebsite",
                "s3:PutReplicationConfiguration",
                "s3:GetObjectAttributes",
                "s3:DeleteObjectVersionTagging",
                "s3:PutObjectLegalHold",
                "s3:InitiateReplication",
                "s3:GetObjectLegalHold",
                "s3:GetBucketNotification",
                "s3:PutBucketCORS",
                "s3:GetReplicationConfiguration",
                "s3:ListMultipartUploadParts",
                "s3:PutObject",
                "s3:GetObject",
                "s3:PutBucketNotification",
                "s3:PutBucketLogging",
                "s3:GetAnalyticsConfiguration",
                "s3:PutBucketObjectLockConfiguration",
                "s3:GetObjectVersionForReplication",
                "s3:GetLifecycleConfiguration",
                "s3:GetInventoryConfiguration",
                "s3:GetBucketTagging",
                "s3:PutAccelerateConfiguration",
                "s3:DeleteObjectVersion",
                "s3:GetBucketLogging",
                "s3:ListBucketVersions",
                "s3:ReplicateTags",
                "s3:RestoreObject",
                "s3:ListBucket",
                "s3:GetAccelerateConfiguration",
                "s3:GetObjectVersionAttributes",
                "s3:GetBucketPolicy",
                "s3:PutEncryptionConfiguration",
                "s3:GetEncryptionConfiguration",
                "s3:GetObjectVersionTorrent",
                "s3:AbortMultipartUpload",
                "s3:PutBucketTagging",
                "s3:GetBucketRequestPayment",
                "s3:GetObjectTagging",
                "s3:GetMetricsConfiguration",
                "s3:GetBucketOwnershipControls",
                "s3:DeleteBucket",
                "s3:PutBucketVersioning",
                "s3:GetBucketPublicAccessBlock",
                "s3:ListBucketMultipartUploads",
                "s3:PutIntelligentTieringConfiguration",
                "s3:PutMetricsConfiguration",
                "s3:PutBucketOwnershipControls",
                "s3:PutObjectVersionTagging",
                "s3:GetBucketVersioning",
                "s3:GetBucketAcl",
                "s3:PutInventoryConfiguration",
                "s3:GetObjectTorrent",
                "s3:PutBucketWebsite",
                "s3:PutBucketRequestPayment",
                "s3:PutObjectRetention",
                "s3:GetBucketCORS",
                "s3:GetBucketLocation",
                "s3:ReplicateDelete",
                "s3:GetObjectVersion"
            ],
            "Resource": [
                f"arn:aws:s3:::{bucket_name}/*",
                f"arn:aws:s3:::{bucket_name}"
            ]
        },
        {
            "Sid": "VisualEditor1",
            "Effect": "Allow",
            "Action": [
                "s3:ListAccessPointsForObjectLambda",
                "s3:GetAccessPoint",
                "s3:ListAccessPoints",
                "s3:CreateStorageLensGroup",
                "s3:ListJobs",
                "s3:PutStorageLensConfiguration",
                "s3:ListMultiRegionAccessPoints",
                "s3:ListStorageLensGroups",
                "s3:ListStorageLensConfigurations",
                "s3:GetAccountPublicAccessBlock",
                "s3:ListAllMyBuckets",
                "s3:ListAccessGrantsInstances",
                "s3:CreateJob"
            ],
            "Resource": "*"
        }
    ]
}

create_policy_response1 = iam_client.create_policy(
    PolicyName=policy_name,
    PolicyDocument=json.dumps(cmp_doc1),
    Description=description
)
attach_group_policy_response = iam_client.attach_group_policy(
    GroupName=dev_group,
    PolicyArn=create_policy_response1["Policy"]["Arn"]
)
# print(attach_group_policy_response)

#### [AWS Glue Permissions for different type of users](https://www.youtube.com/watch?v=D0d8XLaoi5c&list=PLs7fR_1uLDaarvArh3bk_Q45MV5hKvaMG&index=4)

In [None]:
filepath1 = "./customers.csv"
key1 = f"{datalake_folder_name}/customers.csv"
s3_client.put_object(Bucket=bucket_name, Key=datalake_folder_name)
s3_client.upload_file(filepath1, bucket_name, key1)

- Create `glue_role_name` role and attach `AWSGlueServiceRole` policy to the Role.

In [None]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
create_role_response = iam_client.create_role(
    RoleName=glue_role_name,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
    Description="Glue Service Role Plus Customer Managed Policy"
)

AWSGlueServiceRole_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"

# Attach AWS managed policy with the role
response = iam_client.attach_role_policy(
    RoleName=glue_role_name,
    PolicyArn=AWSGlueServiceRole_arn
)


- Provide `glue_role_name` role full access to S3 bucket (`bucket_name`)

In [None]:

# Attach Customer managed policy with the role
response = iam_client.attach_role_policy(
    RoleName=glue_role_name,
    PolicyArn=create_policy_response1["Policy"]["Arn"]
)

-   Create Customer Managed Policy for `iam:GetRole` & `"iam:PassRole"` and attach it to the group (`dev_group`)

In [None]:
cmp_name2 = 'assume_role_for_glue_jobs' # Customer managed policy
cmp_doc2 = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "iam:GetRole",
                "iam:PassRole"
            ],
            "Resource": f"arn:aws:iam::{account_id}:role/httx-glue-dev-landing-*"
        }
    ]
}

create_policy_response2 = iam_client.create_policy(
    PolicyName=cmp_name2,
    PolicyDocument=json.dumps(cmp_doc2),
    Description='assume_role_for_glue_jobs'
)
attach_group_policy_response = iam_client.attach_group_policy(
    GroupName=dev_group,
    PolicyArn=create_policy_response2["Policy"]["Arn"]
)

- Create Glue Crawler as developer user (`dev_user_1`)

In [None]:
boto3.setup_default_session(profile_name=dev_user_1)

glue_role_arn = create_role_response['Role']['Arn']
create_crawler_response1 = glue_client.create_crawler(
    Name=s3_crawler_name,
    Role=glue_role_arn,
    DatabaseName=catalog_db_name,
    Description='Crawler for generated customer schema',
    Targets={
        'S3Targets': [
            {
                'Path': f"s3://{bucket_name}/{datalake_folder_name}",
                'Exclusions': []
            },
        ]
    },
    SchemaChangePolicy={
        'UpdateBehavior': 'UPDATE_IN_DATABASE',
        'DeleteBehavior': 'DELETE_FROM_DATABASE'
    },
    RecrawlPolicy={
        'RecrawlBehavior': 'CRAWL_EVERYTHING'
    },
    #,Configuration='{ "Version": 1.0, "CrawlerOutput": { "Partitions": { "AddOrUpdateBehavior": "InheritFromTable" } } }'
)
print(response)

-   Create Customer Managed Policy (`cloudwatch_on_glue`) and attach it to the group (`dev_group`)

In [None]:
cmp_name3 = 'cloudwatchlog_glue' # Customer managed policy
cmp_doc3 = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "logs:ListTagsLogGroup",
                "logs:GetDataProtectionPolicy",
                "logs:ListAnomalies",
                "logs:GetDelivery",
                "logs:GetLogRecord",
                "logs:ListLogAnomalyDetectors",
                "logs:DescribeLogStreams",
                "logs:DescribeSubscriptionFilters",
                "logs:StartQuery",
                "logs:Unmask",
                "logs:GetDeliveryDestinationPolicy",
                "logs:DescribeMetricFilters",
                "logs:GetDeliveryDestination",
                "logs:GetLogAnomalyDetector",
                "logs:ListTagsForResource",
                "logs:GetDeliverySource",
                "logs:GetQueryResults",
                "logs:StartLiveTail",
                "logs:GetLogEvents",
                "logs:FilterLogEvents",
                "logs:GetLogGroupFields"
            ],
            "Resource": f"arn:aws:logs:us-east-1:{account_id}:log-group:aws-glue/*"
        },
        {
            "Sid": "VisualEditor1",
            "Effect": "Allow",
            "Action": [
                "logs:DescribeQueries",
                "logs:DescribeLogGroups",
                "logs:DescribeAccountPolicies",
                "logs:DescribeDeliverySources",
                "logs:StopQuery",
                "logs:TestMetricFilter",
                "logs:GetLogDelivery",
                "logs:ListLogDeliveries",
                "logs:DescribeDeliveryDestinations",
                "logs:DescribeExportTasks",
                "logs:StopLiveTail",
                "logs:DescribeDeliveries",
                "logs:DescribeQueryDefinitions",
                "logs:DescribeResourcePolicies",
                "logs:DescribeDestinations"
            ],
            "Resource": "*"
        }
    ]
}

create_policy_response3 = iam_client.create_policy(
    PolicyName=cmp_name3,
    PolicyDocument=json.dumps(cmp_doc3),
    Description='Access for developer group to read Glue Cloudwatch logs.'
)
attach_group_policy_response = iam_client.attach_group_policy(
    GroupName=dev_group,
    PolicyArn=create_policy_response3["Policy"]["Arn"]
)

<b style="color:red">User</b>: arn:aws:iam::381492255899:user/dev1 is not authorized to perform: logs:DescribeLogStreams on resource: arn:aws:logs:us-east-1:381492255899:log-group:/aws-glue/crawlers:log-stream: because no identity-based policy allows the logs:DescribeLogStreams action

- Grant LF permissions (`ALL`) to `glue_role_name` Role on Catalog DB.

In [None]:
lf_principle = create_role_response['Role']['Arn'] # glue_role_arn
data_lake_location_arn = f"arn:aws:s3:::{bucket_name}/{datalake_folder_name}/"
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle
    },
    Resource={
        'Database': {
            'Name': catalog_db_name
        },
        'DataLocation': {
            'ResourceArn': data_lake_location_arn
        }
    },
    Permissions=['ALL'],
    PermissionsWithGrantOption=[]
)

- Grant Table level permission (`ALL`) to `dev_user_1`

In [None]:
lf_principle = f"arn:aws:iam::{account_id}:user/{dev_user_1}" # user_arn
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle
    },
    Resource={
        'Table': {
            'DatabaseName': f"{catalog_db_name}",
            'TableWildcard': {}
        }
    },
    Permissions=['ALL'],
    PermissionsWithGrantOption=[]
)

In [None]:
# boto3.setup_default_session(profile_name="dev1")
# run_crawler_response = glue_client.start_crawler(Name=crawler_name)
# print(run_crawler_response)

#### Delete Resources

In [None]:
# s3 = boto3.resource('s3')
# bucket = s3.Bucket(bucket_name)

# # Delete all objects in the bucket
# bucket.objects.all().delete()

# # Delete all object versions (if versioning is enabled)
# bucket.object_versions.all().delete()

# # Finally, delete the bucket
# bucket.delete()

# s3_client.delete_object(Bucket=bucket_name, Key=key1)         # delete csv file
# s3_client.delete_object(Bucket=bucket_name, Key="test-dataset")    # delete folder
# s3_client.delete_bucket(Bucket=bucket_name)

In [None]:
# iam.delete_iam_role(role_name)
# iam.delete_iam_policy(create_policy_response1["Policy"]["Arn"])
# iam.delete_iam_policy(create_policy_response2["Policy"]["Arn"])
# iam.delete_iam_policy(create_policy_response3["Policy"]["Arn"])

In [None]:
# boto3.setup_default_session(profile_name="AMominNJ")

# lakeformation_client.deregister_resource(ResourceArn=f'arn:aws:s3:::{bucket_name}/data')

# response = iam_client.detach_group_policy(GroupName='developers',PolicyArn=create_policy_response3["Policy"]["Arn"])
# response = iam_client.delete_policy(PolicyArn=create_policy_response1["Policy"]["Arn"])
# response = iam_client.delete_policy(PolicyArn=create_policy_response2["Policy"]["Arn"])
# response = iam_client.delete_policy(PolicyArn=create_policy_response3["Policy"]["Arn"])
# iam_client.delete_role(RoleName=role_name)


In [None]:
# lf_principle = create_role_response['Role']['Arn'] # `httx-glue-dev-landing-test-dataset` role ARN
# data_lake_location_arn = f"arn:aws:s3:::{bucket_name}/{datalake_folder_name}/"
# lf_principle = f"arn:aws:iam::{account_id}:user/dev1"

# response = lakeformation_client.revoke_permissions(
#     Principal={
#         'DataLakePrincipalIdentifier': lf_principle
#     },
#     Resource={
#         'Database': {
#             'Name': 'httx-landing-data'
#         },
#         'DataLocation': {
#             'ResourceArn': data_lake_location_arn
#         }
#     },
#     Permissions=['ALL'],
#     PermissionsWithGrantOption=[]
# )


# response = lakeformation_client.revoke_permissions(
#     Principal={
#         'DataLakePrincipalIdentifier': lf_principle
#     },
#     Resource={
#         'Database': {    # Grant `dev1` user database access
#             'Name': 'httx-landing-data'
#         },
#         'DataLocation': { # Grant `dev1` user data location access
#             'ResourceArn': data_lake_location_arn
#         },
#         'Table': {
#             'DatabaseName': 'httx-landing-data',
#             'TableWildcard': {}
#         }
#     },
#     Permissions=['ALL'],
#     PermissionsWithGrantOption=[]
# )

In [None]:
# glue_client.delete_database(CatalogId=account_id,Name=database_name)
# glue_client.delete_crawler(Name=crawler_name)