In [1]:
import boto3, botocore
from botocore.exceptions import ClientError
import os, time, json, io, zipfile
from datetime import date
from dotenv import load_dotenv


from misc import load_from_yaml, save_to_yaml
import iam, s3, lf, rds, vpc, ec2, redshift

load_dotenv(".env")
# boto3.setup_default_session(profile_name="AMominNJ")

False

In [2]:
ACCOUNT_ID        = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION            = os.environ['AWS_DEFAULT_REGION']
VPC_ID            = os.environ['AWS_DEFAULT_VPC']
SECURITY_GROUP_ID = os.environ['AWS_DEFAULT_SG_ID']
AWS_DEFAULT_ROUTE_TABLE = os.environ['AWS_DEFAULT_ROUTE_TABLE']
SUBNET_IDS        = SUBNET_IDS = os.environ["AWS_DEFAULT_SUBNET_IDS"].split(":")
SUBNET_ID         = SUBNET_IDS[0]
print(SUBNET_IDS)

['subnet-0a972b05a5b162feb', 'subnet-0ca765b361e4cb186', 'subnet-0de97821ddb8236f7', 'subnet-0a160fbe0fcafe373', 'subnet-0980ad10eb313405b']


In [4]:
sts_client           = boto3.client('sts')
rds_client           = boto3.client('rds')
iam_client           = boto3.client('iam')
s3_client            = boto3.client('s3')
glue_client          = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')
ec2_client           = boto3.client('ec2', region_name=REGION)
ec2_resource         = boto3.resource('ec2', region_name=REGION)

redshift_client      = boto3.client('redshift')

### [AWS Tutorials: Using Amazon Redshift in AWS based Data Lake](https://www.youtube.com/watch?v=Co8UpEYlZYA&list=PLO95rE9ahzRuUGYApNciILstNNJlvuc6g&index=1&t=1078s)


-   [lab](https://aws-dojo.com/ws30/labs/)

In [3]:
BUCKET_NAME = 'httx-redshift-bkt'
CATALOG_DB_NAME = 'httx-catalog-db'
GLUE_ROLE_NAME = "httx-glue-role" 
REDSHIFT_ROLE_NAME = "httx-redshift-role" 
RS_CRAWLER_NAME = "httx-rscrawler"

#### Screenshots

<div style="text-align:center" ><img src="./images/screenshot.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 1.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 2.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 3.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 4.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 5.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 6.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 7.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 8.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 9.png" width="1000px" height="500px" /></div>
<div style="text-align:center" ><img src="./images/screenshot 10.png" width="1000px" height="500px" /></div>

#### Create IAM Role (for AWS Glue Service)

- Create aws glue role by the name of `glue_role_name`.
- Assign Power User Access Policy (`PowerUserAccess`) to the role.

In [6]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
GLUE_ROLE_ARN = iam_client.create_role(
    RoleName=GLUE_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
    Description="Glue Service Role"
)['Role']['Arn']

In [15]:
# aws_glue_service_policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
# admin_access_policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
power_user_access_policy_arn = "arn:aws:iam::aws:policy/PowerUserAccess"

In [17]:
# Attach AWS managed policy with the role
response = iam_client.attach_role_policy(
    RoleName=GLUE_ROLE_NAME,
    PolicyArn=power_user_access_policy_arn
)

In [None]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "redshift.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
REDSHIFT_ROLE_ARN = iam_client.create_role(
    RoleName=REDSHIFT_ROLE_NAME,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
    Description="Glue Service Role"
)['Role']['Arn']

In [None]:
# aws_glue_service_policy_arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
# admin_access_policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
amazon_redshift_all_commands_fullaccess = "arn:aws:iam::aws:policy/AmazonRedshiftAllCommandsFullAccess"
amazon_s3_read_only_access = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"

In [None]:
# Attach AWS managed policy with the role
response = iam_client.attach_role_policy(
    RoleName=REDSHIFT_ROLE_NAME,
    PolicyArn=amazon_redshift_all_commands_fullaccess
)
response = iam_client.attach_role_policy(
    RoleName=REDSHIFT_ROLE_NAME,
    PolicyArn=amazon_s3_read_only_access
)

#### Create S3 Bucket and Folders

In [None]:
output,scripts,tmp = ['output', 'scripts', 'tmp']     # List of folders to create

s3.create_s3_bucket(BUCKET_NAME, [output,scripts,tmp])

#### Launch Redshift Cluster

In [7]:
REDSHIFT_CLUSTER_PARAMS={
    "DBName": 'httx-redshift-db',
    "ClusterIdentifier": 'httx-redshift-cluster',
    "ClusterType": 'single-node',
    "NodeType": 'dc2.large',
    "MasterUsername": os.environ['USERNAME'],
    "MasterUserPassword": os.environ['PASSWORD'],
    # "ClusterSecurityGroups": ['string',],
    # "VpcSecurityGroupIds": ['string',],
    # "ClusterSubnetGroupName": 'string',
    # "AvailabilityZone": 'string',
    "PreferredMaintenanceWindow": 'Mon:03:00-Mon:04:00',
    # "ClusterParameterGroupName": 'string',
    "AutomatedSnapshotRetentionPeriod": 7,
    "ManualSnapshotRetentionPeriod": 7,
    "Port": 5439,
    # "ClusterVersion": 'string',
    # "AllowVersionUpgrade": True,
    "NumberOfNodes": 1,
    "PubliclyAccessible": True,
    "Encrypted": False,
    # "HsmClientCertificateIdentifier": 'string',
    # "HsmConfigurationIdentifier": 'string',
    # "ElasticIp": 'string',
    "Tags": [
        {'Key': 'Name', 'Value': 'httx-rs-cluster'},
        {'Key': 'Environment', 'Value': 'Dev'}
    ],
    # "KmsKeyId": 'string',
    # "EnhancedVpcRouting": True,
    # "AdditionalInfo": 'string',
    # "IamRoles": ['string',],
    # "MaintenanceTrackName": 'string',
    # "SnapshotScheduleIdentifier": 'httx-snapshot-schedule',
    # "AvailabilityZoneRelocation": True,
    # "AquaConfigurationStatus": 'enabled',            # |'disabled'|'auto',
    # "DefaultIamRoleArn": 'string',
    # "LoadSampleData": 'string',
    # "ManageMasterPassword": True,
    # "MasterPasswordSecretKmsKeyId": 'string',
    # "IpAddressType": 'string',
    # "MultiAZ": True,
    # "RedshiftIdcApplicationArn": 'string'
}

REDSHIFT_CLUSTER_PARAMS={
    "db_name": 'httx-redshift-db',
    "cluster_identifier": 'httx-redshift-cluster',
    "cluster_type": 'single-node',
    "node_type": 'dc2.large',
    "master_username": os.environ['USERNAME'],
    "master_user_password": os.environ['PASSWORD'],
    "preferred_maintenance_window": 'Mon:03:00-Mon:04:00',
    "automated_snapshot_retention_period": 1,
    "manual_snapshot_retention_period": 1,
    "port": 5439,
    "number_of_nodes": 2,
    "publicly_accessible": True,
    "encrypted": False,
    "tags": [
        {'Key': 'Name', 'Value': 'httx-rs-cluster'},
        {'Key': 'Environment', 'Value': 'Dev'}
    ]
}

-   **ClusterSecurityGroups (list)**:

    -   A list of security groups to be associated with this cluster.
    -   Default: The default cluster security group for Amazon Redshift.

-   **VpcSecurityGroupIds (list)**:

    -   A list of Virtual Private Cloud (VPC) security groups to be associated with the cluster.
    -   Default: The default VPC security group is associated with the cluster.

-   **ClusterSubnetGroupName (string)**:

    -   The name of a cluster subnet group to be associated with this cluster.
    -   If this parameter is not provided the resulting cluster will be deployed outside virtual private cloud (VPC).

-   **AvailabilityZone (string)**:

    -   The EC2 Availability Zone (AZ) in which you want Amazon Redshift to provision the cluster. For example, if you have several EC2 instances running in a specific Availability Zone, then you might want the cluster to be provisioned in the same zone in order to decrease network latency.
    -   Default: A random, system-chosen Availability Zone in the region that is specified by the endpoint.
    -   Constraint: The specified Availability Zone must be in the same region as the current endpoint.


In [53]:
# NOT WORKING!
def create_redshift_cluster_v3(
    db_name="httx-redshift-db",  # Default database name
    cluster_identifier="httx-redshift-cluster",  # Unique cluster identifier
    cluster_type="multi-node",  # Cluster type: 'single-node' or 'multi-node'
    node_type="dc2.large",  # Default node type
    master_username=os.environ['USERNAME'],  # Default master username
    master_user_password=os.environ['PASSWORD'],  # Default master password
    # cluster_security_groups=[],  # List of cluster security group names
    vpc_security_group_ids=[],  # List of VPC security group IDs
    # cluster_subnet_group_name='',  # Subnet group name for cluster
    # availability_zone='',  # AZ in which the cluster is created
    preferred_maintenance_window='Mon:09:00-Mon:10:00',  # Maintenance window (e.g., "Mon:09:00-Mon:10:00")
    # cluster_parameter_group_name='',  # Cluster parameter group name
    automated_snapshot_retention_period=1,  # Retain automated snapshots (days)
    manual_snapshot_retention_period=1,  # Retain manual snapshots (days)
    port=5439,  # Port for database connections
    # cluster_version='',  # Redshift engine version
    allow_version_upgrade=True,  # Allow automatic version upgrades
    number_of_nodes=2,  # Number of nodes (required for multi-node clusters)
    publicly_accessible=False,  # Whether the cluster is publicly accessible
    encrypted=False,  # Encrypt data at rest
    # hsm_client_certificate_identifier='',  # HSM client certificate
    # hsm_configuration_identifier='',  # HSM configuration
    # elastic_ip='',  # Elastic IP address
    tags=[],  # List of tags (e.g., [{'Key': 'Name', 'Value': 'MyCluster'}])
    # kms_key_id='',  # AWS KMS key for encryption
    enhanced_vpc_routing=False,  # Enable enhanced VPC routing
    # additional_info='',  # Reserved for internal use
    # iam_roles=[],  # List of IAM roles for Redshift to assume
    # maintenance_track_name='',  # Maintenance track (e.g., 'current' or 'trailing')
    # snapshot_schedule_identifier='',  # Snapshot schedule ID
    availability_zone_relocation=False,  # Allow relocation across AZs
    aqua_configuration_status="auto",  # AQUA settings: 'enabled', 'disabled', 'auto'
    # default_iam_role_arn='',  # Default IAM role ARN
    # load_sample_data='tickit',  # Option to load sample data (e.g., 'tickit')
    manage_master_password=False,  # Whether AWS Secrets Manager manages the master password
    # master_password_secret_kms_key_id='',  # KMS key for encrypting Secrets Manager password
    ip_address_type="ipv4",  # IP address type ('ipv4' or others)
    multi_az=False,  # Multi-AZ deployment
    # redshift_idc_application_arn='',  # ARN of IDC application (if applicable)
    ):
    try:
        # Initialize Redshift client
        redshift_client = boto3.client('redshift')

        # Create cluster
        response = redshift_client.create_cluster(
            DBName=db_name,
            ClusterIdentifier=cluster_identifier,
            ClusterType=cluster_type,
            NodeType=node_type,
            MasterUsername=master_username,
            MasterUserPassword=master_user_password,
            # ClusterSecurityGroups=cluster_security_groups or [], # Cannot use both cluster security groups and VPC security groups
            VpcSecurityGroupIds=vpc_security_group_ids or [],
            # ClusterSubnetGroupName=cluster_subnet_group_name,
            # AvailabilityZone=availability_zone,
            PreferredMaintenanceWindow=preferred_maintenance_window,
            # ClusterParameterGroupName=cluster_parameter_group_name,
            AutomatedSnapshotRetentionPeriod=automated_snapshot_retention_period,
            ManualSnapshotRetentionPeriod=manual_snapshot_retention_period,
            Port=port,
            # ClusterVersion=cluster_version,
            AllowVersionUpgrade=allow_version_upgrade,
            NumberOfNodes=number_of_nodes if cluster_type == "multi-node" else 1,
            PubliclyAccessible=publicly_accessible,
            Encrypted=encrypted,
            # HsmClientCertificateIdentifier=hsm_client_certificate_identifier,
            # HsmConfigurationIdentifier=hsm_configuration_identifier,
            # ElasticIp=elastic_ip,
            Tags=tags or [],
            # KmsKeyId=kms_key_id,
            EnhancedVpcRouting=enhanced_vpc_routing,
            # AdditionalInfo=additional_info,
            # IamRoles=iam_roles or [],
            # MaintenanceTrackName=maintenance_track_name,
            # SnapshotScheduleIdentifier=snapshot_schedule_identifier,
            AvailabilityZoneRelocation=availability_zone_relocation,
            AquaConfigurationStatus=aqua_configuration_status,
            # DefaultIamRoleArn=default_iam_role_arn,
            # LoadSampleData=load_sample_data,
            ManageMasterPassword=manage_master_password,
            # MasterPasswordSecretKmsKeyId=master_password_secret_kms_key_id,
            IpAddressType=ip_address_type,
            MultiAZ=multi_az,
            # RedshiftIdcApplicationArn=redshift_idc_application_arn,
        )

        # Return response
        return response

    except Exception as e:
        print(f"Error creating Redshift cluster: {e}")
        return None


In [54]:
create_redshift_cluster_v3()

Error creating Redshift cluster: An error occurred (UnsupportedOperation) when calling the CreateCluster operation: Dual Stack is not supported


In [9]:
redshift.create_redshift_cluster(**REDSHIFT_CLUSTER_PARAMS)

TypeError: create_redshift_cluster() got an unexpected keyword argument 'cluster_identifier'

In [None]:
response = redshift_client.create_cluster_subnet_group(
    ClusterSubnetGroupName='string',
    Description='string',
    SubnetIds=SUBNET_IDS
)

In [None]:
response = redshift_client.create_cluster_security_group(
    ClusterSecurityGroupName='string',
    Description='string',
    Tags=[
        {
            'Key': 'string',
            'Value': 'string'
        },
    ]
)

In [None]:
response = redshift_client.describe_clusters(
    ClusterIdentifier='string',
    MaxRecords=123,
    Marker='string',
    TagKeys=[
        'string',
    ],
    TagValues=[
        'string',
    ]
)

In [None]:
# # Associate the IAM role with the Redshift cluster (OPTIONAL)
# response = redshift_client.modify_cluster_iam_roles(
#     ClusterIdentifier=cluster_identifier,
#     AddIamRoles=[power_user_access_policy_arn]  # This adds the role to the cluster
# )

#### Create Private Link (VPC Endpoint)

-   `Gateway` endpoints serve as a target for a route in your route table for traffic destined for the service.

In [None]:
# VPC Endpoint parameters
SERVICE_NAME = 'com.amazonaws.us-east-1.s3'  # Replace with the desired service (e.g., S3)
ROUTE_TABLE_IDS = [AWS_DEFAULT_ROUTE_TABLE]

# Create an Interface Endpoint
vpc_endpoint_id = ec2_client.create_vpc_endpoint(
    VpcEndpointType='Gateway',
    VpcId=VPC_ID,
    ServiceName=SERVICE_NAME,
    RouteTableIds=ROUTE_TABLE_IDS,
    # SubnetIds=SUBNET_IDS,
    # SecurityGroupIds=[SECURITY_GROUP_ID],
    PrivateDnsEnabled=False  # Enable private DNS to resolve service names within the VPC
)['VpcEndpoint']['VpcEndpointId']

In [41]:
ec2_client.create_tags(Resources=['vpc_endpoint_id'],Tags=[{'Key': 'Name', 'Value': 'rs-glue-vpc-endpoint'}])

In [50]:
# # Delete the VPC Endpoint
# response = ec2_client.delete_vpc_endpoints(
#     VpcEndpointIds=[vpc_endpoint_id]
# )

#### Create glue components

##### Create Glue Catalog Database 

In [None]:
## Example usage
datalake_location_uri = f"s3://{bucket_name}" #/{datalake_folder_name}"

create_database_response = glue_client.create_database(
    CatalogId=ACCOUNT_ID,
    DatabaseInput={
        'Name': catalog_db_name,
        'Description': 'This is a Glue Catalog database',
        'LocationUri': datalake_location_uri,
    }
)
print(create_database_response)

- Grant `CREATE_TABLE` permission to `glue_role_name` on data catalog DB.

In [13]:
# Arn for glue_role_name
lf_principle = create_role_response['Role']['Arn']

# Grant 'CREATE_TABLE' LF Permission to `glue_role_name` Role
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': lf_principle
    },
    Resource={
        'Database': {
            'Name': catalog_db_name
        }
    },
    Permissions=['CREATE_TABLE', 'ALTER'],
    PermissionsWithGrantOption=[]
)

In [None]:
glue_client.delete_database(CatalogId=ACCOUNT_ID,Name=catalog_db_name)

##### Create JDBC connection for Glue Crawler.

In [None]:
# glue_client.delete_connection?
# glue_client.get_connection?

In [23]:
# ?glue_client.create_connection

In [None]:
glue_rs_connection_name = "glue-rs-connection"
port = '5439'
host_name = 'httx-rh-cluster.cd3k9pazjjel.us-east-1.redshift.amazonaws.com'

# Construct the connection properties
jdbc_url = f"jdbc:redshift://{host_name}:{port}/{rs_db_name}"
connection_properties = {
    'USERNAME': rs_db_username,
    'PASSWORD': rs_db_password,
    'JDBC_CONNECTION_URL': jdbc_url,
    'JDBC_ENFORCE_SSL': 'false'  # set to 'true' if using SSL
}

# Construct the physical connection requirements
physical_connection_requirements = {
    'SecurityGroupIdList': [SECURITY_GROUP_ID],
    'SubnetId': SUBNET_ID  # Use subnet ID instead of VPC ID
}

response = glue_client.create_connection(
    ConnectionInput={
        "Name": glue_rs_connection_name,
        "ConnectionType": "JDBC",  # Use JDBC for Redshift
        "ConnectionProperties": connection_properties,
        "PhysicalConnectionRequirements": physical_connection_requirements
    },
    Tags={'Name': f"{glue_rs_connection_name}"}
)

In [29]:
# response = glue_client.get_connection(Name=glue_rh_connection_name)
# print(response)

- Test the Connection:
    -   <b style="color:red">FAILED</b>: For some unknown reasons connection made Using the SDK (Boto3) does not work unless you make some random eidt on the connection from AWS Console.

In [None]:
# glue_mysql_connection_name = "glue-mysql-connection"
# response = glue_client.get_connection(Name=glue_rh_connection_name)
# print(response)

In [None]:
# glue_client.delete_connection(ConnectionName=glue_rh_connection_name)

##### Create Glue Crawler.

In [None]:
create_crawler_response1 = glue_client.create_crawler(
    Name=RS_CRAWLER_NAME,
    Role=GLUE_ROLE_ARN,
    DatabaseName=CATALOG_DB_NAME,
    Description='Crawler for generated customer schema',
    Targets={
        'JdbcTargets': [
            {
                'ConnectionName': glue_rs_connection_name,
                'Path': f"{rs_db_name}/%",
                'Exclusions': [],  # Optional: specify any patterns to exclude
            }
        ],
    },
    SchemaChangePolicy={
        'UpdateBehavior': 'UPDATE_IN_DATABASE',
        'DeleteBehavior': 'DELETE_FROM_DATABASE'
    },
    RecrawlPolicy={
        'RecrawlBehavior': 'CRAWL_EVERYTHING'
    }
)
print(create_crawler_response1)

In [60]:
# run_crawler_response1 = glue_client.start_crawler(Name=rds_crawler_name)
# print(run_crawler_response1)

In [62]:
# ?lakeformation_client.grant_permissions

- Grant Table level LF permission (`SELECT`) to `glue_role_name` on the tables just created on Catalog DB.

In [None]:
response = lakeformation_client.grant_permissions(
    Principal={
        'DataLakePrincipalIdentifier': GLUE_ROLE_ARN
    },
    Resource={
        'Table': {
            'DatabaseName': f"{CATALOG_DB_NAME}",
            'TableWildcard': {}
        }
    },
    Permissions=['SELECT'],
    PermissionsWithGrantOption=[]
)

#### Delete All Resources

In [None]:
# s3_resources = boto3.resource('s3')
# bucket = s3_resources.Bucket(bucket_name)

# # Delete all objects in the bucket
# bucket.objects.all().delete()

# # Delete all object versions (if versioning is enabled)
# bucket.object_versions.all().delete()

# # Finally, delete the bucket
# bucket.delete()


In [None]:
## Delete the Redshift cluster without a final snapshot
# response = redshift_client.delete_cluster(
#     ClusterIdentifier=cluster_identifier,
#     SkipFinalClusterSnapshot=True  # Set to False if you want to take a final snapshot before deletion
# )
# print(response)

In [None]:
# rds.delete_rds_instance(db_instance_identifier_mysqlrds1)

In [53]:
# response = glue_client.delete_connection(ConnectionName=glue_mysql_connection_name)
# response = glue_client.delete_crawler(Name=rds_crawler_name)

In [None]:
# iam.delete_iam_role(glue_role_name)

### [AWS Tutorials - Working with Data API for Amazon Redshift](https://www.youtube.com/watch?v=LFrlMQbPehA&list=PLO95rE9ahzRuUGYApNciILstNNJlvuc6g&index=5&t=13s)

In [None]:
# Initialize AWS clients
redshift_client = boto3.client('redshift')
secrets_manager_client = boto3.client('secretsmanager')
iam_client = boto3.client('iam')
sagemaker_client = boto3.client('sagemaker')

def create_redshift_cluster():
    """
    Creates an Amazon Redshift cluster.
    """
    cluster_identifier = "dojoredshift"
    response = redshift_client.create_cluster(
        ClusterIdentifier=cluster_identifier,
        NodeType='dc2.large',
        MasterUsername='awsuser',
        MasterUserPassword='Password1!',
        ClusterType='single-node'
    )
    print(f"Creating Redshift cluster: {cluster_identifier}")
    return cluster_identifier

def wait_for_redshift(cluster_identifier):
    """
    Waits until the Redshift cluster is available.
    """
    print(f"Waiting for Redshift cluster '{cluster_identifier}' to become available...")
    while True:
        response = redshift_client.describe_clusters(ClusterIdentifier=cluster_identifier)
        cluster_status = response['Clusters'][0]['ClusterStatus']
        if cluster_status == 'available':
            print(f"Redshift cluster '{cluster_identifier}' is available.")
            break
        else:
            print(f"Cluster status: {cluster_status}")
            time.sleep(30)

def create_secret():
    """
    Creates a secret in AWS Secrets Manager for Redshift credentials.
    """
    secret_name = "dojosecret"
    secret_value = {
        "username": "awsuser",
        "password": "Password1!",
        "engine": "redshift",
        "host": "dojoredshift.cluster-identifier.aws-region.redshift.amazonaws.com",
        "port": 5439,
        "dbClusterIdentifier": "dojoredshift"
    }
    response = secrets_manager_client.create_secret(
        Name=secret_name,
        SecretString=str(secret_value)
    )
    secret_arn = response['ARN']
    print(f"Created secret: {secret_name}, ARN: {secret_arn}")
    return secret_arn

def create_iam_role():
    """
    Creates an IAM role for SageMaker with the necessary permissions.
    """
    role_name = "dojosagemakerrole"
    assume_role_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Principal": {"Service": "sagemaker.amazonaws.com"},
                "Action": "sts:AssumeRole"
            }
        ]
    }
    response = iam_client.create_role(
        RoleName=role_name,
        AssumeRolePolicyDocument=str(assume_role_policy)
    )
    iam_client.attach_role_policy(
        RoleName=role_name,
        PolicyArn="arn:aws:iam::aws:policy/PowerUserAccess"
    )
    print(f"Created IAM role: {role_name}")
    return role_name

def create_sagemaker_notebook(role_name):
    """
    Creates a SageMaker notebook instance.
    """
    notebook_name = "dojodataapinotebook"
    response = sagemaker_client.create_notebook_instance(
        NotebookInstanceName=notebook_name,
        InstanceType='ml.t2.medium',
        RoleArn=f"arn:aws:iam::{boto3.client('sts').get_caller_identity()['Account']}:role/{role_name}"
    )
    print(f"Creating SageMaker notebook: {notebook_name}")
    return notebook_name

def main():
    # Step 2: Launch Redshift Cluster
    cluster_identifier = create_redshift_cluster()
    wait_for_redshift(cluster_identifier)
    
    # Step 4: Configure Secrets Manager
    secret_arn = create_secret()

    # Step 5: Configure IAM Role and SageMaker Notebook
    role_name = create_iam_role()
    create_sagemaker_notebook(role_name)


### [AWS Tutorials: Using Lambda UDF with Amazon Redshift](https://www.youtube.com/watch?v=HqpwL7et4eQ&list=PLO95rE9ahzRuUGYApNciILstNNJlvuc6g&index=4&t=119s)

-   [lab](https://aws-dojo.com/excercises/excercise31/)

Here is a nested bulleted point version of the text you provided:

- **Introduction**
  - AWS Lambda can now be used to create a user-defined function (UDF) in Amazon Redshift.
  - UDFs can be utilized in both the `SELECT` and `WHERE` clauses of SQL queries.
  - The business logic for the UDF is defined in a Lambda function, which allows the use of programming languages like Python or Node.js.
  
- **Steps to Create UDF with Lambda in Redshift**
  - **IAM Role Creation**
    - Create an IAM role for the Redshift cluster.
    - This IAM role should have permission to invoke the Lambda function.
    - Redshift cluster needs permission to invoke Lambda for executing the query.
  
  - **Defining the External Function**
    - Use the `CREATE EXTERNAL FUNCTION` command in Redshift.
    - In the UDF, the Lambda function is referenced to perform the business logic.
    - When the UDF is called in a query, it invokes the Lambda function to return the result.

- **Lambda Function Structure**
  - **Output Format**
    - The result returned by the Lambda function must be a dictionary with four fields:
      - `success`: Indicates if the Lambda function execution was successful (`true` or `false`).
      - `error_message`: Contains an error description if the Lambda function fails.
      - `result`: The actual result, which can be an array of records.
      - `number_of_records`: Optional but recommended to indicate how many records are being returned.
  
  - **Return Example**
    - Pass an array of arguments, multiply values, and append the result to an array.
    - Return a dictionary that includes:
      - `success` status.
      - `result`: Array of the calculated values.
      - `number_of_records`: Optional, but can be added.

- **Creating the Lambda Function**
  - **Lambda Creation**
    - Create the Lambda function like any other Lambda in AWS.
    - Ensure proper formatting when returning results as a dictionary.
  
  - **Returning the Output**
    - Populate the `success` field based on exception handling.
    - If `success` is `false`, include an `error_message`.
    - Append results in an array and return it.

- **Using the UDF in Queries**
  - Example SQL query using the UDF:
    - Call the UDF within the `SELECT` clause.
    - The UDF will invoke the Lambda function to compute the result.
  
- **Lambda Invocation Frequency**
  - **Invocation Examples**
    - Example 1: Lambda is invoked for each row if the parameters are dynamic (e.g., columns in the table).
    - Example 2: Lambda is invoked only once if parameters are fixed values.
  
  - **Invocation in SELECT Clause**
    - If used in the `SELECT` clause, Lambda function processes multiple rows in one call, looping through arguments.
    - Ensure that Lambda execution does not exceed time limits when handling multiple rows.

- **Considerations**
  - Be cautious of Lambda's concurrency limits and costs.
  - Use Lambda functions efficiently to avoid unnecessary invocations.
  - Test queries to optimize Lambda usage and minimize invocation count.

- **Practical Example: Creating a UDF with Lambda**
  - Create a Redshift cluster and associate it with an IAM role.
  - Create tables and insert data for query testing.
  - Create a Lambda function in Python 3.8 to process quantity and price.
  - Define a UDF in Redshift that uses this Lambda function.
  - Query the data using the UDF, passing parameters like `quantity_order` and `price_each`.

### [AWS Tutorials - Continuous S3 data ingestion to Amazon Redshift (Copy Job)](https://www.youtube.com/watch?v=2reIpdRYscM&list=PLO95rE9ahzRuUGYApNciILstNNJlvuc6g&index=2)

### [AWS Tutorials: Amazon Redshift Federated Query with RDS PostgreSQL](https://www.youtube.com/watch?v=vJXZwkch2WY&list=PLO95rE9ahzRuUGYApNciILstNNJlvuc6g&index=3)
-   [lab](https://aws-dojo.com/ws37/labs/#google_vignette)