In [None]:
import boto3, botocore
from botocore.exceptions import ClientError
import os, time, json, io, zipfile, requests
from datetime import date
from dotenv import load_dotenv


from misc import load_from_yaml, save_to_yaml
import iam, s3, lf, rds, vpc, ec2

from ec2 import ALL_IN_ONE_INBOUND_RULES, ALL_IN_ONE_OUTBOUND_RULES

load_dotenv(".env")
# boto3.setup_default_session(profile_name="AMominNJ")

In [None]:
ACCOUNT_ID        = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION            = os.environ['AWS_DEFAULT_REGION']
VPC_ID            = os.environ['AWS_DEFAULT_VPC']
SECURITY_GROUP_ID = os.environ['AWS_DEFAULT_SG_ID']
SUBNET_IDS        = SUBNET_IDS = os.environ["AWS_DEFAULT_SUBNET_IDS"].split(":")
SUBNET_ID         = SUBNET_IDS[0]
print(SUBNET_IDS)

In [7]:
sts_client           = boto3.client('sts')
rds_client           = boto3.client('rds')
iam_client           = boto3.client('iam')
s3_client            = boto3.client('s3')
glue_client          = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')
stepfunctions_client = boto3.client('stepfunctions')
apigateway_client    = boto3.client('apigateway')
lsn_client           = boto3.client('lambda')
events_client        = boto3.client('events')

In [8]:
ec2_client   = boto3.client('ec2', region_name=REGION)
ec2_resource = boto3.resource('ec2', region_name=REGION)
msk_client   = boto3.client('kafka')

- [Boto3 Docs: AWS MSK](https://boto3.amazonaws.com/v1/documentation/api/1.35.9/reference/services/kafka.html)

### [AWS: Get Started with AWS MSK](https://www.youtube.com/watch?v=5WaIgJwYpS8&list=PLhr1KZpdzukd2EuSB1F9zoWMTwinTkqVn&index=1)

![](./useful_commands.png)

### [AWS: Perform Common Operations on an Amazon MSK Cluster](https://www.youtube.com/watch?v=AUx5x_jrX6I&list=PLhr1KZpdzukd2EuSB1F9zoWMTwinTkqVn&index=2) || [`NOT TESTED YET`]

- **Performing common operations on an Amazon MSK cluster**:  
    - Launching and expanding clusters.  
    - Configuring auto-scaling and security settings.  
    - Updating cluster configurations.  

- **Creating an Amazon MSK Cluster** 
  - **Quick Create Method:**: Creates a cluster with best practice settings.  
    - Default settings include:  
      - Apache Kafka version.  
      - Broker type.  
      - Amazon Elastic Block Store (EBS) storage volume.  

- **Modifying Cluster Properties**  
  - Editing Broker Storage
  - Configuring Auto Scaling for Storage 

- **Expanding the Cluster**  
  - Adding Brokers 
  - Rebalancing the Cluster

- **Updating Security Settings**  
  - Current setting: IAM role-based authentication.  
  - Enable SASL/SCRAM authentication:  
    - Select the checkbox and save changes.  
    - Confirm changes in the Properties tab.  

- **Creating and Applying Cluster Configurations**  
  - `Creating Configuration`
    - Name the configuration.  
    - Add properties 
    - Save the configuration.  
  - `Applying Configuration`
    - Select the new configuration
    - Confirm changes in the Properties tab.  
    - Revision tracking allows future updates.

In [None]:
# 1. Creating an Amazon MSK Cluster - Quick Create Method
def create_msk_cluster(cluster_name, kafka_version, broker_instance_type, ebs_storage):
    response = msk_client.create_cluster(
        ClusterName=cluster_name,
        KafkaVersion=kafka_version,
        NumberOfBrokerNodes=3,
        BrokerNodeGroupInfo={
            'InstanceType': broker_instance_type,
            'ClientSubnets': ['subnet-xxxxxx', 'subnet-yyyyyy', 'subnet-zzzzzz'],
            'StorageInfo': {
                'EbsStorageInfo': {
                    'VolumeSize': ebs_storage
                }
            }
        }
    )
    print("Cluster creation initiated:", response)
    return response['ClusterArn']

# 2. Modifying Cluster Properties
def update_broker_storage(cluster_arn, new_storage):
    response = msk_client.update_broker_storage(
        ClusterArn=cluster_arn,
        CurrentVersion=msk_client.describe_cluster(ClusterArn=cluster_arn)['ClusterInfo']['CurrentVersion'],
        TargetBrokerEBSVolumeInfo=[
            {
                'KafkaBrokerNodeId': broker['BrokerNodeInfo']['BrokerId'],
                'VolumeSizeGB': new_storage
            }
            for broker in msk_client.list_nodes(ClusterArn=cluster_arn)['NodeInfoList']
        ]
    )
    print("Broker storage updated:", response)

def configure_auto_scaling(cluster_arn, max_storage, target_utilization):
    response = msk_client.update_broker_count(
        ClusterArn=cluster_arn,
        CurrentVersion=msk_client.describe_cluster(ClusterArn=cluster_arn)['ClusterInfo']['CurrentVersion'],
        TargetBrokerCount=3,
        EnhancedMonitoring='PER_TOPIC_PER_PARTITION'
    )
    print("Auto-scaling configured:", response)

# 3. Expanding the Cluster
def add_brokers(cluster_arn, additional_brokers):
    response = msk_client.update_broker_count(
        ClusterArn=cluster_arn,
        CurrentVersion=msk_client.describe_cluster(ClusterArn=cluster_arn)['ClusterInfo']['CurrentVersion'],
        TargetNumberOfBrokers=len(additional_brokers)
    )
    print("Brokers added:", response)

def rebalance_cluster():
    # Implement logic to rebalance cluster partitions
    print("Rebalancing the cluster is a manual process using Kafka tools.")

# 4. Updating Security Settings
def update_security_settings(cluster_arn):
    response = msk_client.update_security(
        ClusterArn=cluster_arn,
        CurrentVersion=msk_client.describe_cluster(ClusterArn=cluster_arn)['ClusterInfo']['CurrentVersion'],
        SaslScramEnabled=True
    )
    print("Security settings updated:", response)

# 5. Creating and Applying Cluster Configurations
def create_cluster_configuration(configuration_name, log_retention_hours, auto_create_topics):
    response = msk_client.create_configuration(
        Name=configuration_name,
        KafkaVersions=['2.8.1'],
        ServerProperties=f"log.retention.hours={log_retention_hours}\nauto.create.topics.enable={auto_create_topics}".encode()
    )
    print("Cluster configuration created:", response)
    return response['Arn']

def apply_cluster_configuration(cluster_arn, config_arn):
    response = msk_client.update_cluster_configuration(
        ClusterArn=cluster_arn,
        CurrentVersion=msk_client.describe_cluster(ClusterArn=cluster_arn)['ClusterInfo']['CurrentVersion'],
        ConfigurationInfo={
            'Arn': config_arn,
            'Revision': 1
        }
    )
    print("Cluster configuration applied:", response)

# Main workflow
def main():
    cluster_name = "MyMSKCluster"
    kafka_version = "2.8.1"
    broker_instance_type = "kafka.m5.large"
    ebs_storage = 100

    # Step 1: Create MSK cluster
    cluster_arn = create_msk_cluster(cluster_name, kafka_version, broker_instance_type, ebs_storage)
    time.sleep(900)  # Wait for the cluster to be created

    # Step 2: Modify cluster properties
    update_broker_storage(cluster_arn, 500)
    configure_auto_scaling(cluster_arn, 1000, 60)

    # Step 3: Expand the cluster
    add_brokers(cluster_arn, ["broker-4", "broker-5", "broker-6"])
    rebalance_cluster()

    # Step 4: Update security settings
    update_security_settings(cluster_arn)

    # Step 5: Create and apply configuration
    config_arn = create_cluster_configuration("MyClusterConfig", 72, True)
    apply_cluster_configuration(cluster_arn, config_arn)

### [Get Started with MSK Serverless](https://www.youtube.com/watch?v=Ask0ajHnDgc&list=PLhr1KZpdzukd2EuSB1F9zoWMTwinTkqVn&index=3)

##### Introduction to MSK Serverless
- **MSK Serverless Overview**:
  - Automatically provisions and scales compute and storage resources.
  - Enables on-demand Apache Kafka usage with pay-per-data-streamed-and-retained pricing.
  - Integrates with existing Kafka applications.
  - Reduces operational overhead by eliminating cluster infrastructure management.
  - Cost-effective for highly variable workloads with on-demand streaming capacity.
- **Performance**:
  - Sustains up to 200 MB/s write capacity per cluster.
  - Supports up to 400 MB/s read capacity per cluster.
  - Scaling through partitions:
    - Each partition: 5 MB/s write capacity and 10 MB/s read capacity.
- **Pricing**:
  - Fixed cost: $0.75 per hour.
  - Throughput-based pricing for data and partitions.
  - Storage costs vary based on throughput and retention plans.
- **Key Features**:
  - Auto partition placement.
  - Full Apache Kafka compatibility.
  - High availability:
    - Partitions distributed across three availability zones.
  - Security:
    - Private and secure connectivity via Amazon MSK VPC.
    - IAM role-based authentication.

---

##### Creating an MSK Serverless Cluster
- **Cluster Creation**:
  - Navigate to MSK and select "Custom Create" method.
  - Provide:
    - Cluster name.
    - Default throughput and storage settings.
  - Networking information:
    - Create up to five VPC configurations (one configuration for the example).
    - Choose a VPC and provide subnets for at least two availability zones.
  - Security settings:
    - Retain default security group.
    - Use IAM role-based authentication.
  - Optional: Add custom tags.
  - Review settings and create the cluster.
  - Cluster creation time: ~5 minutes.

---

##### Using MSK Serverless with Producers and Consumers
- **Integrated Development Environment (IDE)**:
  - Example IDE: AWS Cloud9 (other IDEs supported).
  - Set up producers and consumers in separate windows:
    - Left window: Produce data to the cluster.
    - Right window: Consume data from the cluster in real-time.
- **Provisioned Cluster**:
  - Set environment variable `BS` (bootstrap string) with the private endpoint.
  - Use the `echo` command to confirm the variable is set.
  - Create a Kafka topic:
    - Copy and modify the topic creation command from the MSK Developer Guide.
    - Run the command (ignore errors for existing topics).
    - List existing topics.
  - Start Kafka console producer and consumer:
    - Copy and run respective commands from the MSK Developer Guide.
    - Verify messages are transmitted between producer and consumer.
- **Serverless Cluster**:
  - Update the `BS` variable with the serverless cluster's bootstrap endpoint.
  - Confirm variable changes with the `echo` command.
  - Create a Kafka topic (same process as the provisioned cluster).
  - Start Kafka console producer and consumer:
    - Use the same commands with the updated `BS` variable.
    - Verify messages sent from the producer appear in the consumer.

---

##### Key Demonstration Steps
- Demonstrate data flow on:
  - A provisioned cluster.
  - A serverless cluster (with modified bootstrap broker endpoint).
- Validate successful operation of Kafka producers and consumers:
  - Text entered in the producer window appears in the consumer window.

### [Introduction to Amazon MSK Connect](https://www.youtube.com/watch?v=KtECJViknCM&list=PLhr1KZpdzukd2EuSB1F9zoWMTwinTkqVn&index=4)

##### Introduction to Amazon MSK Connect
- **What is Amazon MSK Connect?**  
  - Fully managed Kafka Connect service for Amazon MSK.  
  - Deploy source and sink connectors to move data in and out of Amazon MSK.  
  - Create no-code data integration pipelines for various data sources and targets:  
    - Databases (RDBMS like MySQL, Oracle, or NoSQL like MongoDB).  
    - Key-value stores.  
    - Search indexes.  
    - File systems (e.g., Amazon S3).  
- **Features**:  
  - **Single Message Transforms (SMTs)**: Transform messages as they flow through the MSK Connect framework.  
  - **AWS Glue Schema Registry (GSR)**: Manage schemas for serialized messages.  
  - **IAM Integration**: Secure and manage access with AWS Identity and Access Management.  

---

##### Benefits of Amazon MSK Connect
- **Traditional Kafka Connect Challenges**:  
  - On-premises Kafka Connect requires extensive infrastructure management (hardware, OS, encryption, etc.).  
  - Kafka Connect on Amazon EC2 reduces hardware management but still requires infrastructure deployment and configuration.  
- **MSK Connect Advantages**:  
  - Abstracts the complexity of infrastructure management (hardware and Kafka Connect deployment).  
  - Focus on use cases, not maintenance.  
  - Fully managed and serverless:
    - No provisioning or maintaining Kafka Connect clusters.  
    - Pay only for resources used.  
  - Scalable throughput:  
    - Add MSK Connect Units (MCUs) or auto-scaling policies.  

---

##### Use Cases for MSK Connect
- **Data Migration**:  
  - Migrate data from relational databases (e.g., MySQL, Oracle) or NoSQL databases (e.g., MongoDB) to Amazon S3 for:  
    - Compliance.  
    - Data analysis with tools like Spark Streaming, Kinesis Data Analytics, or AWS Lambda.  
  - Use **Change Data Capture (CDC)** to stream updates, new record creation, and deletions.  
  - Backup data to and restore from S3 for disaster recovery.  
- **Streaming SaaS/Enterprise Data**:  
  - Stream data from applications like Salesforce or Zendesk to data stores (e.g., Snowflake, MongoDB, Redshift) for analytics.  
- **Kafka Migration**:  
  - Migrate Kafka workloads and connectors from other Kafka platforms to Amazon MSK for cost efficiency and manageability.  

---

##### Example Architecture: Streaming Aurora MySQL to Amazon S3 Using CDC
- **Pipeline Overview**:  
  - Two connectors configured and deployed:  
    1. **Source Connector** (Amazon Aurora MySQL):  
       - Uses a Debezium connector to convert MySQL transaction logs into CDC events.  
       - Streams CDC events through the MSK cluster.  
       - Serializes messages as JSON and stores schemas in the Glue Schema Registry.  
    2. **Sink Connector** (Amazon S3):  
       - Retrieves schemas from the Glue Schema Registry.  
       - Deserializes records streamed through the MSK cluster.  
       - Batches records per partition and stores them in S3 buckets.  
- **Key Characteristics**:  
  - No-code and serverless integration.  
  - Fully managed Kafka Connect clusters.  
  - Fully compatible with Kafka Connect for easy migration of connectors.  

---

##### Additional Features
- **Flexibility and Scalability**:  
  - Seamlessly scale or auto-scale connectors for workload spikes.  
  - Fully compatible with Kafka Connect for leveraging existing connectors.  
- **Cost Efficiency**:  
  - Pay only for the resources used.  

---

##### Resources
- Learn more through additional documentation and links provided in the description.  