In [None]:
import boto3, botocore
from botocore.exceptions import ClientError
import os, time, json, io, zipfile
from datetime import date
from dotenv import load_dotenv


from misc import load_from_yaml, save_to_yaml
import iam, s3, lf, rds, vpc, ec2, redshift

load_dotenv(".env")
# boto3.setup_default_session(profile_name="AMominNJ")

False

In [None]:
ACCOUNT_ID        = os.environ['AWS_ACCOUNT_ID_ROOT']
REGION            = os.environ['AWS_DEFAULT_REGION']
VPC_ID            = os.environ['AWS_DEFAULT_VPC']
SECURITY_GROUP_ID = os.environ['AWS_DEFAULT_SG_ID']
SUBNET_IDS        = SUBNET_IDS = os.environ["AWS_DEFAULT_SUBNET_IDS"].split(":")
SUBNET_ID         = SUBNET_IDS[0]
print(SUBNET_IDS)

In [None]:
sts_client           = boto3.client('sts')
rds_client           = boto3.client('rds')
iam_client           = boto3.client('iam')
s3_client            = boto3.client('s3')
glue_client          = boto3.client('glue')
lakeformation_client = boto3.client('lakeformation')
redshift_client      = boto3.client('redshift')
ec2_client = boto3.client('ec2', region_name=REGION)
ec2_resource = boto3.resource('ec2', region_name=REGION)

#### [What is Amazon Redshift | How to configure and connect to Redshift](https://www.youtube.com/watch?v=tSHCf1koYk8)

##### Notes: Configuring and Starting AWS Redshift


- **What is Amazon Redshift?**
  - A fully managed data warehousing service in the cloud.
  - Redshift automatically scales based on the compute demand.
  - Supports columnar storage and massive parallel processing for high-performance query execution.
  - AWS Redshift Serverless allows automatic scaling and cost-efficient operation without manual infrastructure management.

- **Key Advantages of Redshift:**
  - Speed and performance in executing queries.
  - Automatic backups and scaling.
  - Columnar storage for efficient data access.
  - Supports materialized views for improving query performance by pre-computing results.
  - Offers Massive Parallel Processing (MPP) for distributed query execution across multiple nodes.
  - Comparisons with other data warehouse solutions:
    - Google BigQuery, Snowflake, Microsoft Azure Synapse Analytics.
  - Compression mechanisms and storage on S3.
  
- **Steps to Create and Configure Redshift Cluster:**
  - **1. Create Cluster Subnet Group:**
    - Navigate to **Clusters** and create a new cluster.
    - Create a **Subnet Group** to choose the VPC and subnets for the Redshift cluster.
    - Choose your **VPC** (e.g., default VPC) and select the desired subnets (e.g., `ap-south-1a`, `ap-south-1b`).
    - Create the subnet group.

  - **2. Create IAM Role for S3 Access (Optional):**
    - Navigate to **IAM** and create a role for Redshift.
    - Attach the policy `AmazonRedshiftAllCommandsFullAccess`.
    - If needed, add additional policies like `S3ReadOnlyAccess` to allow S3 bucket access.
    - Attach the role to the Redshift cluster.

  - **3. Configure Cluster Settings:**
    - Navigate back to the Redshift **Clusters** dashboard.
    - Enter a **Cluster Identifier** (e.g., `myydredshift`).
    - Choose the **Node Type** (e.g., `dc2.large`) and set the number of nodes.
      - **Leader Node** coordinates query execution.
      - **Compute Nodes** store the actual data.
    - Set **Database Configuration** (username, password, etc.).

  - **4. Set Security and Networking:**
    - Configure a **Security Group** for Redshift and open necessary ports (e.g., port `5439` for Redshift).
    - Assign the cluster to a **VPC** and subnet.
    - Optionally enable **Enhanced VPC Routing** for internal traffic within the VPC.

  - **5. Enable Backups and Maintenance:**
    - Set up **automated backups** and define snapshot retention policies (e.g., retain backups for 7 days).
    - Optionally configure **high availability** and replication to other regions.

  - **6. Finalizing Cluster Setup:**
    - Click **Create Cluster** to launch the Redshift cluster.
    - Wait for the cluster creation process to complete (this may take a few minutes).

- **Connecting to Redshift:**
  - Use the **Query Editor V2** or other SQL tools like **MySQL Workbench** to connect to the cluster.
  - Authenticate using database credentials (e.g., `username: redshift_admin`).
  - Load sample data or import data from S3 or a local file.
  - Execute queries and analyze data in the Redshift Query Editor.

- **Additional Redshift Operations:**
  - Resize the cluster by adjusting node configurations via the **Actions** menu.
  - View and manage existing databases, schemas, and tables from the Query Editor interface.
  - Monitor cluster performance and query history.

Let me know if you need further refinement on any step!

##### Deployment of Redshift Cluster

In [None]:
bucket_name, datalake_folder_name = 'httx-datalake-bkt', "S3-Datalake"
catalog_db_name = 'httx-catalog-db'
glue_role_name = "httx-glue-role" 
s3_crawler_name = "httx-s3crawler"
rds_crawler_name = "httx-rdscrawler"

In [None]:
cluster_identifier = "httx-redshift-cluster-1"
admin_db_username = os.environ['USERNAME']
admin_db_password = os.environ['PASSWORD']


#### [What is Amazon Redshift Serverless | How to configure and connect to Redshift Serverless](https://www.youtube.com/watch?v=mEEbqIdQf7w&list=PLneBjIzDLECkmd-0-gcegDghBdC0hRukz&index=2)

#### [How to load data from S3 to Redshift](https://www.youtube.com/watch?v=AcVNDbSy9L8&list=PLneBjIzDLECkmd-0-gcegDghBdC0hRukz&index=3)

##### Notes: load data from S3 to Redshift

- **Setting up the environment**
  - Create an S3 bucket
    - Name the bucket (e.g., `redshift-demo`)
    - Enable necessary options (e.g., block public access, versioning)
    - Upload the data file (e.g., `customer.csv`) to the S3 bucket
  - Prepare the data
    - The dataset includes columns like `index`, `ID`, `first name`, `last name`, `company`, `city`, `country`, `phone numbers`, and `email`
    - Ensure the data is in a supported format (e.g., CSV)

- **Setting up IAM role**
  - Create an IAM role for Redshift
    - Use case: Redshift
    - Attach appropriate policies (e.g., S3 read-only access)
    - Optionally, limit access to specific S3 buckets
  - Attach the IAM role to Redshift (either Serverless or Cluster)

- **Connecting to Redshift**
  - Use the Redshift Query Editor v2
  - Create a connection
    - Provide credentials (e.g., username: `RsAdmin`, password: set during namespace setup)
  - Select the database (e.g., `dev`) and schema (e.g., `public`)

- **Loading data into Redshift**
  - Choose to load data from the S3 bucket
    - Select the S3 bucket and file (e.g., `customer.csv`)
    - Specify file format (e.g., CSV) and delimiter
    - Configure additional options (e.g., headers, conversion parameters)
  - Create or choose the table
    - Create a new table (e.g., `customers`) if it doesn't exist
    - Map columns (e.g., `index` as primary key)
  - Run the `COPY` command
    - Specify table, schema, and data format (e.g., CSV)
    - Use Redshift's `COPY` command to load data from the S3 bucket to the Redshift table

- **Verifying the data load**
  - Query the loaded data
    - Verify that the data has been loaded into the table
    - Run queries to inspect and filter data (e.g., `SELECT * FROM customers WHERE country = 'USA'`)