In [1]:
import pandas
import configparser
import psycopg2
import boto3
import json

In [2]:
%load_ext sql

#### Load the config file and parse the arguments

In [3]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

# DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
# DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
# DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("CLUSTER","CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("CLUSTER","DB_NAME")
DWH_DB_USER            = config.get("CLUSTER","DB_USER")
DWH_DB_PASSWORD        = config.get("CLUSTER","DB_PASSWORD")
DWH_PORT               = config.get("CLUSTER","DB_PORT")

DWH_IAM_ROLE_NAME      = config.get("CLUSTER", "DB_ROLE")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pandas.DataFrame({"Param":
                  ["DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_DB,project3
1,DWH_DB_USER,praveen
2,DWH_DB_PASSWORD,Praveen0
3,DWH_PORT,5439
4,DWH_IAM_ROLE_NAME,rolePraveen


### Creating EC2, S3 services and IAM, Redshift Clients

In [7]:
ec2 = boto3.resource('ec2',
                    region_name='us-west-2',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

s3 = boto3.resource('s3',
                    region_name='us-west-2',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

iam = boto3.client('iam',
                    region_name='us-west-2',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

redshift = boto3.client('redshift',
                    region_name='us-west-2',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

### Creating IAM role

In [15]:
# Need only to run once, once a IAM role is created, it stays in the cloud.
def create_IAM(roleName):
    try:
        print('1.1 Creating a new IAM Role')
        dwhRole =iam.create_role(
            Path='/',
            RoleName=roleName,
            Description = "Allows Redshift clusters to call AWS services on your behalf.",
            AssumeRolePolicyDocument=json.dumps(
                {'Statement': [{'Action': 'sts:AssumeRole',
                   'Effect': 'Allow',
                   'Principal': {'Service': 'redshift.amazonaws.com'}}],
                 'Version': '2012-10-17'})  )   

    except Exception as e:
        print(e)

    print("1.2 Attaching Policy")

    iam.attach_role_policy(RoleName=roleName,
                           PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                          )['ResponseMetadata']['HTTPStatusCode']

    print("1.3 Get the IAM role ARN")
    roleArn = iam.get_role(RoleName=roleName)['Role']['Arn']
    print(roleArn)
    return roleArn

In [18]:
ARN = create_IAM(DWH_IAM_ROLE_NAME)

1.1 Creating a new IAM Role
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::136173015886:role/rolePraveen


In [4]:
ARN = 'arn:aws:iam::136173015886:role/rolePraveen'

##### Fill the above ARN in the config file

### Create Cluster

In [6]:
def create_cluster(roleArn):
    try:
        response = redshift.create_cluster(        
            # Hardware
            ClusterType = 'multi-node',
            NodeType = 'dc2.large',
            NumberOfNodes= 4,
            
            # Identifiers and Credentials
            DBName=DWH_DB,
            ClusterIdentifier = 'dwhCluster',
            MasterUsername = DWH_DB_USER,
            MasterUserPassword = DWH_DB_PASSWORD,
            
            # Giving ARN role to access S3
            IamRoles = [roleArn]
        )
    except Exception as e:
        print(e)


In [7]:
create_cluster(ARN)

### Check Cluster Status

In [5]:
def check_cluster_status(redshift, cluster_identifier):
    ''' Check the status of the cluster'''
    cluster_props = redshift.describe_clusters(ClusterIdentifier = cluster_identifier)['Clusters'][0]
    status = cluster_props['ClusterStatus']
    print(status)

In [8]:
check_cluster_status(redshift, DWH_CLUSTER_IDENTIFIER)

available


##### Getting the Cluster Endpoint

In [9]:
global cluster_props
cluster_props = redshift.describe_clusters(ClusterIdentifier = DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
DWH_ENDPOINT = cluster_props['Endpoint']['Address']
DWH_ROLE_ARN = cluster_props['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.cr55pbcjn1ju.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::136173015886:role/rolePraveen


### Open a TCP port to access the Cluster Endpoint

In [20]:
def open_tcp_port():
    try:
        vpc = ec2.Vpc(id=cluster_props['VpcId'])
        defaultSg = list(vpc.security_groups.all())[0]
        print(defaultSg)

        defaultSg.authorize_ingress(
            GroupName= defaultSg.group_name ,  
            CidrIp='0.0.0.0/0',  
            IpProtocol='TCP',  
            FromPort=5439,
            ToPort=5439
        )
    except Exception as e:
        print(e)

In [21]:
open_tcp_port()

ec2.SecurityGroup(id='sg-992447c7')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


In [None]:
### Connect to Cluster and 

In [10]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://praveen:Praveen0@dwhcluster.cr55pbcjn1ju.us-west-2.redshift.amazonaws.com:5439/project3


'Connected: praveen@project3'

In [11]:
import psycopg2
from sql_queries import create_table_queries, drop_table_queries

In [15]:
def create_tables(cur, conn):
    for query in create_table_queries:
        cur.execute(query)
        conn.commit()

In [12]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
cur = conn.cursor()


OperationalError: could not translate host name "project3.clt9wxumiczk.us-west-2.redshift.amazonaws.com " to address: Name or service not known


In [None]:
drop_tables(cur, conn)
create_tables(cur, conn)

### Delete Cluster

In [13]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster(ClusterIdentifier = DWH_CLUSTER_IDENTIFIER, SkipFinalClusterSnapshot=True)

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'MasterUsername': 'praveen',
  'DBName': 'project3',
  'Endpoint': {'Address': 'dwhcluster.cr55pbcjn1ju.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2020, 4, 26, 13, 28, 17, 452000, tzinfo=tzlocal()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-992447c7',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-d85f32a0',
  'AvailabilityZone': 'us-west-2d',
  'PreferredMaintenanceWindow': 'sat:09:00-sat:09:30',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible': True,
  'Encrypted': False,
  'Tags': [],
  'EnhancedVpcRouting': False,


In [14]:
check_cluster_status(redshift, DWH_CLUSTER_IDENTIFIER)

deleting
