# 1. IMPORT the necessary libraries

In [1]:
import pandas as pd
import boto3
import json

# NOTE: AWS secret and Access Key are not available in the dwh.cfg for security reasons.
- To run this code
    - Create a new IAM user in your AWS account
    - provide it AdministratorAccess
    - Take note of the Access Key and secret and enter it in the dwh.cfg 
    
# 2. LOAD DWH Params from a dwh.cfg file

In [2]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


# 3. Create clients from IAM, EC2, S3 and Redshift

In [3]:
import boto3

ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

# 4. Check the available sources in the S3 bucket

we can already check if some of the datasets are available in the S3 bucket. Running some of the procedures in the 02-prepare-finstart-data notebook and 03-prepare-additional-datasets notebook are required if some of the datasets are missing.


## 4.1. companies.csv dataset
Companies.csv holds a list of all S&P 500 companies and must be located under: stock_dfs/sp500_companies.csv.
in case the csv is missing, run the procedures in notebook: 02-prepare-finstart-data.

Ofcourse the S&P500 companies list is dynamic and therefore it can be useful to run the procedure in 02-prepare-finstart-data.


In [4]:
DbBucket =  s3.Bucket("cda-dend-capstone")
for obj in DbBucket.objects.filter(Prefix="stock_dfs"):
    obj.key
    
if str(obj.key) == ('stock_dfs/sp500_companies.csv'):
    print('resource is available: ' + str(obj))
else:
    print('sp500_companies.csv is not available, run ... ')

resource is available: s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/sp500_companies.csv')


## 4.2. Tickers information of the S&P 500 companies

each company's ticker information is retrieved from yahoo and is stored in one CSV per company. the next statement lists the ticker per company in a CSV format if available. Missing tickers or updating information can be simply accomplished by running the located in notebook: 02-prepare-finstart-data.

Note: I suggest to run this notebook until the cluster is up and running. After the the custer is running than you can decide if you want to run the next notebook: 02-prepare-finstart-data. There is no real reason why you must finish the cluster first, but I just prefer a step by step approach.

In [5]:
for obj in DbBucket.objects.filter(Prefix="stock_dfs/csv"):
    print(obj)

s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/A.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/AAL.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/AAP.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/AAPL.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/ABBV.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/ABC.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/ABMD.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/ABT.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/ACN.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/ADBE.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/ADI.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', key='stock_dfs/csv/ADM.csv')
s3.ObjectSummary(bucket_name='cda-dend-capstone', 

# 5. IAM Role
- Create an IAM Role that makes Redshift able to access the S3 DBBucket

In [6]:
from botocore.exceptions import ClientError

#1.1 Create the role, 
try:
    print("1.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
    
    
print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhRole already exists.
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::983440384925:role/dwhRole


# 6. Redshift Cluster
- Create a RedShift Cluster

In [7]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

# 7. Describe the cluster and show the details
- Run the block of code several times until the cluster status becomes available

In [9]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,creating
3,MasterUsername,dwhuser
4,DBName,dwh
5,VpcId,vpc-6358f11b
6,NumberOfNodes,4


# 8. Get the cluster endpoint and role ARN

In [None]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
endpoint = DWH_ENDPOINT
print("DWH_ENDPOINT :: ", endpoint)
print("DWH_ROLE_ARN :: ", roleArn)

# 9. Open the incoming TCP port to access the cluster endpoint

In [None]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

# 10. Load the extension SQL and connect to db (postgres)

In [None]:
%load_ext sql

In [None]:
import os 
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

# Delete the Cluster
It's good to clean up resources that you don't need anymore. Delete the cluster with care, there is no way back.
The command is commented, it's to make sure that you don't delete the cluster by accident

In [None]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
#redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

# Check the status of the cluster

In [None]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)