In [61]:
import os
import boto3
import ipaddress

ec2_client = boto3.client("ec2", region_name="ap-southeast-1")
emr_client = boto3.client("emr", region_name="ap-southeast-1")
iam_client = boto3.client('iam', region_name="ap-southeast-1")

In [62]:
### to check if your ip address can get throu cidr ###
def check_ip_in_cidr(ip_address, cidr_block):
    ip = ipaddress.ip_address(ip_address)
    network = ipaddress.ip_network(cidr_block)
    return ip in network

# Example usage
ip_address = '115.66.123.164' 
cidr_block = '10.0.0.0/16'

if check_ip_in_cidr(ip_address, cidr_block):
    print("IP address is within the CIDR block.")
else:
    print("IP address is not within the CIDR block.")

IP address is not within the CIDR block.


In [63]:
### get key-pair ###
def create_key_pair(name):
    key_pair = ec2_client.create_key_pair(KeyName=name)
    private_key = key_pair["KeyMaterial"]
    # write private key to file with 400 permissions
    with os.fdopen(os.open(f"/var/ssh/{name}.pem", os.O_WRONLY | os.O_CREAT, 0o400), "w+") as handle:
        handle.write(private_key)
key_pair_name = "ec2-key-pair"

try:    
    create_key_pair(key_pair_name)
except Exception as e:
    print(e)

An error occurred (InvalidKeyPair.Duplicate) when calling the CreateKeyPair operation: The keypair already exists


In [64]:
### get defualt vpc ###
response = ec2_client.describe_vpcs(
    Filters=[{'Name': 'isDefault', 'Values': ['true']}]
)
default_vpcs = response['Vpcs']

if default_vpcs:
    default_vpc = default_vpcs[0]
    default_vpc_id = default_vpc['VpcId']
    print("Default VPC ID:", default_vpc_id)

    response = ec2_client.describe_subnets(
        Filters=[
            {'Name': 'vpc-id', 'Values': [default_vpc_id]},
            {'Name': 'default-for-az', 'Values': ['true']}
        ]
    )
    default_subnets = response['Subnets']

    if default_subnets:
        default_subnet = default_subnets[0]
        default_subnet_id = default_subnet['SubnetId']
        print("Default Subnet ID:", default_subnet_id)
    else:
        print("No default subnet found for the default VPC.")
else:
    print("No default VPC found.")

Default VPC ID: vpc-4658c621
Default Subnet ID: subnet-b5e7abfc


In [94]:
# ec2-key-pair: /tmp/aws_ec2_key.pem
### Create the Instance Profile ###
# instance_profile_name = 'emr-instance-profile'
# instance_profile_response = iam_client.create_instance_profile(
#     InstanceProfileName=instance_profile_name
# )

# # Get the ARN of the Instance Profile
# instance_profile_arn = instance_profile_response['InstanceProfile']['Arn']

# # Add the EMR permissions to the Instance Profile
# iam_client.add_role_to_instance_profile(
#     InstanceProfileName=instance_profile_name,
#     RoleName='EMR_DefaultRole'
# )

### get iam role arn ###
instance_profile_name = 'emr-instance-profile'
response = iam_client.get_instance_profile(InstanceProfileName=instance_profile_name)
instance_profile_arn = response['InstanceProfile']['Arn']
print("Instance profile ARN:", instance_profile_arn)

Instance profile ARN: arn:aws:iam::852288348919:instance-profile/emr-instance-profile


In [66]:
### create emr cluster ###
response = emr_client.run_job_flow(
    Name='emr_sg',
    ReleaseLabel='emr-6.5.0', 
    Instances={
        'InstanceGroups': [
            {
                'Name': 'Master Instance Group',
                'Market': 'ON_DEMAND',
                'InstanceRole': 'MASTER',
                'InstanceType': 'm5.xlarge',
                'InstanceCount': 1,
            },
            {
                'Name': 'Core Instance Group',
                'Market': 'ON_DEMAND',
                'InstanceRole': 'CORE',
                'InstanceType': 'm5.xlarge',
                'InstanceCount': 2,
            }
        ],
        'Ec2KeyName': key_pair_name,
        'KeepJobFlowAliveWhenNoSteps': True,
        'TerminationProtected': False,
        'Ec2SubnetId': 'subnet-b5e7abfc',
    },
    Applications=[
        {'Name': 'Hadoop'},
        {'Name': 'Spark'},
    ],
    JobFlowRole=instance_profile_arn,
    ServiceRole='EMR_DefaultRole'
)

cluster_id = response['JobFlowId']
print('Cluster ID: ', cluster_id)

Cluster ID:  j-1V6LEV0IWI13X


In [67]:
### Check the cluster status ###
response = emr_client.describe_cluster(ClusterId=cluster_id)
cluster_status = response['Cluster']['Status']['State']
print(f"Cluster Status: {cluster_status}")
print(f"Cluster Defaitls: {response}")

# Check if there are any errors
if cluster_status == 'TERMINATED_WITH_ERRORS':
    step_details = response['Cluster']['Status']['StateChangeReason']['Message']
    print(f"Cluster Error: {step_details}")

# check running or waiting cluster
emr_client.list_clusters(ClusterStates=['RUNNING', "WAITING"])

Cluster Status: STARTING
Cluster Defaitls: {'Cluster': {'Id': 'j-1V6LEV0IWI13X', 'Name': 'emr_sg', 'Status': {'State': 'STARTING', 'StateChangeReason': {}, 'Timeline': {'CreationDateTime': datetime.datetime(2023, 6, 19, 12, 38, 4, 611000, tzinfo=tzlocal())}}, 'Ec2InstanceAttributes': {'Ec2KeyName': 'ec2-key-pair', 'Ec2SubnetId': 'subnet-b5e7abfc', 'RequestedEc2SubnetIds': ['subnet-b5e7abfc'], 'RequestedEc2AvailabilityZones': [], 'IamInstanceProfile': 'arn:aws:iam::852288348919:instance-profile/emr-instance-profile', 'EmrManagedMasterSecurityGroup': 'sg-0b8000bcbeeab5697', 'EmrManagedSlaveSecurityGroup': 'sg-00b99738b04b80963'}, 'InstanceCollectionType': 'INSTANCE_GROUP', 'ReleaseLabel': 'emr-6.5.0', 'AutoTerminate': False, 'TerminationProtected': False, 'VisibleToAllUsers': True, 'Applications': [{'Name': 'Hadoop', 'Version': '3.2.1'}, {'Name': 'Spark', 'Version': '3.1.2'}], 'Tags': [], 'ServiceRole': 'EMR_DefaultRole', 'NormalizedInstanceHours': 0, 'Configurations': [], 'ScaleDownBeha

{'Clusters': [],
 'ResponseMetadata': {'RequestId': 'd60851a0-4125-4ed8-8789-050145458900',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd60851a0-4125-4ed8-8789-050145458900',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '15',
   'date': 'Mon, 19 Jun 2023 04:38:07 GMT'},
  'RetryAttempts': 0}}

In [69]:
### get master node DNS name ###
response = emr_client.describe_cluster(ClusterId=cluster_id)
master_public_dns = response['Cluster']['MasterPublicDnsName']
print(master_public_dns)

ec2-52-76-7-72.ap-southeast-1.compute.amazonaws.com


In [68]:
### allow ssh ###
try:
    response = emr_client.describe_cluster(
        ClusterId=cluster_id
    )

    # Extract the security group ID
    security_group_id = response['Cluster']['Ec2InstanceAttributes']['EmrManagedMasterSecurityGroup']

    # Update the security group inbound rules
    response = ec2_client.authorize_security_group_ingress(
        GroupId=security_group_id,
        IpPermissions=[
            {
                'IpProtocol': 'tcp',
                'FromPort': 22,
                'ToPort': 22,
                'IpRanges': [{'CidrIp': '0.0.0.0/0'}]
            }
        ]
    )
except Exception as e:
    print(e)

An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 22, to port: 22, ALLOW" already exists


In [86]:
### create elastic ip ###

# create elastic ip
# response = ec2_client.allocate_address(Domain='vpc')
# elastic_ip = response['PublicIp']
# tag_key = 'name'
# tag_value = 'emr_elastic_ip'
# # Add tags to the Elastic IP address
# ec2_client.create_tags(
#     Resources=[response['AllocationId']],
#     Tags=[{'Key': tag_key, 'Value': tag_value}]
# )
# print("Elastic IP address created and tagged:", elastic_ip)
elastic_ip = '13.251.215.161'
print("Elastic IP address created and tagged:", elastic_ip)

response = ec2_client.describe_addresses(PublicIps=[elastic_ip])
allocation_id = response['Addresses'][0]['AllocationId']
print("Allocation ID:", allocation_id)

### get emr instance id ###
response = emr_client.list_instances(ClusterId=cluster_id)
instances = response['Instances']
for instance in instances:
    if instance['PublicDnsName'] == master_public_dns:
        instance_id = instance['Ec2InstanceId']
print("EC2 instance ID:", instance_id)

ec2_client.associate_address(
    AllocationId=allocation_id,
    InstanceId=instance_id
)

Elastic IP address created and tagged: 13.251.215.161
Allocation ID: eipalloc-066a0d310d6c7462d
EC2 instance ID: i-0716d42fbe3e9a28f


{'AssociationId': 'eipassoc-02f1cb7e7fa3e6f67',
 'ResponseMetadata': {'RequestId': '18a7c4e4-7f4b-4875-bfbc-2170717306f5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '18a7c4e4-7f4b-4875-bfbc-2170717306f5',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'content-type': 'text/xml;charset=UTF-8',
   'content-length': '295',
   'date': 'Mon, 19 Jun 2023 05:00:32 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

In [None]:
# ssh -i /var/ssh/ec2-key-pair.pem hadoop@13.251.215.161

In [95]:
### Terminate the EMR cluster ###
response = emr_client.terminate_job_flows(JobFlowIds=[cluster_id])
print(f"EMR cluster {cluster_id} terminated.")

EMR cluster j-1V6LEV0IWI13X terminated.
