# AWS ParallelCluster Programmatic Configuration

Investigate the pcluster configure argument 

In [2]:
import pcluster
import pcluster.configure.easyconfig as easyconfig
import requests
import random

from pcluster.cluster_model import ClusterModel
from pcluster.config.hit_converter import HitConverter
from pcluster.config.pcluster_config import PclusterConfig
from pcluster.config.validators import HEAD_NODE_UNSUPPORTED_INSTANCE_TYPES, HEAD_NODE_UNSUPPORTED_MESSAGE
from pcluster.configure.networking import (
    NetworkConfiguration,
    PublicPrivateNetworkConfig,
    automate_subnet_creation,
    automate_vpc_with_subnet_creation,
)
from pcluster.configure.utils import get_regions, get_resource_tag, handle_client_exception, prompt, prompt_iterable
from pcluster.utils import (
    error,
    get_default_instance_type,
    get_region,
    get_supported_az_for_multi_instance_types,
    get_supported_az_for_one_instance_type,
    get_supported_compute_instance_types,
    get_supported_instance_types,
    get_supported_os_for_scheduler,
    get_supported_schedulers,
)
from pcluster.configure.easyconfig import ClusterConfigureHelper

import json
import logging
import os
import sys
from collections import OrderedDict
import boto3
import tempfile
import string

from datetime import datetime
from jinja2 import Environment, BaseLoader


## Create A Project ID

Create a random project ID for the state, buckets, etc.

In [3]:
today = datetime.today().strftime('%Y%m%d')
today = '20210127'

N = 6
random_string = ''.join(random.choices(string.ascii_lowercase + string.digits, k=N))
random_string

ID='{today}-{random_string}'.format(today=today, random_string=random_string)

In [14]:
# These are all variables that will be passed in and supplied through cookiecutter
# We'll also grab the terraform output
# We'll use pcluster to create the subnets because it's very picky about subnets

CONFIG = {
    'id': ID,
    'hosted_zone_id': '',
    '_copy_without_render': [
                'files/installation/deploy_jupyterhub/jupyterhub_config.py',
    ],
    'project': 'slurm-cluster',
    'stage': 'development',
    'vpc_id': 'vpc-63c3bb0b',
    'aws_region': 'eu-west-2',
    'custom_ami_id': '',
    'cloudformation_stack': 'parallelcluster-{{cookiecutter.project}}-{{cookiecutter.id}}-{{cookiecutter.stage}}',
    # These all get read in from terraform
    's3_installation_bucket': "{{cookiecutter.project}}-{{cookiecutter.id}}-{{cookiecutter.stage}}-installation",
    's3_user_data_bucket': "{{cookiecutter.project}}-{{cookiecutter.id}}-{{cookiecutter.stage}}-user-data",
    's3_admin_bucket': "{{cookiecutter.project}}-{{cookiecutter.id}}-{{cookiecutter.stage}}-admin",
    'tags': {
        'Name': '{{cookiecutter.project}}-{{cookiecutter.id}}-{{cookiecutter.stage}}',
        'Project': '{{cookiecutter.project}}',
        'Stage': '{{cookiecutter.stage}}',

    },
    # Terraform recipe vars
    'terraform_recipes': {
        # Bootstrap the state
        'terraform_state': 'terraform-state',
        # Supply the resources, build the custom AMI
        'pcluster_resources': 'pcluster-resources',
        # After install all the apps - easybuild, modules, etc
        'pcluster_apps': 'pcluster-apps',
    },
    # Terraform state Vars
    # these end up looking different because the module tacks on some names
    "terraform_state": {
        "s3_bucket": "{{cookiecutter.project}}-{{cookiecutter.id}}",
        "s3_bucket_full_name": "{{cookiecutter.project}}-{{cookiecutter.id}}-{{cookiecutter.stage}}-terraform-state",
        "dynamo_db_table": "{{cookiecutter.project}}-{{cookiecutter.id}}-{{cookiecutter.stage}}-terraform-state-lock",
    },
    # Store the terraform output
    'terraform_output': {
        'terraform_state': {},
        'pcluster_resources': {},
        'pcluster_apps': {},
    },
    # PCluster Vars
    'pcluster': {
        'master_instance_type': 't3a.2xlarge',
        'scheduler': 'slurm',
        'base_os': 'alinux2',
        'pcluster_version': '2.10.1',
        'key_pair': '',
        'min_cluster_size': 0,
        'max_cluster_size': 100,
        'head_node_instance_type': 't3a.2xlarge',
        # This doesn't matter because we're using queues
        # But still used
        'compute_node_instance_type': 't3a.2xlarge',
        'master_subnet_id': '',
        'compute_subnet_id': '',
        'compute_resources': [
            {
                'instance_type': 't3a.medium',
                'min_count': 0,
                'max_count': 100,
            },
            {
                'instance_type': 't3a.large',
                'min_count': 0,
                'max_count': 100,
            },
            {
                'instance_type': 't3a.2xlarge',
                'min_count': 0,
                'max_count': 100,
            },
            {
                'instance_type': 'm4.large',
                'min_count': 0,
                'max_count': 100,
            },
            {
                'instance_type': 'm4.xlarge',
                'min_count': 0,
                'max_count': 100,
            },
            {
                'instance_type': 'm4.2xlarge',
                'min_count': 0,
                'max_count': 100,
            },
            {
                'instance_type': 'g4dn.xlarge',
                'min_count': 0,
                'max_count': 100,
            },
            {
                'instance_type': 'g4dn.2xlarge',
                'min_count': 0,
                'max_count': 100,
            },
            {
                'instance_type': 'g4dn.4xlarge',
                'min_count': 0,
                'max_count': 100,
            },
        ],
        'queues': [
            {
                'name': 'dev',
                'enable_efa': False,
                'enable_efa_gdr': False,
                'compute_resource_instance_types': ['t3a.medium', 't3a.large', 't3a.2xlarge'],
            },
            {
                'name': 'cpu',
                'enable_efa': False,
                'enable_efa_gdr': False,
                'compute_resource_instance_types': ['m4.large', 'm4.xlarge', 'm4.2xlarge'],
            },
            {
                'name': 'gpu',
                'enable_efa': False,
                'enable_efa_gdr': False,
                'compute_resource_instance_types': ['g4dn.xlarge', 'g4dn.2xlarge', 'g4dn.4xlarge'],
            },
        ],
        'queue_settings': '',
        'efs_resources': [
            {
                "name": "apps",
                "efs_id": False,
                "performance_mode": "generalPurpose"
            },
            {
                "name": "scratch",
                "efs_id": False,
                "performance_mode": "maxIO"
            }
        ],
    }
}


BASE_DIR='/home/jovyan/etcembly/'
os.makedirs(BASE_DIR, exist_ok=True)

with open(os.path.join(BASE_DIR, 'cookiecutter.json'), 'w') as outfile:
    json.dump(CONFIG, outfile, indent=4)    

with open('/home/jovyan/app/aws_parallelcluster_wrapper/_cookiecutter_templates/terraform-modules/terraform-state/cookiecutter.json', 'w') as outfile:
    json.dump(CONFIG, outfile, indent=4)
        
    
with open('/home/jovyan/app/aws_parallelcluster_wrapper/_cookiecutter_templates/terraform-modules/pcluster-resources/cookiecutter.json', 'w') as outfile:
    json.dump(CONFIG, outfile, indent=4)
    
with open('/home/jovyan/app/aws_parallelcluster_wrapper/_cookiecutter_templates/terraform-modules/pcluster-apps/cookiecutter.json', 'w') as outfile:
    json.dump(CONFIG, outfile, indent=4)
      

Then run

```bash
aws_parallelcluster_wrapper  apply-terraform-state  --config ~/etcembly/cookiecutter.json --outdir ~/etcembly/project/slurm-cluster-development/ 
aws_parallelcluster_wrapper deploy-pcluster-resources  --config ~/etcembly/cookiecutter.json --outdir ~/etcembly/project/slurm-cluster-development/ --apply
aws_parallelcluster_wrapper create-pcluster  --config ~/etcembly/cookiecutter.json --outdir ~/etcembly/project/slurm-cluster-development/ --apply
```

## PConfig Cluster Section

### Region

Just run a check here to make sure that there is a valid region.

In [132]:
#available_regions = get_regions()

# London
# region = 'eu-west-2'
os.environ["AWS_DEFAULT_REGION"] = CONFIG['aws_region']

#pcluster_config.region = region

## Cloudformation stuff

In [85]:
import boto3

In [104]:
sts_client = boto3.client("sts")
account_id = sts_client.get_caller_identity()["Account"]
account_id

'858286506743'

In [88]:
client = boto3.client('cloudformation')

In [90]:
client.list_exports()

{'Exports': [],
 'ResponseMetadata': {'RequestId': '7108a5c7-b22a-4a32-bd7f-5b2d092370d6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7108a5c7-b22a-4a32-bd7f-5b2d092370d6',
   'content-type': 'text/xml',
   'content-length': '272',
   'date': 'Wed, 27 Jan 2021 12:19:23 GMT'},
  'RetryAttempts': 0}}

In [112]:
stacks = client.describe_stacks(StackName='parallelcluster-slurm-cluster-20210127-development',)

In [116]:
len(stacks['Stacks'])

1

In [124]:
stacks['Stacks'][0].keys()

dict_keys(['StackId', 'StackName', 'Description', 'Parameters', 'CreationTime', 'RollbackConfiguration', 'StackStatus', 'DisableRollback', 'NotificationARNs', 'Capabilities', 'Outputs', 'Tags', 'EnableTerminationProtection', 'DriftInformation'])

In [122]:
for parameter in stacks['Stacks'][0]['Parameters']:
    key = parameter['ParameterKey']
    if 'Master' in key:
        print('ParameterKey: {}'.format(parameter['ParameterKey']))
    

ParameterKey: MasterSubnetId
ParameterKey: MasterRootVolumeSize
ParameterKey: MasterInstanceType


In [125]:
for output in stacks['Stacks'][0]['Outputs']:
    print(output)
    #key = parameter['ParameterKey']
    #if 'Master' in key:
    #    print('ParameterKey: {}'.format(parameter['ParameterKey']))

{'OutputKey': 'ArtifactS3RootDirectory', 'OutputValue': 'parallelcluster-slurm-cluster-20210127-develop-28jog97ymlhz3z72', 'Description': 'Root directory in S3 bucket where cluster artifacts are stored'}
{'OutputKey': 'IsHITCluster', 'OutputValue': 'true'}
{'OutputKey': 'ClusterUser', 'OutputValue': 'ec2-user', 'Description': 'Username to login to head node'}
{'OutputKey': 'MasterPrivateIP', 'OutputValue': '172.31.97.236', 'Description': 'Private IP Address of the head node'}
{'OutputKey': 'ResourcesS3Bucket', 'OutputValue': 'parallelcluster-p4b4uzx457i1111h', 'Description': 'S3 user bucket where AWS ParallelCluster resources are stored'}
{'OutputKey': 'ClusterConfigMetadata', 'OutputValue': '{"sections": {"cluster": ["default"], "scaling": ["default"], "vpc": ["default"]}}'}
{'OutputKey': 'GangliaPrivateURL', 'OutputValue': 'http://172.31.97.236/ganglia/', 'Description': 'Private URL to access Ganglia (disabled by default)'}


In [15]:
cfn = boto3.client('cloudformation')
my_stack_name='parallelcluster-slurm-cluster-20210127-development'
list(map(lambda x: cfn.describe_stack_resources(StackName=x['PhysicalResourceId'])['StackResources'], cfn.describe_stack_resources(StackName=my_stack_name)['StackResources']))

ClientError: An error occurred (ValidationError) when calling the DescribeStackResources operation: Stack with id parallelcluster-slurm-cluster-20210127-development does not exist

In [131]:
stack_resources = cfn.describe_stack_resources(StackName=my_stack_name)['StackResources']

In [132]:
stack_resources[0]

{'StackName': 'parallelcluster-slurm-cluster-20210127-development',
 'StackId': 'arn:aws:cloudformation:us-east-1:858286506743:stack/parallelcluster-slurm-cluster-20210127-development/872a7680-6096-11eb-8ea4-1266e578c117',
 'LogicalResourceId': 'CleanupResourcesFunction',
 'PhysicalResourceId': 'pcluster-CleanupResources-872a7680-6096-11eb-8ea4-1266e578c117',
 'ResourceType': 'AWS::Lambda::Function',
 'Timestamp': datetime.datetime(2021, 1, 27, 11, 55, 47, 332000, tzinfo=tzlocal()),
 'ResourceStatus': 'CREATE_COMPLETE',
 'DriftInformation': {'StackResourceDriftStatus': 'NOT_CHECKED'}}