### Playing with terraform

See Example 1 for the blurb. Here trying something a bit more serious - a Gromacs MD job.
    

In [1]:
import json
import os.path as op
import os
import subprocess

First a command to run terraform (assumed here to be using the Docker image):

In [2]:
def terraform(command):
    """
    Run terraform with the given command
    """
    base_command = 'docker run -i -v "$PWD":/wd -w /wd -v "$HOME"/.aws:/root/.aws hashicorp/terraform:light'
    result = subprocess.run(base_command + ' ' + command, shell=True, capture_output=True)
    return result

Run 'terraform apply':

In [3]:
result = terraform('apply -no-color -auto-approve')
if result.returncode != 0:
    print(result.stdout.decode())

Load the terraform .tfstate file, and find out how to get various useful stuff out of it...

In [4]:
def get_tfstate():
    with open('terraform.tfstate') as f:
        data = json.load(f)
    tfstate = {}
    
    worker_list = []
    for resource in data['resources']:
        for instance in resource['instances']:
            if 'ami' in instance['attributes']:
                worker_list.append(instance['attributes'])
    workers = {}
    for worker in worker_list:
        workers[worker['tags']['JobId']] = worker
    tfstate['workers'] = workers
    
    bucket = None
    for resource in data['resources']:
        for instance in resource['instances']:
            if 'bucket' in instance['attributes']:
                bucket = instance['attributes']
    tfstate['bucket_name'] = bucket['bucket']
    tfstate['region'] = bucket['region']
    
    key_name = None
    for resource in data['resources']:
        for instance in resource['instances']:
            if 'key_name' in instance['attributes']:
                key_name = instance['attributes']['key_name']
    tfstate['key_name'] = key_name
    
    return tfstate

In [5]:
tfstate = get_tfstate()

Check existing workers (there may be none at this stage)

In [6]:
workers = tfstate['workers']
for w in workers:
    print(w, workers[w]['public_ip'], workers[w]['spot_bid_status'])

In [7]:
print(tfstate['bucket_name'])
print(tfstate['key_name'])
print(tfstate['region'])
bucket_name = tfstate['bucket_name']
key_name = tfstate['key_name']

laughtongroup.charlie.xbow
terraform_ec2_key
eu-west-1


Parse a job file and extract the options

In [8]:
def parse_script(scriptfile):
    """
    Extract xbow parameters from a script file
    """
    with open(scriptfile) as f:
        lines = f.readlines()
    result = {}
    for line in lines:
        if line[:6] == '#XBOW ':
            words = line.split()
            if len(words) != 2:
                raise ValueError('Error cannot parse {}'.format(line))
            paramdef = words[1]
            if paramdef[:2] != '--':
                raise ValueError('Error cannot parse {}'.format(line))
            if not '=' in paramdef:
                raise ValueError('Error cannot parse {}'.format(line))
            try:
                key, value = paramdef[2:].split('=')
            except:
                raise ValueError('Error cannot parse {}'.format(line))
            if key in result:
                result[key].append(value)
            else:
                result[key] = [value]
    for key in result:
        if len(result[key]) == 1:
            result[key] = result[key][0]
    return result

In [9]:
job_script = 'runjob.sh'
job_options = parse_script(job_script)
print(job_options)

{'instance_type': 'p2.xlarge', 'upload': 'bpti-md.tpr'}


In [10]:
def next_job_id(bucket_name):
    """
    The next job should have an id one greater than the largest so far
    """
    result = subprocess.run('aws s3 ls s3://{}/'.format(bucket_name).split(), capture_output=True)
    if result.returncode != 0:
        raise RuntimeError('Error getting job ids from bucket')
    job_ids = []
    for line in result.stdout.decode().split('\n'):
        if 'PRE' in line:
            job_ids.append(line.split()[1][:-1])
    job_ids = [int(j) for j in job_ids]
    job_ids.sort()
    if len(job_ids) > 0:
        next_job = job_ids[-1] + 1
    else:
        next_job = 0
    return next_job

In [11]:
next_job = next_job_id(bucket_name)
print(next_job)

0


In [12]:
def create_instance_tf_file(instance_spec):
    """
    Create a .tf file for a new instance
    """
    required_keys = ['job_index', 'instance_type', 'xbow_bucket', 'key_name']
    for key in required_keys:
        if not key in instance_spec:
            raise ValueError('Error - instance specification missing required key {}'.format(key))
    
    tf_instance_template = """resource "aws_spot_instance_request" "worker_{job_index}" {{
  ami           = data.aws_ami.base.id
  instance_type = "{instance_type}"
  key_name = "{key_name}"
  security_groups = ["allow_ssh"]
  iam_instance_profile = "EC2InstanceRole"

  depends_on = [aws_s3_bucket.xbow_bucket]

  tags = {{
    Name = "Worker-{job_index}"
    JobId = "{job_index}"
  }}
}}

output "worker_{job_index}_public_ip" {{
  value = aws_spot_instance_request.worker_{job_index}.public_ip
}}
output "worker_{job_index}_spot_request_state" {{
  value = aws_spot_instance_request.worker_{job_index}.spot_request_state
}}
output "worker_{job_index}_spot_bid_status" {{
  value = aws_spot_instance_request.worker_{job_index}.spot_bid_status
}}
    """
    tf_file = 'worker_{job_index}.tf'.format(**instance_spec)
    with open(tf_file, 'w') as f:
        f.write(tf_instance_template.format(**instance_spec))
    return tf_file

In [13]:
instance_spec = {
    'job_index':      next_job,
    'instance_type':  job_options['instance_type'],
    'xbow_bucket':    bucket_name,
    'key_name':       key_name
}
tf_file = create_instance_tf_file(instance_spec)

In [14]:
result = terraform('apply -no-color -auto-approve')
if result.returncode != 0:
    print(result.stdout.decode())

Check the state of the instance - if not ready yet don't worry, we can get on with uploading data to the s3 bucket:

In [15]:
tfstate = get_tfstate()
workers = tfstate['workers']
for w in workers:
    print(w, workers[w]['public_ip'], workers[w]['spot_request_state'])

0 None open


Transferring files onto the instance via an intermediate s3 bucket (for resilience/backup)

In [16]:
class S3Stager(object):
    """
    A thing for moving files to and from instances via s3
    """
    def __init__(self, bucket_id, remote_ip, key_name, remote_dir):
        self.bucket_uri = 's3://{}'.format(bucket_id)
        self.blob_base = op.join(self.bucket_uri, remote_dir)
        self.remote_ip = remote_ip
        self.key_name = key_name
        self.remote_dir = remote_dir
    
    def upload(self, filenames):
        """
        Upload a local file to the remote instance, via s3
        """
        if not isinstance(filenames, list):
            filenames = [filenames]
        targetdir = self.blob_base + '/'
        for filename in filenames:
            result = subprocess.run(['aws', 's3', 'cp', filename, targetdir], capture_output=True)
            if result.returncode != 0:
                return result
        return result
    
    def sync(self):
        """
        Synchronise all files betweenthe s3 bucket and the instance
        """
        result = subprocess.run(['ssh', '-i', self.key_name, '-o', 'StrictHostKeyChecking=no', 'ubuntu@{}'.format(self.remote_ip), 
                                  'aws', 's3', 'sync', self.blob_base, self.remote_dir], capture_output=True)
        if result.returncode != 0:
            return result
        result = subprocess.run(['ssh', '-i', self.key_name, '-o', 'StrictHostKeyChecking=no', 'ubuntu@{}'.format(self.remote_ip), 
                                  'aws', 's3', 'sync', self.remote_dir, self.blob_base], capture_output=True)
        return result
        
    def download(self, filenames):
        """
        Download files from the S3 bucket to the current directory
        """
        if not isinstance(filenames, list):
            filenames = [filenames]
        include_string = ' '.join(['--include "{}"'.format(filename) for filename in filenames])
        
        result = subprocess.run(['aws', 's3', 'sync', self.blob_base, '.', '--exclude', '"*"'] + include_string.split(), capture_output=True)
        return result
    
    def ls(self):
        """
        List the contents of the s3 bucket
        """
        result = subprocess.run(['aws', 's3', 'ls', self.blob_base + '/'], capture_output=True)
        return result
    
    def purge(self):
        """
        Remove all files from the s3 bucket
        """
        result = subprocess.run(['aws', 's3', 'rm', self.blob_base + '/', '--recursive'], capture_output=True)
        return result
        

Transfer files to the instance, via the s3 bucket:

In [36]:
next_job = str(next_job)
stager = S3Stager(bucket_name, workers[next_job]['public_ip'], key_name, next_job)
files_to_upload = job_options['upload']
if not isinstance(files_to_upload, list):
    files_to_upload = [files_to_upload]
files_to_upload.append(job_script)
result = stager.upload(files_to_upload)
if result.returncode != 0:
    print(result)

In [28]:
result = stager.ls()
print(result.stdout.decode())

2020-01-24 18:15:32     799108 bpti-md.tpr
2020-01-24 18:15:33        165 runjob.sh



If neccessary, wait for confirmation that the instance is ready

In [29]:
result = terraform('refresh -no-color')
if result.returncode != 0:
    print(result.stdout.decode())
tfstate = get_tfstate()
workers = tfstate['workers']
for w in workers:
    print(w, workers[w]['public_ip'], workers[w]['spot_request_state'])

0 52.208.232.28 active


Now files can be transferred for s3 to the new instance (note we have to recreate the stager object, to make sure it now has a valid ip address to connect to):

In [37]:
stager = S3Stager(bucket_name, workers[next_job]['public_ip'], key_name, next_job)
result = stager.sync()
if result.returncode != 0:
    print(result)

CompletedProcess(args=['ssh', '-i', 'terraform_ec2_key', '-o', 'StrictHostKeyChecking=no', 'ubuntu@52.208.232.28', 'aws', 's3', 'sync', '0', 's3://laughtongroup.charlie.xbow/0'], returncode=1, stdout=b'Completed 1 file(s) with ~0 file(s) remaining (calculating...)\r', stderr=b'upload failed: 0/dask-worker-space/global.lock to s3://laughtongroup.charlie.xbow/0/dask-worker-space/global.lock seek() takes 2 positional arguments but 3 were given\nupload failed: 0/dask-worker-space/purge.lock to s3://laughtongroup.charlie.xbow/0/dask-worker-space/purge.lock seek() takes 2 positional arguments but 3 were given\n')


Run a command on a remote instance:

In [22]:
def remote_run(public_ip, key_name, command):
    result = subprocess.run(['ssh', '-i', key_name, '-o', 'StrictHostKeyChecking=no', 'ubuntu@{}'.format(public_ip)] + command.split(), capture_output=True)
    return result

Submit the job:

In [38]:
result = remote_run(workers[next_job]['public_ip'], key_name, 'cd {} &&  tsp sh {}'.format(next_job, job_script))
print(result.stdout.decode())

2



Check the job:

In [42]:
result = remote_run(workers[next_job]['public_ip'], key_name, 'tsp')
print(result.stdout.decode())

ID   State      Output               E-Level  Times(r/u/s)   Command [run=0/1]
0    finished   /tmp/ts-out.b05bWj   1        25.23/6.54/0.65 sh runjob.sh
1    finished   /tmp/ts-out.ZpOBrj   127      1.53/1.39/0.10 sh runjob.sh
2    finished   /tmp/ts-out.LXpEl0   0        56.42/1.50/0.17 sh runjob.sh



In [41]:
result = remote_run(workers[next_job]['public_ip'], key_name, 'tsp -c')
print(result.stdout.decode())

Succesfully installed gromacs 2019-cuda
Unable to find image 'claughton/gromacs:2019-cuda' locally
2019-cuda: Pulling from claughton/gromacs
18d680d61657: Pulling fs layer
0addb6fece63: Pulling fs layer
78e58219b215: Pulling fs layer
eb6959a66df2: Pulling fs layer
c6aa9245dd49: Pulling fs layer
f0233a859d9b: Pulling fs layer
b4c3e8ab5f01: Pulling fs layer
04f88a0781f7: Pulling fs layer
57f0a9a9301a: Pulling fs layer
f0233a859d9b: Waiting
04f88a0781f7: Waiting
57f0a9a9301a: Waiting
b4c3e8ab5f01: Waiting
eb6959a66df2: Waiting
c6aa9245dd49: Waiting
0addb6fece63: Verifying Checksum
0addb6fece63: Download complete
78e58219b215: Verifying Checksum
78e58219b215: Download complete
18d680d61657: Verifying Checksum
18d680d61657: Download complete
eb6959a66df2: Verifying Checksum
eb6959a66df2: Download complete
c6aa9245dd49: Verifying Checksum
c6aa9245dd49: Download complete
b4c3e8ab5f01: Verifying Checksum
b4c3e8ab5f01: Download complete
18d680d61657: Pull complete
0addb6fece63: Pull complete
78

In [43]:
result = remote_run(workers[next_job]['public_ip'], key_name, 'ls {}'.format(next_job))
print(result.stdout.decode())

bpti-md.cpt
bpti-md.edr
bpti-md.gro
bpti-md.log
bpti-md.tpr
dask-worker-space
runjob.sh



Re-sync the worker with the s3 bucket, so it's safe to destroy it

In [44]:
result = stager.sync()
if result.returncode != 0:
    print(result)

CompletedProcess(args=['ssh', '-i', 'terraform_ec2_key', '-o', 'StrictHostKeyChecking=no', 'ubuntu@52.208.232.28', 'aws', 's3', 'sync', '0', 's3://laughtongroup.charlie.xbow/0'], returncode=1, stdout=b'Completed 0 Bytes/1.8 MiB (0 Bytes/s) with 5 file(s) remaining\rCompleted 0 Bytes/1.8 MiB (0 Bytes/s) with 4 file(s) remaining\rCompleted 1.5 KiB/1.8 MiB (67.9 KiB/s) with 4 file(s) remaining\rupload: 0/bpti-md.edr to s3://laughtongroup.charlie.xbow/0/bpti-md.edr\nCompleted 1.5 KiB/1.8 MiB (67.9 KiB/s) with 3 file(s) remaining\rCompleted 257.5 KiB/1.8 MiB (6.0 MiB/s) with 3 file(s) remaining\rCompleted 513.5 KiB/1.8 MiB (10.3 MiB/s) with 3 file(s) remaining\rCompleted 769.5 KiB/1.8 MiB (15.0 MiB/s) with 3 file(s) remaining\rCompleted 1.0 MiB/1.8 MiB (19.5 MiB/s) with 3 file(s) remaining  \rCompleted 1.3 MiB/1.8 MiB (23.9 MiB/s) with 3 file(s) remaining  \rCompleted 1.5 MiB/1.8 MiB (28.1 MiB/s) with 3 file(s) remaining  \rCompleted 1.5 MiB/1.8 MiB (20.5 MiB/s) with 3 file(s) remaining  \r

Remove the worker

In [45]:
os.remove(tf_file)

In [46]:
result = terraform('apply -no-color -auto-approve')
if result.returncode != 0:
    print(result.stdout.decode())

Download the results file from the s3 bucket, which can then be cleaned out

In [47]:
result = stager.download("*")
if result.returncode != 0:
    print(result)

In [48]:
result = stager.purge()
if result.returncode != 0:
    print(result)