### xbow library proof of concept
This notebook illiustrates how we might put together an xbow Python library to crreate and manage a cluster of instances that have a shared filesystem. In the walkthrough:

1. Some basic functions and classes are defined
2. A collection (list) of instances is created
3. A shared filesystem is created and attached to the instances
4. A gromacs test job is run on each instance, with selected files written to the shared filesystem
5. The filesyetem and instances are terminated.

In [None]:
import boto, boto.ec2, boto.ec2.blockdevicemapping, boto.manage
import boto3
import paramiko
import time

A few paremeters - you will need to set `key_name` and `pemfile` for certain.

In [None]:
region = 'eu-west-1'
price = '0.4'
instance_type = 'm4.large'
image_id = 'ami-9d8421e4'
username = 'ubuntu'
disk_size = 20
disk_delete_on_termination = True
security_groups = ['efs-walkthrough1-ec2-sg']
key_name = 'calAWS'
pemfile = '../calAWS.pem'

The cell below is based heavily on yiour existing xbow.py script, Chris. 

In [None]:
ec2 = boto.ec2.connect_to_region(region)

bdm = boto.ec2.blockdevicemapping.BlockDeviceMapping()
bdm['/dev/sda1'] = boto.ec2.blockdevicemapping.BlockDeviceType(volume_type='gp2', 
                                                               size=disk_size, 
                                                               delete_on_termination=disk_delete_on_termination)
bdm['/dev/sdb'] = boto.ec2.blockdevicemapping.BlockDeviceType(ephemeral_name='ephemeral0')

def spot_instances_pool(ec2, 
                        pool_name='DefaultPool',
                        count=1,
                        price=1.0,
                        image_id=None,
                        instance_type=None,
                        security_groups=None,
                        key_name=None,
                        block_device_map=None):
    spot_reqs = ec2.request_spot_instances(price=price,
                                       image_id=image_id,
                                       count=count,
                                       instance_type=instance_type,
                                       security_groups=security_groups,
                                       key_name=key_name,
                                       block_device_map=block_device_map
                                      )
    request_codes = [s.status.code for s in spot_reqs]
    request_ids = [s.id for s in spot_reqs]
    
    while 'pending-evaluation' in request_codes:
        time.sleep(5)
        spot_reqs = ec2.get_all_spot_instance_requests(request_ids=request_ids)
        request_codes = [s.status.code for s in spot_reqs]
    
    if request_codes.count('fulfilled') != count:
        print 'Error: not all spot requests were fulfilled'
    print request_codes
    request_ids = []
    instance_ids = []
    for s in spot_reqs:
        if s.status.code == 'fulfilled':
            request_ids.append(s.id)
            instance_ids.append(s.instance_id)
    
    while None in instance_ids:
        time.sleep(5)
        spot_reqs = ec2.get_all_spot_instance_requests(request_ids=request_ids)
        instance_ids = [s.instance_id for s in spot_reqs]
    print instance_ids
    
    time.sleep(2)
    reservations = ec2.get_all_instances(instance_ids=instance_ids)
    instances = reservations[0].instances
    i = 0
    for instance in instances:
        instance.add_tag('PoolName', pool_name)
        i += 1
    instance_states = [instance.state for instance in instances]
    
    while 'pending' in instance_states:
        time.sleep(5)
        for instance in instances:
            instance.update()
        instance_states = [instance.state for instance in instances]
    if instance_states.count('running') != count:
        print 'Error - not all instances are running'
    print instance_states
    
    instance_ids = []
    running_instances = []
    for instance in instances:
        if instance.state == 'running':
            instance_ids.append(instance.id)
            running_instances.append(instance)
    
    all_instance_statuses = ec2.get_all_instance_status(instance_ids=instance_ids)
    system_statuses = [inst_stat.system_status.status for inst_stat in all_instance_statuses]
    instance_statuses = [inst_stat.instance_status.status for inst_stat in all_instance_statuses]
    while 'initializing' in system_statuses or 'initializing' in instance_statuses:
        all_instance_statuses = ec2.get_all_instance_status(instance_ids=instance_ids)
        system_statuses = [inst_stat.system_status.status for inst_stat in all_instance_statuses]
        instance_statuses = [inst_stat.instance_status.status for inst_stat in all_instance_statuses]
        time.sleep(5)
    return running_instances

Again, based in part on the code in xbow.py:

In [None]:
class ConnectedInstance(object):
    """ An Instance you can talk to"""
    def __init__(self, instance, connection, username, key_filename):
        self.instance = instance
        self.connection = connection
        self.pool_name = instance.tags['PoolName']
        status=connection.get_all_instance_status(instance_ids=[instance.id])[0]
        self.system_status = status.system_status.status
        self.instance_status = status.instance_status.status
        self.state = self.instance.state
        if self.state != 'running' or self.system_status != 'ok' or self.instance_status != 'ok':
            self.status = 'Unwell'
        else:
            self.status = 'Ready'
        if self.status != 'Ready':
            return
        
        
        self.sshclient = paramiko.SSHClient()
        self.sshclient.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        self.sshclient.connect(instance.ip_address, username=username, key_filename=key_filename, timeout=10)
        self.transport = self.sshclient.get_transport()
        
    
    def update(self):
        """Update status info"""
        self.instance.update()
        self.state = self.instance.state
        if self.state == 'terminated' or self.status == 'shutting-down':
            self.status = 'Terminated'
            self.instance_status = None
            self.system_status = None
            return
        else:
            status=self.connection.get_all_instance_status(instance_ids=[self.instance.id])[0]
            self.system_status = status.system_status.status
            self.instance_status = status.instance_status.status
        if self.state != 'running' or self.system_status != 'ok' or self.instance_status != 'ok':
            self.status = 'Unwell'
        else:
            if self.status == 'Unwell' or self.status == 'Unknown':
                self.status = 'Ready'
        if self.status == 'Busy':
            while self.channel.recv_ready():
                self.output += self.channel.recv(1024)
            if self.channel.exit_status_ready():
                self.status = 'Ready'
                self.exit_status = self.channel.recv_exit_status()
                
    def wait(self, timeout=None):
        """wait until not busy"""
        start_time = time.time()
        max_wait_exceeded = False
        if self.status == 'Busy':
            while not self.channel.exit_status_ready() and not max_wait_exceeded:
                if timeout is not None:
                    elapsed_time = time.time() - start_time
                    max_wait_exceeded = elapsed_time > timeout
                if max_wait_exceeded:
                    return
                while self.channel.recv_ready():
                    self.output += self.channel.recv(1024)
                time.sleep(5)
                
            while self.channel.recv_ready():
                self.output += self.channel.recv(1024)
            self.status = 'Ready'
            self.exit_status = self.channel.recv_exit_status()
            
    def terminate(self, force=False):
        """Terminate the instance"""
        if force or self.status is not "Busy":
            self.connection.terminate_instances(instance_ids=[self.instance.id])
        else:
            print 'Error - the instance is busy'
            
    def exec_command(self, script, block=True):
        """send a command to the instance"""
        
        self.update()
        if self.status == 'Unwell':
            self.output = 'Error - this instance is unwell'
            self.exit_status = -1
            return
        
        if self.status != 'Ready':
            self.output = 'Error - this instance is not ready'
            self.exit_status = -1
            return
        
        transport = self.sshclient.get_transport()
        self.channel = transport.open_session()
        self.channel.set_combine_stderr(True)
        self.channel.exec_command(script)
        self.status='Busy'
        self.exit_status=None
        self.output = ''
        if block:
            self.wait()
            return
        else:
            return
        

### OK, ready to go. Begin by creating a pool (list) of instances

In [None]:
instances = spot_instances_pool(ec2, pool_name='MDPool', count=2,
                        price=price,
                        image_id=image_id,
                        instance_type=instance_type,
                        security_groups=security_groups,
                        key_name=key_name,
                        block_device_map=bdm)
print instances

### Turn them into 'Connected Instances':

In [None]:
cis = [ConnectedInstance(i, ec2, username, '../calAWS.pem') for i in instances]

In [None]:
print cis[0].status
print cis[0].instance.state
cis[0].update()
print cis[0].status
print cis[0].system_status, cis[0].instance_status, cis[0].state, cis[0].pool_name

### Create the EFS filesystem:

In [None]:
efs_client = boto3.client('efs')
response = efs_client.create_file_system(CreationToken='MyTestFileSystem')
print response

In [None]:
FileSystemId = response['FileSystemId']
print FileSystemId

### Before we can mount the filesystem on the instances, there is a bit of work to do.
#### Firstly we need to know the subnet id in each availability zone in our region:

In [None]:
ec2_client = boto3.client('ec2')
sn_all = ec2_client.describe_subnets()['Subnets']
az_subnetid = {}
for sn in sn_all:
    az_subnetid[sn['AvailabilityZone']] = sn['SubnetId']
for az in az_subnetid:
    print az, az_subnetid[az]

#### Next we need to knopw the security group ids associated with our security group names.

In [None]:
sg_all = ec2_client.describe_security_groups()['SecurityGroups']
sgnameid = {}
for sg in sg_all:
    sgnameid[sg['GroupName']] = sg['GroupId']
for sgname in sgnameid:
    print sgname, sgnameid[sgname]

#### Now we can create the mount targets - one for each availability zone. The security group (efs-walkthrough1-mt-sg) allows them to have read-write access to the instance when they get mounted (later)

In [None]:
for az in az_subnetid:
    response2 = efs_client.create_mount_target(FileSystemId=response['FileSystemId'], 
                                           SubnetId=az_subnetid[az], 
                                           SecurityGroups=[sgnameid['efs-walkthrough1-mt-sg']]
                                          )
    print response2

MountTargetIds = [ mt['MountTargetId'] for mt in efs_client.describe_mount_targets(FileSystemId=FileSystemId)['MountTargets']]

#### Before we can mount the filesystem on the instances, we have to install the nfs client (it would be nice to update the ami we use so this is not neccessary). Note how, by using the `block=False` option, all jobs run concurrently.

In [None]:
for ci in cis:
    ci.exec_command('sudo apt-get update; sudo apt-get install nfs-common -y; mkdir ~/efs-mount-point', block=False)
for ci in cis:
    ci.wait()
    print ci.output

#### Now we can mount the filesystem on each instance

In [None]:
dnsname = '{}.efs.{}.amazonaws.com'.format(FileSystemId, region)
mount_command = 'sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 {}:/ ~/efs-mount-point'.format(dnsname)

for c in cis:
    c.exec_command(mount_command, block=False)
for c in cis:
    c.wait()
    print c.output

#### We need to adjust the permissions on the shared filesystem

In [None]:
for c in cis:
    c.exec_command('sudo chmod go+rw efs-mount-point', block=False)
for c in cis:
    c.wait()
    print c.output

#### Now we can try it out. First we copy a Gromacs .tpr file from the examples already on the instance to the shared filesystem. Note we only have to do this on one of our instances!

In [None]:
cis[0].exec_command('cp tios/test/examples/bpti.tpr ~/efs-mount-point')
cis[1].exec_command('ls -l ~/efs-mount-point')
print cis[1].output

#### Now we can run the Gromacs job. A temporary directory is created where most of the stuff happens, but selected output files are written to the shared directory (with unique names)

In [None]:
i = 0
for c in cis:
    run_command = 'export TMPDIR=`mktemp -d`; cd $TMPDIR; source /usr/local/gromacs/2016.4/bin/GMXRC; gmx mdrun -s ~/efs-mount-point/bpti.tpr -g ~/efs-mount-point/gmx_logfile{}.log -nsteps 1000'.format(i)
    c.exec_command(run_command, block=False)
    i += 1

for c in cis:
    c.wait()
    print c.output

#### Check to see what we have now got in the shared filesystem:

In [None]:
cis[0].exec_command('ls -l ~/efs-mount-point')
print cis[0].output

#### Now shut everything down

In [None]:
instance_ids = [i.id for i in instances]
ec2.terminate_instances(instance_ids=instance_ids)
time.sleep(2)
for MountTargetId in MountTargetIds:
    efs_client.delete_mount_target(MountTargetId=MountTargetId)
time.sleep(10)
efs_client.delete_file_system(FileSystemId=FileSystemId)

#### If youy get an error, you may need to try a second time to get the filesystem deleted, if it was still busy the first time:

In [None]:
efs_client.delete_file_system(FileSystemId=FileSystemId)