Skip to content

Commit

Permalink
Merged in fix/gabi/MPC-6227_EKS_node_refuses_to_die (pull request ela…
Browse files Browse the repository at this point in the history
…stic#763)

Fix MPC-6227 EKS node remains after running stack_power.py

* Set use_default_tags in EKS module to True
* added profile option to two python scripts to minimize user errors.
* refactor stack_power to be more user friendly.

Approved-by: Gideon Avida
Approved-by: Can Yildiz
  • Loading branch information
Gabi Davar committed Jul 5, 2022
1 parent dc79ea3 commit 60870e4
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 51 deletions.
5 changes: 3 additions & 2 deletions aws/README.md
Expand Up @@ -63,11 +63,12 @@ Install [Session Manager plugin for the AWS CLI](https://docs.aws.amazon.com/sys
### Using
```bash
STACK_NAME=gabi
python ./misc/list_ssm_instances_per_stack.py ${STACK_NAME}
PROFILE=dev_il
python ./misc/list_ssm_instances_per_stack.py ${STACK_NAME} --profile ${PROFILE}

# example output
mindw@GABI-PC /mnt/j/dev/src/engageli/devops/aws/misc
python3 list_ssm_instances_per_stack.py igor
python3 list_ssm_instances_per_stack.py igor --profile dev_il
Session manager enabled hosts for stack: igor
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Instance ID ┃ Name ┃ Agent Version ┃
Expand Down
1 change: 1 addition & 0 deletions aws/ams-cluster-v1-tf/eks.tf
Expand Up @@ -231,6 +231,7 @@ module "eks" {
}
public_ip = true
key_name = var.key_name
use_default_tags = true
tags = {
Role = "EKS"
# check if this can be set only on the asg? is it still required? do we go to karpenter instead of
Expand Down
2 changes: 1 addition & 1 deletion aws/ams-cluster-v1-tf/terraform.sh
Expand Up @@ -177,7 +177,7 @@ if [ "${STACK_PROFILE}" == "XYZ" ]; then
exit 1
fi

AWS_PROFILE=${STACK_PROFILE} ../misc/stack_power.py -s ${STACK_NAME} -r ${STACK_REGION} -o
../misc/stack_power.py -p ${STACK_PROFILE} -s ${STACK_NAME} -r ${STACK_REGION} -o

# in case the provider config has changed
[[ "${FAST_RUN}" == "false" ]] && terraform init --upgrade --reconfigure
Expand Down
9 changes: 8 additions & 1 deletion aws/misc/list_ssm_instances_per_stack.py 100644 → 100755
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
import click
import boto3
import rich
Expand All @@ -7,12 +8,18 @@

@click.command()
@click.argument('stack_name')
def list_ssm_instances_per_stack(stack_name):
@click.option('-p', '--profile')
def list_ssm_instances_per_stack(stack_name: str, profile: str):
'''List all instances that support AWS Session Manager
Use awscli with SSM plugin https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html
to create a session: aws ssm start-session --target <instance_id>
'''
if profile:
boto3.setup_default_session(
profile_name=profile
)

ssm = boto3.client('ssm')
ec2 = boto3.resource('ec2')
filters = [
Expand Down
193 changes: 146 additions & 47 deletions aws/misc/stack_power.py
Expand Up @@ -9,31 +9,69 @@
from datetime import datetime
from pprint import pprint


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--region', type=str, default=None,
help='Override default region')
parser.add_argument('-s', '--stack-name', type=str, required=True,
help='Destination stack name')
parser.add_argument('-o', '--power-on', action='store_true',
help='By default the script will power off the stack')
parser.add_argument('--ems-asg-size', type=int, default=1,
help="Size for EMS auto scaling group, default=%(default)s")
parser.add_argument('--merger-asg-size', type=int, default=3,
help="Size for merger auto scaling group, default=%(default)s")
parser.add_argument('--recorder-asg-size', type=int, default=1,
help="Size for recorder auto scaling group, default=%(default)s")
parser.add_argument('--num-generators', type=int, default=0,
help="Max number of student generators to power up, default=%(default)s")
parser.add_argument(
'-r',
'--region',
type=str,
default=None,
help='Override default region',
)
parser.add_argument(
'-s',
'--stack-name',
type=str,
required=True,
help='Destination stack name',
)
parser.add_argument(
'-o',
'--power-on',
action='store_true',
help='By default the script will power off the stack',
)
parser.add_argument(
'--ems-asg-size',
type=int,
default=1,
help="Size for EMS auto scaling group, default=%(default)s",
)
parser.add_argument(
'--merger-asg-size',
type=int,
default=3,
help="Size for merger auto scaling group, default=%(default)s",
)
parser.add_argument(
'--recorder-asg-size',
type=int,
default=1,
help="Size for recorder auto scaling group, default=%(default)s",
)
parser.add_argument(
'--num-generators',
type=int,
default=0,
help="Max number of student generators to power up, default=%(default)s",
)
parser.add_argument(
'-p', '--profile', type=str, default=None, help='AWS Profile to use'
)
return parser.parse_args()


def power_instances(args):
ec2 = boto3.client('ec2', args.region)
instances = defaultdict(list)
state_filter = 'stopped' if args.power_on else 'running'
stack_instances = ec2.describe_instances(Filters=[
{'Name': 'tag:StackName', 'Values': [args.stack_name]},
{'Name': 'instance-state-name', 'Values': [state_filter]}])
stack_instances = ec2.describe_instances(
Filters=[
{'Name': 'tag:StackName', 'Values': [args.stack_name]},
{'Name': 'instance-state-name', 'Values': [state_filter]},
]
)
for res in stack_instances['Reservations']:
for instance in res['Instances']:
for tag in instance['Tags']:
Expand All @@ -43,31 +81,33 @@ def power_instances(args):
# pprint(instances)
for role in ('bastion', 'engageli-api', 'gen-student'):
if not instances[role]:
print('No instances for role:', role)
print(f'No {state_filter} instances for role: {role}')
continue
if args.power_on:
if role == 'gen-student':
if args.num_generators == 0:
print('Skipping student generators')
continue
instancesToStart = instances[role][0:min(len(instances[role]), args.num_generators)]
instancesToStart = instances[role][
0 : min(len(instances[role]), args.num_generators)
]
else:
instancesToStart = instances[role]
print('Starting:', instancesToStart)
print(f'Starting role {role} instance: {instancesToStart}')
response = ec2.start_instances(InstanceIds=instancesToStart)
else:
print('Stopping:', instances[role])
print(f'Stopping role {role} instance: {instances[role]}')
response = ec2.stop_instances(InstanceIds=instances[role])
pprint(response)
if not args.power_on:
# terminate ASG instances instead of waiting for cooldown
for role in ('ems', 'recorder', 'merger'):
if not instances[role]:
print('No instances for role:', role)
print(f'No {state_filter} instances for role: {role}')
continue
print('Terminating:', instances[role])
print(f'Terminating {role}\'s instances: {instances[role]}')
ec2.terminate_instances(InstanceIds=instances[role])


def power_rds(args):
rds = boto3.client('rds', args.region)
result = rds.describe_db_instances()
Expand All @@ -78,36 +118,75 @@ def power_rds(args):
# pprint(instance)
res = 'nothing'
# Make sure status change is valid
if args.power_on and instance['DBInstanceStatus'] not in {'available', 'starting', 'configuring-enhanced-monitoring'}:
if instance['DBInstanceStatus'] != 'stopped':
print('DB state:', instance['DBInstanceStatus'], 'cannot start')
if args.power_on and instance['DBInstanceStatus'] not in {
'available',
'starting',
'configuring-enhanced-monitoring',
}:
if instance['DBInstanceStatus'] not in ['stopped', 'stopping']:
print(f'DB state: {instance["DBInstanceStatus"]} cannot start')
sys.exit(1)
print(datetime.now(), 'Starting:', instance['DBInstanceIdentifier'])
if instance['DBInstanceStatus'] == 'stopping':
# Loop until DB is up
print(
f'{datetime.now()} Waiting for RDS DB to stop: {instance["DBInstanceIdentifier"]}'
)
start = datetime.now()
while True:
time.sleep(5)
now = datetime.now()
res = rds.describe_db_instances(
DBInstanceIdentifier=instance['DBInstanceIdentifier']
)
print(
f"{now} {now - start} State: {res['DBInstances'][0]['DBInstanceStatus']}"
)
if res['DBInstances'][0]['DBInstanceStatus'] == 'stopped':
break
print(
f'{datetime.now()} Starting RDS DB: {instance["DBInstanceIdentifier"]}'
)
res = rds.start_db_instance(
DBInstanceIdentifier=instance['DBInstanceIdentifier'])
DBInstanceIdentifier=instance['DBInstanceIdentifier']
)
# Loop until DB is up
start = datetime.now()
while True:
time.sleep(5)
now = datetime.now()
res = rds.describe_db_instances(DBInstanceIdentifier=instance['DBInstanceIdentifier'])
print(now, now - start, 'State:', res['DBInstances'][0]['DBInstanceStatus'])
res = rds.describe_db_instances(
DBInstanceIdentifier=instance['DBInstanceIdentifier']
)
print(
f"{now} {now - start} State: {res['DBInstances'][0]['DBInstanceStatus']}"
)
if res['DBInstances'][0]['DBInstanceStatus'] == 'available':
break
elif not args.power_on and instance['DBInstanceStatus'] not in {'stopping', 'stopped'}:
elif not args.power_on and instance['DBInstanceStatus'] not in {
'stopping',
'stopped',
}:
print(
f'Stopping RDS DB instance: {instance["DBInstanceIdentifier"]}'
)
res = rds.stop_db_instance(
DBInstanceIdentifier=instance['DBInstanceIdentifier'])
pprint(res)
DBInstanceIdentifier=instance['DBInstanceIdentifier']
)


def auto_scaling_group(args):
asg = boto3.client('autoscaling', args.region)
res = asg.describe_auto_scaling_groups()
res = asg.describe_auto_scaling_groups(
Filters=[
{'Name': 'tag:StackName', 'Values': [args.stack_name]},
]
)
for group in res['AutoScalingGroups']:
# parse group's tags
tags = {tag['Key']: tag['Value'] for tag in group['Tags']}
stackName = tags.get('StackName')
if not stackName or stackName != args.stack_name:
print('Skipping ASG', group['AutoScalingGroupName'])
print(f'Skipping ASG {group["AutoScalingGroupName"]}')
continue
if args.power_on:
if tags['Role'] == 'ems-asg':
Expand All @@ -120,32 +199,52 @@ def auto_scaling_group(args):
# hardcode this to 1, let the cluster-autoscaler do its job
size = 1
else:
print('Skipping ASG', group['AutoScalingGroupName'], 'Role:', tags['Role'])
print(
f'Skipping ASG {group["AutoScalingGroupName"]} Role: {tags["Role"]}'
)
continue
# ensure ASG's max size is at least as big as the requested size
maxSize = size if group.get("MaxSize") < size else group.get("MaxSize")
maxSize = max(size, group.get("MaxSize"))
else:
size = 0
# Don't change max size on shutdown - rely on desired-capcity.
# Don't change max size on shutdown - rely on desired-capacity.
maxSize = group.get("MaxSize")
# pprint(group)
try:
response = asg.update_auto_scaling_group(
AutoScalingGroupName=group['AutoScalingGroupName'],
MinSize=size,
MaxSize=maxSize,
DesiredCapacity=size)
if (
group.get("MaxSize") == maxSize
and group.get("MinSize") == group.get("DesiredCapacity") == size
):
print(
f'Requested sizes {group["AutoScalingGroupName"]} are already set.'
)
else:
print(
f'Scaling {group["AutoScalingGroupName"]} MinSize={size}, MaxSize={maxSize}.'
)
response = asg.update_auto_scaling_group(
AutoScalingGroupName=group['AutoScalingGroupName'],
MinSize=size,
MaxSize=maxSize,
DesiredCapacity=size,
)
# pprint(response)
except Exception as ex:
print("auto_scaling_group exception for:", group['AutoScalingGroupName'], ex)
raise ex
print(
f'auto_scaling_group exception for: {group["AutoScalingGroupName"]}', ex
)
raise


def main():
args = parse_args()
# Start DB first so it's ready for the other services
if args.profile:
boto3.setup_default_session(profile_name=args.profile)
# Start DB first, so it's ready for the other services
power_rds(args)
auto_scaling_group(args)
power_instances(args)


if __name__ == '__main__':
main()

0 comments on commit 60870e4

Please sign in to comment.