V0.6.0 Introduced Cloud-Orchestration and Scalable Loading and Mainta…

…ining Components (#102) * Prepare next release * Masterscript: maintaining does not change timer settings of benchmarker * Masterscript: reconnect and try again, if not failed due to "not found" * Masterscript: improved output about workflow * Masterscript: aws example nodegroup scale * Masterscript: aws example nodegroup get size * Masterscript: aws example nodegroup wait for size * Masterscript: aws example nodegroup show size * Masterscript: aws example nodegroup show and check size * Masterscript: aws example nodegroup name and type * Masterscript: aws example dict of nodegroups * Masterscript: aws example nodegroup name necessary for scaling * Masterscript: aws example nodegroup name and type * Masterscript: maintaining duration default 4h * Masterscript: maintaining parameters and nodeSelector * Masterscript: nodeSelector for sut, monitoring and benchmarker * Masterscript: maintaining is accepted running also when num_maintaining=0 * Masterscript: request resources from command line * Masterscript: prepare max_sut per cluster and per experiment * Masterscript: catch json exception in getNode() * Masterscript: maintaining example TSBS as experiment setup * Masterscript: jobtemplate_maintaining per experiment * Masterscript: initContainers in maintaining * Masterscript: maintaining also watches succeeded pods * Masterscript: maintaining also respects (longly) pending pods * Masterscript: loading pods controlled by redis queue * Masterscript: loading pods controlled by redis queue, include params * Masterscript: initContainers parameters set correctly * Masterscript: Stop also loading jobs and pods * Masterscript: Number of parallel loaders * Masterscript: Empty schema before loading pods * Masterscript: Stop also loading jobs and pods when putting sut down * Masterscript: Loading only finished, when outside and inside cluster are done * Masterscript: Stop also loading jobs and pods - in all configurations * Masterscript: Stop also loading jobs and pods - in all configurations (config, experiment, cluster) * Masterscript: Check status of parallel loading * Masterscript: Job status explained * Masterscript: Job status returns true iff all pods are completed * Masterscript: Job status more output * Masterscript: Job status returns true iff all pods are completed * Masterscript: Job status returns true iff all pods are completed, then delete all loading pods * Masterscript: Job status returns true iff all pods are completed, copy loading pods logs * Masterscript: Copy logs of all containers of loading pods * Masterscript: Mark SUT as loaded as soon as realizing all pods have status success - include this as timeLoading * Masterscript: Use maintaining structure for setting loading parameters * Masterscript: Mark SUT as loaded * Masterscript: Mark SUT as loaded, read old labels at first * Masterscript: Mark SUT as loaded, read old labels at first and convert to float * Masterscript: Mark SUT as loaded, read old labels at first and convert to float, debug output * Masterscript: Mark SUT as loaded, read old labels at first and convert to int * Masterscript: Mark SUT as loaded, read old labels at first and convert to int, cleaned
Beuth-Erdelt · Oct 18, 2022 · ec6bd6e · ec6bd6e
1 parent 0929756
commit ec6bd6e
Show file tree

Hide file tree

Showing 7 changed files with 676 additions and 92 deletions.
diff --git a/bexhoma/clusters.py b/bexhoma/clusters.py
@@ -65,3 +65,78 @@ def store_pod_log(self, pod_name, container=''):
 
 
 
+
+class aws(kubernetes):
+    def __init__(self, clusterconfig='cluster.config', configfolder='experiments/', yamlfolder='k8s/', context=None, code=None, instance=None, volume=None, docker=None, script=None, queryfile=None):
+        self.code = code
+        kubernetes.__init__(self, clusterconfig=clusterconfig, configfolder=configfolder, context=context, yamlfolder=yamlfolder, code=self.code, instance=instance, volume=volume, docker=docker, script=script, queryfile=queryfile)
+        self.cluster = self.contextdata['cluster']
+    def eksctl(self, command):
+        #fullcommand = 'eksctl --context {context} {command}'.format(context=self.context, command=command)
+        fullcommand = 'eksctl {command}'.format(command=command)
+        self.logger.debug('aws.eksctl({})'.format(fullcommand))
+        #print(fullcommand)
+        return os.popen(fullcommand).read()# os.system(fullcommand)
+    def getNodes(self, app='', nodegroup_type='', nodegroup_name=''):
+        self.logger.debug('aws.getNodes()')
+        label = ''
+        if len(app)==0:
+            app = self.appname
+        label += 'app='+app
+        if len(nodegroup_type)>0:
+            label += ',type='+nodegroup_type
+        if len(nodegroup_name)>0:
+            label += ',alpha.eksctl.io/nodegroup-name='+nodegroup_name
+        try:
+            api_response = self.v1core.list_node(label_selector=label)
+            #pprint(api_response)
+            if len(api_response.items) > 0:
+                return api_response.items
+            else:
+                return []
+        except ApiException as e:
+            print("Exception when calling CoreV1Api->list_node for getNodes: %s\n" % e)
+            print("Create new access token")
+            self.cluster_access()
+            self.wait(2)
+            return self.getNodes(app=app, nodegroup_type=nodegroup_type, nodegroup_name=nodegroup_name)
+    def scale_nodegroups(self, nodegroup_names, size=None):
+        print("aws.scale_nodegroups({nodegroup_names}, {size})".format(nodegroup_names=nodegroup_names, size=size))
+        for nodegroup_name, size_default in nodegroup_names.items():
+            if size is not None:
+                size_default = size
+            self.scale_nodegroup(nodegroup_name, size_default)
+    def scale_nodegroup(self, nodegroup_name, size):
+        print("aws.scale_nodegroup({nodegroup_name}, {size})".format(nodegroup_name=nodegroup_name, size=size))
+        if not self.check_nodegroup(nodegroup_name=nodegroup_name, num_nodes_aux_planned=size):
+            #fullcommand = "eksctl scale nodegroup --cluster=Test-2 --nodes=0 --nodes-min=0 --name=Kleine_Gruppe"
+            command = "scale nodegroup --cluster={cluster} --nodes={size} --name={nodegroup_name}".format(cluster=self.cluster, size=size, nodegroup_name=nodegroup_name)
+            return self.eksctl(command)
+        #if not self.check_nodegroup(nodegroup_type, num_nodes_aux_planned):
+        #    command = "scale nodegroup --cluster={cluster} --nodes={size} --name={nodegroup}".format(cluster=self.cluster, size=size, nodegroup=nodegroup)
+        #    return self.eksctl(command)
+        #else:
+        #    return ""
+    def get_nodegroup_size(self, nodegroup_type='', nodegroup_name=''):
+        resp = self.getNodes(nodegroup_type=nodegroup_type, nodegroup_name=nodegroup_name)
+        num_nodes_aux_actual = len(resp)
+        self.logger.debug('aws.get_nodegroup_size({},{}) = {}'.format(nodegroup_type, nodegroup_name, num_nodes_aux_actual))
+        return num_nodes_aux_actual
+    def check_nodegroup(self, nodegroup_type='', nodegroup_name='', num_nodes_aux_planned=0):
+        num_nodes_aux_actual = self.get_nodegroup_size(nodegroup_type=nodegroup_type, nodegroup_name=nodegroup_name)
+        self.logger.debug('aws.check_nodegroup({}, {}, {}) = {}'.format(nodegroup_type, nodegroup_name, num_nodes_aux_planned, num_nodes_aux_actual))
+        return num_nodes_aux_planned == num_nodes_aux_actual
+    def wait_for_nodegroups(self, nodegroup_names, size=None):
+        print("aws.wait_for_nodegroups({nodegroup_names})".format(nodegroup_names=nodegroup_names))
+        for nodegroup_name, size_default in nodegroup_names.items():
+            if size is not None:
+                size_default = size
+            self.wait_for_nodegroup(nodegroup_name=nodegroup_name, num_nodes_aux_planned=size_default)
+    def wait_for_nodegroup(self, nodegroup_type='', nodegroup_name='', num_nodes_aux_planned=0):
+        while (not self.check_nodegroup(nodegroup_type=nodegroup_type, nodegroup_name=nodegroup_name, num_nodes_aux_planned=num_nodes_aux_planned)):
+           self.wait(30)
+        print("Nodegroup {},{} ready".format(nodegroup_type, nodegroup_name))
+        return True
+
+
+