V0.5.13 (#91)

* Prepare next release * Include Docker images for TPC-H in subfolder * More TPC examples * Experiments cleaned * TPC-DS: Some example DDL and query files * TPC-H: Some docs
Beuth-Erdelt · Aug 25, 2021 · 5d2f83b · 5d2f83b
1 parent 0b79480
commit 5d2f83b
Show file tree

Hide file tree

Showing 123 changed files with 36,850 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -32,10 +32,9 @@ A more advanced workflow is: Plan a sequence of such experiments, run plan as a
 
 ## Quickstart
 
-The repository contains a tool for running TPC-H (reading) queries at MonetDB and PostgreSQL.
+The repository contains a [tool](experiments/tpch/) for running TPC-H (reading) queries at MonetDB and PostgreSQL.
 
-1. Run `tpch run`.  
-  This is equivalent to `python tpch.py run`.
+1. Run `tpch run -sf 1 -t 30`.
 1. You can watch status using `bexperiments status` while running.  
   This is equivalent to `python cluster.py status`.
 1. After benchmarking has finished, run `bexperiments dashboard` to connect to a dashboard. You can open dashboard in browser at `http://localhost:8050`.  

diff --git a/bexhoma/scripts/__init__.py b/bexhoma/scripts/__init__.py
@@ -1,4 +1,4 @@
 """
 The clustermanager module
 """
-__all__ = ["experimentsmanager","tpch"]
+__all__ = ["experimentsmanager","tpch","tpcds"]
diff --git a/bexhoma/scripts/tpcds.py b/bexhoma/scripts/tpcds.py
@@ -0,0 +1,192 @@
+"""
+:Date: 2021-02-12
+:Version: 0.1
+:Authors: Patrick Erdelt
+
+Perform TPC-DS inspired benchmarks in a Kubernetes cluster.
+This either profiles the imported data in several DBMS and compares some statistics, or runs the TPC-H queries.
+Optionally monitoring is actived.
+User can choose to detach the componenten of the benchmarking system, so that as much as possible is run inside a Kubernetes (K8s) cluster.
+User can also choose some parameters like number of runs per query and configuration and request some resources.
+"""
+from bexhoma import *
+from dbmsbenchmarker import *
+#import experiments
+import logging
+import urllib3
+import logging
+import argparse
+import time
+from timeit import default_timer
+import datetime
+
+
+urllib3.disable_warnings()
+logging.basicConfig(level=logging.ERROR)
+
+def do_benchmark():
+	description = """Perform TPC-DS inspired benchmarks in a Kubernetes cluster.
+	This either profiles the imported data in several DBMS and compares some statistics, or runs the TPC-DS queries.
+	Optionally monitoring is actived.
+	User can choose to detach the componenten of the benchmarking system, so that as much as possible is run inside a Kubernetes (K8s) cluster.
+	User can also choose some parameters like number of runs per query and configuration and request some resources.
+	"""
+	# argparse
+	parser = argparse.ArgumentParser(description=description)
+	parser.add_argument('mode', help='profile the import of TPC-DS data, or run the TPC-DS queries, or start DBMS and load data, or just start the DBMS', choices=['profiling', 'run', 'start', 'load'])
+	parser.add_argument('-cx', '--context', help='context of Kubernetes (for a multi cluster environment), default is current context', default=None)
+	parser.add_argument('-e', '--experiment', help='sets experiment code for continuing started experiment', default=None)
+	parser.add_argument('-d', '--detached', help='puts most of the experiment workflow inside the cluster', action='store_true')
+	parser.add_argument('-m', '--monitoring', help='activates monitoring', action='store_true')
+	parser.add_argument('-ms', '--max-sut', help='maximum number of parallel DBMS configurations, default is no limit', default=None)
+	parser.add_argument('-dt', '--datatransfer', help='activates datatransfer', action='store_true', default=False)
+	parser.add_argument('-md', '--monitoring-delay', help='time to wait [s] before execution of the runs of a query', default=10)
+	parser.add_argument('-nr', '--num-run', help='number of runs per query', default=1)
+	parser.add_argument('-nc', '--num-config', help='number of runs per configuration', default=1)
+	parser.add_argument('-ne', '--num-query-executors', help='comma separated list of number of parallel clients', default="1")
+	parser.add_argument('-sf', '--scaling-factor', help='scaling factor (SF)', default=1)
+	parser.add_argument('-t', '--timeout', help='timeout for a run of a query', default=180)
+	parser.add_argument('-rr', '--request-ram', help='request ram', default='16Gi')
+	parser.add_argument('-rc', '--request-cpu', help='request cpus', default='4')
+	parser.add_argument('-rct', '--request-cpu-type', help='request node having node label cpu=', default='')
+	parser.add_argument('-rg', '--request-gpu', help='request number of gpus', default=1)
+	parser.add_argument('-rgt', '--request-gpu-type', help='request node having node label gpu=', default='a100')
+	parser.add_argument('-rst', '--request-storage-type', help='request persistent storage of certain type', default=None, choices=[None, '', 'local-hdd', 'shared'])
+	parser.add_argument('-rss', '--request-storage-size', help='request persistent storage of certain size', default='10Gi')
+	parser.add_argument('-rnn', '--request-node-name', help='request a specific node', default=None)
+	args = parser.parse_args()
+	# set parameter
+	monitoring = args.monitoring
+	mode = str(args.mode)
+	SF = str(args.scaling_factor)
+	timeout = int(args.timeout)
+	numRun = int(args.num_run)
+	numExperiments = int(args.num_config)
+	cpu = str(args.request_cpu)
+	memory = str(args.request_ram)
+	cpu_type = str(args.request_cpu_type)
+	gpu_type = str(args.request_gpu_type)
+	gpus = str(args.request_gpu)
+	request_storage_type = args.request_storage_type
+	request_storage_size = args.request_storage_size
+	request_node_name = args.request_node_name
+	datatransfer = args.datatransfer
+	code = args.experiment
+	# set cluster
+	cluster = clusters.kubernetes(context=args.context)
+	cluster_name = cluster.contextdata['clustername']
+	if args.max_sut is not None:
+		cluster.max_sut = int(args.max_sut)
+	# set experiment
+	if code is None:
+		code = cluster.code
+	experiment = experiments.tpcds(cluster=cluster, SF=SF, timeout=timeout, detached=True, code=code, numExperiments=numExperiments)
+	if mode == 'run':
+		# we want all TPC-H queries
+		experiment.set_queries_full()
+		experiment.set_workload(
+			name = 'TPC-H Queries SF='+str(SF),
+			info = 'This experiment compares run time and resource consumption of TPC-DS queries in different DBMS.'
+		)
+	else:
+		# we want to profile the import
+		experiment.set_queries_profiling()
+		experiment.set_workload(
+			name = 'TPC-H Data Profiling SF='+str(SF),
+			info = 'This experiment compares imported TPC-DS data sets in different DBMS.'
+		)
+	if monitoring:
+		# we want to monitor resource consumption
+		experiment.set_querymanagement_monitoring(numRun=numRun, delay=10, datatransfer=datatransfer)
+	else:
+		# we want to just run the queries
+		experiment.set_querymanagement_quicktest(numRun=numRun, datatransfer=datatransfer)
+	# set resources for dbms
+	experiment.set_resources(
+		requests = {
+			'cpu': cpu,
+			'memory': memory,
+			'gpu': 0
+		},
+		limits = {
+			'cpu': 0,
+			'memory': 0
+		},
+		nodeSelector = {
+			'cpu': cpu_type,
+			'gpu': '',
+		})
+	if request_node_name is not None:
+		experiment.set_resources(
+			nodeSelector = {
+				'cpu': cpu_type,
+				'gpu': '',
+				'kubernetes.io/hostname': request_node_name
+			})		
+	# persistent storage
+	print(request_storage_type)
+	experiment.set_storage(
+		storageClassName = request_storage_type,
+		storageSize = request_storage_size,#'100Gi',
+		keep = False
+		)
+	cluster.start_dashboard()
+	# add configs
+	config = configurations.default(experiment=experiment, docker='MonetDB', configuration='MonetDB-{}'.format(cluster_name), alias='DBMS A', dialect='MonetDB')
+	#config = configurations.default(experiment=experiment, docker='MemSQL', configuration='MemSQL-{}'.format(cluster_name), alias='DBMS B', dialect='MonetDB')
+	#config = configurations.default(experiment=experiment, docker='MariaDB', configuration='MariaDB-{}'.format(cluster_name), alias='DBMS C')
+	config = configurations.default(experiment=experiment, docker='PostgreSQL', configuration='PostgreSQL-{}'.format(cluster_name), alias='DBMS D', dialect='MonetDB')
+	#config = configurations.default(experiment=experiment, docker='Citus', configuration='Citus-{}'.format(cluster_name), alias='DBMS E', dialect='OmniSci')
+	#config = configurations.default(experiment=experiment, docker='MySQL', configuration='MySQL-{}'.format(cluster_name), alias='DBMS F')
+	#config = configurations.default(experiment=experiment, docker='MariaDBCS', configuration='MariaDBCS-{}'.format(cluster_name), alias='DBMS G')
+	#config = configurations.default(experiment=experiment, docker='Exasol', configuration='Exasol-{}'.format(cluster_name), alias='DBMS H')
+	#config = configurations.default(experiment=experiment, docker='DB2', configuration='DB2-{}'.format(cluster_name), alias='DBMS I')
+	#config = configurations.default(experiment=experiment, docker='SAPHANA', configuration='SAPHANA-{}'.format(cluster_name), alias='DBMS J', dialect='MonetDB')
+	#config = configurations.default(experiment=experiment, docker='Clickhouse', configuration='Clickhouse-{}'.format(cluster_name), alias='DBMS K')
+	#config = configurations.default(experiment=experiment, docker='SQLServer', configuration='SQLServer-{}'.format(cluster_name), alias='DBMS L')
+	#config = configurations.default(experiment=experiment, docker='OmniSci', configuration='OmniSci-{}'.format(cluster_name) alias='DBMS M')
+	if args.mode == 'start':
+		experiment.start_sut()
+	elif args.mode == 'load':
+		# start all DBMS
+		experiment.start_sut()
+		# configure number of clients per config = 0
+		list_clients = []
+		# total time of experiment
+		experiment.add_benchmark_list(list_clients)
+		start = default_timer()
+		start_datetime = str(datetime.datetime.now())
+		print("Experiment starts at {} ({})".format(start_datetime, start))
+		# run workflow
+		experiment.work_benchmark_list()
+		# total time of experiment
+		end = default_timer()
+		end_datetime = str(datetime.datetime.now())
+		duration_experiment = end - start
+		print("Experiment ends at {} ({}): {}s total".format(end_datetime, end, duration_experiment))
+	else:
+		# configure number of clients per config
+		list_clients = args.num_query_executors.split(",")
+		if len(list_clients) > 0:
+			list_clients = [int(x) for x in list_clients]
+		experiment.add_benchmark_list(list_clients)
+		# total time of experiment
+		start = default_timer()
+		start_datetime = str(datetime.datetime.now())
+		print("Experiment starts at {} ({})".format(start_datetime, start))
+		# run workflow
+		experiment.work_benchmark_list()
+		# total time of experiment
+		end = default_timer()
+		end_datetime = str(datetime.datetime.now())
+		duration_experiment = end - start
+		print("Experiment ends at {} ({}): {}s total".format(end_datetime, end, duration_experiment))
+		##################
+		experiment.evaluate_results()
+		experiment.stop_benchmarker()
+		experiment.stop_sut()
+		cluster.stop_dashboard()
+		cluster.start_dashboard()
+		# OOM? exit code 137
+		#experiment.zip()
+	exit()
diff --git a/bexhoma/scripts/tpch.py b/bexhoma/scripts/tpch.py
@@ -132,19 +132,19 @@ def do_benchmark():
 		)
 	cluster.start_dashboard()
 	# add configs
-	config = configurations.default(experiment=experiment, docker='MonetDB', alias='DBMS A', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='MemSQL', alias='DBMS B', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='MariaDB', alias='DBMS C', numExperiments=1, clients=[1])
-	config = configurations.default(experiment=experiment, docker='PostgreSQL', alias='DBMS D', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='Citus', alias='DBMS E', numExperiments=1, dialect='OmniSci', clients=[1])
-	#config = configurations.default(experiment=experiment, docker='MySQL', alias='DBMS F', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='MariaDBCS', alias='DBMS G', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='Exasol', alias='DBMS H', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='DB2', alias='DBMS I', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='SAPHANA', alias='DBMS J', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='Clickhouse', alias='DBMS K', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='SQLServer', alias='DBMS L', numExperiments=1, clients=[1])
-	#config = configurations.default(experiment=experiment, docker='OmniSci', alias='DBMS M', numExperiments=1, clients=[1])
+	config = configurations.default(experiment=experiment, docker='MonetDB', configuration='MonetDB-{}'.format(cluster_name), alias='DBMS A')
+	#config = configurations.default(experiment=experiment, docker='MemSQL', configuration='MemSQL-{}'.format(cluster_name), alias='DBMS B')
+	#config = configurations.default(experiment=experiment, docker='MariaDB', configuration='MariaDB-{}'.format(cluster_name), alias='DBMS C')
+	config = configurations.default(experiment=experiment, docker='PostgreSQL', configuration='PostgreSQL-{}'.format(cluster_name), alias='DBMS D')
+	#config = configurations.default(experiment=experiment, docker='Citus', configuration='Citus-{}'.format(cluster_name), alias='DBMS E', dialect='OmniSci')
+	#config = configurations.default(experiment=experiment, docker='MySQL', configuration='MySQL-{}'.format(cluster_name), alias='DBMS F')
+	#config = configurations.default(experiment=experiment, docker='MariaDBCS', configuration='MariaDBCS-{}'.format(cluster_name), alias='DBMS G')
+	#config = configurations.default(experiment=experiment, docker='Exasol', configuration='Exasol-{}'.format(cluster_name), alias='DBMS H')
+	#config = configurations.default(experiment=experiment, docker='DB2', configuration='DB2-{}'.format(cluster_name), alias='DBMS I')
+	#config = configurations.default(experiment=experiment, docker='SAPHANA', configuration='SAPHANA-{}'.format(cluster_name), alias='DBMS J')
+	#config = configurations.default(experiment=experiment, docker='Clickhouse', configuration='Clickhouse-{}'.format(cluster_name), alias='DBMS K')
+	#config = configurations.default(experiment=experiment, docker='SQLServer', configuration='SQLServer-{}'.format(cluster_name), alias='DBMS L')
+	#config = configurations.default(experiment=experiment, docker='OmniSci', configuration='OmniSci-{}'.format(cluster_name), alias='DBMS M')
 	if args.mode == 'start':
 		experiment.start_sut()
 	elif args.mode == 'load':

diff --git a/experiments/example/OmniSci/initdata.sql b/experiments/example/OmniSci/initdata.sql
diff --git a/experiments/example/OmniSci/initschema.sql b/experiments/example/OmniSci/initschema.sql
diff --git a/experiments/tpcds/Citus/.gitignore b/experiments/tpcds/Citus/.gitignore
@@ -0,0 +1 @@
+filled*.sql