/
run_benchmark.sh
executable file
·533 lines (418 loc) · 16.7 KB
/
run_benchmark.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
#!/bin/bash
set -o pipefail
set -o errexit
set -o nounset
set -o errtrace
shopt -s inherit_errexit
####################################################################################################
# VARIABLES/CONSTANTS
####################################################################################################
# v_min/v_max are mutually exclusive with v_thread_numbers.
#
v_min_threads=2 # int
v_max_threads=128 # int
v_thread_numbers_list=() # array
c_ssh_user=root
c_ssh_password=busybear
c_ssh_host=localhost
c_ssh_port=10000
c_components_dir=$(readlink -f "$(dirname "$0")")/components
c_output_dir=$(readlink -f "$(dirname "$0")")/output
c_temp_dir=$(dirname "$(mktemp)")
c_perf_shell_pidfile=$c_temp_dir/$(basename "$0").perf.pid
c_qemu_binary=$c_components_dir/qemu-system-riscv64
c_qemu_debug_file=$(basename "$0").qemu_debug.log
c_qemu_output_log_file=$(basename "$0").qemu_out.log
# Easier to run on a fresh copy each time, as an image can be easily broken, and leads to problems on
# startup.
#
c_guest_memory=8G
c_guest_image_source=$c_components_dir/busybear.qcow2
c_guest_image_temp=$c_temp_dir/busybear.temp.qcow2 # must be qcow2
c_kernel_image=$c_components_dir/Image
c_bios_image=$c_components_dir/fw_dynamic.bin
c_qemu_pidfile=$c_temp_dir/$(basename "$0").qemu.pid
# see above for the SSH port
c_perf_stat_events=L1-dcache-load-misses,context-switches,migrations,cycles,sched:sched_switch
c_perf_record_events=cpu-cycles
c_debug_log_file=$(basename "$0").log
c_help='Usage: '"$(basename "$0")"' [-s|--no-smt] [-p|--perf-stat] [-P|--perf-record] [-m|--perf-record-match <pattern>] [-t|--threads <threads_spec>] <bench_name> <runs> <qemu_boot_script> <benchmark_script>
Runs the specified benchmark with different vCPU/thread numbers, and stores the results.
Example usage:
./'"$(basename "$0")"' blackscholes_mytest 1 support_scripts/qemu_basic.sh support_scripts/bench_parsec_blackscholes.sh
Options:
- `--no-smt`: Disables SMT
- `--perf-stat`: Run perf stat; when enabled, the timings file is not written
- `--perf-record`: Run perf record; only on run per thread group is executed, ignoring the <run> parameter
- `--threads <threads_spec>`: Set threads number specification (see below); defaults to '"$v_min_threads-$v_max_threads"'
- `--perf-record-match <pattern>`: Start profiling only when a certain (sed) pattern matches
Some benchmarks may override the min/max for different reasons (they will print a warning).
WATCH OUT! Specifying any of the perf options will disable the standard benchmark.
WATCH OUT! If the benchmark is very short-running, `--perf-record-match` may miss the window; empirically, it'\''s quick to react (subsecond), but it'\''s certainly not instantaneous.
WATCH OUT! It'\''s advisable to lock the CPU clock (typically, this is done in the BIOS), in order to avoid the clock decreasing when the number of threads increase.
Perf stat events recorded: '"$c_perf_stat_events"'
Perf record events recorded: '"$c_perf_record_events"'
The number of threads can be specified in two formats:
- `<min>-<max>`: the thread numbers are calculated by the benchmark script, with minimum and maximum specified by this option;
- `a,b,c`: the thread numbers are those specified; the benchmark script calculation is ignored.
---
Requires the components built by `setup_system.sh` to be in place.
Powers of two below or equal $v_max_threads are used for each run; the of number of host processors is added if it'\''s not a power of 2.
The `sshpass` program must be available on the host.
The output CSV is be stored in the `'"$c_output_dir"'` subdirectory, with name `<bench_name>.csv`.
'
# User-defined
#
v_count_runs= # int
v_qemu_script= # string
v_bench_script= # string
v_enable_perf_stat= # boolean (false=blank, true=anything else)
v_enable_perf_record= # boolean (false=blank, true=anything else)
v_perf_record_pattern= # string
v_disable_smt= # boolean (false=blank, true=anything else)
# Computed internally
#
v_previous_smt_configuration= # string
v_isolated_processors=() # array
####################################################################################################
# MAIN FUNCTIONS
####################################################################################################
function decode_cmdline_args {
eval set -- "$(getopt --options hspPm:t: --long help,no-smt,perf-stat,perf-record,perf-record-match:,threads: --name "$(basename "$0")" -- "$@")"
local threads_spec=
while true ; do
case "$1" in
-h|--help)
echo "$c_help"
exit 0 ;;
-s|--no-smt)
v_disable_smt=1
shift ;;
-p|--perf-stat)
v_enable_perf_stat=1
shift ;;
-P|--perf-record)
v_enable_perf_record=1
shift ;;
-m|--perf-record-match)
v_perf_record_pattern=$2
shift 2 ;;
-t|--threads)
set_thread_numbers "$2"
shift 2 ;;
--)
shift
break ;;
esac
done
if [[ $# -ne 4 ]]; then
echo "$c_help"
exit 1
fi
v_bench_name=$1
v_count_runs=$2
v_qemu_script=$3
v_bench_script=$4
}
function load_includes {
# Note that the second may override functions. This is used in one case only though, and it's very
# specific.
# shellcheck source=support_scripts/benchmark_apis.sh
source "$(dirname "$0")/support_scripts/benchmark_apis.sh"
# shellcheck source=/dev/null
source "$v_qemu_script"
# shellcheck source=/dev/null
source "$v_bench_script"
}
function copy_busybear_image {
echo "Creating BusyBear run image..."
qemu-img create -f qcow2 -b "$c_guest_image_source" "$c_guest_image_temp"
}
# Since we copy the image each time, we can just kill QEMU. We leave the run image, if debug is needed.
#
function register_exit_handlers {
trap '{
if [[ -f $c_perf_shell_pidfile ]]; then
sudo pkill -P "$(< "$c_perf_shell_pidfile")"
sudo rm "$c_perf_shell_pidfile"
fi
exit_system_configuration_reset
if [[ -f $c_qemu_pidfile ]]; then
pkill -F "$c_qemu_pidfile"
# The pidfile is 600, so we need `-f`.
rm -f "$c_qemu_pidfile"
fi
}' EXIT
}
function clear_existing_data {
rm -f "$c_output_dir/$v_bench_name."*
}
function run_benchmark {
local benchmark_log_file_name=$c_output_dir/$v_bench_name.log
true > "$benchmark_log_file_name"
echo "Threads number list: ${v_thread_numbers_list[*]}"
# See note in the help.
#
# Originally, the strategy was to use the run number in the outer cycle, with the rationale that variations
# between runs would not cluster across a number of threads (inner cycle).
# Later, the nesting has been reversed; this has been made possible by giving the guideline of setting
# a fixed CPU clock.
#
for threads in "${v_thread_numbers_list[@]}"; do
boot_guest "$threads"
wait_guest_online
echo "
################################################################################
> Threads: $threads ($v_bench_name)
################################################################################
" | tee -a "$benchmark_log_file_name"
if [[ -z $v_enable_perf_stat && -z $v_enable_perf_record ]]; then
local standard_timing_file_name="$c_output_dir/$v_bench_name.timings.csv"
run_benchmark_thread_group "$threads" "$v_count_runs" "$benchmark_log_file_name" "$standard_timing_file_name"
fi
# The perf shell pidfile is created in the `start_perf_*`` functions.
# Using a pidfile inside a sudoed shell is the simplest way, as finding the exact process to kill
# in the background/subshell/sudo processes tree is a bloody mess (e.g. the tree is different based
# on how the commands are grouped - `{ cmds ;}` or `( cmds )`).
# The process is killed inside the run_benchmark_thread_group() function, as we want to run profiling as
# tightly as possible around the benchmark execution.
# Note that we kill the shell pid child(ren) (which is the `perf` process), via `pkill -P`.
#
if [[ -n $v_enable_perf_stat ]]; then
local perf_stat_timing_file_name="$c_output_dir/$v_bench_name.perf_stat.timings.csv"
start_perf_stat "$threads"
store_vcpu_pids "$threads"
run_benchmark_thread_group "$threads" "$v_count_runs" "$benchmark_log_file_name" "$perf_stat_timing_file_name"
sudo rm "$c_perf_shell_pidfile"
fi
if [[ -n $v_enable_perf_record ]]; then
local runs=1
local perf_record_timing_file_name="$c_output_dir/$v_bench_name.perf_record.timings.csv"
start_perf_record "$threads" "$benchmark_log_file_name"
# We don't need to store the vcpu pids in this case, since perf is not recording per-thread.
run_benchmark_thread_group "$threads" "$runs" "$benchmark_log_file_name" "$perf_record_timing_file_name"
sudo rm "$c_perf_shell_pidfile"
fi
shutdown_guest
done
echo "> Benchmark completed."
}
####################################################################################################
# INNER FUNCTIONS
####################################################################################################
function run_benchmark_thread_group {
local threads=$1
local runs=$2
local benchmark_log_file_name=$3
local timings_file_name=$4
# The `cd` is for simulating a new session.
#
local benchmark_command
benchmark_command=$(compose_benchmark_command "$threads")
if ((runs > 1)); then
benchmark_command="for ((run=0; run < $runs; run++)); do
${benchmark_command}
cd
done"
fi
if [[ ! -s $timings_file_name ]]; then
echo "threads,run,run_time" > "$timings_file_name"
fi
# Don't store the output in the script debug log - too verbose, and it has its own log.
#
set +x
# Ideally, we wouldn't run another program (`tee`) during the benchmarks, however, it's crucial for
# being able to start `perf` when a certain pattern is matched.
#
local command_output
command_output=$(run_remote_command "$benchmark_command" | tee -a "$benchmark_log_file_name")
if [[ -n $v_enable_perf_stat || -n $v_enable_perf_record ]]; then
if [[ ! -f $c_perf_shell_pidfile ]]; then
>&2 echo "Perf was enabled, but the pidfile hasn't been created - possibly, the pattern didn't match."
exit 1
fi
local perf_shell_pid
perf_shell_pid=$(< "$c_perf_shell_pidfile")
echo "> Killing perf (may take a while)..."
sudo pkill -P "$perf_shell_pid"
# There may be a lot of data to write, and the process is in the background.
#
while pgrep -P "$perf_shell_pid" > /dev/null; do sleep 0.1; done
fi
local run_walltimes
run_walltimes=$(extract_run_walltimes "$command_output")
echo "
> TIMES: $(echo -n "$run_walltimes" | tr $'\n' ',')
" | tee -a "$benchmark_log_file_name"
store_timings "$threads" "$run_walltimes" "$timings_file_name"
# Restore logging.
#
set -x
}
function store_vcpu_pids {
local threads=$1
local output_file_name
output_file_name="$c_output_dir/$v_bench_name.pids.$(printf %03d "$threads").txt"
# Sample lines:
#
# Creating thread 'worker' -> PID 11042
# Creating thread 'CPU 0/TCG' -> PID 11043
#
local vcpu_pids
mapfile -t vcpu_pids < <(perl -lne 'print $1 if /CPU.+PID (\d+)/' "$c_qemu_debug_file")
if ((${#vcpu_pids[@]} != threads)); then
>&2 echo "Unexpected number of QEMU vCPU thread PIDS found: ${vcpu_pids[*]}"
exit 1
fi
printf '%s\n' "${vcpu_pids[@]}" > "$output_file_name"
}
# Returns (prints) the perf process pid.
#
function start_perf_stat {
local threads=$1
local output_file_name
output_file_name="$c_output_dir/$v_bench_name.perf_stat.data.$(printf %03d "$threads").csv"
sudo sh -c "
echo \$\$ > $(q "$c_perf_shell_pidfile")
perf stat -e $(q "$c_perf_stat_events") --per-thread -p $(< "$c_qemu_pidfile") --field-separator "," 2> $(q "$output_file_name") > /dev/null
" &
}
# Returns (prints) the perf process pid.
#
function start_perf_record {
local threads=$1
local benchmark_log_file_name=$2
local output_file_name
output_file_name="$c_output_dir/$v_bench_name.perf_record.data.$(printf %03d "$threads").dat"
{
if [[ -n $v_perf_record_pattern ]]; then
sh -c "tail --pid=\$\$ -f $(printf "%q" "$benchmark_log_file_name") | { sed -n $(printf "%q" "/$v_perf_record_pattern/ q") && kill \$\$; }" || true
fi
echo "> Starting perf record..."
sudo sh -c "
echo \$\$ > $(q "$c_perf_shell_pidfile")
perf record -e $(q "$c_perf_record_events") -g -p $(< "$c_qemu_pidfile") -o $(q "$output_file_name")
"
} &
}
# Returns (prints) the run walltimes (one per line).
#
# Watch out: The last newline is stripped; this avoids makes it simpler to handle it, due to commands
# generally appending a newline (echo, <<<), but it must not be forgotten.
#
function extract_run_walltimes {
local command_output=$1
local run_walltimes
run_walltimes=$(echo "$command_output" | perl -lne 'print $1 if /^ROI time measured: (\d+[.,]\d+)s/' | perl -pe 'chomp if eof')
local count_run_walltimes
count_run_walltimes=$(wc -l <<< "$run_walltimes")
if (( count_run_walltimes != v_count_runs )); then
>&2 echo "Unexpected number of walltimes found: $count_run_walltimes ($v_count_runs expected)"
exit 1
fi
echo -n "$run_walltimes"
}
function store_timings {
local threads=$1
local run_walltimes=$2
local timings_file_name=$3
local run=0
while IFS= read -r -a run_walltime; do
# Replace time comma with dot, it present.
#
echo "$threads,$run,${run_walltime/,/.}" >> "$timings_file_name"
(( ++run ))
done <<< "$run_walltimes"
}
####################################################################################################
# HELPERS
####################################################################################################
# Make the least invasive as possible, as in some cases, it can make the command ugly.
#
function q {
local string=$1
printf "%q" "$string"
}
function set_thread_numbers {
local threads_spec=$1
if [[ -z $threads_spec ]]; then
return
elif [[ $threads_spec =~ ^([[:digit:]]+)-([[:digit:]]+)$ ]]; then
v_min_threads=${BASH_REMATCH[1]}
v_max_threads=${BASH_REMATCH[2]}
# Easy mistake :)
#
if ((v_min_threads > 0)); then
return
fi
elif [[ $threads_spec =~ ^[[:digit:]]+(,[[:digit:]]+)*$ ]]; then
mapfile -td, v_thread_numbers_list < <(echo -n "$threads_spec")
return
fi
>&2 echo 'Invalid threads spec (see help): `'"$threads_spec"'`'
exit 1
}
# Input: $@=ssh params
#
function run_remote_command {
# If there is an error, the output may never be shown, so we send it to stderr regardless.
#
# Disabling the host checking is required, both because sshpass doesn't get along with the host checking
# prompt, and because if the guest is changed (reset), SSH will complain.
# Set the ERROR log level, in order to skipt the warning about the host added to the known list
#
sshpass -p "$c_ssh_password" \
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR \
-p "$c_ssh_port" "$c_ssh_user"@"$c_ssh_host" "$@" | tee /dev/stderr
}
# Tricky, for a simple concept:
#
# - waiting for the port to be open is not enough, as QEMU leaves it open regardless;
# - we can't use a single attempt with a long timeout (due to the first SSH connection being slower);
# in some cases, the connection times out - possibly this is due to QEMU receving the packets before
# the SSH server is up, and discarding them instead of queuing them.
#
function wait_guest_online {
while ! nc -z localhost "$c_ssh_port"; do sleep 1; done
SECONDS=0
local single_attempt_timeout=2
local wait_time=60
while (( SECONDS < wait_time )); do
if run_remote_command -o ConnectTimeout="$single_attempt_timeout" exit 2> /dev/null; then
return
fi
done
>&2 echo "Couldn't connect to the VM within $wait_time seconds"
exit 1
}
# The guest may not (for RISC-V, it won't) respond to an ACPI shutdown, so the QEMU monitor strategy
# is not suitable.
#
function shutdown_guest {
run_remote_command "/sbin/halt"
# Shutdown is asynchronous, so just wait for the pidfile to go.
# In some cases (unclear why, for PARSEC freqmine), the image file would still be locked by the next
# thread group, implying that QEMU is still on. For this reason, an extra check is needed.
#
while [[ -f $c_qemu_pidfile || $(lsof "$c_guest_image_temp" 2> /dev/null || true) != "" ]]; do
sleep 0.5
done
}
####################################################################################################
# EXECUTION
####################################################################################################
decode_cmdline_args "$@"
load_includes
create_directories
copy_busybear_image
init_debug_log
cache_sudo
find_host_system_configuration_options
register_exit_handlers
set_host_system_configuration
prepare_isolated_processors_list
if ((${#v_thread_numbers_list[@]} == 0)); then
prepare_threads_number_list
fi
clear_existing_data
run_benchmark