bpf: lbr: enable reading LBR from tracing bpf programs

The typical way to access LBR is via hardware perf_event. For CPUs with FREEZE_LBRS_ON_PMI support, PMI could capture reliable LBR. On the other hand, LBR could also be useful in non-PMI scenario. For example, in kretprobe or bpf fexit program, LBR could provide a lot of information on what happened with the function. In this RFC, we try to enable LBR for BPF program. This works like: 1. Create a hardware perf_event with PERF_SAMPLE_BRANCH_* on each CPU; 2. Call a new bpf helper (bpf_get_branch_trace) from the BPF program; 3. Before calling this bpf program, the kernel stops LBR on local CPU, make a copy of LBR, and resumes LBR; 4. In the bpf program, the helper access the copy from #3. Please see tools/testing/selftests/bpf/[progs|prog_tests]/get_call_trace.c for a detailed example. Not that, this process is far from ideal, but it allows quick prototype of this feature. AFAICT, the biggest challenge here is that we are now sharing LBR in PMI and out of PMI, which could trigger some interesting race conditions. However, if we allow some level of missed/corrupted samples, this should still be very useful. Please share your thoughts and comments on this. Thanks in advance! Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Like Xu <like.xu@linux.intel.com> Cc: Alexey Budankov <alexey.budankov@linux.intel.com> Signed-off-by: Song Liu <songliubraving@fb.com>
0day-ci · Aug 18, 2021 · bdd1320 · bdd1320
1 parent 8cacfc8
commit bdd1320
Show file tree

Hide file tree

Showing 14 changed files with 245 additions and 2 deletions.
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
@@ -1862,3 +1862,19 @@ EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
 struct event_constraint vlbr_constraint =
 	__EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR),
 			  FIXED_EVENT_FLAGS, 1, 0, PERF_X86_EVENT_LBR_SELECT);
+
+DEFINE_PER_CPU(struct perf_branch_entry, bpf_lbr_entries[MAX_LBR_ENTRIES]);
+DEFINE_PER_CPU(int, bpf_lbr_cnt);
+
+int bpf_branch_record_read(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	intel_pmu_lbr_disable_all();
+	intel_pmu_lbr_read();
+	memcpy(this_cpu_ptr(&bpf_lbr_entries), cpuc->lbr_entries,
+	       sizeof(struct perf_branch_entry) * x86_pmu.lbr_nr);
+	*this_cpu_ptr(&bpf_lbr_cnt) = x86_pmu.lbr_nr;
+	intel_pmu_lbr_enable_all(false);
+	return 0;
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
@@ -575,7 +575,8 @@ struct bpf_prog {
 				has_callchain_buf:1, /* callchain buffer allocated? */
 				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
-				call_get_func_ip:1; /* Do we call get_func_ip() */
+				call_get_func_ip:1, /* Do we call get_func_ip() */
+				call_get_branch:1; /* Do we call get_branch_trace() */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
@@ -116,6 +116,8 @@ struct perf_branch_stack {
 	struct perf_branch_entry	entries[];
 };
 
+int bpf_branch_record_read(void);
+
 struct task_struct;
 
 /*

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
@@ -4871,6 +4871,13 @@ union bpf_attr {
  * 	Return
  *		Value specified by user at BPF link creation/attachment time
  *		or 0, if it was not specified.
+ *
+ * long bpf_get_branch_trace(void *entries, u32 size, u64 flags)
+ *	Description
+ *		Get branch strace from hardware engines like Intel LBR.
+ * 	Return
+ *		> 0, # of entries.
+ *		**-EOPNOTSUP**, the hardware/kernel does not support this function
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5048,6 +5055,7 @@ union bpf_attr {
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
 	FN(get_attach_cookie),		\
+	FN(get_branch_trace),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper

diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
@@ -566,6 +566,9 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
 {
 	rcu_read_lock();
 	migrate_disable();
+	if (prog->call_get_branch)
+		bpf_branch_record_read();
+
 	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
 		inc_misses_counter(prog);
 		return 0;

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
@@ -6446,6 +6446,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		env->prog->call_get_func_ip = true;
 	}
 
+	if (func_id == BPF_FUNC_get_branch_trace) {
+		if (env->prog->aux->sleepable) {
+			verbose(env, "sleepable progs cannot call get_branch_trace\n");
+			return -ENOTSUPP;
+		}
+		env->prog->call_get_branch = true;
+	}
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;

diff --git a/kernel/events/core.c b/kernel/events/core.c
@@ -13434,3 +13434,8 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
 	.threaded	= true,
 };
 #endif /* CONFIG_CGROUP_PERF */
+
+int __weak  bpf_branch_record_read(void)
+{
+	return -EOPNOTSUPP;
+}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
@@ -1002,6 +1002,29 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+#ifndef MAX_LBR_ENTRIES
+#define MAX_LBR_ENTRIES		32
+#endif
+
+DECLARE_PER_CPU(struct perf_branch_entry, bpf_lbr_entries[MAX_LBR_ENTRIES]);
+DECLARE_PER_CPU(int, bpf_lbr_cnt);
+BPF_CALL_3(bpf_get_branch_trace, void *, buf, u32, size, u64, flags)
+{
+	memcpy(buf, *this_cpu_ptr(&bpf_lbr_entries),
+	       min_t(u32, size,
+		     sizeof(struct perf_branch_entry) * MAX_LBR_ENTRIES));
+	return *this_cpu_ptr(&bpf_lbr_cnt);
+}
+
+static const struct bpf_func_proto bpf_get_branch_trace_proto = {
+	.func		= bpf_get_branch_trace,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1115,6 +1138,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_snprintf_proto;
 	case BPF_FUNC_get_func_ip:
 		return &bpf_get_func_ip_proto_tracing;
+	case BPF_FUNC_get_branch_trace:
+		return &bpf_get_branch_trace_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -1851,6 +1876,8 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
 {
 	cant_sleep();
 	rcu_read_lock();
+	if (prog->call_get_branch)
+		bpf_branch_record_read();
 	(void) bpf_prog_run(prog, args);
 	rcu_read_unlock();
 }

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
@@ -231,6 +231,18 @@ struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
 	return sk;
 }
 
+int noinline bpf_fexit_loop_test1(int n)
+{
+	int i, sum = 0;
+
+	/* the primary goal of this test is to test LBR. Create a lot of
+	 * branches in the function, so we can catch it easily.
+	 */
+	for (i = 0; i < n; i++)
+		sum += i;
+	return sum;
+}
+
 __diag_pop();
 
 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
@@ -293,7 +305,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 		    bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
 		    bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 ||
 		    bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 ||
-		    bpf_fentry_test8(&arg) != 0)
+		    bpf_fentry_test8(&arg) != 0 ||
+		    bpf_fexit_loop_test1(101) != 5050)
 			goto out;
 		break;
 	case BPF_MODIFY_RETURN:

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
@@ -4871,6 +4871,13 @@ union bpf_attr {
  * 	Return
  *		Value specified by user at BPF link creation/attachment time
  *		or 0, if it was not specified.
+ *
+ * long bpf_get_branch_trace(void *entries, u32 size, u64 flags)
+ *	Description
+ *		Get branch strace from hardware engines like Intel LBR.
+ * 	Return
+ *		> 0, # of entries.
+ *		**-EOPNOTSUP**, the hardware/kernel does not support this function
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5048,6 +5055,7 @@ union bpf_attr {
 	FN(timer_cancel),		\
 	FN(get_func_ip),		\
 	FN(get_attach_cookie),		\
+	FN(get_branch_trace),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper

diff --git a/tools/testing/selftests/bpf/prog_tests/get_branch_trace.c b/tools/testing/selftests/bpf/prog_tests/get_branch_trace.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include "get_branch_trace.skel.h"
+
+static int pfd_array[128] = {-1};  /* TODO remove hardcodded 128 */
+
+static int create_perf_events(void)
+{
+	struct perf_event_attr attr = {0};
+	int cpu;
+
+	/* create perf event */
+	attr.size = sizeof(attr);
+	attr.type = PERF_TYPE_HARDWARE;
+	attr.config = PERF_COUNT_HW_CPU_CYCLES;
+	attr.freq = 1;
+	attr.sample_freq = 4000;
+	attr.sample_type = PERF_SAMPLE_BRANCH_STACK;
+	attr.branch_sample_type = PERF_SAMPLE_BRANCH_KERNEL |
+		PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY;
+	for (cpu = 0; cpu < libbpf_num_possible_cpus(); cpu++) {
+		pfd_array[cpu] = syscall(__NR_perf_event_open, &attr,
+					 -1, cpu, -1, PERF_FLAG_FD_CLOEXEC);
+		if (pfd_array[cpu] < 0)
+			break;
+	}
+	return cpu == 0;
+}
+
+static void close_perf_events(void)
+{
+	int cpu = 0;
+	int fd;
+
+	while (cpu < 128) {
+		fd = pfd_array[cpu];
+		if (fd < 0)
+			break;
+		close(fd);
+	}
+}
+
+void test_get_branch_trace(void)
+{
+	struct get_branch_trace *skel;
+	int err, prog_fd;
+	__u32 retval;
+
+	if (create_perf_events()) {
+		test__skip();  /* system doesn't support LBR */
+		goto cleanup;
+	}
+
+	skel = get_branch_trace__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "get_branch_trace__open_and_load"))
+		goto cleanup;
+
+	err = kallsyms_find("bpf_fexit_loop_test1", &skel->bss->address_low);
+	if (!ASSERT_OK(err, "kallsyms_find"))
+		goto cleanup;
+
+	err = kallsyms_find_next("bpf_fexit_loop_test1", &skel->bss->address_high);
+	if (!ASSERT_OK(err, "kallsyms_find_next"))
+		goto cleanup;
+
+	err = get_branch_trace__attach(skel);
+	if (!ASSERT_OK(err, "get_branch_trace__attach"))
+		goto cleanup;
+
+	prog_fd = bpf_program__fd(skel->progs.test1);
+	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+				NULL, 0, &retval, NULL);
+
+	if (!ASSERT_OK(err, "bpf_prog_test_run"))
+		goto cleanup;
+	ASSERT_GT(skel->bss->test1_hits, 5, "find_test1_in_lbr");
+
+cleanup:
+	get_branch_trace__destroy(skel);
+	close_perf_events();
+}
diff --git a/tools/testing/selftests/bpf/progs/get_branch_trace.c b/tools/testing/selftests/bpf/progs/get_branch_trace.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u64 test1_hits = 0;
+__u64 address_low = 0;
+__u64 address_high = 0;
+
+#define MAX_LBR_ENTRIES 32
+
+struct perf_branch_entry entries[MAX_LBR_ENTRIES] = {};
+
+static inline bool in_range(__u64 val)
+{
+	return (val >= address_low) && (val < address_high);
+}
+
+SEC("fexit/bpf_fexit_loop_test1")
+int BPF_PROG(test1, int n, int ret)
+{
+	long cnt, i;
+
+	cnt = bpf_get_branch_trace(entries, sizeof(entries), 0);
+
+	for (i = 0; i < MAX_LBR_ENTRIES; i++) {
+		if (i >= cnt)
+			break;
+		if (in_range(entries[i].from) && in_range(entries[i].to))
+			test1_hits++;
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
@@ -117,6 +117,36 @@ int kallsyms_find(const char *sym, unsigned long long *addr)
 	return err;
 }
 
+/* find the address of the next symbol, this can be used to determine the
+ * end of a function
+ */
+int kallsyms_find_next(const char *sym, unsigned long long *addr)
+{
+	char type, name[500];
+	unsigned long long value;
+	bool found = false;
+	int err = 0;
+	FILE *f;
+
+	f = fopen("/proc/kallsyms", "r");
+	if (!f)
+		return -EINVAL;
+
+	while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) {
+		if (found) {
+			*addr = value;
+			goto out;
+		}
+		if (strcmp(name, sym) == 0)
+			found = true;
+	}
+	err = -ENOENT;
+
+out:
+	fclose(f);
+	return err;
+}
+
 void read_trace_pipe(void)
 {
 	int trace_fd;

diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
@@ -16,6 +16,11 @@ long ksym_get_addr(const char *name);
 /* open kallsyms and find addresses on the fly, faster than load + search. */
 int kallsyms_find(const char *sym, unsigned long long *addr);
 
+/* find the address of the next symbol, this can be used to determine the
+ * end of a function
+ */
+int kallsyms_find_next(const char *sym, unsigned long long *addr);
+
 void read_trace_pipe(void);
 
 ssize_t get_uprobe_offset(const void *addr, ssize_t base);