Skip to content

Commit

Permalink
bpf: lbr: enable reading LBR from tracing bpf programs
Browse files Browse the repository at this point in the history
The typical way to access LBR is via hardware perf_event. For CPUs with
FREEZE_LBRS_ON_PMI support, PMI could capture reliable LBR. On the other
hand, LBR could also be useful in non-PMI scenario. For example, in
kretprobe or bpf fexit program, LBR could provide a lot of information
on what happened with the function.

In this RFC, we try to enable LBR for BPF program. This works like:
  1. Create a hardware perf_event with PERF_SAMPLE_BRANCH_* on each CPU;
  2. Call a new bpf helper (bpf_get_branch_trace) from the BPF program;
  3. Before calling this bpf program, the kernel stops LBR on local CPU,
     make a copy of LBR, and resumes LBR;
  4. In the bpf program, the helper access the copy from #3.

Please see tools/testing/selftests/bpf/[progs|prog_tests]/get_call_trace.c
for a detailed example. Not that, this process is far from ideal, but it
allows quick prototype of this feature.

AFAICT, the biggest challenge here is that we are now sharing LBR in PMI
and out of PMI, which could trigger some interesting race conditions.
However, if we allow some level of missed/corrupted samples, this should
still be very useful.

Please share your thoughts and comments on this. Thanks in advance!

Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Like Xu <like.xu@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
  • Loading branch information
liu-song-6 authored and intel-lab-lkp committed Aug 18, 2021
1 parent 8cacfc8 commit bdd1320
Show file tree
Hide file tree
Showing 14 changed files with 245 additions and 2 deletions.
16 changes: 16 additions & 0 deletions arch/x86/events/intel/lbr.c
Expand Up @@ -1862,3 +1862,19 @@ EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
struct event_constraint vlbr_constraint =
__EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR),
FIXED_EVENT_FLAGS, 1, 0, PERF_X86_EVENT_LBR_SELECT);

DEFINE_PER_CPU(struct perf_branch_entry, bpf_lbr_entries[MAX_LBR_ENTRIES]);
DEFINE_PER_CPU(int, bpf_lbr_cnt);

int bpf_branch_record_read(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

intel_pmu_lbr_disable_all();
intel_pmu_lbr_read();
memcpy(this_cpu_ptr(&bpf_lbr_entries), cpuc->lbr_entries,
sizeof(struct perf_branch_entry) * x86_pmu.lbr_nr);
*this_cpu_ptr(&bpf_lbr_cnt) = x86_pmu.lbr_nr;
intel_pmu_lbr_enable_all(false);
return 0;
}
3 changes: 2 additions & 1 deletion include/linux/filter.h
Expand Up @@ -575,7 +575,8 @@ struct bpf_prog {
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
call_get_func_ip:1; /* Do we call get_func_ip() */
call_get_func_ip:1, /* Do we call get_func_ip() */
call_get_branch:1; /* Do we call get_branch_trace() */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
Expand Down
2 changes: 2 additions & 0 deletions include/linux/perf_event.h
Expand Up @@ -116,6 +116,8 @@ struct perf_branch_stack {
struct perf_branch_entry entries[];
};

int bpf_branch_record_read(void);

struct task_struct;

/*
Expand Down
8 changes: 8 additions & 0 deletions include/uapi/linux/bpf.h
Expand Up @@ -4871,6 +4871,13 @@ union bpf_attr {
* Return
* Value specified by user at BPF link creation/attachment time
* or 0, if it was not specified.
*
* long bpf_get_branch_trace(void *entries, u32 size, u64 flags)
* Description
* Get branch strace from hardware engines like Intel LBR.
* Return
* > 0, # of entries.
* **-EOPNOTSUP**, the hardware/kernel does not support this function
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
Expand Down Expand Up @@ -5048,6 +5055,7 @@ union bpf_attr {
FN(timer_cancel), \
FN(get_func_ip), \
FN(get_attach_cookie), \
FN(get_branch_trace), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
Expand Down
3 changes: 3 additions & 0 deletions kernel/bpf/trampoline.c
Expand Up @@ -566,6 +566,9 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
{
rcu_read_lock();
migrate_disable();
if (prog->call_get_branch)
bpf_branch_record_read();

if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
inc_misses_counter(prog);
return 0;
Expand Down
7 changes: 7 additions & 0 deletions kernel/bpf/verifier.c
Expand Up @@ -6446,6 +6446,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
env->prog->call_get_func_ip = true;
}

if (func_id == BPF_FUNC_get_branch_trace) {
if (env->prog->aux->sleepable) {
verbose(env, "sleepable progs cannot call get_branch_trace\n");
return -ENOTSUPP;
}
env->prog->call_get_branch = true;
}
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
Expand Down
5 changes: 5 additions & 0 deletions kernel/events/core.c
Expand Up @@ -13434,3 +13434,8 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */

int __weak bpf_branch_record_read(void)
{
return -EOPNOTSUPP;
}
27 changes: 27 additions & 0 deletions kernel/trace/bpf_trace.c
Expand Up @@ -1002,6 +1002,29 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
.arg1_type = ARG_PTR_TO_CTX,
};

#ifndef MAX_LBR_ENTRIES
#define MAX_LBR_ENTRIES 32
#endif

DECLARE_PER_CPU(struct perf_branch_entry, bpf_lbr_entries[MAX_LBR_ENTRIES]);
DECLARE_PER_CPU(int, bpf_lbr_cnt);
BPF_CALL_3(bpf_get_branch_trace, void *, buf, u32, size, u64, flags)
{
memcpy(buf, *this_cpu_ptr(&bpf_lbr_entries),
min_t(u32, size,
sizeof(struct perf_branch_entry) * MAX_LBR_ENTRIES));
return *this_cpu_ptr(&bpf_lbr_cnt);
}

static const struct bpf_func_proto bpf_get_branch_trace_proto = {
.func = bpf_get_branch_trace,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};

static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
Expand Down Expand Up @@ -1115,6 +1138,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_snprintf_proto;
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_tracing;
case BPF_FUNC_get_branch_trace:
return &bpf_get_branch_trace_proto;
default:
return bpf_base_func_proto(func_id);
}
Expand Down Expand Up @@ -1851,6 +1876,8 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
cant_sleep();
rcu_read_lock();
if (prog->call_get_branch)
bpf_branch_record_read();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
}
Expand Down
15 changes: 14 additions & 1 deletion net/bpf/test_run.c
Expand Up @@ -231,6 +231,18 @@ struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
return sk;
}

int noinline bpf_fexit_loop_test1(int n)
{
int i, sum = 0;

/* the primary goal of this test is to test LBR. Create a lot of
* branches in the function, so we can catch it easily.
*/
for (i = 0; i < n; i++)
sum += i;
return sum;
}

__diag_pop();

ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
Expand Down Expand Up @@ -293,7 +305,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 ||
bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 ||
bpf_fentry_test8(&arg) != 0)
bpf_fentry_test8(&arg) != 0 ||
bpf_fexit_loop_test1(101) != 5050)
goto out;
break;
case BPF_MODIFY_RETURN:
Expand Down
8 changes: 8 additions & 0 deletions tools/include/uapi/linux/bpf.h
Expand Up @@ -4871,6 +4871,13 @@ union bpf_attr {
* Return
* Value specified by user at BPF link creation/attachment time
* or 0, if it was not specified.
*
* long bpf_get_branch_trace(void *entries, u32 size, u64 flags)
* Description
* Get branch strace from hardware engines like Intel LBR.
* Return
* > 0, # of entries.
* **-EOPNOTSUP**, the hardware/kernel does not support this function
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
Expand Down Expand Up @@ -5048,6 +5055,7 @@ union bpf_attr {
FN(timer_cancel), \
FN(get_func_ip), \
FN(get_attach_cookie), \
FN(get_branch_trace), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
Expand Down
82 changes: 82 additions & 0 deletions tools/testing/selftests/bpf/prog_tests/get_branch_trace.c
@@ -0,0 +1,82 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include <test_progs.h>
#include "get_branch_trace.skel.h"

static int pfd_array[128] = {-1}; /* TODO remove hardcodded 128 */

static int create_perf_events(void)
{
struct perf_event_attr attr = {0};
int cpu;

/* create perf event */
attr.size = sizeof(attr);
attr.type = PERF_TYPE_HARDWARE;
attr.config = PERF_COUNT_HW_CPU_CYCLES;
attr.freq = 1;
attr.sample_freq = 4000;
attr.sample_type = PERF_SAMPLE_BRANCH_STACK;
attr.branch_sample_type = PERF_SAMPLE_BRANCH_KERNEL |
PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY;
for (cpu = 0; cpu < libbpf_num_possible_cpus(); cpu++) {
pfd_array[cpu] = syscall(__NR_perf_event_open, &attr,
-1, cpu, -1, PERF_FLAG_FD_CLOEXEC);
if (pfd_array[cpu] < 0)
break;
}
return cpu == 0;
}

static void close_perf_events(void)
{
int cpu = 0;
int fd;

while (cpu < 128) {
fd = pfd_array[cpu];
if (fd < 0)
break;
close(fd);
}
}

void test_get_branch_trace(void)
{
struct get_branch_trace *skel;
int err, prog_fd;
__u32 retval;

if (create_perf_events()) {
test__skip(); /* system doesn't support LBR */
goto cleanup;
}

skel = get_branch_trace__open_and_load();
if (!ASSERT_OK_PTR(skel, "get_branch_trace__open_and_load"))
goto cleanup;

err = kallsyms_find("bpf_fexit_loop_test1", &skel->bss->address_low);
if (!ASSERT_OK(err, "kallsyms_find"))
goto cleanup;

err = kallsyms_find_next("bpf_fexit_loop_test1", &skel->bss->address_high);
if (!ASSERT_OK(err, "kallsyms_find_next"))
goto cleanup;

err = get_branch_trace__attach(skel);
if (!ASSERT_OK(err, "get_branch_trace__attach"))
goto cleanup;

prog_fd = bpf_program__fd(skel->progs.test1);
err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
NULL, 0, &retval, NULL);

if (!ASSERT_OK(err, "bpf_prog_test_run"))
goto cleanup;
ASSERT_GT(skel->bss->test1_hits, 5, "find_test1_in_lbr");

cleanup:
get_branch_trace__destroy(skel);
close_perf_events();
}
36 changes: 36 additions & 0 deletions tools/testing/selftests/bpf/progs/get_branch_trace.c
@@ -0,0 +1,36 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

char _license[] SEC("license") = "GPL";

__u64 test1_hits = 0;
__u64 address_low = 0;
__u64 address_high = 0;

#define MAX_LBR_ENTRIES 32

struct perf_branch_entry entries[MAX_LBR_ENTRIES] = {};

static inline bool in_range(__u64 val)
{
return (val >= address_low) && (val < address_high);
}

SEC("fexit/bpf_fexit_loop_test1")
int BPF_PROG(test1, int n, int ret)
{
long cnt, i;

cnt = bpf_get_branch_trace(entries, sizeof(entries), 0);

for (i = 0; i < MAX_LBR_ENTRIES; i++) {
if (i >= cnt)
break;
if (in_range(entries[i].from) && in_range(entries[i].to))
test1_hits++;
}
return 0;
}
30 changes: 30 additions & 0 deletions tools/testing/selftests/bpf/trace_helpers.c
Expand Up @@ -117,6 +117,36 @@ int kallsyms_find(const char *sym, unsigned long long *addr)
return err;
}

/* find the address of the next symbol, this can be used to determine the
* end of a function
*/
int kallsyms_find_next(const char *sym, unsigned long long *addr)
{
char type, name[500];
unsigned long long value;
bool found = false;
int err = 0;
FILE *f;

f = fopen("/proc/kallsyms", "r");
if (!f)
return -EINVAL;

while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) {
if (found) {
*addr = value;
goto out;
}
if (strcmp(name, sym) == 0)
found = true;
}
err = -ENOENT;

out:
fclose(f);
return err;
}

void read_trace_pipe(void)
{
int trace_fd;
Expand Down
5 changes: 5 additions & 0 deletions tools/testing/selftests/bpf/trace_helpers.h
Expand Up @@ -16,6 +16,11 @@ long ksym_get_addr(const char *name);
/* open kallsyms and find addresses on the fly, faster than load + search. */
int kallsyms_find(const char *sym, unsigned long long *addr);

/* find the address of the next symbol, this can be used to determine the
* end of a function
*/
int kallsyms_find_next(const char *sym, unsigned long long *addr);

void read_trace_pipe(void);

ssize_t get_uprobe_offset(const void *addr, ssize_t base);
Expand Down

0 comments on commit bdd1320

Please sign in to comment.