Skip to content
Permalink
Browse files
net_sched: introduce eBPF based Qdisc
This *incomplete* patch introduces a programmable Qdisc with
eBPF.  The goal is to make this Qdisc as programmable as possible,
that is, to replace as many existing Qdisc's as we can, no matter
in tree or out of tree. And we want to make programmer's and researcher's
life as easy as possible, so that they don't have to write a complete
Qdisc kernel module just to experiment some queuing theory.

The design was discussed during last LPC:
https://linuxplumbersconf.org/event/7/contributions/679/attachments/520/1188/sch_bpf.pdf

Here is a summary of design decisions I made:

1. Avoid eBPF struct_ops, as it would be really hard to program
   a Qdisc with this approach, literally all the struct Qdisc_ops
   and struct Qdisc_class_ops are needed to implement. This is almost
   as hard as programming a Qdisc kernel module.

2. Avoid exposing skb's to user-space, which means we don't introduce
   a map to store skb's. Instead, store them in kernel without exposure
   to user-space. There are three different reasons behind this:

   2a) User-space does not need to read skb, there is no use case to let
   user-space make decisions, so far.

   2b) Kernel would lose the visibility of the "queues", as maps are only
   shared between eBPF programs and user-space. These queues still have to
   interact with the kernel, for example, kernel wants to reset all queues
   when we reset the network interface, kernel wants to adjust number of
   queues if they are mapped to hardware queues.

   2c) It is harder to interact with existing TC infra. See below.

3. Integrate with existing TC infra. For example, if the user doesn't want
   to implement her own filters (e.g. a flow dissector), she should be able
   to re-use the existing TC filters. And each queue can be easily mapped
   to a TC class and dump its stats easily via netlink. Users can use this
   Qdisc together with any other Qdisc's too, pretty much like a regular
   Qdisc.

So I choose to use priority queues to store skb's inside a flow and to
store flows inside a Qdisc, and let eBPF programs decide the *relative*
position of the skb within the flow and the *relative* order of the flows
too, upon each enqueue and dequeue. Each flow is also exposed to user as
a TC class, like many other classful Qdisc's.

Although the biggest limitation is obviously that users can not traverse
the packets or flows inside the Qdisc, I think at least they could store
those global information of interest inside their own hashmap.

Any high-level feedbacks are welcome. Please do not review any coding details
until RFC tag is removed.

TODO:
1. actually test it
2. write a document for this Qdisc
3. add test cases and sample code

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
  • Loading branch information
Cong Wang authored and intel-lab-lkp committed Sep 13, 2021
1 parent 13bb842 commit 11b7d639a3edc1182a3027ae62cc35dcf0924178
Show file tree
Hide file tree
Showing 8 changed files with 754 additions and 0 deletions.
@@ -8,6 +8,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act,
struct __sk_buff, struct sk_buff)
BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act,
struct __sk_buff, struct sk_buff)
BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_QDISC, tc_cls_act,
struct __sk_buff, struct sk_buff)
BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp,
struct xdp_md, struct xdp_buff)
#ifdef CONFIG_CGROUP_BPF
@@ -0,0 +1,90 @@
// SPDX-License-Identifier: GPL-2.0
/*
* A priority queue implementation based on rbtree
*
* Copyright (C) 2021, Bytedance, Cong Wang <cong.wang@bytedance.com>
*/

#ifndef _LINUX_PRIORITY_QUEUE_H
#define _LINUX_PRIORITY_QUEUE_H

#include <linux/rbtree.h>

struct pq_node {
struct rb_node rb_node;
};

struct pq_root {
struct rb_root_cached rb_root;
bool (*cmp)(struct pq_node *l, struct pq_node *r);
};

static inline void pq_root_init(struct pq_root *root,
bool (*cmp)(struct pq_node *l, struct pq_node *r))
{
root->rb_root = RB_ROOT_CACHED;
root->cmp = cmp;
}

static inline void pq_push(struct pq_root *root, struct pq_node *node)
{
struct rb_node **link = &root->rb_root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct pq_node *entry;
bool leftmost = true;

/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct pq_node, rb_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (root->cmp(entry, node)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = false;
}
}

rb_link_node(&node->rb_node, parent, link);
rb_insert_color_cached(&node->rb_node, &root->rb_root, leftmost);
}

static inline struct pq_node *pq_top(struct pq_root *root)
{
struct rb_node *left = rb_first_cached(&root->rb_root);

if (!left)
return NULL;
return rb_entry(left, struct pq_node, rb_node);
}

static inline struct pq_node *pq_pop(struct pq_root *root)
{
struct pq_node *t = pq_top(root);

if (t)
rb_erase_cached(&t->rb_node, &root->rb_root);
return t;
}

static inline void pq_flush(struct pq_root *root, void (*destroy)(struct pq_node *))
{
struct rb_node *node, *next;

for (node = rb_first(&root->rb_root.rb_root);
next = node ? rb_next(node) : NULL, node != NULL;
node = next) {
struct pq_node *pqe;

pqe = rb_entry(node, struct pq_node, rb_node);
if (destroy)
destroy(pqe);
}
}
#endif /* _LINUX_PRIORITY_QUEUE_H */
@@ -36,6 +36,7 @@
#include <linux/splice.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
#include <linux/priority_queue.h>
#include <net/flow.h>
#include <net/page_pool.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -735,6 +736,7 @@ struct sk_buff {
};
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct pq_node pqnode; /* used in ebpf qdisc */
struct list_head list;
};

@@ -949,6 +949,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LSM,
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_SCHED_QDISC,
};

enum bpf_attach_type {
@@ -6258,4 +6259,23 @@ enum {
BTF_F_ZERO = (1ULL << 3),
};

struct sch_bpf_ctx {
/* Input */
struct __sk_buff *skb;
__u32 nr_flows;
__u32 handle;

/* Output */
__u64 rank;
__u64 delay;
__u32 classid;
};

enum {
SCH_BPF_OK,
SCH_BPF_REQUEUE,
SCH_BPF_DROP,
SCH_BPF_THROTTLE,
};

#endif /* _UAPI__LINUX_BPF_H__ */
@@ -1265,4 +1265,19 @@ enum {

#define TCA_ETS_MAX (__TCA_ETS_MAX - 1)

enum {
TCA_SCH_BPF_UNSPEC,
TCA_SCH_BPF_ENQUEUE_PROG_NAME, /* string */
TCA_SCH_BPF_ENQUEUE_PROG_FD, /* u32 */
TCA_SCH_BPF_ENQUEUE_PROG_ID, /* u32 */
TCA_SCH_BPF_ENQUEUE_PROG_TAG, /* data */
TCA_SCH_BPF_DEQUEUE_PROG_NAME, /* string */
TCA_SCH_BPF_DEQUEUE_PROG_FD, /* u32 */
TCA_SCH_BPF_DEQUEUE_PROG_ID, /* u32 */
TCA_SCH_BPF_DEQUEUE_PROG_TAG, /* data */
__TCA_SCH_BPF_MAX,
};

#define TCA_SCH_BPF_MAX (__TCA_SCH_BPF_MAX - 1)

#endif
@@ -439,6 +439,21 @@ config NET_SCH_ETS

If unsure, say N.

config NET_SCH_BPF
tristate "eBPF based programmable queue discipline"
help
This eBPF based queue discipline offers a way to program your
own packet scheduling algorithm. This is a classful qdisc which
also allows you to decide the hierarchy.

Say Y here if you want to use the eBPF based programmable queue
discipline.

To compile this driver as a module, choose M here: the module
will be called sch_bpf.

If unsure, say N.

menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
help
@@ -65,6 +65,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
obj-$(CONFIG_NET_SCH_BPF) += sch_bpf.o

obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o

0 comments on commit 11b7d63

Please sign in to comment.