Skip to content

Commit

Permalink
net: sched: add bpf_link API for bpf classifier
Browse files Browse the repository at this point in the history
This commit introduces a bpf_link based kernel API for creating tc
filters and using the cls_bpf classifier. Only a subset of what netlink
API offers is supported, things like TCA_BPF_POLICE, TCA_RATE and
embedded actions are unsupported.

The kernel API and the libbpf wrapper added in a subsequent patch are
more opinionated and mirror the semantics of low level netlink based
TC-BPF API, i.e. always setting direct action mode, always setting
protocol to ETH_P_ALL, and only exposing handle and priority as the
variables the user can control. We add an additional gen_flags parameter
though to allow for offloading use cases. It would be trivial to extend
the current API to support specifying other attributes in the future,
but for now I'm sticking how we want to push usage.

The semantics around bpf_link support are as follows:

A user can create a classifier attached to a filter using the bpf_link
API, after which changing it and deleting it only happens through the
bpf_link API. It is not possible to bind the bpf_link to existing
filter, and any such attempt will fail with EEXIST. Hence EEXIST can be
returned in two cases, when existing bpf_link owned filter exists, or
existing netlink owned filter exists.

Removing bpf_link owned filter from netlink returns EPERM, denoting that
netlink is locked out from filter manipulation when bpf_link is
involved.

Whenever a filter is detached due to chain removal, or qdisc tear down,
or net_device shutdown, the bpf_link becomes automatically detached.

In this way, the netlink API and bpf_link creation path are exclusive
and don't stomp over one another. Filters created using bpf_link API
cannot be replaced by netlink API, and filters created by netlink API are
never replaced by bpf_link. Netfilter also cannot detach bpf_link filters.

We serialize all changes dover rtnl_lock as cls_bpf API doesn't support the
unlocked classifier API.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
  • Loading branch information
kkdwivedi authored and intel-lab-lkp committed May 28, 2021
1 parent 7699aad commit 7f3d2ad
Show file tree
Hide file tree
Showing 7 changed files with 426 additions and 6 deletions.
3 changes: 3 additions & 0 deletions include/linux/bpf_types.h
Expand Up @@ -135,3 +135,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
#ifdef CONFIG_NET
BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
#endif
#if IS_ENABLED(CONFIG_NET_CLS_BPF)
BPF_LINK_TYPE(BPF_LINK_TYPE_TC, tc)
#endif
13 changes: 13 additions & 0 deletions include/net/pkt_cls.h
Expand Up @@ -2,6 +2,7 @@
#ifndef __NET_PKT_CLS_H
#define __NET_PKT_CLS_H

#include <linux/bpf.h>
#include <linux/pkt_cls.h>
#include <linux/workqueue.h>
#include <net/sch_generic.h>
Expand Down Expand Up @@ -45,6 +46,9 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t func);
struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block,
u32 chain_index);
void tcf_chain_put_by_act(struct tcf_chain *chain);
void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack);
struct tcf_chain *tcf_get_next_chain(struct tcf_block *block,
struct tcf_chain *chain);
struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain,
Expand Down Expand Up @@ -1004,4 +1008,13 @@ struct tc_fifo_qopt_offload {
};
};

#if IS_ENABLED(CONFIG_NET_CLS_BPF)
int bpf_tc_link_attach(union bpf_attr *attr, struct bpf_prog *prog);
#else
static inline int bpf_tc_link_attach(union bpf_attr *attr, struct bpf_prog *prog)
{
return -EOPNOTSUPP;
}
#endif

#endif
6 changes: 5 additions & 1 deletion include/net/sch_generic.h
Expand Up @@ -341,7 +341,11 @@ struct tcf_proto_ops {
int (*tmplt_dump)(struct sk_buff *skb,
struct net *net,
void *tmplt_priv);

#if IS_ENABLED(CONFIG_NET_CLS_BPF)
int (*bpf_link_change)(struct net *net, struct tcf_proto *tp,
struct bpf_prog *filter, void **arg, u32 handle,
u32 gen_flags);
#endif
struct module *owner;
int flags;
};
Expand Down
15 changes: 15 additions & 0 deletions include/uapi/linux/bpf.h
Expand Up @@ -994,6 +994,7 @@ enum bpf_attach_type {
BPF_SK_LOOKUP,
BPF_XDP,
BPF_SK_SKB_VERDICT,
BPF_TC,
__MAX_BPF_ATTACH_TYPE
};

Expand All @@ -1007,6 +1008,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_ITER = 4,
BPF_LINK_TYPE_NETNS = 5,
BPF_LINK_TYPE_XDP = 6,
BPF_LINK_TYPE_TC = 7,

MAX_BPF_LINK_TYPE,
};
Expand Down Expand Up @@ -1447,6 +1449,12 @@ union bpf_attr {
__aligned_u64 iter_info; /* extra bpf_iter_link_info */
__u32 iter_info_len; /* iter_info length */
};
struct { /* used by BPF_TC */
__u32 parent;
__u32 handle;
__u32 gen_flags;
__u16 priority;
} tc;
};
} link_create;

Expand Down Expand Up @@ -5519,6 +5527,13 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
struct {
__u32 ifindex;
__u32 parent;
__u32 handle;
__u32 gen_flags;
__u16 priority;
} tc;
};
} __attribute__((aligned(8)));

Expand Down
10 changes: 9 additions & 1 deletion kernel/bpf/syscall.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <net/pkt_cls.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
Expand Down Expand Up @@ -3027,6 +3028,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_SK_LOOKUP;
case BPF_XDP:
return BPF_PROG_TYPE_XDP;
case BPF_TC:
return BPF_PROG_TYPE_SCHED_CLS;
default:
return BPF_PROG_TYPE_UNSPEC;
}
Expand Down Expand Up @@ -4085,7 +4088,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
return -EINVAL;
}

#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
#define BPF_LINK_CREATE_LAST_FIELD link_create.tc.priority
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
Expand Down Expand Up @@ -4136,6 +4139,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
break;
#endif
#if IS_ENABLED(CONFIG_NET_CLS_BPF)
case BPF_PROG_TYPE_SCHED_CLS:
ret = bpf_tc_link_attach(attr, prog);
break;
#endif
default:
ret = -EINVAL;
Expand Down
138 changes: 135 additions & 3 deletions net/sched/cls_api.c
Expand Up @@ -9,6 +9,7 @@
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
*/

#include <linux/bpf.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
Expand Down Expand Up @@ -1720,9 +1721,9 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain,
return tp_new;
}

static void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tcf_chain_info chain_info;
struct tcf_proto *tp_iter;
Expand Down Expand Up @@ -1760,6 +1761,7 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain,

tcf_proto_put(tp, rtnl_held, extack);
}
EXPORT_SYMBOL_GPL(tcf_chain_tp_delete_empty);

static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
struct tcf_chain_info *chain_info,
Expand Down Expand Up @@ -3917,3 +3919,133 @@ static int __init tc_filter_init(void)
}

subsys_initcall(tc_filter_init);

#if IS_ENABLED(CONFIG_NET_CLS_BPF)

int bpf_tc_link_attach(union bpf_attr *attr, struct bpf_prog *prog)
{
struct net *net = current->nsproxy->net_ns;
u32 chain_index, prio, protocol, parent;
struct tcf_chain_info chain_info;
struct tcf_block *block;
struct tcf_chain *chain;
struct tcf_proto *tp;
int err, tp_created;
unsigned long cl;
struct Qdisc *q;
void *fh;

/* Caller already checks bpf_capable */
if (!ns_capable(current->nsproxy->net_ns->user_ns, CAP_NET_ADMIN))
return -EPERM;

if (attr->link_create.flags ||
!attr->link_create.target_ifindex ||
!tc_flags_valid(attr->link_create.tc.gen_flags))
return -EINVAL;

replay:
parent = attr->link_create.tc.parent;
prio = attr->link_create.tc.priority;
protocol = htons(ETH_P_ALL);
chain_index = 0;
tp_created = 0;
prio <<= 16;
cl = 0;

/* Address this when cls_bpf switches to RTNL_FLAG_DOIT_UNLOCKED */
rtnl_lock();

block = tcf_block_find(net, &q, &parent, &cl,
attr->link_create.target_ifindex, parent, NULL);
if (IS_ERR(block)) {
err = PTR_ERR(block);
goto out_unlock;
}
block->classid = parent;

chain = tcf_chain_get(block, chain_index, true);
if (!chain) {
err = -ENOMEM;
goto out_block;
}

mutex_lock(&chain->filter_chain_lock);

tp = tcf_chain_tp_find(chain, &chain_info, protocol,
prio ?: TC_H_MAKE(0x80000000U, 0U),
!prio);
if (IS_ERR(tp)) {
err = PTR_ERR(tp);
goto out_chain_unlock;
}

if (!tp) {
struct tcf_proto *tp_new = NULL;

if (chain->flushing) {
err = -EAGAIN;
goto out_chain_unlock;
}

if (!prio)
prio = tcf_auto_prio(tcf_chain_tp_prev(chain,
&chain_info));

mutex_unlock(&chain->filter_chain_lock);

tp_new = tcf_proto_create("bpf", protocol, prio, chain, true,
NULL);
if (IS_ERR(tp_new)) {
err = PTR_ERR(tp_new);
goto out_chain;
}

tp_created = 1;
tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio,
true);
if (IS_ERR(tp)) {
err = PTR_ERR(tp);
goto out_chain;
}
} else {
mutex_unlock(&chain->filter_chain_lock);
}

fh = tp->ops->get(tp, attr->link_create.tc.handle);

if (!tp->ops->bpf_link_change)
err = -EDEADLK;
else
err = tp->ops->bpf_link_change(net, tp, prog, &fh,
attr->link_create.tc.handle,
attr->link_create.tc.gen_flags);
if (err >= 0 && q)
q->flags &= ~TCQ_F_CAN_BYPASS;

out:
if (err < 0 && tp_created)
tcf_chain_tp_delete_empty(chain, tp, true, NULL);
out_chain:
if (chain) {
if (!IS_ERR_OR_NULL(tp))
tcf_proto_put(tp, true, NULL);
/* Chain reference only kept for tp creation
* to pair with tcf_chain_put from tcf_proto_destroy
*/
if (!tp_created)
tcf_chain_put(chain);
}
out_block:
tcf_block_release(q, block, true);
out_unlock:
rtnl_unlock();
if (err == -EAGAIN)
goto replay;
return err;
out_chain_unlock:
mutex_unlock(&chain->filter_chain_lock);
goto out;
}

#endif

0 comments on commit 7f3d2ad

Please sign in to comment.