Skip to content

Commit a8da2c7

Browse files
kkdwvdintel-lab-lkp
authored andcommitted
net: sched: add bpf_link API for bpf classifier
This commit introduces a bpf_link based kernel API for creating tc filters and using the cls_bpf classifier. Only a subset of what netlink API offers is supported, things like TCA_BPF_POLICE, TCA_RATE and embedded actions are unsupported. The kernel API and the libbpf wrapper added in a subsequent patch are more opinionated and mirror the semantics of low level netlink based TC-BPF API, i.e. always setting direct action mode, always setting protocol to ETH_P_ALL, and only exposing handle and priority as the variables the user can control. We add an additional gen_flags parameter though to allow for offloading use cases. It would be trivial to extend the current API to support specifying other attributes in the future, but for now I'm sticking how we want to push usage. The semantics around bpf_link support are as follows: A user can create a classifier attached to a filter using the bpf_link API, after which changing it and deleting it only happens through the bpf_link API. It is not possible to bind the bpf_link to existing filter, and any such attempt will fail with EEXIST. Hence EEXIST can be returned in two cases, when existing bpf_link owned filter exists, or existing netlink owned filter exists. Removing bpf_link owned filter from netlink returns EPERM, denoting that netlink is locked out from filter manipulation when bpf_link is involved. Whenever a filter is detached due to chain removal, or qdisc tear down, or net_device shutdown, the bpf_link becomes automatically detached. In this way, the netlink API and bpf_link creation path are exclusive and don't stomp over one another. Filters created using bpf_link API cannot be replaced by netlink API, and filters created by netlink API are never replaced by bpf_link. Netfilter also cannot detach bpf_link filters. We serialize all changes dover rtnl_lock as cls_bpf API doesn't support the unlocked classifier API. Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>. Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
1 parent 2d87eba commit a8da2c7

File tree

7 files changed

+430
-6
lines changed

7 files changed

+430
-6
lines changed

include/linux/bpf_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
135135
#ifdef CONFIG_NET
136136
BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
137137
#endif
138+
#if IS_ENABLED(CONFIG_NET_CLS_BPF)
139+
BPF_LINK_TYPE(BPF_LINK_TYPE_TC, tc)
140+
#endif

include/net/pkt_cls.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#ifndef __NET_PKT_CLS_H
33
#define __NET_PKT_CLS_H
44

5+
#include <linux/bpf.h>
56
#include <linux/pkt_cls.h>
67
#include <linux/workqueue.h>
78
#include <net/sch_generic.h>
@@ -45,6 +46,9 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t func);
4546
struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block,
4647
u32 chain_index);
4748
void tcf_chain_put_by_act(struct tcf_chain *chain);
49+
void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
50+
struct tcf_proto *tp, bool rtnl_held,
51+
struct netlink_ext_ack *extack);
4852
struct tcf_chain *tcf_get_next_chain(struct tcf_block *block,
4953
struct tcf_chain *chain);
5054
struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain,
@@ -1004,4 +1008,13 @@ struct tc_fifo_qopt_offload {
10041008
};
10051009
};
10061010

1011+
#if IS_ENABLED(CONFIG_NET_CLS_BPF)
1012+
int bpf_tc_link_attach(union bpf_attr *attr, struct bpf_prog *prog);
1013+
#else
1014+
static inline int bpf_tc_link_attach(union bpf_attr *attr, struct bpf_prog *prog)
1015+
{
1016+
return -EOPNOTSUPP;
1017+
}
1018+
#endif
1019+
10071020
#endif

include/net/sch_generic.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,11 @@ struct tcf_proto_ops {
341341
int (*tmplt_dump)(struct sk_buff *skb,
342342
struct net *net,
343343
void *tmplt_priv);
344-
344+
#if IS_ENABLED(CONFIG_NET_CLS_BPF)
345+
int (*bpf_link_change)(struct net *net, struct tcf_proto *tp,
346+
struct bpf_prog *filter, void **arg, u32 handle,
347+
u32 gen_flags);
348+
#endif
345349
struct module *owner;
346350
int flags;
347351
};

include/uapi/linux/bpf.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,7 @@ enum bpf_attach_type {
994994
BPF_SK_LOOKUP,
995995
BPF_XDP,
996996
BPF_SK_SKB_VERDICT,
997+
BPF_TC,
997998
__MAX_BPF_ATTACH_TYPE
998999
};
9991000

@@ -1007,6 +1008,7 @@ enum bpf_link_type {
10071008
BPF_LINK_TYPE_ITER = 4,
10081009
BPF_LINK_TYPE_NETNS = 5,
10091010
BPF_LINK_TYPE_XDP = 6,
1011+
BPF_LINK_TYPE_TC = 7,
10101012

10111013
MAX_BPF_LINK_TYPE,
10121014
};
@@ -1447,6 +1449,12 @@ union bpf_attr {
14471449
__aligned_u64 iter_info; /* extra bpf_iter_link_info */
14481450
__u32 iter_info_len; /* iter_info length */
14491451
};
1452+
struct { /* used by BPF_TC */
1453+
__u32 parent;
1454+
__u32 handle;
1455+
__u32 gen_flags;
1456+
__u16 priority;
1457+
} tc;
14501458
};
14511459
} link_create;
14521460

@@ -5519,6 +5527,13 @@ struct bpf_link_info {
55195527
struct {
55205528
__u32 ifindex;
55215529
} xdp;
5530+
struct {
5531+
__u32 ifindex;
5532+
__u32 parent;
5533+
__u32 handle;
5534+
__u32 gen_flags;
5535+
__u16 priority;
5536+
} tc;
55225537
};
55235538
} __attribute__((aligned(8)));
55245539

kernel/bpf/syscall.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: GPL-2.0-only
22
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
33
*/
4+
#include <net/pkt_cls.h>
45
#include <linux/bpf.h>
56
#include <linux/bpf_trace.h>
67
#include <linux/bpf_lirc.h>
@@ -3027,6 +3028,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
30273028
return BPF_PROG_TYPE_SK_LOOKUP;
30283029
case BPF_XDP:
30293030
return BPF_PROG_TYPE_XDP;
3031+
case BPF_TC:
3032+
return BPF_PROG_TYPE_SCHED_CLS;
30303033
default:
30313034
return BPF_PROG_TYPE_UNSPEC;
30323035
}
@@ -4085,7 +4088,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
40854088
return -EINVAL;
40864089
}
40874090

4088-
#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
4091+
#define BPF_LINK_CREATE_LAST_FIELD link_create.tc.priority
40894092
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
40904093
{
40914094
enum bpf_prog_type ptype;
@@ -4136,6 +4139,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
41364139
case BPF_PROG_TYPE_XDP:
41374140
ret = bpf_xdp_link_attach(attr, prog);
41384141
break;
4142+
#endif
4143+
#if IS_ENABLED(CONFIG_NET_CLS_BPF)
4144+
case BPF_PROG_TYPE_SCHED_CLS:
4145+
ret = bpf_tc_link_attach(attr, prog);
4146+
break;
41394147
#endif
41404148
default:
41414149
ret = -EINVAL;

net/sched/cls_api.c

Lines changed: 136 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
1010
*/
1111

12+
#include <linux/bpf.h>
1213
#include <linux/module.h>
1314
#include <linux/types.h>
1415
#include <linux/kernel.h>
@@ -1720,9 +1721,9 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain,
17201721
return tp_new;
17211722
}
17221723

1723-
static void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
1724-
struct tcf_proto *tp, bool rtnl_held,
1725-
struct netlink_ext_ack *extack)
1724+
void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
1725+
struct tcf_proto *tp, bool rtnl_held,
1726+
struct netlink_ext_ack *extack)
17261727
{
17271728
struct tcf_chain_info chain_info;
17281729
struct tcf_proto *tp_iter;
@@ -1760,6 +1761,7 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
17601761

17611762
tcf_proto_put(tp, rtnl_held, extack);
17621763
}
1764+
EXPORT_SYMBOL_GPL(tcf_chain_tp_delete_empty);
17631765

17641766
static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
17651767
struct tcf_chain_info *chain_info,
@@ -3917,3 +3919,134 @@ static int __init tc_filter_init(void)
39173919
}
39183920

39193921
subsys_initcall(tc_filter_init);
3922+
3923+
#if IS_ENABLED(CONFIG_NET_CLS_BPF)
3924+
3925+
int bpf_tc_link_attach(union bpf_attr *attr, struct bpf_prog *prog)
3926+
{
3927+
struct net *net = current->nsproxy->net_ns;
3928+
struct tcf_chain_info chain_info;
3929+
u32 chain_index, prio, parent;
3930+
struct tcf_block *block;
3931+
struct tcf_chain *chain;
3932+
struct tcf_proto *tp;
3933+
int err, tp_created;
3934+
unsigned long cl;
3935+
struct Qdisc *q;
3936+
__be16 protocol;
3937+
void *fh;
3938+
3939+
/* Caller already checks bpf_capable */
3940+
if (!ns_capable(current->nsproxy->net_ns->user_ns, CAP_NET_ADMIN))
3941+
return -EPERM;
3942+
3943+
if (attr->link_create.flags ||
3944+
!attr->link_create.target_ifindex ||
3945+
!tc_flags_valid(attr->link_create.tc.gen_flags))
3946+
return -EINVAL;
3947+
3948+
replay:
3949+
parent = attr->link_create.tc.parent;
3950+
prio = attr->link_create.tc.priority;
3951+
protocol = htons(ETH_P_ALL);
3952+
chain_index = 0;
3953+
tp_created = 0;
3954+
prio <<= 16;
3955+
cl = 0;
3956+
3957+
/* Address this when cls_bpf switches to RTNL_FLAG_DOIT_UNLOCKED */
3958+
rtnl_lock();
3959+
3960+
block = tcf_block_find(net, &q, &parent, &cl,
3961+
attr->link_create.target_ifindex, parent, NULL);
3962+
if (IS_ERR(block)) {
3963+
err = PTR_ERR(block);
3964+
goto out_unlock;
3965+
}
3966+
block->classid = parent;
3967+
3968+
chain = tcf_chain_get(block, chain_index, true);
3969+
if (!chain) {
3970+
err = -ENOMEM;
3971+
goto out_block;
3972+
}
3973+
3974+
mutex_lock(&chain->filter_chain_lock);
3975+
3976+
tp = tcf_chain_tp_find(chain, &chain_info, protocol,
3977+
prio ?: TC_H_MAKE(0x80000000U, 0U),
3978+
!prio);
3979+
if (IS_ERR(tp)) {
3980+
err = PTR_ERR(tp);
3981+
goto out_chain_unlock;
3982+
}
3983+
3984+
if (!tp) {
3985+
struct tcf_proto *tp_new = NULL;
3986+
3987+
if (chain->flushing) {
3988+
err = -EAGAIN;
3989+
goto out_chain_unlock;
3990+
}
3991+
3992+
if (!prio)
3993+
prio = tcf_auto_prio(tcf_chain_tp_prev(chain,
3994+
&chain_info));
3995+
3996+
mutex_unlock(&chain->filter_chain_lock);
3997+
3998+
tp_new = tcf_proto_create("bpf", protocol, prio, chain, true,
3999+
NULL);
4000+
if (IS_ERR(tp_new)) {
4001+
err = PTR_ERR(tp_new);
4002+
goto out_chain;
4003+
}
4004+
4005+
tp_created = 1;
4006+
tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio,
4007+
true);
4008+
if (IS_ERR(tp)) {
4009+
err = PTR_ERR(tp);
4010+
goto out_chain;
4011+
}
4012+
} else {
4013+
mutex_unlock(&chain->filter_chain_lock);
4014+
}
4015+
4016+
fh = tp->ops->get(tp, attr->link_create.tc.handle);
4017+
4018+
if (!tp->ops->bpf_link_change)
4019+
err = -EDEADLK;
4020+
else
4021+
err = tp->ops->bpf_link_change(net, tp, prog, &fh,
4022+
attr->link_create.tc.handle,
4023+
attr->link_create.tc.gen_flags);
4024+
if (err >= 0 && q)
4025+
q->flags &= ~TCQ_F_CAN_BYPASS;
4026+
4027+
out:
4028+
if (err < 0 && tp_created)
4029+
tcf_chain_tp_delete_empty(chain, tp, true, NULL);
4030+
out_chain:
4031+
if (chain) {
4032+
if (!IS_ERR_OR_NULL(tp))
4033+
tcf_proto_put(tp, true, NULL);
4034+
/* Chain reference only kept for tp creation
4035+
* to pair with tcf_chain_put from tcf_proto_destroy
4036+
*/
4037+
if (!tp_created)
4038+
tcf_chain_put(chain);
4039+
}
4040+
out_block:
4041+
tcf_block_release(q, block, true);
4042+
out_unlock:
4043+
rtnl_unlock();
4044+
if (err == -EAGAIN)
4045+
goto replay;
4046+
return err;
4047+
out_chain_unlock:
4048+
mutex_unlock(&chain->filter_chain_lock);
4049+
goto out;
4050+
}
4051+
4052+
#endif

0 commit comments

Comments
 (0)