Skip to content

Commit 02db34d

Browse files
committed
Merge branch 'bpf-BASE_RTT'
Lawrence Brakmo says: ==================== bpf: add support for BASE_RTT This patch set adds the following functionality to socket_ops BPF programs. 1) Add bpf helper function bpf_getsocketops. Currently only supports TCP_CONGESTION 2) Add BPF_SOCKET_OPS_BASE_RTT op to get the base RTT of the connection. In general, the base RTT indicates the threshold such that RTTs above it indicate congestion. More details in the relevant patches. Consists of the following patches: [PATCH net-next 1/5] bpf: add support for BPF_SOCK_OPS_BASE_RTT [PATCH net-next 2/5] bpf: Adding helper function bpf_getsockops [PATCH net-next 3/5] bpf: Add BPF_SOCKET_OPS_BASE_RTT support to [PATCH net-next 4/5] bpf: sample BPF_SOCKET_OPS_BASE_RTT program [PATCH net-next 5/5] bpf: create samples/bpf/tcp_bpf.readme ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents 62d3f60 + bfdf756 commit 02db34d

File tree

7 files changed

+213
-7
lines changed

7 files changed

+213
-7
lines changed

include/uapi/linux/bpf.h

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -613,12 +613,22 @@ union bpf_attr {
613613
* int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
614614
* Calls setsockopt. Not all opts are available, only those with
615615
* integer optvals plus TCP_CONGESTION.
616-
* Supported levels: SOL_SOCKET and IPROTO_TCP
616+
* Supported levels: SOL_SOCKET and IPPROTO_TCP
617617
* @bpf_socket: pointer to bpf_socket
618-
* @level: SOL_SOCKET or IPROTO_TCP
618+
* @level: SOL_SOCKET or IPPROTO_TCP
619619
* @optname: option name
620620
* @optval: pointer to option value
621-
* @optlen: length of optval in byes
621+
* @optlen: length of optval in bytes
622+
* Return: 0 or negative error
623+
*
624+
* int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
625+
* Calls getsockopt. Not all opts are available.
626+
* Supported levels: IPPROTO_TCP
627+
* @bpf_socket: pointer to bpf_socket
628+
* @level: IPPROTO_TCP
629+
* @optname: option name
630+
* @optval: pointer to option value
631+
* @optlen: length of optval in bytes
622632
* Return: 0 or negative error
623633
*
624634
* int bpf_skb_adjust_room(skb, len_diff, mode, flags)
@@ -721,7 +731,8 @@ union bpf_attr {
721731
FN(sock_map_update), \
722732
FN(xdp_adjust_meta), \
723733
FN(perf_event_read_value), \
724-
FN(perf_prog_read_value),
734+
FN(perf_prog_read_value), \
735+
FN(getsockopt),
725736

726737
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
727738
* function eBPF program intends to call
@@ -955,6 +966,13 @@ enum {
955966
BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
956967
* needs ECN
957968
*/
969+
BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is
970+
* based on the path and may be
971+
* dependent on the congestion control
972+
* algorithm. In general it indicates
973+
* a congestion threshold. RTTs above
974+
* this indicate congestion
975+
*/
958976
};
959977

960978
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */

net/core/filter.c

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3273,7 +3273,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
32733273

32743274
static const struct bpf_func_proto bpf_setsockopt_proto = {
32753275
.func = bpf_setsockopt,
3276-
.gpl_only = true,
3276+
.gpl_only = false,
32773277
.ret_type = RET_INTEGER,
32783278
.arg1_type = ARG_PTR_TO_CTX,
32793279
.arg2_type = ARG_ANYTHING,
@@ -3282,6 +3282,48 @@ static const struct bpf_func_proto bpf_setsockopt_proto = {
32823282
.arg5_type = ARG_CONST_SIZE,
32833283
};
32843284

3285+
BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
3286+
int, level, int, optname, char *, optval, int, optlen)
3287+
{
3288+
struct sock *sk = bpf_sock->sk;
3289+
int ret = 0;
3290+
3291+
if (!sk_fullsock(sk))
3292+
goto err_clear;
3293+
3294+
#ifdef CONFIG_INET
3295+
if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
3296+
if (optname == TCP_CONGESTION) {
3297+
struct inet_connection_sock *icsk = inet_csk(sk);
3298+
3299+
if (!icsk->icsk_ca_ops || optlen <= 1)
3300+
goto err_clear;
3301+
strncpy(optval, icsk->icsk_ca_ops->name, optlen);
3302+
optval[optlen - 1] = 0;
3303+
} else {
3304+
goto err_clear;
3305+
}
3306+
} else {
3307+
goto err_clear;
3308+
}
3309+
return ret;
3310+
#endif
3311+
err_clear:
3312+
memset(optval, 0, optlen);
3313+
return -EINVAL;
3314+
}
3315+
3316+
static const struct bpf_func_proto bpf_getsockopt_proto = {
3317+
.func = bpf_getsockopt,
3318+
.gpl_only = false,
3319+
.ret_type = RET_INTEGER,
3320+
.arg1_type = ARG_PTR_TO_CTX,
3321+
.arg2_type = ARG_ANYTHING,
3322+
.arg3_type = ARG_ANYTHING,
3323+
.arg4_type = ARG_PTR_TO_UNINIT_MEM,
3324+
.arg5_type = ARG_CONST_SIZE,
3325+
};
3326+
32853327
static const struct bpf_func_proto *
32863328
bpf_base_func_proto(enum bpf_func_id func_id)
32873329
{
@@ -3460,6 +3502,8 @@ static const struct bpf_func_proto *
34603502
switch (func_id) {
34613503
case BPF_FUNC_setsockopt:
34623504
return &bpf_setsockopt_proto;
3505+
case BPF_FUNC_getsockopt:
3506+
return &bpf_getsockopt_proto;
34633507
case BPF_FUNC_sock_map_update:
34643508
return &bpf_sock_map_update_proto;
34653509
default:

net/ipv4/tcp_nv.c

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
* nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected
4040
* nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8
4141
* nv_rtt_factor RTT averaging factor
42-
* nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur
42+
* nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur
4343
* nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd
4444
* nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd
4545
* nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping
@@ -61,7 +61,7 @@ static int nv_min_cwnd __read_mostly = 2;
6161
static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */
6262
static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */
6363
static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */
64-
static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */
64+
static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */
6565
static int nv_cwnd_growth_rate_neg __read_mostly = 8;
6666
static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */
6767
static int nv_dec_eval_min_calls __read_mostly = 60;
@@ -101,6 +101,11 @@ struct tcpnv {
101101
u32 nv_last_rtt; /* last rtt */
102102
u32 nv_min_rtt; /* active min rtt. Used to determine slope */
103103
u32 nv_min_rtt_new; /* min rtt for future use */
104+
u32 nv_base_rtt; /* If non-zero it represents the threshold for
105+
* congestion */
106+
u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is
107+
* set to 80% of nv_base_rtt. It helps reduce
108+
* unfairness between flows */
104109
u32 nv_rtt_max_rate; /* max rate seen during current RTT */
105110
u32 nv_rtt_start_seq; /* current RTT ends when packet arrives
106111
* acking beyond nv_rtt_start_seq */
@@ -132,9 +137,24 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
132137
static void tcpnv_init(struct sock *sk)
133138
{
134139
struct tcpnv *ca = inet_csk_ca(sk);
140+
int base_rtt;
135141

136142
tcpnv_reset(ca, sk);
137143

144+
/* See if base_rtt is available from socket_ops bpf program.
145+
* It is meant to be used in environments, such as communication
146+
* within a datacenter, where we have reasonable estimates of
147+
* RTTs
148+
*/
149+
base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
150+
if (base_rtt > 0) {
151+
ca->nv_base_rtt = base_rtt;
152+
ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
153+
} else {
154+
ca->nv_base_rtt = 0;
155+
ca->nv_lower_bound_rtt = 0;
156+
}
157+
138158
ca->nv_allow_cwnd_growth = 1;
139159
ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ;
140160
ca->nv_min_rtt = NV_INIT_RTT;
@@ -144,6 +164,19 @@ static void tcpnv_init(struct sock *sk)
144164
ca->cwnd_growth_factor = 0;
145165
}
146166

167+
/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt)
168+
* bounds to RTT.
169+
*/
170+
inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val)
171+
{
172+
if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt)
173+
return ca->nv_lower_bound_rtt;
174+
else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt)
175+
return ca->nv_base_rtt;
176+
else
177+
return val;
178+
}
179+
147180
static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
148181
{
149182
struct tcp_sock *tp = tcp_sk(sk);
@@ -265,6 +298,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
265298
if (ca->nv_eval_call_cnt < 255)
266299
ca->nv_eval_call_cnt++;
267300

301+
/* Apply bounds to rtt. Only used to update min_rtt */
302+
avg_rtt = nv_get_bounded_rtt(ca, avg_rtt);
303+
268304
/* update min rtt if necessary */
269305
if (avg_rtt < ca->nv_min_rtt)
270306
ca->nv_min_rtt = avg_rtt;

samples/bpf/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ always += tcp_bufs_kern.o
129129
always += tcp_cong_kern.o
130130
always += tcp_iw_kern.o
131131
always += tcp_clamp_kern.o
132+
always += tcp_basertt_kern.o
132133
always += xdp_redirect_kern.o
133134
always += xdp_redirect_map_kern.o
134135
always += xdp_redirect_cpu_kern.o

samples/bpf/tcp_basertt_kern.c

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/* Copyright (c) 2017 Facebook
2+
*
3+
* This program is free software; you can redistribute it and/or
4+
* modify it under the terms of version 2 of the GNU General Public
5+
* License as published by the Free Software Foundation.
6+
*
7+
* BPF program to set base_rtt to 80us when host is running TCP-NV and
8+
* both hosts are in the same datacenter (as determined by IPv6 prefix).
9+
*
10+
* Use load_sock_ops to load this BPF program.
11+
*/
12+
13+
#include <uapi/linux/bpf.h>
14+
#include <uapi/linux/tcp.h>
15+
#include <uapi/linux/if_ether.h>
16+
#include <uapi/linux/if_packet.h>
17+
#include <uapi/linux/ip.h>
18+
#include <linux/socket.h>
19+
#include "bpf_helpers.h"
20+
#include "bpf_endian.h"
21+
22+
#define DEBUG 1
23+
24+
#define bpf_printk(fmt, ...) \
25+
({ \
26+
char ____fmt[] = fmt; \
27+
bpf_trace_printk(____fmt, sizeof(____fmt), \
28+
##__VA_ARGS__); \
29+
})
30+
31+
SEC("sockops")
32+
int bpf_basertt(struct bpf_sock_ops *skops)
33+
{
34+
char cong[20];
35+
char nv[] = "nv";
36+
int rv = 0, n;
37+
int op;
38+
39+
op = (int) skops->op;
40+
41+
#ifdef DEBUG
42+
bpf_printk("BPF command: %d\n", op);
43+
#endif
44+
45+
/* Check if both hosts are in the same datacenter. For this
46+
* example they are if the 1st 5.5 bytes in the IPv6 address
47+
* are the same.
48+
*/
49+
if (skops->family == AF_INET6 &&
50+
skops->local_ip6[0] == skops->remote_ip6[0] &&
51+
(bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
52+
(bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
53+
switch (op) {
54+
case BPF_SOCK_OPS_BASE_RTT:
55+
n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION,
56+
cong, sizeof(cong));
57+
if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) {
58+
/* Set base_rtt to 80us */
59+
rv = 80;
60+
} else if (n) {
61+
rv = n;
62+
} else {
63+
rv = -1;
64+
}
65+
break;
66+
default:
67+
rv = -1;
68+
}
69+
} else {
70+
rv = -1;
71+
}
72+
#ifdef DEBUG
73+
bpf_printk("Returning %d\n", rv);
74+
#endif
75+
skops->reply = rv;
76+
return 1;
77+
}
78+
char _license[] SEC("license") = "GPL";

samples/bpf/tcp_bbf.readme

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops)
2+
programs. These programs attach to a cgroupv2. The following commands create
3+
a cgroupv2 and attach a bash shell to the group.
4+
5+
mkdir -p /tmp/cgroupv2
6+
mount -t cgroup2 none /tmp/cgroupv2
7+
mkdir -p /tmp/cgroupv2/foo
8+
bash
9+
echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
10+
11+
Anything that runs under this shell belongs to the foo cgroupv2 To load
12+
(attach) one of the tcp_*_kern.o programs:
13+
14+
./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o
15+
16+
If the "-l" flag is used, the load_sock_ops program will continue to run
17+
printing the BPF log buffer. The tcp_*_kern.o programs use special print
18+
functions to print logging information (if enabled by the ifdef).
19+
20+
If using netperf/netserver to create traffic, you need to run them under the
21+
cgroupv2 to which the BPF programs are attached (i.e. under bash shell
22+
attached to the cgroupv2).
23+
24+
To remove (unattach) a socket_ops BPF program from a cgroupv2:
25+
26+
./load_sock_ops -r /tmp/cgroupv2/foo

tools/testing/selftests/bpf/bpf_helpers.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) =
6767
static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
6868
int optlen) =
6969
(void *) BPF_FUNC_setsockopt;
70+
static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval,
71+
int optlen) =
72+
(void *) BPF_FUNC_getsockopt;
7073
static int (*bpf_sk_redirect_map)(void *map, int key, int flags) =
7174
(void *) BPF_FUNC_sk_redirect_map;
7275
static int (*bpf_sock_map_update)(void *map, void *key, void *value,

0 commit comments

Comments
 (0)