Skip to content
Permalink
Browse files
tcp: Delay sending non-probes for RFC4821 mtu probing
According to RFC4821 Section 7.4 "Protocols MAY delay sending non-probes
in order to accumulate enough data" but linux almost never does that.

Linux waits for probe_size + (1 + retries) * mss_cache to be available
in the send buffer and if that condition is not met it will send anyway
using the current MSS. The feature can be made to work by sending very
large chunks of data from userspace (for example 128k) but for small writes
on fast links probes almost never happen.

This patch tries to implement the "MAY" by adding an extra flag
"wait_data" to icsk_mtup which is set to 1 if a probe is possible but
insufficient data is available. Then data is held back in
tcp_write_xmit until a probe is sent, probing conditions are no longer
met, or 500ms pass.

Signed-off-by: Leonard Crestez <cdleonard@gmail.com>
  • Loading branch information
cdleonard authored and intel-lab-lkp committed Apr 21, 2021
1 parent 593ef16 commit 0b2ec1d5d492384a89a499c7227c7f6a21f283d0
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 5 deletions.
@@ -552,6 +552,10 @@ tcp_probe_threshold - INTEGER
will stop in respect to the width of search range in bytes. Default
is 8 bytes.

tcp_probe_wait - INTEGER
How long to wait for data for a tcp mtu probe. The default is 500
milliseconds, zero can be used to disable this feature.

tcp_no_metrics_save - BOOLEAN
By default, TCP saves various connection metrics in the route cache
when the connection closes, so that connections established in the
@@ -125,11 +125,16 @@ struct inet_connection_sock {
int search_low;

/* Information on the current probe. */
u32 probe_size:31,
u32 probe_size:30,
/* Are we actively accumulating data for an mtu probe? */
wait_data:1,
/* Is the MTUP feature enabled for this connection? */
enabled:1;

u32 probe_timestamp;

/* Timer for wait_data */
struct timer_list wait_data_timer;
} icsk_mtup;
u32 icsk_probes_tstamp;
u32 icsk_user_timeout;
@@ -128,6 +128,7 @@ struct netns_ipv4 {
int sysctl_tcp_min_snd_mss;
int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;
int sysctl_tcp_probe_wait;

int sysctl_tcp_keepalive_time;
int sysctl_tcp_keepalive_intvl;
@@ -1377,6 +1377,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out ||
ca_ops->cong_control)
return;
if (inet_csk(sk)->icsk_mtup.wait_data)
return;
delta = tcp_jiffies32 - tp->lsndtime;
if (delta > inet_csk(sk)->icsk_rto)
tcp_cwnd_restart(sk, delta);
@@ -844,6 +844,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_douintvec_minmax,
.extra2 = &u32_max_div_HZ,
},
{
.procname = "tcp_probe_wait",
.data = &init_net.ipv4.sysctl_tcp_probe_wait,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
{
.procname = "igmp_link_local_mcast_reports",
.data = &init_net.ipv4.sysctl_igmp_llm_reports,
@@ -2891,6 +2891,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_wait = HZ / 2;

net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
@@ -1756,6 +1756,35 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
}
EXPORT_SYMBOL(tcp_mss_to_mtu);

void tcp_mtu_probe_wait_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);

if (icsk->icsk_mtup.wait_data) {
icsk->icsk_mtup.wait_data = false;
sk_stop_timer(sk, &icsk->icsk_mtup.wait_data_timer);
}
}

static void tcp_mtu_probe_wait_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_mtup.wait_data_timer);
struct sock *sk = &icsk->icsk_inet.sk;

bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
/* push pending frames now */
icsk->icsk_mtup.wait_data = false;
tcp_push_pending_frames(sk);
} else {
/* flush later if sock locked by user */
sk_reset_timer(sk, &icsk->icsk_mtup.wait_data_timer, jiffies + HZ / 10);
}
bh_unlock_sock(sk);
sock_put(sk);
}


/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
{
@@ -1770,6 +1799,7 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
timer_setup(&icsk->icsk_mtup.wait_data_timer, tcp_mtu_probe_wait_timer, 0);
}
EXPORT_SYMBOL(tcp_mtup_init);

@@ -2368,12 +2398,14 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}

/* Can probe ever fit inside window? */
if (tp->snd_wnd < size_needed)
return -1;

/* Have enough data in the send queue to probe? */
if (tp->write_seq - tp->snd_nxt < size_needed)
return -1;
return net->ipv4.sysctl_tcp_probe_wait ? 0 : -1;

if (tp->snd_wnd < size_needed)
return -1;
if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
return 0;

@@ -2598,7 +2630,9 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
@@ -2609,13 +2643,25 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;

tcp_mstamp_refresh(tp);
if (!push_one) {
/*
* Waiting for tcp probe data also applies when push_one=1
* If user does many small writes we hold them until we have have enough
* for a probe.
*/
if (!push_one || (push_one < 2 && net->ipv4.sysctl_tcp_probe_wait)) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
if (!result) {
if (net->ipv4.sysctl_tcp_probe_wait && !icsk->icsk_mtup.wait_data) {
icsk->icsk_mtup.wait_data = true;
sk_reset_timer(sk, &icsk->icsk_mtup.wait_data_timer, jiffies + net->ipv4.sysctl_tcp_probe_wait);
}
return false;
} else if (result > 0) {
tcp_mtu_probe_wait_stop(sk);
sent_pkts = 1;
} else {
tcp_mtu_probe_wait_stop(sk);
}
}

0 comments on commit 0b2ec1d

Please sign in to comment.