From 3a6117bbe0ed6a87605c1e43e12a1438d8844380 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 27 May 2008 05:25:36 +0000 Subject: [PATCH] Get rid of an old and terrible hack. Local stream sockets enqueue packets directly on the peer's sockbuf, rather then the sender's sockbuf. That part of the code is fine, but in order to prevent the sender from queueing infinite mbufs (because its sockbuf appears to be empty when you do that) the code dynamically messed around with the sender's high water mark. This blew up the new SOCK_SEQPACKET. In particular, it blows up the use of the PR_ATOMIC on stream sockets and can cause spurious EMSGSIZE errors to be returned instead of the EWOULDBLOCK that should have been returned. Also fix, or partially the resource limit code which tries to reduce the high water mark when a user is using too many mbufs. This never worked well and still doesn't, but it is in better shape now. Get rid of the crufty code and simply add a flag to the signalsockbuf, SSB_STOP, to stop the sender. Also adjust the vkernel to increase the default socket buffer when connecting to vknet instead of if_tap. VKE currently issues non-blocking writes to vknet/tap and we do not want to lose packets for no good reason. --- sys/dev/virtual/net/if_vke.c | 12 +++++-- sys/kern/kern_resource.c | 19 +++++++--- sys/kern/uipc_socket.c | 9 ++--- sys/kern/uipc_socket2.c | 4 +-- sys/kern/uipc_usrreq.c | 52 ++++++++++++---------------- sys/platform/vkernel/platform/init.c | 7 +++- sys/sys/socketvar.h | 24 ++++++++++--- sys/sys/unpcb.h | 6 ++-- 8 files changed, 81 insertions(+), 52 deletions(-) diff --git a/sys/dev/virtual/net/if_vke.c b/sys/dev/virtual/net/if_vke.c index 498777a5827b..2216a1388b52 100644 --- a/sys/dev/virtual/net/if_vke.c +++ b/sys/dev/virtual/net/if_vke.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/dev/virtual/net/if_vke.c,v 1.8 2008/05/27 01:10:38 dillon Exp $ + * $DragonFly: src/sys/dev/virtual/net/if_vke.c,v 1.9 2008/05/27 05:25:33 dillon Exp $ */ #include @@ -156,11 +156,17 @@ vke_start(struct ifnet *ifp) return; while ((m = ifq_dequeue(&ifp->if_snd, NULL)) != NULL) { + /* + * Copy the data into a single mbuf and write it out + * non-blocking. + */ if (m->m_pkthdr.len <= MCLBYTES) { m_copydata(m, 0, m->m_pkthdr.len, sc->sc_txbuf); BPF_MTAP(ifp, m); - write(sc->sc_fd, sc->sc_txbuf, m->m_pkthdr.len); - ifp->if_opackets++; + if (write(sc->sc_fd, sc->sc_txbuf, m->m_pkthdr.len) < 0) + ifp->if_oerrors++; + else + ifp->if_opackets++; } else { ifp->if_oerrors++; } diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index 614d7342c19e..2e074294da6a 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -37,7 +37,7 @@ * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 * $FreeBSD: src/sys/kern/kern_resource.c,v 1.55.2.5 2001/11/03 01:41:08 ps Exp $ - * $DragonFly: src/sys/kern/kern_resource.c,v 1.34 2007/08/20 05:40:40 dillon Exp $ + * $DragonFly: src/sys/kern/kern_resource.c,v 1.35 2008/05/27 05:25:34 dillon Exp $ */ #include "opt_compat.h" @@ -682,10 +682,19 @@ chgsbsize(struct uidinfo *uip, u_long *hiwat, u_long to, rlim_t max) crit_enter(); new = uip->ui_sbsize + to - *hiwat; - /* don't allow them to exceed max, but allow subtraction */ - if (to > *hiwat && new > max) { - crit_exit(); - return (0); + + /* + * If we are trying to increase the socket buffer size + * Scale down the hi water mark when we exceed the user's + * allowed socket buffer space. + * + * We can't scale down too much or we will blow up atomic packet + * operations. + */ + if (to > *hiwat && to > MCLBYTES && new > max) { + to = to * max / new; + if (to < MCLBYTES) + to = MCLBYTES; } uip->ui_sbsize = new; *hiwat = to; diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index c908b26788d6..f9b1afe3797e 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -65,7 +65,7 @@ * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ - * $DragonFly: src/sys/kern/uipc_socket.c,v 1.47 2008/01/05 14:02:38 swildner Exp $ + * $DragonFly: src/sys/kern/uipc_socket.c,v 1.48 2008/05/27 05:25:34 dillon Exp $ */ #include "opt_inet.h" @@ -554,12 +554,13 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? ENOTCONN : EDESTADDRREQ); } + if ((atomic && resid > so->so_snd.ssb_hiwat) || + clen > so->so_snd.ssb_hiwat) { + gotoerr(EMSGSIZE); + } space = ssb_space(&so->so_snd); if (flags & MSG_OOB) space += 1024; - if ((atomic && resid > so->so_snd.ssb_hiwat) || - clen > so->so_snd.ssb_hiwat) - gotoerr(EMSGSIZE); if (space < resid + clen && uio && (atomic || space < so->so_snd.ssb_lowat || space < clen)) { if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index 4d96f6354412..0d7faff27533 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -33,7 +33,7 @@ * * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $ - * $DragonFly: src/sys/kern/uipc_socket2.c,v 1.30 2008/04/20 13:44:25 swildner Exp $ + * $DragonFly: src/sys/kern/uipc_socket2.c,v 1.31 2008/05/27 05:25:34 dillon Exp $ */ #include "opt_param.h" @@ -423,7 +423,7 @@ ssb_reserve(struct signalsockbuf *ssb, u_long cc, struct socket *so, * or when called from netgraph (ie, ngd_attach) */ if (cc > sb_max_adj) - return (0); + cc = sb_max_adj; if (!chgsbsize(so->so_cred->cr_uidinfo, &ssb->ssb_hiwat, cc, rl ? rl->rlim_cur : RLIM_INFINITY)) { return (0); diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index c414391ad66a..a851a6b5826b 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -32,7 +32,7 @@ * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 * $FreeBSD: src/sys/kern/uipc_usrreq.c,v 1.54.2.10 2003/03/04 17:28:09 nectar Exp $ - * $DragonFly: src/sys/kern/uipc_usrreq.c,v 1.40 2008/05/27 01:10:39 dillon Exp $ + * $DragonFly: src/sys/kern/uipc_usrreq.c,v 1.41 2008/05/27 05:25:34 dillon Exp $ */ #include @@ -71,7 +71,7 @@ static struct unp_head unp_shead, unp_dhead; * Unix communications domain. * * TODO: - * SEQPACKET, RDM + * RDM * rethink name space problems * need a proper out-of-band * lock pushdown @@ -233,7 +233,6 @@ uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp = so->so_pcb; struct socket *so2; - u_long newhiwat; if (unp == NULL) return EINVAL; @@ -246,19 +245,18 @@ uipc_rcvd(struct socket *so, int flags) case SOCK_SEQPACKET: if (unp->unp_conn == NULL) break; - so2 = unp->unp_conn->unp_socket; /* - * Adjust backpressure on sender - * and wakeup any waiting to write. + * Because we are transfering mbufs directly to the + * peer socket we have to use SSB_STOP on the sender + * to prevent it from building up infinite mbufs. */ - so2->so_snd.ssb_mbmax += unp->unp_mbcnt - so->so_rcv.ssb_mbcnt; - unp->unp_mbcnt = so->so_rcv.ssb_mbcnt; - newhiwat = - so2->so_snd.ssb_hiwat + unp->unp_cc - so->so_rcv.ssb_cc; - chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.ssb_hiwat, - newhiwat, RLIM_INFINITY); - unp->unp_cc = so->so_rcv.ssb_cc; - sowwakeup(so2); + so2 = unp->unp_conn->unp_socket; + if (so->so_rcv.ssb_cc < so2->so_snd.ssb_hiwat && + so->so_rcv.ssb_mbcnt < so2->so_snd.ssb_mbmax + ) { + so2->so_snd.ssb_flags &= ~SSB_STOP; + sowwakeup(so2); + } break; default: @@ -276,7 +274,6 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, int error = 0; struct unpcb *unp = so->so_pcb; struct socket *so2; - u_long newhiwat; if (unp == NULL) { error = EINVAL; @@ -368,14 +365,17 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, sbappend(&so2->so_rcv.sb, m); m = NULL; } - so->so_snd.ssb_mbmax -= - so2->so_rcv.ssb_mbcnt - unp->unp_conn->unp_mbcnt; - unp->unp_conn->unp_mbcnt = so2->so_rcv.ssb_mbcnt; - newhiwat = so->so_snd.ssb_hiwat - - (so2->so_rcv.ssb_cc - unp->unp_conn->unp_cc); - chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.ssb_hiwat, - newhiwat, RLIM_INFINITY); - unp->unp_conn->unp_cc = so2->so_rcv.ssb_cc; + + /* + * Because we are transfering mbufs directly to the + * peer socket we have to use SSB_STOP on the sender + * to prevent it from building up infinite mbufs. + */ + if (so2->so_rcv.ssb_cc >= so->so_snd.ssb_hiwat || + so2->so_rcv.ssb_mbcnt >= so->so_snd.ssb_mbmax + ) { + so->so_snd.ssb_flags |= SSB_STOP; + } sorwakeup(so2); break; @@ -406,16 +406,10 @@ static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp = so->so_pcb; - struct socket *so2; if (unp == NULL) return EINVAL; sb->st_blksize = so->so_snd.ssb_hiwat; - if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) && - unp->unp_conn != NULL) { - so2 = unp->unp_conn->unp_socket; - sb->st_blksize += so2->so_rcv.ssb_cc; - } sb->st_dev = NOUDEV; if (unp->unp_ino == 0) /* make up a non-zero inode number */ unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; diff --git a/sys/platform/vkernel/platform/init.c b/sys/platform/vkernel/platform/init.c index 45ba458473b4..52da57d0f0e9 100644 --- a/sys/platform/vkernel/platform/init.c +++ b/sys/platform/vkernel/platform/init.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/platform/vkernel/platform/init.c,v 1.54 2008/05/27 01:10:45 dillon Exp $ + * $DragonFly: src/sys/platform/vkernel/platform/init.c,v 1.55 2008/05/27 05:25:35 dillon Exp $ */ #include @@ -973,6 +973,8 @@ unix_connect(const char *path) struct sockaddr_un sunx; int len; int net_fd; + int sndbuf = 262144; + struct stat st; snprintf(sunx.sun_path, sizeof(sunx.sun_path), "%s", path); len = offsetof(struct sockaddr_un, sun_path[strlen(sunx.sun_path)]); @@ -987,6 +989,9 @@ unix_connect(const char *path) close(net_fd); return(-1); } + setsockopt(net_fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)); + if (fstat(net_fd, &st) == 0) + printf("Network socket buffer: %d bytes\n", st.st_blksize); fcntl(net_fd, F_SETFL, O_NONBLOCK); return(net_fd); } diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index e8037393eacd..ea5cebed47d4 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -32,7 +32,7 @@ * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * $FreeBSD: src/sys/sys/socketvar.h,v 1.46.2.10 2003/08/24 08:24:39 hsu Exp $ - * $DragonFly: src/sys/sys/socketvar.h,v 1.30 2007/11/07 18:24:04 dillon Exp $ + * $DragonFly: src/sys/sys/socketvar.h,v 1.31 2008/05/27 05:25:36 dillon Exp $ */ #ifndef _SYS_SOCKETVAR_H_ @@ -83,6 +83,7 @@ struct signalsockbuf { #define SSB_AIO 0x80 /* AIO operations queued */ #define SSB_KNOTE 0x100 /* kernel note attached */ #define SSB_MEVENT 0x200 /* need message event notification */ +#define SSB_STOP 0x400 /* backpressure indicator */ /* * Per-socket kernel structure. Contains universal send and receive queues, @@ -220,11 +221,24 @@ struct xsocket { * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? * This is problematical if the fields are unsigned, as the space might * still be negative (cc > hiwat or mbcnt > mbmax). Should detect - * overflow and return 0. Should use "lmin" but it doesn't exist now. + * overflow and return 0. + * + * SSB_STOP ignores cc/hiwat and returns 0. This is used by unix domain + * stream sockets to signal backpressure. */ -#define ssb_space(ssb) \ - ((long)imin((int)((ssb)->ssb_hiwat - (ssb)->ssb_cc), \ - (int)((ssb)->ssb_mbmax - (ssb)->ssb_mbcnt))) +static __inline +long +ssb_space(struct signalsockbuf *ssb) +{ + long bleft; + long mleft; + + if (ssb->ssb_flags & SSB_STOP) + return(0); + bleft = ssb->ssb_hiwat - ssb->ssb_cc; + mleft = ssb->ssb_mbmax - ssb->ssb_mbcnt; + return((bleft < mleft) ? bleft : mleft); +} #define ssb_append(ssb, m) \ sbappend(&(ssb)->sb, m) diff --git a/sys/sys/unpcb.h b/sys/sys/unpcb.h index 2c5cb3f46036..bb39f0e0a41b 100644 --- a/sys/sys/unpcb.h +++ b/sys/sys/unpcb.h @@ -32,7 +32,7 @@ * * @(#)unpcb.h 8.1 (Berkeley) 6/2/93 * $FreeBSD: src/sys/sys/unpcb.h,v 1.9.2.1 2002/03/09 05:22:23 dd Exp $ - * $DragonFly: src/sys/sys/unpcb.h,v 1.4 2005/04/20 19:38:22 hsu Exp $ + * $DragonFly: src/sys/sys/unpcb.h,v 1.5 2008/05/27 05:25:36 dillon Exp $ */ #ifndef _SYS_UNPCB_H_ @@ -79,8 +79,8 @@ struct unpcb { struct unp_head unp_refs; /* referencing socket linked list */ LIST_ENTRY(unpcb) unp_reflink; /* link in unp_refs list */ struct sockaddr_un *unp_addr; /* bound address of socket */ - int unp_cc; /* copy of rcv.sb_cc */ - int unp_mbcnt; /* copy of rcv.sb_mbcnt */ + int unused01; + int unused02; unp_gen_t unp_gencnt; /* generation count of this instance */ int unp_flags; /* flags */ struct xucred unp_peercred; /* peer credentials, if applicable */