Fix unaligned access issues in host-network byte I/O on ARM

elprans · elprans · commit c04576db852d · 2017-10-31T16:38:49.000-04:00
Per report in MagicStack#216 and https://bugs.debian.org/872408, the current implementation may raise SIGBUS on ARM platforms that do not support unaligned memory access. Fix this by using a stack variable and memcpy, which gives the compiler more opportunity to generate correct code. While at it, implement hton* and ntoh* in terms of byteswap intrinsics with an open-coded fallback. This drops the dependency on the platform's implementation, which might not necessarily be consistently fast. Smoke-tested locally on cross-compiled armv5tel-softfloat-linux-gnueabi and armv6j-hardfloat-linux-gnueabi with QEMU TCG. Fixes: MagicStack#216.
diff --git a/asyncpg/protocol/codecs/base.pyx b/asyncpg/protocol/codecs/base.pyx
@@ -181,15 +181,15 @@ cdef class Codec:
                           FastReadBuffer buf):
         cdef:
             object result
-            uint32_t elem_count
-            uint32_t i
+            ssize_t elem_count
+            ssize_t i
             int32_t elem_len
             uint32_t elem_typ
             uint32_t received_elem_typ
             Codec elem_codec
             FastReadBuffer elem_buf = FastReadBuffer.new()
 
-        elem_count = <uint32_t>hton.unpack_int32(buf.read(4))
+        elem_count = <ssize_t><uint32_t>hton.unpack_int32(buf.read(4))
         result = record.ApgRecord_New(self.element_names, elem_count)
         for i in range(elem_count):
             elem_typ = self.element_type_oids[i]
diff --git a/asyncpg/protocol/codecs/record.pyx b/asyncpg/protocol/codecs/record.pyx
@@ -17,14 +17,14 @@ cdef inline record_encode_frame(ConnectionSettings settings, WriteBuffer buf,
 cdef anonymous_record_decode(ConnectionSettings settings, FastReadBuffer buf):
     cdef:
         tuple result
-        uint32_t elem_count
+        ssize_t elem_count
+        ssize_t i
         int32_t elem_len
         uint32_t elem_typ
-        uint32_t i
         Codec elem_codec
         FastReadBuffer elem_buf = FastReadBuffer.new()
 
-    elem_count = <uint32_t>hton.unpack_int32(buf.read(4))
+    elem_count = <ssize_t><uint32_t>hton.unpack_int32(buf.read(4))
     result = cpython.PyTuple_New(elem_count)
 
     for i in range(elem_count):
diff --git a/asyncpg/protocol/hton.h b/asyncpg/protocol/hton.h
@@ -0,0 +1,199 @@
+#include <stdint.h>
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#include <endian.h>
+#elif defined(__NetBSD__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+#include <sys/endian.h>
+#elif defined(__DragonFly__)
+#include <sys/endian.h>
+#elif defined(__APPLE__)
+#include <libkern/OSByteOrder.h>
+#define __BYTE_ORDER BYTE_ORDER
+#define __BIG_ENDIAN BIG_ENDIAN
+#define __LITTLE_ENDIAN LITTLE_ENDIAN
+#elif defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+/* Assume Windows is always LE.  There seems to be no reliable way
+   to detect endianness there */
+#define __LITTLE_ENDIAN 1234
+#define __BIG_ENDIAN 4321
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#else
+#error Cannot determine platform byte order.
+#endif
+
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+
+#define apg_bswap16(x) __builtin_bswap16(x)
+#define apg_bswap32(x) __builtin_bswap32(x)
+#define apg_bswap64(x) __builtin_bswap64(x)
+
+#elif defined(_MSC_VER)
+
+#define apg_bswap16(x) _byteswap_ushort(x)
+#define apg_bswap32(x) _byteswap_ulong(x)
+#define apg_bswap64(x) _byteswap_uint64(x)
+
+#else
+
+static inline uint16_t
+apg_bswap16(uint16_t)
+{
+    return ((x << 8) & 0xff00) | (x >> 8) & 0x00ff));
+}
+
+static inline uint32_t
+apg_bswap32(uint32_t x)
+{
+    return (
+        ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) |
+		((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff)
+    );
+}
+
+static inline uint64_t
+apg_bswap64(uint64_t x)
+{
+	return (
+		((x << 56) & 0xff00000000000000ULL) |
+		((x << 40) & 0x00ff000000000000ULL) |
+		((x << 24) & 0x0000ff0000000000ULL) |
+		((x << 8) & 0x000000ff00000000ULL) |
+		((x >> 8) & 0x00000000ff000000ULL) |
+		((x >> 24) & 0x0000000000ff0000ULL) |
+		((x >> 40) & 0x000000000000ff00ULL) |
+		((x >> 56) & 0x00000000000000ffULL);
+    );
+}
+
+#endif
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+
+#define apg_hton16(x) (x)
+#define apg_hton32(x) (x)
+#define apg_hton64(x) (x)
+
+#define apg_ntoh16(x) (x)
+#define apg_ntoh32(x) (x)
+#define apg_ntoh64(x) (x)
+
+#elif __BYTE_ORDER == __LITTLE_ENDIAN
+
+#define apg_hton16(x) apg_bswap16(x)
+#define apg_hton32(x) apg_bswap32(x)
+#define apg_hton64(x) apg_bswap64(x)
+
+#define apg_ntoh16(x) apg_bswap16(x)
+#define apg_ntoh32(x) apg_bswap32(x)
+#define apg_ntoh64(x) apg_bswap64(x)
+
+#else
+
+#error Unsupported byte order.
+
+#endif
+
+
+static inline void
+pack_int16(char *buf, int16_t x)
+{
+    uint16_t nx = apg_hton16((uint16_t)x);
+    /* NOTE: the memcpy below is _important_ to support systems
+       which disallow unaligned access.  On systems, which do
+       allow unaligned access it will be optimized away by the
+       compiler
+    */
+    memcpy(buf, &nx, sizeof(uint16_t));
+}
+
+
+static inline void
+pack_int32(char *buf, int64_t x)
+{
+    uint32_t nx = apg_hton32((uint32_t)x);
+    memcpy(buf, &nx, sizeof(uint32_t));
+}
+
+
+static inline void
+pack_int64(char *buf, int64_t x)
+{
+    uint64_t nx = apg_hton64((uint64_t)x);
+    memcpy(buf, &nx, sizeof(uint64_t));
+}
+
+
+static inline int16_t
+unpack_int16(const char *buf)
+{
+    uint16_t nx;
+    memcpy((char *)&nx, buf, sizeof(uint16_t));
+    return (int16_t)apg_ntoh16(nx);
+}
+
+
+static inline int32_t
+unpack_int32(const char *buf)
+{
+    uint32_t nx;
+    memcpy((char *)&nx, buf, sizeof(uint32_t));
+    return (int32_t)apg_ntoh32(nx);
+}
+
+
+static inline int64_t
+unpack_int64(const char *buf)
+{
+    uint64_t nx;
+    memcpy((char *)&nx, buf, sizeof(uint64_t));
+    return (int64_t)apg_ntoh64(nx);
+}
+
+
+union _apg_floatconv {
+    uint32_t i;
+    float f;
+};
+
+
+union _apg_doubleconv {
+    uint64_t i;
+    double f;
+};
+
+
+static inline void
+pack_float(char *buf, float f)
+{
+    union _apg_floatconv v;
+    v.f = f;
+    pack_int32(buf, (int32_t)v.i);
+}
+
+
+static inline void
+pack_double(char *buf, double f)
+{
+    union _apg_doubleconv v;
+    v.f = f;
+    pack_int64(buf, (int64_t)v.i);
+}
+
+
+static inline float
+unpack_float(const char *buf)
+{
+    union _apg_floatconv v;
+    v.i = (uint32_t)unpack_int32(buf);
+    return v.f;
+}
+
+
+static inline double
+unpack_double(const char *buf)
+{
+    union _apg_doubleconv v;
+    v.i = (uint64_t)unpack_int64(buf);
+    return v.f;
+}
diff --git a/asyncpg/protocol/hton.pxd b/asyncpg/protocol/hton.pxd
@@ -8,77 +8,14 @@
 from libc.stdint cimport int16_t, int32_t, uint16_t, uint32_t, int64_t, uint64_t
 
 
-IF UNAME_SYSNAME == "Windows":
-    cdef extern from "winsock2.h":
-        uint32_t htonl(uint32_t hostlong)
-        uint16_t htons(uint16_t hostshort)
-        uint32_t ntohl(uint32_t netlong)
-        uint16_t ntohs(uint16_t netshort)
-ELSE:
-    cdef extern from "arpa/inet.h":
-        uint32_t htonl(uint32_t hostlong)
-        uint16_t htons(uint16_t hostshort)
-        uint32_t ntohl(uint32_t netlong)
-        uint16_t ntohs(uint16_t netshort)
-
-
-cdef inline void pack_int16(char* buf, int16_t x):
-    (<uint16_t*>buf)[0] = htons(<uint16_t>x)
-
-
-cdef inline int16_t unpack_int16(const char* buf):
-    return <int16_t>ntohs((<uint16_t*>buf)[0])
-
-
-cdef inline void pack_int32(char* buf, int32_t x):
-    (<uint32_t*>buf)[0] = htonl(<uint32_t>x)
-
-
-cdef inline int32_t unpack_int32(const char* buf):
-    return <int32_t>ntohl((<uint32_t*>buf)[0])
-
-
-cdef inline void pack_int64(char* buf, int64_t x):
-    (<uint32_t*>buf)[0] = htonl(<uint32_t>(<uint64_t>(x) >> 32))
-    (<uint32_t*>&buf[4])[0] = htonl(<uint32_t>(x))
-
-
-cdef inline int64_t unpack_int64(const char* buf):
-    cdef int64_t hh = unpack_int32(buf)
-    cdef uint32_t hl = <uint32_t>unpack_int32(&buf[4])
-
-    return (hh << 32) | hl
-
-
-cdef union _floatconv:
-    uint32_t i
-    float f
-
-
-cdef inline int32_t pack_float(char* buf, float f):
-    cdef _floatconv v
-    v.f = f
-    pack_int32(buf, <int32_t>v.i)
-
-
-cdef inline float unpack_float(const char* buf):
-    cdef _floatconv v
-    v.i = <uint32_t>unpack_int32(buf)
-    return v.f
-
-
-cdef union _doubleconv:
-    uint64_t i
-    double f
-
-
-cdef inline int64_t pack_double(char* buf, double f):
-    cdef _doubleconv v
-    v.f = f
-    pack_int64(buf, <int64_t>v.i)
-
-
-cdef inline double unpack_double(const char* buf):
-    cdef _doubleconv v
-    v.i = <uint64_t>unpack_int64(buf)
-    return v.f
+cdef extern from "hton.h":
+    cdef void pack_int16(char *buf, int16_t x);
+    cdef void pack_int32(char *buf, int32_t x);
+    cdef void pack_int64(char *buf, int64_t x);
+    cdef void pack_float(char *buf, float f);
+    cdef void pack_double(char *buf, double f);
+    cdef int16_t unpack_int16(const char *buf);
+    cdef int32_t unpack_int32(const char *buf);
+    cdef int64_t unpack_int64(const char *buf);
+    cdef float unpack_float(const char *buf);
+    cdef double unpack_double(const char *buf);
diff --git a/setup.py b/setup.py
@@ -21,10 +21,8 @@
 CFLAGS = ['-O2']
 LDFLAGS = []
 
-if platform.uname().system == 'Windows':
-    LDFLAGS.append('ws2_32.lib')
-else:
-    CFLAGS.extend(['-Wall', '-Wsign-compare', '-Wconversion'])
+if platform.uname().system != 'Windows':
+    CFLAGS.extend(['-fsigned-char', '-Wall', '-Wsign-compare', '-Wconversion'])
 
 
 class build_ext(_build_ext.build_ext):