Replace CBC_MAC_ROTATE_IN_PLACE with an N lg N rotation.

Really the only thing we should be doing with these ciphers is hastening their demise, but it was the weekend and this seemed like fun. EVP_tls_cbc_copy_mac needs to rotate a buffer by a secret amount. (It extracts the MAC, but rotated.) We have two codepaths for this. If CBC_MAC_ROTATE_IN_PLACE is defined (always on), we make some assumptions abuot cache lines, play games with volatile, and hope that doesn't leak anything. Otherwise, we do O(N^2) work to constant-time select the rotation incidences. But we can do O(N lg N). Rotate by powers of two and constant-time select by the offset's bit positions. (Handwaivy lower-bound: an array position has N possible values, so, armed with only a constant-time select, we need O(lg N) work to resolve it. There's N array positions, so O(N lg N).) A microbenchmark of EVP_tls_cbc_copy_mac shows this is 27% faster than the old one, but still 32% slower than the in-place version. in-place: Did 15724000 CopyFromMAC operations in 20000744us (786170.8 ops/sec) N^2: Did 8443000 CopyFromMAC operations in 20001582us (422116.6 ops/sec) N lg N: Did 10718000 CopyFromMAC operations in 20000763us (535879.6 ops/sec) This results in the following the CBC ciphers. I measured AES-128-CBC-SHA1 and AES-256-CBC-SHA384 which are, respectively, the cipher where the other bits are the fastest and the cipher where N is largest. in-place: Did 2634000 AES-128-CBC-SHA1 (16 bytes) open operations in 10000739us (263380.5 ops/sec): 4.2 MB/s Did 1424000 AES-128-CBC-SHA1 (1350 bytes) open operations in 10002782us (142360.4 ops/sec): 192.2 MB/s Did 531000 AES-128-CBC-SHA1 (8192 bytes) open operations in 10002460us (53086.9 ops/sec): 434.9 MB/s N^2: Did 2529000 AES-128-CBC-SHA1 (16 bytes) open operations in 10001474us (252862.7 ops/sec): 4.0 MB/s Did 1392000 AES-128-CBC-SHA1 (1350 bytes) open operations in 10006659us (139107.4 ops/sec): 187.8 MB/s Did 528000 AES-128-CBC-SHA1 (8192 bytes) open operations in 10001276us (52793.3 ops/sec): 432.5 MB/s N lg N: Did 2531000 AES-128-CBC-SHA1 (16 bytes) open operations in 10003057us (253022.7 ops/sec): 4.0 MB/s Did 1390000 AES-128-CBC-SHA1 (1350 bytes) open operations in 10003287us (138954.3 ops/sec): 187.6 MB/s Did 531000 AES-128-CBC-SHA1 (8192 bytes) open operations in 10002448us (53087.0 ops/sec): 434.9 MB/s in-place: Did 1249000 AES-256-CBC-SHA384 (16 bytes) open operations in 10001767us (124877.9 ops/sec): 2.0 MB/s Did 879000 AES-256-CBC-SHA384 (1350 bytes) open operations in 10009244us (87818.8 ops/sec): 118.6 MB/s Did 344000 AES-256-CBC-SHA384 (8192 bytes) open operations in 10025897us (34311.1 ops/sec): 281.1 MB/s N^2: Did 1072000 AES-256-CBC-SHA384 (16 bytes) open operations in 10008090us (107113.3 ops/sec): 1.7 MB/s Did 780000 AES-256-CBC-SHA384 (1350 bytes) open operations in 10007787us (77939.3 ops/sec): 105.2 MB/s Did 333000 AES-256-CBC-SHA384 (8192 bytes) open operations in 10016332us (33245.7 ops/sec): 272.3 MB/s N lg N: Did 1168000 AES-256-CBC-SHA384 (16 bytes) open operations in 10007671us (116710.5 ops/sec): 1.9 MB/s Did 836000 AES-256-CBC-SHA384 (1350 bytes) open operations in 10001536us (83587.2 ops/sec): 112.8 MB/s Did 339000 AES-256-CBC-SHA384 (8192 bytes) open operations in 10018522us (33837.3 ops/sec): 277.2 MB/s TLS CBC performance isn't as important as it was before, and the costs aren't that high, so avoid making assumptions about cache lines. (If we care much about CBC open performance, we probably should get the malloc out of EVP_tls_cbc_digest_record at the end.) Change-Id: Ib8d8271be4b09e5635062cd3b039e1e96f0d9d3d Reviewed-on: https://boringssl-review.googlesource.com/11003 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
google · Sep 12, 2016 · c763a40 · c763a40
1 parent 84b5c00
commit c763a40
Showing 1 changed file with 25 additions and 36 deletions.
diff --git a/crypto/cipher/tls_cbc.c b/crypto/cipher/tls_cbc.c
@@ -123,22 +123,12 @@ int EVP_tls_cbc_remove_padding(unsigned *out_padding_ok, unsigned *out_len,
   return 1;
 }
 
-/* If CBC_MAC_ROTATE_IN_PLACE is defined then EVP_tls_cbc_copy_mac is performed
- * with variable accesses in a 64-byte-aligned buffer. Assuming that this fits
- * into a single or pair of cache-lines, then the variable memory accesses don't
- * actually affect the timing. CPUs with smaller cache-lines [if any] are not
- * multi-core and are not considered vulnerable to cache-timing attacks. */
-#define CBC_MAC_ROTATE_IN_PLACE
-
 void EVP_tls_cbc_copy_mac(uint8_t *out, unsigned md_size,
                           const uint8_t *in, unsigned in_len,
                           unsigned orig_len) {
-#if defined(CBC_MAC_ROTATE_IN_PLACE)
-  uint8_t rotated_mac_buf[64 + EVP_MAX_MD_SIZE];
-  uint8_t *rotated_mac;
-#else
-  uint8_t rotated_mac[EVP_MAX_MD_SIZE];
-#endif
+  uint8_t rotated_mac1[EVP_MAX_MD_SIZE], rotated_mac2[EVP_MAX_MD_SIZE];
+  uint8_t *rotated_mac = rotated_mac1;
+  uint8_t *rotated_mac_tmp = rotated_mac2;
 
   /* mac_end is the index of |in| just after the end of the MAC. */
   unsigned mac_end = in_len;
@@ -153,10 +143,6 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, unsigned md_size,
   assert(in_len >= md_size);
   assert(md_size <= EVP_MAX_MD_SIZE);
 
-#if defined(CBC_MAC_ROTATE_IN_PLACE)
-  rotated_mac = rotated_mac_buf + ((0 - (size_t)rotated_mac_buf) & 63);
-#endif
-
   /* This information is public so it's safe to branch based on it. */
   if (orig_len > md_size + 255 + 1) {
     scan_start = orig_len - (md_size + 255 + 1);
@@ -250,27 +236,30 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, unsigned md_size,
     j &= constant_time_lt(j, md_size);
   }
 
-/* Now rotate the MAC */
-#if defined(CBC_MAC_ROTATE_IN_PLACE)
-  j = 0;
-  for (i = 0; i < md_size; i++) {
-    /* in case cache-line is 32 bytes, touch second line */
-    ((volatile uint8_t *)rotated_mac)[rotate_offset ^ 32];
-    out[j++] = rotated_mac[rotate_offset++];
-    rotate_offset &= constant_time_lt(rotate_offset, md_size);
-  }
-#else
-  memset(out, 0, md_size);
-  rotate_offset = md_size - rotate_offset;
-  rotate_offset &= constant_time_lt(rotate_offset, md_size);
-  for (i = 0; i < md_size; i++) {
-    for (j = 0; j < md_size; j++) {
-      out[j] |= rotated_mac[i] & constant_time_eq_8(j, rotate_offset);
+  /* Now rotate the MAC. We rotate in log(md_size) steps, one for each bit
+   * position. */
+  for (unsigned offset = 1; offset < md_size;
+       offset <<= 1, rotate_offset >>= 1) {
+    /* Rotate by |offset| iff the corresponding bit is set in
+     * |rotate_offset|, placing the result in |rotated_mac_tmp|. */
+    const uint8_t skip_rotate = (rotate_offset & 1) - 1;
+    for (i = 0, j = offset; i < md_size; i++, j++) {
+      if (j >= md_size) {
+        j -= md_size;
+      }
+      rotated_mac_tmp[i] =
+          constant_time_select_8(skip_rotate, rotated_mac[i], rotated_mac[j]);
     }
-    rotate_offset++;
-    rotate_offset &= constant_time_lt(rotate_offset, md_size);
+
+    /* Swap pointers so |rotated_mac| contains the (possibly) rotated value.
+     * Note the number of iterations and thus the identity of these pointers is
+     * public information. */
+    uint8_t *tmp = rotated_mac;
+    rotated_mac = rotated_mac_tmp;
+    rotated_mac_tmp = tmp;
   }
-#endif
+
+  memcpy(out, rotated_mac, md_size);
 }
 
 /* u32toBE serialises an unsigned, 32-bit number (n) as four bytes at (p) in