From 3d2cf6c5bd35b0d72716b47bdd7e3892388aafc4 Mon Sep 17 00:00:00 2001
From: PiRK <pierrekn@gmail.com>
Date: Fri, 29 Jan 2021 11:49:51 +0100
Subject: [PATCH 01/59] initialize variable in tests

This was detected while running the tests with the `-Wconditional-uninitialized` flag

```
./autogen.sh
CC=clang CFLAGS="-Wconditional-uninitialized" ./configure
make check
```

The resulting warning is a false positive, but setting the value to -1
ensures that the CHECK below will fail if recid is never written to.
---
 src/tests.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tests.c b/src/tests.c
index c2d5e2892..4110bb9a7 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -4324,8 +4324,10 @@ void test_ecdsa_sign_verify(void) {
     secp256k1_scalar one;
     secp256k1_scalar msg, key;
     secp256k1_scalar sigr, sigs;
-    int recid;
     int getrec;
+    /* Initialize recid to suppress a false positive -Wconditional-uninitialized in clang.
+       VG_UNDEF ensures that valgrind will still treat the variable as uninitialized. */
+    int recid = -1; VG_UNDEF(&recid, sizeof(recid));
     random_scalar_order_test(&msg);
     random_scalar_order_test(&key);
     secp256k1_ecmult_gen(&ctx->ecmult_gen_ctx, &pubj, &key);

From 99a1cfec1740a914aa416a87fd0acbde5426b969 Mon Sep 17 00:00:00 2001
From: PiRK <pierrekn@gmail.com>
Date: Sun, 31 Jan 2021 18:41:35 +0100
Subject: [PATCH 02/59] print warnings for conditional-uninitialized

This compiler flag is available for clang but not gcc.

Test plan:

```
autogen.sh
./configure
make check
CC=clang ./configure
make check
```

If a variable is used uninitialized, the warning should look something
like:
```
  CC       src/tests-tests.o
src/tests.c:4336:15: warning: variable 'recid' may be uninitialized when used here [-Wconditional-uninitialized]
        CHECK(recid >= 0 && recid < 4);
              ^~~~~
./src/util.h:54:18: note: expanded from macro 'CHECK'
    if (EXPECT(!(cond), 0)) { \
                 ^~~~
./src/util.h:41:39: note: expanded from macro 'EXPECT'
                                      ^
src/tests.c:4327:14: note: initialize the variable 'recid' to silence this warning
    int recid;
             ^
              = 0
1 warning generated.
```
---
 configure.ac | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/configure.ac b/configure.ac
index 451915ccf..d9a7ddb6a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -79,6 +79,15 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[char foo;]])],
       CFLAGS="$saved_CFLAGS"
     ])
 
+saved_CFLAGS="$CFLAGS"
+CFLAGS="-Wconditional-uninitialized $CFLAGS"
+AC_MSG_CHECKING([if ${CC} supports -Wconditional-uninitialized])
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[char foo;]])],
+    [ AC_MSG_RESULT([yes]) ],
+    [ AC_MSG_RESULT([no])
+      CFLAGS="$saved_CFLAGS"
+    ])
+
 saved_CFLAGS="$CFLAGS"
 CFLAGS="-fvisibility=hidden $CFLAGS"
 AC_MSG_CHECKING([if ${CC} supports -fvisibility=hidden])

From de0a643c3dc2c40a447e670cfa1c1683c79c9297 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 11 Oct 2020 19:10:58 -0700
Subject: [PATCH 03/59] Add secp256k1_ctz{32,64}_var functions

These functions count the number of trailing zeroes in non-zero integers.
---
 src/tests.c | 21 +++++++++++++++++
 src/util.h  | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/src/tests.c b/src/tests.c
index c2d5e2892..ab981b5a7 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -416,6 +416,25 @@ void run_scratch_tests(void) {
     secp256k1_context_destroy(none);
 }
 
+void run_ctz_tests(void) {
+    static const uint32_t b32[] = {1, 0xffffffff, 0x5e56968f, 0xe0d63129};
+    static const uint64_t b64[] = {1, 0xffffffffffffffff, 0xbcd02462139b3fc3, 0x98b5f80c769693ef};
+    int shift;
+    unsigned i;
+    for (i = 0; i < sizeof(b32) / sizeof(b32[0]); ++i) {
+        for (shift = 0; shift < 32; ++shift) {
+            CHECK(secp256k1_ctz32_var_debruijn(b32[i] << shift) == shift);
+            CHECK(secp256k1_ctz32_var(b32[i] << shift) == shift);
+        }
+    }
+    for (i = 0; i < sizeof(b64) / sizeof(b64[0]); ++i) {
+        for (shift = 0; shift < 64; ++shift) {
+            CHECK(secp256k1_ctz64_var_debruijn(b64[i] << shift) == shift);
+            CHECK(secp256k1_ctz64_var(b64[i] << shift) == shift);
+        }
+    }
+}
+
 /***** HASH TESTS *****/
 
 void run_sha256_tests(void) {
@@ -5606,6 +5625,8 @@ int main(int argc, char **argv) {
     run_rand_bits();
     run_rand_int();
 
+    run_ctz_tests();
+
     run_sha256_tests();
     run_hmac_sha256_tests();
     run_rfc6979_hmac_sha256_tests();
diff --git a/src/util.h b/src/util.h
index b7e457c48..f78846836 100644
--- a/src/util.h
+++ b/src/util.h
@@ -276,4 +276,69 @@ SECP256K1_GNUC_EXT typedef __int128 int128_t;
 # endif
 #endif
 
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+/* Determine the number of trailing zero bits in a (non-zero) 32-bit x.
+ * This function is only intended to be used as fallback for
+ * secp256k1_ctz32_var, but permits it to be tested separately. */
+static SECP256K1_INLINE int secp256k1_ctz32_var_debruijn(uint32_t x) {
+    static const uint8_t debruijn[32] = {
+        0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A,
+        0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B,
+        0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B
+    };
+    return debruijn[((x & -x) * 0x04D7651F) >> 27];
+}
+
+/* Determine the number of trailing zero bits in a (non-zero) 64-bit x.
+ * This function is only intended to be used as fallback for
+ * secp256k1_ctz64_var, but permits it to be tested separately. */
+static SECP256K1_INLINE int secp256k1_ctz64_var_debruijn(uint64_t x) {
+    static const uint8_t debruijn[64] = {
+        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
+        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
+        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
+    };
+    return debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58];
+}
+
+/* Determine the number of trailing zero bits in a (non-zero) 32-bit x. */
+static SECP256K1_INLINE int secp256k1_ctz32_var(uint32_t x) {
+    VERIFY_CHECK(x != 0);
+#if (__has_builtin(__builtin_ctz) || SECP256K1_GNUC_PREREQ(3,4))
+    /* If the unsigned type is sufficient to represent the largest uint32_t, consider __builtin_ctz. */
+    if (((unsigned)UINT32_MAX) == UINT32_MAX) {
+        return __builtin_ctz(x);
+    }
+#endif
+#if (__has_builtin(__builtin_ctzl) || SECP256K1_GNUC_PREREQ(3,4))
+    /* Otherwise consider __builtin_ctzl (the unsigned long type is always at least 32 bits). */
+    return __builtin_ctzl(x);
+#else
+    /* If no suitable CTZ builtin is available, use a (variable time) software emulation. */
+    return secp256k1_ctz32_var_debruijn(x);
+#endif
+}
+
+/* Determine the number of trailing zero bits in a (non-zero) 64-bit x. */
+static SECP256K1_INLINE int secp256k1_ctz64_var(uint64_t x) {
+    VERIFY_CHECK(x != 0);
+#if (__has_builtin(__builtin_ctzl) || SECP256K1_GNUC_PREREQ(3,4))
+    /* If the unsigned long type is sufficient to represent the largest uint64_t, consider __builtin_ctzl. */
+    if (((unsigned long)UINT64_MAX) == UINT64_MAX) {
+        return __builtin_ctzl(x);
+    }
+#endif
+#if (__has_builtin(__builtin_ctzll) || SECP256K1_GNUC_PREREQ(3,4))
+    /* Otherwise consider __builtin_ctzll (the unsigned long long type is always at least 64 bits). */
+    return __builtin_ctzll(x);
+#else
+    /* If no suitable CTZ builtin is available, use a (variable time) software emulation. */
+    return secp256k1_ctz64_var_debruijn(x);
+#endif
+}
+
 #endif /* SECP256K1_UTIL_H */

From 8e415acba25830da9c23a4dd5531ebfc6b65aae7 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Sun, 29 Nov 2020 14:01:03 -0800
Subject: [PATCH 04/59] Add safegcd based modular inverse modules

Refactored by: Pieter Wuille <pieter@wuille.net>
---
 Makefile.am         |   4 +
 src/modinv32.h      |  31 ++++
 src/modinv32_impl.h | 364 +++++++++++++++++++++++++++++++++++++++++++
 src/modinv64.h      |  35 +++++
 src/modinv64_impl.h | 367 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 801 insertions(+)
 create mode 100644 src/modinv32.h
 create mode 100644 src/modinv32_impl.h
 create mode 100644 src/modinv64.h
 create mode 100644 src/modinv64_impl.h

diff --git a/Makefile.am b/Makefile.am
index 023fa6067..c399cff08 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -34,6 +34,10 @@ noinst_HEADERS += src/field_5x52.h
 noinst_HEADERS += src/field_5x52_impl.h
 noinst_HEADERS += src/field_5x52_int128_impl.h
 noinst_HEADERS += src/field_5x52_asm_impl.h
+noinst_HEADERS += src/modinv32.h
+noinst_HEADERS += src/modinv32_impl.h
+noinst_HEADERS += src/modinv64.h
+noinst_HEADERS += src/modinv64_impl.h
 noinst_HEADERS += src/assumptions.h
 noinst_HEADERS += src/util.h
 noinst_HEADERS += src/scratch.h
diff --git a/src/modinv32.h b/src/modinv32.h
new file mode 100644
index 000000000..2678d816f
--- /dev/null
+++ b/src/modinv32.h
@@ -0,0 +1,31 @@
+/***********************************************************************
+ * Copyright (c) 2020 Peter Dettman                                    *
+ * Distributed under the MIT software license, see the accompanying    *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef SECP256K1_MODINV32_H
+#define SECP256K1_MODINV32_H
+
+#if defined HAVE_CONFIG_H
+#include "libsecp256k1-config.h"
+#endif
+
+#include "util.h"
+
+typedef struct {
+    int32_t v[9];
+} secp256k1_modinv32_signed30;
+
+typedef struct {
+    /* The modulus in signed30 notation. */
+    secp256k1_modinv32_signed30 modulus;
+
+    /* modulus^{-1} mod 2^30 */
+    uint32_t modulus_inv30;
+} secp256k1_modinv32_modinfo;
+
+static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
+static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
+
+#endif /* SECP256K1_MODINV32_H */
diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
new file mode 100644
index 000000000..d2fecc31c
--- /dev/null
+++ b/src/modinv32_impl.h
@@ -0,0 +1,364 @@
+/***********************************************************************
+ * Copyright (c) 2020 Peter Dettman                                    *
+ * Distributed under the MIT software license, see the accompanying    *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef SECP256K1_MODINV32_IMPL_H
+#define SECP256K1_MODINV32_IMPL_H
+
+#include "modinv32.h"
+
+#include "util.h"
+
+static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int32_t sign, const secp256k1_modinv32_modinfo *modinfo) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    int32_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4],
+            r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8];
+    int32_t cond_add, cond_negate;
+
+    cond_add = r8 >> 31;
+
+    r0 += modinfo->modulus.v[0] & cond_add;
+    r1 += modinfo->modulus.v[1] & cond_add;
+    r2 += modinfo->modulus.v[2] & cond_add;
+    r3 += modinfo->modulus.v[3] & cond_add;
+    r4 += modinfo->modulus.v[4] & cond_add;
+    r5 += modinfo->modulus.v[5] & cond_add;
+    r6 += modinfo->modulus.v[6] & cond_add;
+    r7 += modinfo->modulus.v[7] & cond_add;
+    r8 += modinfo->modulus.v[8] & cond_add;
+
+    cond_negate = sign >> 31;
+
+    r0 = (r0 ^ cond_negate) - cond_negate;
+    r1 = (r1 ^ cond_negate) - cond_negate;
+    r2 = (r2 ^ cond_negate) - cond_negate;
+    r3 = (r3 ^ cond_negate) - cond_negate;
+    r4 = (r4 ^ cond_negate) - cond_negate;
+    r5 = (r5 ^ cond_negate) - cond_negate;
+    r6 = (r6 ^ cond_negate) - cond_negate;
+    r7 = (r7 ^ cond_negate) - cond_negate;
+    r8 = (r8 ^ cond_negate) - cond_negate;
+
+    r1 += r0 >> 30; r0 &= M30;
+    r2 += r1 >> 30; r1 &= M30;
+    r3 += r2 >> 30; r2 &= M30;
+    r4 += r3 >> 30; r3 &= M30;
+    r5 += r4 >> 30; r4 &= M30;
+    r6 += r5 >> 30; r5 &= M30;
+    r7 += r6 >> 30; r6 &= M30;
+    r8 += r7 >> 30; r7 &= M30;
+
+    cond_add = r8 >> 31;
+
+    r0 += modinfo->modulus.v[0] & cond_add;
+    r1 += modinfo->modulus.v[1] & cond_add;
+    r2 += modinfo->modulus.v[2] & cond_add;
+    r3 += modinfo->modulus.v[3] & cond_add;
+    r4 += modinfo->modulus.v[4] & cond_add;
+    r5 += modinfo->modulus.v[5] & cond_add;
+    r6 += modinfo->modulus.v[6] & cond_add;
+    r7 += modinfo->modulus.v[7] & cond_add;
+    r8 += modinfo->modulus.v[8] & cond_add;
+
+    r1 += r0 >> 30; r0 &= M30;
+    r2 += r1 >> 30; r1 &= M30;
+    r3 += r2 >> 30; r2 &= M30;
+    r4 += r3 >> 30; r3 &= M30;
+    r5 += r4 >> 30; r4 &= M30;
+    r6 += r5 >> 30; r5 &= M30;
+    r7 += r6 >> 30; r6 &= M30;
+    r8 += r7 >> 30; r7 &= M30;
+
+    r->v[0] = r0;
+    r->v[1] = r1;
+    r->v[2] = r2;
+    r->v[3] = r3;
+    r->v[4] = r4;
+    r->v[5] = r5;
+    r->v[6] = r6;
+    r->v[7] = r7;
+    r->v[8] = r8;
+}
+
+typedef struct {
+    int32_t u, v, q, r;
+} secp256k1_modinv32_trans2x2;
+
+static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
+    uint32_t u = 1, v = 0, q = 0, r = 1;
+    uint32_t c1, c2, f = f0, g = g0, x, y, z;
+    int i;
+
+    for (i = 0; i < 30; ++i) {
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == g << i);
+
+        c1 = eta >> 31;
+        c2 = -(g & 1);
+
+        x = (f ^ c1) - c1;
+        y = (u ^ c1) - c1;
+        z = (v ^ c1) - c1;
+
+        g += x & c2;
+        q += y & c2;
+        r += z & c2;
+
+        c1 &= c2;
+        eta = (eta ^ c1) - (c1 + 1);
+
+        f += g & c1;
+        u += q & c1;
+        v += r & c1;
+
+        g >>= 1;
+        u <<= 1;
+        v <<= 1;
+    }
+
+    t->u = (int32_t)u;
+    t->v = (int32_t)v;
+    t->q = (int32_t)q;
+    t->r = (int32_t)r;
+
+    return eta;
+}
+
+static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
+    /* inv256[i] = -(2*i+1)^-1 (mod 256) */
+    static const uint8_t inv256[128] = {
+        0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
+        0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
+        0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
+        0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
+        0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
+        0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
+        0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
+        0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
+        0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
+        0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
+        0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
+    };
+
+    uint32_t u = 1, v = 0, q = 0, r = 1;
+    uint32_t f = f0, g = g0, m;
+    uint16_t w;
+    int i = 30, limit, zeros;
+
+    for (;;) {
+        /* Use a sentinel bit to count zeros only up to i. */
+        zeros = secp256k1_ctz32_var(g | (UINT32_MAX << i));
+
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+
+        if (i <= 0) {
+            break;
+        }
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((g & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
+
+        if (eta < 0) {
+            uint32_t tmp;
+            eta = -eta;
+            tmp = f; f = g; g = -tmp;
+            tmp = u; u = q; q = -tmp;
+            tmp = v; v = r; r = -tmp;
+        }
+
+        /* Handle up to 8 divsteps at once, subject to eta and i. */
+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+        m = (UINT32_MAX >> (32 - limit)) & 255U;
+
+        w = (g * inv256[(f >> 1) & 127]) & m;
+
+        g += f * w;
+        q += u * w;
+        r += v * w;
+
+        VERIFY_CHECK((g & m) == 0);
+    }
+
+    t->u = (int32_t)u;
+    t->v = (int32_t)v;
+    t->q = (int32_t)q;
+    t->r = (int32_t)r;
+
+    return eta;
+}
+
+static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp256k1_modinv32_signed30 *e, const secp256k1_modinv32_trans2x2 *t, const secp256k1_modinv32_modinfo* modinfo) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int32_t di, ei, md, me, sd, se;
+    int64_t cd, ce;
+    int i;
+
+    /*
+     * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add
+     * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior
+     * to division by 2^30). This has the same effect as if we added the modulus to the input(s).
+     */
+
+    sd = d->v[8] >> 31;
+    se = e->v[8] >> 31;
+
+    md = (u & sd) + (v & se);
+    me = (q & sd) + (r & se);
+
+    di = d->v[0];
+    ei = e->v[0];
+
+    cd = (int64_t)u * di + (int64_t)v * ei;
+    ce = (int64_t)q * di + (int64_t)r * ei;
+
+    /*
+     * Subtract from md/me an extra term in the range [0, 2^30) such that the low 30 bits of each
+     * sum of products will be 0. This allows clean division by 2^30. On output, d/e are thus in
+     * the range (-2.P, P), consistent with the input constraint.
+     */
+
+    md -= (modinfo->modulus_inv30 * (uint32_t)cd + md) & M30;
+    me -= (modinfo->modulus_inv30 * (uint32_t)ce + me) & M30;
+
+    cd += (int64_t)modinfo->modulus.v[0] * md;
+    ce += (int64_t)modinfo->modulus.v[0] * me;
+
+    VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30;
+    VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30;
+
+    for (i = 1; i < 9; ++i) {
+        di = d->v[i];
+        ei = e->v[i];
+
+        cd += (int64_t)u * di + (int64_t)v * ei;
+        ce += (int64_t)q * di + (int64_t)r * ei;
+
+        cd += (int64_t)modinfo->modulus.v[i] * md;
+        ce += (int64_t)modinfo->modulus.v[i] * me;
+
+        d->v[i - 1] = (int32_t)cd & M30; cd >>= 30;
+        e->v[i - 1] = (int32_t)ce & M30; ce >>= 30;
+    }
+
+    d->v[8] = (int32_t)cd;
+    e->v[8] = (int32_t)ce;
+}
+
+static void secp256k1_modinv32_update_fg_30(secp256k1_modinv32_signed30 *f, secp256k1_modinv32_signed30 *g, const secp256k1_modinv32_trans2x2 *t) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int32_t fi, gi;
+    int64_t cf, cg;
+    int i;
+
+    fi = f->v[0];
+    gi = g->v[0];
+
+    cf = (int64_t)u * fi + (int64_t)v * gi;
+    cg = (int64_t)q * fi + (int64_t)r * gi;
+
+    VERIFY_CHECK(((int32_t)cf & M30) == 0);
+    VERIFY_CHECK(((int32_t)cg & M30) == 0);
+
+    cf >>= 30;
+    cg >>= 30;
+
+    for (i = 1; i < 9; ++i) {
+        fi = f->v[i];
+        gi = g->v[i];
+
+        cf += (int64_t)u * fi + (int64_t)v * gi;
+        cg += (int64_t)q * fi + (int64_t)r * gi;
+
+        f->v[i - 1] = (int32_t)cf & M30; cf >>= 30;
+        g->v[i - 1] = (int32_t)cg & M30; cg >>= 30;
+    }
+
+    f->v[8] = (int32_t)cf;
+    g->v[8] = (int32_t)cg;
+}
+
+static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
+    secp256k1_modinv32_signed30 d = {{0}};
+    secp256k1_modinv32_signed30 e = {{1}};
+    secp256k1_modinv32_signed30 f = modinfo->modulus;
+    secp256k1_modinv32_signed30 g = *x;
+    int i;
+    int32_t eta;
+
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If the maximum bitlength of g is known to be less than 256, then eta can be set
+     * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total
+     * divsteps are needed. */
+    eta = -1;
+
+    for (i = 0; i < 25; ++i) {
+        secp256k1_modinv32_trans2x2 t;
+        eta = secp256k1_modinv32_divsteps_30(eta, f.v[0], g.v[0], &t);
+        secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
+        secp256k1_modinv32_update_fg_30(&f, &g, &t);
+    }
+
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
+    VERIFY_CHECK((g.v[0] | g.v[1] | g.v[2] | g.v[3] | g.v[4] | g.v[5] | g.v[6] | g.v[7] | g.v[8]) == 0);
+
+    secp256k1_modinv32_normalize_30(&d, f.v[8] >> 31, modinfo);
+
+    *x = d;
+}
+
+static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
+    secp256k1_modinv32_signed30 d = {{0, 0, 0, 0, 0, 0, 0, 0, 0}};
+    secp256k1_modinv32_signed30 e = {{1, 0, 0, 0, 0, 0, 0, 0, 0}};
+    secp256k1_modinv32_signed30 f = modinfo->modulus;
+    secp256k1_modinv32_signed30 g = *x;
+    int j;
+    int32_t eta;
+    int32_t cond;
+
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to
+     * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */
+    eta = -1;
+
+    while (1) {
+        secp256k1_modinv32_trans2x2 t;
+        eta = secp256k1_modinv32_divsteps_30_var(eta, f.v[0], g.v[0], &t);
+        secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
+        secp256k1_modinv32_update_fg_30(&f, &g, &t);
+        if (g.v[0] == 0) {
+            cond = 0;
+            for (j = 1; j < 9; ++j) {
+                cond |= g.v[j];
+            }
+            if (cond == 0) break;
+        }
+    }
+
+    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
+
+    secp256k1_modinv32_normalize_30(&d, f.v[8] >> 31, modinfo);
+
+    *x = d;
+}
+
+#endif /* SECP256K1_MODINV32_IMPL_H */
diff --git a/src/modinv64.h b/src/modinv64.h
new file mode 100644
index 000000000..e70fea0d6
--- /dev/null
+++ b/src/modinv64.h
@@ -0,0 +1,35 @@
+/***********************************************************************
+ * Copyright (c) 2020 Peter Dettman                                    *
+ * Distributed under the MIT software license, see the accompanying    *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef SECP256K1_MODINV64_H
+#define SECP256K1_MODINV64_H
+
+#if defined HAVE_CONFIG_H
+#include "libsecp256k1-config.h"
+#endif
+
+#include "util.h"
+
+#ifndef SECP256K1_WIDEMUL_INT128
+#error "modinv64 requires 128-bit wide multiplication support"
+#endif
+
+typedef struct {
+    int64_t v[5];
+} secp256k1_modinv64_signed62;
+
+typedef struct {
+    /* The modulus in signed62 notation. */
+    secp256k1_modinv64_signed62 modulus;
+
+    /* modulus^{-1} mod 2^62 */
+    uint64_t modulus_inv62;
+} secp256k1_modinv64_modinfo;
+
+static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
+static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
+
+#endif /* SECP256K1_MODINV64_H */
diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
new file mode 100644
index 000000000..4d9105571
--- /dev/null
+++ b/src/modinv64_impl.h
@@ -0,0 +1,367 @@
+/***********************************************************************
+ * Copyright (c) 2020 Peter Dettman                                    *
+ * Distributed under the MIT software license, see the accompanying    *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef SECP256K1_MODINV64_IMPL_H
+#define SECP256K1_MODINV64_IMPL_H
+
+#include "modinv64.h"
+
+#include "util.h"
+
+static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int64_t sign, const secp256k1_modinv64_modinfo *modinfo) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int64_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4];
+    int64_t cond_add, cond_negate;
+
+    cond_add = r4 >> 63;
+
+    r0 += modinfo->modulus.v[0] & cond_add;
+    r1 += modinfo->modulus.v[1] & cond_add;
+    r2 += modinfo->modulus.v[2] & cond_add;
+    r3 += modinfo->modulus.v[3] & cond_add;
+    r4 += modinfo->modulus.v[4] & cond_add;
+
+    cond_negate = sign >> 63;
+
+    r0 = (r0 ^ cond_negate) - cond_negate;
+    r1 = (r1 ^ cond_negate) - cond_negate;
+    r2 = (r2 ^ cond_negate) - cond_negate;
+    r3 = (r3 ^ cond_negate) - cond_negate;
+    r4 = (r4 ^ cond_negate) - cond_negate;
+
+    r1 += r0 >> 62; r0 &= M62;
+    r2 += r1 >> 62; r1 &= M62;
+    r3 += r2 >> 62; r2 &= M62;
+    r4 += r3 >> 62; r3 &= M62;
+
+    cond_add = r4 >> 63;
+
+    r0 += modinfo->modulus.v[0] & cond_add;
+    r1 += modinfo->modulus.v[1] & cond_add;
+    r2 += modinfo->modulus.v[2] & cond_add;
+    r3 += modinfo->modulus.v[3] & cond_add;
+    r4 += modinfo->modulus.v[4] & cond_add;
+
+    r1 += r0 >> 62; r0 &= M62;
+    r2 += r1 >> 62; r1 &= M62;
+    r3 += r2 >> 62; r2 &= M62;
+    r4 += r3 >> 62; r3 &= M62;
+
+    r->v[0] = r0;
+    r->v[1] = r1;
+    r->v[2] = r2;
+    r->v[3] = r3;
+    r->v[4] = r4;
+}
+
+typedef struct {
+    int64_t u, v, q, r;
+} secp256k1_modinv64_trans2x2;
+
+static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
+
+    uint64_t u = 1, v = 0, q = 0, r = 1;
+    uint64_t c1, c2, f = f0, g = g0, x, y, z;
+    int i;
+
+    for (i = 0; i < 62; ++i) {
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == g << i);
+
+        c1 = eta >> 63;
+        c2 = -(g & 1);
+
+        x = (f ^ c1) - c1;
+        y = (u ^ c1) - c1;
+        z = (v ^ c1) - c1;
+
+        g += x & c2;
+        q += y & c2;
+        r += z & c2;
+
+        c1 &= c2;
+        eta = (eta ^ c1) - (c1 + 1);
+
+        f += g & c1;
+        u += q & c1;
+        v += r & c1;
+
+        g >>= 1;
+        u <<= 1;
+        v <<= 1;
+    }
+
+    t->u = (int64_t)u;
+    t->v = (int64_t)v;
+    t->q = (int64_t)q;
+    t->r = (int64_t)r;
+
+    return eta;
+}
+
+static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
+    /* inv256[i] = -(2*i+1)^-1 (mod 256) */
+    static const uint8_t inv256[128] = {
+        0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
+        0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
+        0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
+        0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
+        0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
+        0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
+        0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
+        0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
+        0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
+        0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
+        0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
+    };
+
+    uint64_t u = 1, v = 0, q = 0, r = 1;
+    uint64_t f = f0, g = g0, m;
+    uint32_t w;
+    int i = 62, limit, zeros;
+
+    for (;;) {
+        /* Use a sentinel bit to count zeros only up to i. */
+        zeros = secp256k1_ctz64_var(g | (UINT64_MAX << i));
+
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+
+        if (i <= 0) {
+            break;
+        }
+
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((g & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i));
+
+        if (eta < 0) {
+            uint64_t tmp;
+            eta = -eta;
+            tmp = f; f = g; g = -tmp;
+            tmp = u; u = q; q = -tmp;
+            tmp = v; v = r; r = -tmp;
+        }
+
+        /* Handle up to 8 divsteps at once, subject to eta and i. */
+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+        m = (UINT64_MAX >> (64 - limit)) & 255U;
+
+        w = (g * inv256[(f >> 1) & 127]) & m;
+
+        g += f * w;
+        q += u * w;
+        r += v * w;
+
+        VERIFY_CHECK((g & m) == 0);
+    }
+
+    t->u = (int64_t)u;
+    t->v = (int64_t)v;
+    t->q = (int64_t)q;
+    t->r = (int64_t)r;
+
+    return eta;
+}
+
+static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp256k1_modinv64_signed62 *e, const secp256k1_modinv64_trans2x2 *t, const secp256k1_modinv64_modinfo* modinfo) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t d0 = d->v[0], d1 = d->v[1], d2 = d->v[2], d3 = d->v[3], d4 = d->v[4];
+    const int64_t e0 = e->v[0], e1 = e->v[1], e2 = e->v[2], e3 = e->v[3], e4 = e->v[4];
+    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int64_t md, me, sd, se;
+    int128_t cd, ce;
+
+    /*
+     * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add
+     * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior
+     * to division by 2^62). This has the same effect as if we added the modulus to the input(s).
+     */
+
+    sd = d4 >> 63;
+    se = e4 >> 63;
+
+    md = (u & sd) + (v & se);
+    me = (q & sd) + (r & se);
+
+    cd = (int128_t)u * d0 + (int128_t)v * e0;
+    ce = (int128_t)q * d0 + (int128_t)r * e0;
+
+    /*
+     * Subtract from md/me an extra term in the range [0, 2^62) such that the low 62 bits of each
+     * sum of products will be 0. This allows clean division by 2^62. On output, d/e are thus in
+     * the range (-2.P, P), consistent with the input constraint.
+     */
+
+    md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
+    me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
+
+    cd += (int128_t)modinfo->modulus.v[0] * md;
+    ce += (int128_t)modinfo->modulus.v[0] * me;
+
+    VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
+    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
+
+    cd += (int128_t)u * d1 + (int128_t)v * e1;
+    ce += (int128_t)q * d1 + (int128_t)r * e1;
+
+    cd += (int128_t)modinfo->modulus.v[1] * md;
+    ce += (int128_t)modinfo->modulus.v[1] * me;
+
+    d->v[0] = (int64_t)cd & M62; cd >>= 62;
+    e->v[0] = (int64_t)ce & M62; ce >>= 62;
+
+    cd += (int128_t)u * d2 + (int128_t)v * e2;
+    ce += (int128_t)q * d2 + (int128_t)r * e2;
+
+    cd += (int128_t)modinfo->modulus.v[2] * md;
+    ce += (int128_t)modinfo->modulus.v[2] * me;
+
+    d->v[1] = (int64_t)cd & M62; cd >>= 62;
+    e->v[1] = (int64_t)ce & M62; ce >>= 62;
+
+    cd += (int128_t)u * d3 + (int128_t)v * e3;
+    ce += (int128_t)q * d3 + (int128_t)r * e3;
+
+    cd += (int128_t)modinfo->modulus.v[3] * md;
+    ce += (int128_t)modinfo->modulus.v[3] * me;
+
+    d->v[2] = (int64_t)cd & M62; cd >>= 62;
+    e->v[2] = (int64_t)ce & M62; ce >>= 62;
+
+    cd += (int128_t)u * d4 + (int128_t)v * e4;
+    ce += (int128_t)q * d4 + (int128_t)r * e4;
+
+    cd += (int128_t)modinfo->modulus.v[4] * md;
+    ce += (int128_t)modinfo->modulus.v[4] * me;
+
+    d->v[3] = (int64_t)cd & M62; cd >>= 62;
+    e->v[3] = (int64_t)ce & M62; ce >>= 62;
+
+    d->v[4] = (int64_t)cd;
+    e->v[4] = (int64_t)ce;
+}
+
+static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp256k1_modinv64_signed62 *g, const secp256k1_modinv64_trans2x2 *t) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t f0 = f->v[0], f1 = f->v[1], f2 = f->v[2], f3 = f->v[3], f4 = f->v[4];
+    const int64_t g0 = g->v[0], g1 = g->v[1], g2 = g->v[2], g3 = g->v[3], g4 = g->v[4];
+    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int128_t cf, cg;
+
+    cf = (int128_t)u * f0 + (int128_t)v * g0;
+    cg = (int128_t)q * f0 + (int128_t)r * g0;
+
+    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
+    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+
+    cf += (int128_t)u * f1 + (int128_t)v * g1;
+    cg += (int128_t)q * f1 + (int128_t)r * g1;
+
+    f->v[0] = (int64_t)cf & M62; cf >>= 62;
+    g->v[0] = (int64_t)cg & M62; cg >>= 62;
+
+    cf += (int128_t)u * f2 + (int128_t)v * g2;
+    cg += (int128_t)q * f2 + (int128_t)r * g2;
+
+    f->v[1] = (int64_t)cf & M62; cf >>= 62;
+    g->v[1] = (int64_t)cg & M62; cg >>= 62;
+
+    cf += (int128_t)u * f3 + (int128_t)v * g3;
+    cg += (int128_t)q * f3 + (int128_t)r * g3;
+
+    f->v[2] = (int64_t)cf & M62; cf >>= 62;
+    g->v[2] = (int64_t)cg & M62; cg >>= 62;
+
+    cf += (int128_t)u * f4 + (int128_t)v * g4;
+    cg += (int128_t)q * f4 + (int128_t)r * g4;
+
+    f->v[3] = (int64_t)cf & M62; cf >>= 62;
+    g->v[3] = (int64_t)cg & M62; cg >>= 62;
+
+    f->v[4] = (int64_t)cf;
+    g->v[4] = (int64_t)cg;
+}
+
+static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
+
+    secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
+    secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
+    secp256k1_modinv64_signed62 f = modinfo->modulus;
+    secp256k1_modinv64_signed62 g = *x;
+    int i;
+    int64_t eta;
+
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If the maximum bitlength of g is known to be less than 256, then eta can be set
+     * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total
+     * divsteps are needed. */
+    eta = -1;
+
+    for (i = 0; i < 12; ++i) {
+        secp256k1_modinv64_trans2x2 t;
+        eta = secp256k1_modinv64_divsteps_62(eta, f.v[0], g.v[0], &t);
+        secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
+        secp256k1_modinv64_update_fg_62(&f, &g, &t);
+    }
+
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
+    VERIFY_CHECK((g.v[0] | g.v[1] | g.v[2] | g.v[3] | g.v[4]) == 0);
+
+    secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
+
+    *x = d;
+}
+
+static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
+    /* Modular inversion based on the paper "Fast constant-time gcd computation and
+     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
+
+    secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
+    secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
+    secp256k1_modinv64_signed62 f = modinfo->modulus;
+    secp256k1_modinv64_signed62 g = *x;
+    int j;
+    uint64_t eta;
+    int64_t cond;
+
+    /* The paper uses 'delta'; eta == -delta (a performance tweak).
+     *
+     * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to
+     * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */
+    eta = -1;
+
+    while (1) {
+        secp256k1_modinv64_trans2x2 t;
+        eta = secp256k1_modinv64_divsteps_62_var(eta, f.v[0], g.v[0], &t);
+        secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
+        secp256k1_modinv64_update_fg_62(&f, &g, &t);
+        if (g.v[0] == 0) {
+            cond = 0;
+            for (j = 1; j < 5; ++j) {
+                cond |= g.v[j];
+            }
+            if (cond == 0) break;
+        }
+    }
+
+    secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
+
+    *x = d;
+}
+
+#endif /* SECP256K1_MODINV64_IMPL_H */

From d8a92fcc4c65cf189ec7bd5298dad8479347c442 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 3 Dec 2020 16:26:58 -0800
Subject: [PATCH 05/59] Add extensive comments on the safegcd algorithm and
 implementation

This adds a long comment explaining the algorithm and implementation choices by building
it up step by step in Python.

Comments in the code are also reworked/added, with references to the long explanation.
---
 doc/safegcd_implementation.md | 750 ++++++++++++++++++++++++++++++++++
 src/modinv32.h                |  15 +-
 src/modinv32_impl.h           | 209 ++++++----
 src/modinv64.h                |  15 +-
 src/modinv64_impl.h           | 218 +++++-----
 5 files changed, 1019 insertions(+), 188 deletions(-)
 create mode 100644 doc/safegcd_implementation.md

diff --git a/doc/safegcd_implementation.md b/doc/safegcd_implementation.md
new file mode 100644
index 000000000..8346d22e5
--- /dev/null
+++ b/doc/safegcd_implementation.md
@@ -0,0 +1,750 @@
+# The safegcd implementation in libsecp256k1 explained
+
+This document explains the modular inverse implementation in the `src/modinv*.h` files. It is based
+on the paper
+["Fast constant-time gcd computation and modular inversion"](https://gcd.cr.yp.to/papers.html#safegcd)
+by Daniel J. Bernstein and Bo-Yin Yang. The references below are for the Date: 2019.04.13 version.
+
+The actual implementation is in C of course, but for demonstration purposes Python3 is used here.
+Most implementation aspects and optimizations are explained, except those that depend on the specific
+number representation used in the C code.
+
+## 1. Computing the Greatest Common Divisor (GCD) using divsteps
+
+The algorithm from the paper (section 11), at a very high level, is this:
+
+```python
+def gcd(f, g):
+    """Compute the GCD of an odd integer f and another integer g."""
+    assert f & 1  # require f to be odd
+    delta = 1     # additional state variable
+    while g != 0:
+        assert f & 1  # f will be odd in every iteration
+        if delta > 0 and g & 1:
+            delta, f, g = 1 - delta, g, (g - f) // 2
+        elif g & 1:
+            delta, f, g = 1 + delta, f, (g + f) // 2
+        else:
+            delta, f, g = 1 + delta, f, (g    ) // 2
+    return abs(f)
+```
+
+It computes the greatest common divisor of an odd integer *f* and any integer *g*. Its inner loop
+keeps rewriting the variables *f* and *g* alongside a state variable *&delta;* that starts at *1*, until
+*g=0* is reached. At that point, *|f|* gives the GCD. Each of the transitions in the loop is called a
+"division step" (referred to as divstep in what follows).
+
+For example, *gcd(21, 14)* would be computed as:
+- Start with *&delta;=1 f=21 g=14*
+- Take the third branch: *&delta;=2 f=21 g=7*
+- Take the first branch: *&delta;=-1 f=7 g=-7*
+- Take the second branch: *&delta;=0 f=7 g=0*
+- The answer *|f| = 7*.
+
+Why it works:
+- Divsteps can be decomposed into two steps (see paragraph 8.2 in the paper):
+  - (a) If *g* is odd, replace *(f,g)* with *(g,g-f)* or (f,g+f), resulting in an even *g*.
+  - (b) Replace *(f,g)* with *(f,g/2)* (where *g* is guaranteed to be even).
+- Neither of those two operations change the GCD:
+  - For (a), assume *gcd(f,g)=c*, then it must be the case that *f=a&thinsp;c* and *g=b&thinsp;c* for some integers *a*
+    and *b*. As *(g,g-f)=(b&thinsp;c,(b-a)c)* and *(f,f+g)=(a&thinsp;c,(a+b)c)*, the result clearly still has
+    common factor *c*. Reasoning in the other direction shows that no common factor can be added by
+    doing so either.
+  - For (b), we know that *f* is odd, so *gcd(f,g)* clearly has no factor *2*, and we can remove
+    it from *g*.
+- The algorithm will eventually converge to *g=0*. This is proven in the paper (see theorem G.3).
+- It follows that eventually we find a final value *f'* for which *gcd(f,g) = gcd(f',0)*. As the
+  gcd of *f'* and *0* is *|f'|* by definition, that is our answer.
+
+Compared to more [traditional GCD algorithms](https://en.wikipedia.org/wiki/Euclidean_algorithm), this one has the property of only ever looking at
+the low-order bits of the variables to decide the next steps, and being easy to make
+constant-time (in more low-level languages than Python). The *&delta;* parameter is necessary to
+guide the algorithm towards shrinking the numbers' magnitudes without explicitly needing to look
+at high order bits.
+
+Properties that will become important later:
+- Performing more divsteps than needed is not a problem, as *f* does not change anymore after *g=0*.
+- Only even numbers are divided by *2*. This means that when reasoning about it algebraically we
+  do not need to worry about rounding.
+- At every point during the algorithm's execution the next *N* steps only depend on the bottom *N*
+  bits of *f* and *g*, and on *&delta;*.
+
+
+## 2. From GCDs to modular inverses
+
+We want an algorithm to compute the inverse *a* of *x* modulo *M*, i.e. the number a such that *a&thinsp;x=1
+mod M*. This inverse only exists if the GCD of *x* and *M* is *1*, but that is always the case if *M* is
+prime and *0 < x < M*. In what follows, assume that the modular inverse exists.
+It turns out this inverse can be computed as a side effect of computing the GCD by keeping track
+of how the internal variables can be written as linear combinations of the inputs at every step
+(see the [extended Euclidean algorithm](https://en.wikipedia.org/wiki/Extended_Euclidean_algorithm)).
+Since the GCD is *1*, such an algorithm will compute numbers *a* and *b* such that a&thinsp;x + b&thinsp;M = 1*.
+Taking that expression *mod M* gives *a&thinsp;x mod M = 1*, and we see that *a* is the modular inverse of *x
+mod M*.
+
+A similar approach can be used to calculate modular inverses using the divsteps-based GCD
+algorithm shown above, if the modulus *M* is odd. To do so, compute *gcd(f=M,g=x)*, while keeping
+track of extra variables *d* and *e*, for which at every step *d = f/x (mod M)* and *e = g/x (mod M)*.
+*f/x* here means the number which multiplied with *x* gives *f mod M*. As *f* and *g* are initialized to *M*
+and *x* respectively, *d* and *e* just start off being *0* (*M/x mod M = 0/x mod M = 0*) and *1* (*x/x mod M
+= 1*).
+
+```python
+def div2(M, x):
+    """Helper routine to compute x/2 mod M (where M is odd)."""
+    assert M & 1
+    if x & 1: # If x is odd, make it even by adding M.
+        x += M
+    # x must be even now, so a clean division by 2 is possible.
+    return x // 2
+
+def modinv(M, x):
+    """Compute the inverse of x mod M (given that it exists, and M is odd)."""
+    assert M & 1
+    delta, f, g, d, e = 1, M, x, 0, 1
+    while g != 0:
+        # Note that while division by two for f and g is only ever done on even inputs, this is
+        # not true for d and e, so we need the div2 helper function.
+        if delta > 0 and g & 1:
+            delta, f, g, d, e = 1 - delta, g, (g - f) // 2, e, div2(M, e - d)
+        elif g & 1:
+            delta, f, g, d, e = 1 + delta, f, (g + f) // 2, d, div2(M, e + d)
+        else:
+            delta, f, g, d, e = 1 + delta, f, (g    ) // 2, d, div2(M, e    )
+        # Verify that the invariants d=f/x mod M, e=g/x mod M are maintained.
+        assert f % M == (d * x) % M
+        assert g % M == (e * x) % M
+    assert f == 1 or f == -1  # |f| is the GCD, it must be 1
+    # Because of invariant d = f/x (mod M), 1/x = d/f (mod M). As |f|=1, d/f = d*f.
+    return (d * f) % M
+```
+
+Also note that this approach to track *d* and *e* throughout the computation to determine the inverse
+is different from the paper. There (see paragraph 12.1 in the paper) a transition matrix for the
+entire computation is determined (see section 3 below) and the inverse is computed from that.
+The approach here avoids the need for 2x2 matrix multiplications of various sizes, and appears to
+be faster at the level of optimization we're able to do in C.
+
+
+## 3. Batching multiple divsteps
+
+Every divstep can be expressed as a matrix multiplication, applying a transition matrix *(1/2 t)*
+to both vectors *[f, g]* and *[d, e]* (see paragraph 8.1 in the paper):
+
+```
+  t = [ u,  v ]
+      [ q,  r ]
+
+  [ out_f ] = (1/2 * t) * [ in_f ]
+  [ out_g ] =             [ in_g ]
+
+  [ out_d ] = (1/2 * t) * [ in_d ]  (mod M)
+  [ out_e ]               [ in_e ]
+```
+
+where *(u, v, q, r)* is *(0, 2, -1, 1)*, *(2, 0, 1, 1)*, or *(2, 0, 0, 1)*, depending on which branch is
+taken. As above, the resulting *f* and *g* are always integers.
+
+Performing multiple divsteps corresponds to a multiplication with the product of all the
+individual divsteps' transition matrices. As each transition matrix consists of integers
+divided by *2*, the product of these matrices will consist of integers divided by *2<sup>N</sup>* (see also
+theorem 9.2 in the paper). These divisions are expensive when updating *d* and *e*, so we delay
+them: we compute the integer coefficients of the combined transition matrix scaled by *2<sup>N</sup>*, and
+do one division by *2<sup>N</sup>* as a final step:
+
+```python
+def divsteps_n_matrix(delta, f, g):
+    """Compute delta and transition matrix t after N divsteps (multiplied by 2^N)."""
+    u, v, q, r = 1, 0, 0, 1 # start with identity matrix
+    for _ in range(N):
+        if delta > 0 and g & 1:
+            delta, f, g, u, v, q, r = 1 - delta, g, (g - f) // 2, 2*q, 2*r, q-u, r-v
+        elif g & 1:
+            delta, f, g, u, v, q, r = 1 + delta, f, (g + f) // 2, 2*u, 2*v, q+u, r+v
+        else:
+            delta, f, g, u, v, q, r = 1 + delta, f, (g    ) // 2, 2*u, 2*v, q  , r
+    return delta, (u, v, q, r)
+```
+
+As the branches in the divsteps are completely determined by the bottom *N* bits of *f* and *g*, this
+function to compute the transition matrix only needs to see those bottom bits. Furthermore all
+intermediate results and outputs fit in *(N+1)*-bit numbers (unsigned for *f* and *g*; signed for *u*, *v*,
+*q*, and *r*) (see also paragraph 8.3 in the paper). This means that an implementation using 64-bit
+integers could set *N=62* and compute the full transition matrix for 62 steps at once without any
+big integer arithmetic at all. This is the reason why this algorithm is efficient: it only needs
+to update the full-size *f*, *g*, *d*, and *e* numbers once every *N* steps.
+
+We still need functions to compute:
+
+```
+  [ out_f ] = (1/2^N * [ u,  v ]) * [ in_f ]
+  [ out_g ]   (        [ q,  r ])   [ in_g ]
+
+  [ out_d ] = (1/2^N * [ u,  v ]) * [ in_d ]  (mod M)
+  [ out_e ]   (        [ q,  r ])   [ in_e ]
+```
+
+Because the divsteps transformation only ever divides even numbers by two, the result of *t&thinsp;[f,g]* is always even. When *t* is a composition of *N* divsteps, it follows that the resulting *f*
+and *g* will be multiple of *2<sup>N</sup>*, and division by *2<sup>N</sup>* is simply shifting them down:
+
+```python
+def update_fg(f, g, t):
+    """Multiply matrix t/2^N with [f, g]."""
+    u, v, q, r = t
+    cf, cg = u*f + v*g, q*f + r*g
+    # (t / 2^N) should cleanly apply to [f,g] so the result of t*[f,g] should have N zero
+    # bottom bits.
+    assert cf % 2**N == 0
+    assert cg % 2**N == 0
+    return cf >> N, cg >> N
+```
+
+The same is not true for *d* and *e*, and we need an equivalent of the `div2` function for division by *2<sup>N</sup> mod M*.
+This is easy if we have precomputed *1/M mod 2<sup>N</sup>* (which always exists for odd *M*):
+
+```python
+def div2n(M, Mi, x):
+    """Compute x/2^N mod M, given Mi = 1/M mod 2^N."""
+    assert (M * Mi) % 2**N == 1
+    # Find a factor m such that m*M has the same bottom N bits as x. We want:
+    #     (m * M) mod 2^N = x mod 2^N
+    # <=> m mod 2^N = (x / M) mod 2^N
+    # <=> m mod 2^N = (x * Mi) mod 2^N
+    m = (Mi * x) % 2**N
+    # Subtract that multiple from x, cancelling its bottom N bits.
+    x -= m * M
+    # Now a clean division by 2^N is possible.
+    assert x % 2**N == 0
+    return (x >> N) % M
+
+def update_de(d, e, t, M, Mi):
+    """Multiply matrix t/2^N with [d, e], modulo M."""
+    u, v, q, r = t
+    cd, ce = u*d + v*e, q*d + r*e
+    return div2n(M, Mi, cd), div2n(M, Mi, ce)
+```
+
+With all of those, we can write a version of `modinv` that performs *N* divsteps at once:
+
+```python3
+def modinv(M, Mi, x):
+    """Compute the modular inverse of x mod M, given Mi=1/M mod 2^N."""
+    assert M & 1
+    delta, f, g, d, e = 1, M, x, 0, 1
+    while g != 0:
+        # Compute the delta and transition matrix t for the next N divsteps (this only needs
+        # (N+1)-bit signed integer arithmetic).
+        delta, t = divsteps_n_matrix(delta, f % 2**N, g % 2**N)
+        # Apply the transition matrix t to [f, g]:
+        f, g = update_fg(f, g, t)
+        # Apply the transition matrix t to [d, e]:
+        d, e = update_de(d, e, t, M, Mi)
+    return (d * f) % M
+```
+
+This means that in practice we'll always perform a multiple of *N* divsteps. This is not a problem
+because once *g=0*, further divsteps do not affect *f*, *g*, *d*, or *e* anymore (only *&delta;* keeps
+increasing). For variable time code such excess iterations will be mostly optimized away in
+section 6.
+
+
+## 4. Avoiding modulus operations
+
+So far, there are two places where we compute a remainder of big numbers modulo *M*: at the end of
+`div2n` in every `update_de`, and at the very end of `modinv` after potentially negating *d* due to the
+sign of *f*. These are relatively expensive operations when done generically.
+
+To deal with the modulus operation in `div2n`, we simply stop requiring *d* and *e* to be in range
+*[0,M)* all the time. Let's start by inlining `div2n` into `update_de`, and dropping the modulus
+operation at the end:
+
+```python
+def update_de(d, e, t, M, Mi):
+    """Multiply matrix t/2^N with [d, e] mod M, given Mi=1/M mod 2^N."""
+    u, v, q, r = t
+    cd, ce = u*d + v*e, q*d + r*e
+    # Cancel out bottom N bits of cd and ce.
+    md = -((Mi * cd) % 2**N)
+    me = -((Mi * ce) % 2**N)
+    cd += md * M
+    ce += me * M
+    # And cleanly divide by 2**N.
+    return cd >> N, ce >> N
+```
+
+Let's look at bounds on the ranges of these numbers. It can be shown that *|u|+|v|* and *|q|+|r|*
+never exceed *2<sup>N</sup>* (see paragraph 8.3 in the paper), and thus a multiplication with *t* will have
+outputs whose absolute values are at most *2<sup>N</sup>* times the maximum absolute input value. In case the
+inputs *d* and *e* are in *(-M,M)*, which is certainly true for the initial values *d=0* and *e=1* assuming
+*M > 1*, the multiplication results in numbers in range *(-2<sup>N</sup>M,2<sup>N</sup>M)*. Subtracting less than *2<sup>N</sup>*
+times *M* to cancel out *N* bits brings that up to *(-2<sup>N+1</sup>M,2<sup>N</sup>M)*, and
+dividing by *2<sup>N</sup>* at the end takes it to *(-2M,M)*. Another application of `update_de` would take that
+to *(-3M,2M)*, and so forth. This progressive expansion of the variables' ranges can be
+counteracted by incrementing *d* and *e* by *M* whenever they're negative:
+
+```python
+    ...
+    if d < 0:
+        d += M
+    if e < 0:
+        e += M
+    cd, ce = u*d + v*e, q*d + r*e
+    # Cancel out bottom N bits of cd and ce.
+    ...
+```
+
+With inputs in *(-2M,M)*, they will first be shifted into range *(-M,M)*, which means that the
+output will again be in *(-2M,M)*, and this remains the case regardless of how many `update_de`
+invocations there are. In what follows, we will try to make this more efficient.
+
+Note that increasing *d* by *M* is equal to incrementing *cd* by *u&thinsp;M* and *ce* by *q&thinsp;M*. Similarly,
+increasing *e* by *M* is equal to incrementing *cd* by *v&thinsp;M* and *ce* by *r&thinsp;M*. So we could instead write:
+
+```python
+    ...
+    cd, ce = u*d + v*e, q*d + r*e
+    # Perform the equivalent of incrementing d, e by M when they're negative.
+    if d < 0:
+        cd += u*M
+        ce += q*M
+    if e < 0:
+        cd += v*M
+        ce += r*M
+    # Cancel out bottom N bits of cd and ce.
+    md = -((Mi * cd) % 2**N)
+    me = -((Mi * ce) % 2**N)
+    cd += md * M
+    ce += me * M
+    ...
+```
+
+Now note that we have two steps of corrections to *cd* and *ce* that add multiples of *M*: this
+increment, and the decrement that cancels out bottom bits. The second one depends on the first
+one, but they can still be efficiently combined by only computing the bottom bits of *cd* and *ce*
+at first, and using that to compute the final *md*, *me* values:
+
+```python
+def update_de(d, e, t, M, Mi):
+    """Multiply matrix t/2^N with [d, e], modulo M."""
+    u, v, q, r = t
+    md, me = 0, 0
+    # Compute what multiples of M to add to cd and ce.
+    if d < 0:
+        md += u
+        me += q
+    if e < 0:
+        md += v
+        me += r
+    # Compute bottom N bits of t*[d,e] + M*[md,me].
+    cd, ce = (u*d + v*e + md*M) % 2**N, (q*d + r*e + me*M) % 2**N
+    # Correct md and me such that the bottom N bits of t*[d,e] + M*[md,me] are zero.
+    md -= (Mi * cd) % 2**N
+    me -= (Mi * ce) % 2**N
+    # Do the full computation.
+    cd, ce = u*d + v*e + md*M, q*d + r*e + me*M
+    # And cleanly divide by 2**N.
+    return cd >> N, ce >> N
+```
+
+One last optimization: we can avoid the *md&thinsp;M* and *me&thinsp;M* multiplications in the bottom bits of *cd*
+and *ce* by moving them to the *md* and *me* correction:
+
+```python
+    ...
+    # Compute bottom N bits of t*[d,e].
+    cd, ce = (u*d + v*e) % 2**N, (q*d + r*e) % 2**N
+    # Correct md and me such that the bottom N bits of t*[d,e]+M*[md,me] are zero.
+    # Note that this is not the same as {md = (-Mi * cd) % 2**N} etc. That would also result in N
+    # zero bottom bits, but isn't guaranteed to be a reduction of [0,2^N) compared to the
+    # previous md and me values, and thus would violate our bounds analysis.
+    md -= (Mi*cd + md) % 2**N
+    me -= (Mi*ce + me) % 2**N
+    ...
+```
+
+The resulting function takes *d* and *e* in range *(-2M,M)* as inputs, and outputs values in the same
+range. That also means that the *d* value at the end of `modinv` will be in that range, while we want
+a result in *[0,M)*. To do that, we need a normalization function. It's easy to integrate the
+conditional negation of *d* (based on the sign of *f*) into it as well:
+
+```python
+def normalize(sign, v, M):
+    """Compute sign*v mod M, where v is in range (-2*M,M); output in [0,M)."""
+    assert sign == 1 or sign == -1
+    # v in (-2*M,M)
+    if v < 0:
+        v += M
+    # v in (-M,M). Now multiply v with sign (which can only be 1 or -1).
+    if sign == -1:
+        v = -v
+    # v in (-M,M)
+    if v < 0:
+        v += M
+    # v in [0,M)
+    return v
+```
+
+And calling it in `modinv` is simply:
+
+```python
+   ...
+   return normalize(f, d, M)
+```
+
+
+## 5. Constant-time operation
+
+The primary selling point of the algorithm is fast constant-time operation. What code flow still
+depends on the input data so far?
+
+- the number of iterations of the while *g &ne; 0* loop in `modinv`
+- the branches inside `divsteps_n_matrix`
+- the sign checks in `update_de`
+- the sign checks in `normalize`
+
+To make the while loop in `modinv` constant time it can be replaced with a constant number of
+iterations. The paper proves (Theorem 11.2) that *741* divsteps are sufficient for any *256*-bit
+inputs, and [safegcd-bounds](https://github.com/sipa/safegcd-bounds) shows that the slightly better bound *724* is
+sufficient even. Given that every loop iteration performs *N* divsteps, it will run a total of
+*&lceil;724/N&rceil;* times.
+
+To deal with the branches in `divsteps_n_matrix` we will replace them with constant-time bitwise
+operations (and hope the C compiler isn't smart enough to turn them back into branches; see
+`valgrind_ctime_test.c` for automated tests that this isn't the case). To do so, observe that a
+divstep can be written instead as (compare to the inner loop of `gcd` in section 1).
+
+```python
+    x = -f if delta > 0 else f         # set x equal to (input) -f or f
+    if g & 1:
+        g += x                         # set g to (input) g-f or g+f
+        if delta > 0:
+            delta = -delta
+            f += g                     # set f to (input) g (note that g was set to g-f before)
+    delta += 1
+    g >>= 1
+```
+
+To convert the above to bitwise operations, we rely on a trick to negate conditionally: per the
+definition of negative numbers in two's complement, (*-v == ~v + 1*) holds for every number *v*. As
+*-1* in two's complement is all *1* bits, bitflipping can be expressed as xor with *-1*. It follows
+that *-v == (v ^ -1) - (-1)*. Thus, if we have a variable *c* that takes on values *0* or *-1*, then
+*(v ^ c) - c* is *v* if *c=0* and *-v* if *c=-1*.
+
+Using this we can write:
+
+```python
+    x = -f if delta > 0 else f
+```
+
+in constant-time form as:
+
+```python
+    c1 = (-delta) >> 63
+    # Conditionally negate f based on c1:
+    x = (f ^ c1) - c1
+```
+
+To use that trick, we need a helper mask variable *c1* that resolves the condition *&delta;>0* to *-1*
+(if true) or *0* (if false). We compute *c1* using right shifting, which is equivalent to dividing by
+the specified power of *2* and rounding down (in Python, and also in C under the assumption of a typical two's complement system; see
+`assumptions.h` for tests that this is the case). Right shifting by *63* thus maps all
+numbers in range *[-2<sup>63</sup>,0)* to *-1*, and numbers in range *[0,2<sup>63</sup>)* to *0*.
+
+Using the facts that *x&0=0* and *x&(-1)=x* (on two's complement systems again), we can write:
+
+```python
+    if g & 1:
+        g += x
+```
+
+as:
+
+```python
+    # Compute c2=0 if g is even and c2=-1 if g is odd.
+    c2 = -(g & 1)
+    # This masks out x if g is even, and leaves x be if g is odd.
+    g += x & c2
+```
+
+Using the conditional negation trick again we can write:
+
+```python
+    if g & 1:
+        if delta > 0:
+            delta = -delta
+```
+
+as:
+
+```python
+    # Compute c3=-1 if g is odd and delta>0, and 0 otherwise.
+    c3 = c1 & c2
+    # Conditionally negate delta based on c3:
+    delta = (delta ^ c3) - c3
+```
+
+Finally:
+
+```python
+    if g & 1:
+        if delta > 0:
+            f += g
+```
+
+becomes:
+
+```python
+    f += g & c3
+```
+
+It turns out that this can be implemented more efficiently by applying the substitution
+*&eta;=-&delta;*. In this representation, negating *&delta;* corresponds to negating *&eta;*, and incrementing
+*&delta;* corresponds to decrementing *&eta;*. This allows us to remove the negation in the *c1*
+computation:
+
+```python
+    # Compute a mask c1 for eta < 0, and compute the conditional negation x of f:
+    c1 = eta >> 63
+    x = (f ^ c1) - c1
+    # Compute a mask c2 for odd g, and conditionally add x to g:
+    c2 = -(g & 1)
+    g += x & c2
+    # Compute a mask c for (eta < 0) and odd (input) g, and use it to conditionally negate eta,
+    # and add g to f:
+    c3 = c1 & c2
+    eta = (eta ^ c3) - c3
+    f += g & c3
+    # Incrementing delta corresponds to decrementing eta.
+    eta -= 1
+    g >>= 1
+```
+
+By replacing the loop in `divsteps_n_matrix` with a variant of the divstep code above (extended to
+also apply all *f* operations to *u*, *v* and all *g* operations to *q*, *r*), a constant-time version of
+`divsteps_n_matrix` is obtained. The full code will be in section 7.
+
+These bit fiddling tricks can also be used to make the conditional negations and additions in
+`update_de` and `normalize` constant-time.
+
+
+## 6. Variable-time optimizations
+
+In section 5, we modified the `divsteps_n_matrix` function (and a few others) to be constant time.
+Constant time operations are only necessary when computing modular inverses of secret data. In
+other cases, it slows down calculations unnecessarily. In this section, we will construct a
+faster non-constant time `divsteps_n_matrix` function.
+
+To do so, first consider yet another way of writing the inner loop of divstep operations in
+`gcd` from section 1. This decomposition is also explained in the paper in section 8.2.
+
+```python
+for _ in range(N):
+    if g & 1 and eta < 0:
+        eta, f, g = -eta, g, -f
+    if g & 1:
+        g += f
+    eta -= 1
+    g >>= 1
+```
+
+Whenever *g* is even, the loop only shifts *g* down and decreases *&eta;*. When *g* ends in multiple zero
+bits, these iterations can be consolidated into one step. This requires counting the bottom zero
+bits efficiently, which is possible on most platforms; it is abstracted here as the function
+`count_trailing_zeros`.
+
+```python
+def count_trailing_zeros(v):
+    """For a non-zero value v, find z such that v=(d<<z) for some odd d."""
+    return (v & -v).bit_length() - 1
+
+i = N # divsteps left to do
+while True:
+    # Get rid of all bottom zeros at once. In the first iteration, g may be odd and the following
+    # lines have no effect (until "if eta < 0").
+    zeros = min(i, count_trailing_zeros(g))
+    eta -= zeros
+    g >>= zeros
+    i -= zeros
+    if i == 0:
+        break
+    # We know g is odd now
+    if eta < 0:
+        eta, f, g = -eta, g, -f
+    g += f
+    # g is even now, and the eta decrement and g shift will happen in the next loop.
+```
+
+We can now remove multiple bottom *0* bits from *g* at once, but still need a full iteration whenever
+there is a bottom *1* bit. In what follows, we will get rid of multiple *1* bits simultaneously as
+well.
+
+Observe that as long as *&eta; &geq; 0*, the loop does not modify *f*. Instead, it cancels out bottom
+bits of *g* and shifts them out, and decreases *&eta;* and *i* accordingly - interrupting only when *&eta;*
+becomes negative, or when *i* reaches *0*. Combined, this is equivalent to adding a multiple of *f* to
+*g* to cancel out multiple bottom bits, and then shifting them out.
+
+It is easy to find what that multiple is: we want a number *w* such that *g+w&thinsp;f* has a few bottom
+zero bits. If that number of bits is *L*, we want *g+w&thinsp;f mod 2<sup>L</sup> = 0*, or *w = -g/f mod 2<sup>L</sup>*. Since *f*
+is odd, such a *w* exists for any *L*. *L* cannot be more than *i* steps (as we'd finish the loop before
+doing more) or more than *&eta;+1* steps (as we'd run `eta, f, g = -eta, g, f` at that point), but
+apart from that, we're only limited by the complexity of computing *w*.
+
+This code demonstrates how to cancel up to 4 bits per step:
+
+```python
+NEGINV16 = [15, 5, 3, 9, 7, 13, 11, 1] # NEGINV16[n//2] = (-n)^-1 mod 16, for odd n
+i = N
+while True:
+    zeros = min(i, count_trailing_zeros(g))
+    eta -= zeros
+    g >>= zeros
+    i -= zeros
+    if i == 0:
+        break
+    # We know g is odd now
+    if eta < 0:
+        eta, f, g = -eta, g, f
+    # Compute limit on number of bits to cancel
+    limit = min(min(eta + 1, i), 4)
+    # Compute w = -g/f mod 2**limit, using the table value for -1/f mod 2**4. Note that f is
+    # always odd, so its inverse modulo a power of two always exists.
+    w = (g * NEGINV16[(f & 15) // 2]) % (2**limit)
+    # As w = -g/f mod (2**limit), g+w*f mod 2**limit = 0 mod 2**limit.
+    g += w * f
+    assert g % (2**limit) == 0
+    # The next iteration will now shift out at least limit bottom zero bits from g.
+```
+
+By using a bigger table more bits can be cancelled at once. The table can also be implemented
+as a formula. Several formulas are known for computing modular inverses modulo powers of two;
+some can be found in Hacker's Delight second edition by Henry S. Warren, Jr. pages 245-247.
+Here we need the negated modular inverse, which is a simple transformation of those:
+
+- Instead of a 3-bit table:
+  - *-f* or *f ^ 6*
+- Instead of a 4-bit table:
+  - *1 - f(f + 1)*
+  - *-(f + (((f + 1) & 4) << 1))*
+- For larger tables the following technique can be used: if *w=-1/f mod 2<sup>L</sup>*, then *w(w&thinsp;f+2)* is
+  *-1/f mod 2<sup>2L</sup>*. This allows extending the previous formulas (or tables). In particular we
+  have this 6-bit function (based on the 3-bit function above):
+  - *f(f<sup>2</sup> - 2)*
+
+This loop, again extended to also handle *u*, *v*, *q*, and *r* alongside *f* and *g*, placed in
+`divsteps_n_matrix`, gives a significantly faster, but non-constant time version.
+
+
+## 7. Final Python version
+
+All together we need the following functions:
+
+- A way to compute the transition matrix in constant time, using the `divsteps_n_matrix` function
+  from section 2, but with its loop replaced by a variant of the constant-time divstep from
+  section 5, extended to handle *u*, *v*, *q*, *r*:
+
+```python
+def divsteps_n_matrix(eta, f, g):
+    """Compute eta and transition matrix t after N divsteps (multiplied by 2^N)."""
+    u, v, q, r = 1, 0, 0, 1 # start with identity matrix
+    for _ in range(N):
+        c1 = eta >> 63
+        # Compute x, y, z as conditionally-negated versions of f, u, v.
+        x, y, z = (f ^ c1) - c1, (u ^ c1) - c1, (v ^ c1) - c1
+        c2 = -(g & 1)
+        # Conditionally add x, y, z to g, q, r.
+        g, q, r = g + (x & c2), q + (y & c2), r + (z & c2)
+        c1 &= c2                     # reusing c1 here for the earlier c3 variable
+        eta = (eta ^ c1) - (c1 + 1)  # inlining the unconditional eta decrement here
+        # Conditionally add g, q, r to f, u, v.
+        f, u, v = f + (g & c1), u + (q & c1), v + (r & c1)
+        # When shifting g down, don't shift q, r, as we construct a transition matrix multiplied
+        # by 2^N. Instead, shift f's coefficients u and v up.
+        g, u, v = g >> 1, u << 1, v << 1
+    return eta, (u, v, q, r)
+```
+
+- The functions to update *f* and *g*, and *d* and *e*, from section 2 and section 4, with the constant-time
+  changes to `update_de` from section 5:
+
+```python
+def update_fg(f, g, t):
+    """Multiply matrix t/2^N with [f, g]."""
+    u, v, q, r = t
+    cf, cg = u*f + v*g, q*f + r*g
+    return cf >> N, cg >> N
+
+def update_de(d, e, t, M, Mi):
+    """Multiply matrix t/2^N with [d, e], modulo M."""
+    u, v, q, r = t
+    d_sign, e_sign = d >> 257, e >> 257
+    md, me = (u & d_sign) + (v & e_sign), (q & d_sign) + (r & e_sign)
+    cd, ce = (u*d + v*e) % 2**N, (q*d + r*e) % 2**N
+    md -= (Mi*cd + md) % 2**N
+    me -= (Mi*ce + me) % 2**N
+    cd, ce = u*d + v*e + Mi*md, q*d + r*e + Mi*me
+    return cd >> N, ce >> N
+```
+
+- The `normalize` function from section 4, made constant time as well:
+
+```python
+def normalize(sign, v, M):
+    """Compute sign*v mod M, where v in (-2*M,M); output in [0,M)."""
+    v_sign = v >> 257
+    # Conditionally add M to v.
+    v += M & v_sign
+    c = (sign - 1) >> 1
+    # Conditionally negate v.
+    v = (v ^ c) - c
+    v_sign = v >> 257
+    # Conditionally add M to v again.
+    v += M & v_sign
+    return v
+```
+
+- And finally the `modinv` function too, adapted to use *&eta;* instead of *&delta;*, and using the fixed
+  iteration count from section 5:
+
+```python
+def modinv(M, Mi, x):
+    """Compute the modular inverse of x mod M, given Mi=1/M mod 2^N."""
+    eta, f, g, d, e = -1, M, x, 0, 1
+    for _ in range((724 + N - 1) // N):
+        eta, t = divsteps_n_matrix(-eta, f % 2**N, g % 2**N)
+        f, g = update_fg(f, g, t)
+        d, e = update_de(d, e, t, M, Mi)
+    return normalize(f, d, M)
+```
+
+- To get a variable time version, replace the `divsteps_n_matrix` function with one that uses the
+  divsteps loop from section 5, and a `modinv` version that calls it without the fixed iteration
+  count:
+
+```python
+NEGINV16 = [15, 5, 3, 9, 7, 13, 11, 1] # NEGINV16[n//2] = (-n)^-1 mod 16, for odd n
+def divsteps_n_matrix_var(eta, f, g):
+    """Compute eta and transition matrix t after N divsteps (multiplied by 2^N)."""
+    u, v, q, r = 1, 0, 0, 1
+    i = N
+    while True:
+        zeros = min(i, count_trailing_zeros(g))
+        eta, i = eta - zeros, i - zeros
+        g, u, v = g >> zeros, u << zeros, v << zeros
+        if i == 0:
+            break
+        if eta < 0:
+            eta, f, u, v, g, q, r = -eta, g, q, r, -f, -u, -v
+        limit = min(min(eta + 1, i), 4)
+        w = (g * NEGINV16[(f & 15) // 2]) % (2**limit)
+        g, q, r = g + w*f, q + w*u, r + w*v
+    return eta, (u, v, q, r)
+
+def modinv_var(M, Mi, x):
+    """Compute the modular inverse of x mod M, given Mi = 1/M mod 2^N."""
+    eta, f, g, d, e = -1, M, x, 0, 1
+    while g != 0:
+        eta, t = divsteps_n_matrix_var(eta, f % 2**N, g % 2**N)
+        f, g = update_fg(f, g, t)
+        d, e = update_de(d, e, t, M, Mi)
+    return normalize(f, d, Mi)
+```
diff --git a/src/modinv32.h b/src/modinv32.h
index 2678d816f..0efdda9ab 100644
--- a/src/modinv32.h
+++ b/src/modinv32.h
@@ -13,19 +13,30 @@
 
 #include "util.h"
 
+/* A signed 30-bit limb representation of integers.
+ *
+ * Its value is sum(v[i] * 2^(30*i), i=0..8). */
 typedef struct {
     int32_t v[9];
 } secp256k1_modinv32_signed30;
 
 typedef struct {
-    /* The modulus in signed30 notation. */
+    /* The modulus in signed30 notation, must be odd and in [3, 2^256]. */
     secp256k1_modinv32_signed30 modulus;
 
     /* modulus^{-1} mod 2^30 */
     uint32_t modulus_inv30;
 } secp256k1_modinv32_modinfo;
 
-static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
+/* Replace x with its modular inverse mod modinfo->modulus. x must be in range [0, modulus).
+ * If x is zero, the result will be zero as well. If not, the inverse must exist (i.e., the gcd of
+ * x and modulus must be 1). These rules are automatically satisfied if the modulus is prime.
+ *
+ * On output, all of x's limbs will be in [0, 2^30).
+ */
 static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
 
+/* Same as secp256k1_modinv32_var, but constant time in x (not in the modulus). */
+static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
+
 #endif /* SECP256K1_MODINV32_H */
diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
index d2fecc31c..3a6579df6 100644
--- a/src/modinv32_impl.h
+++ b/src/modinv32_impl.h
@@ -11,14 +11,31 @@
 
 #include "util.h"
 
+#include <stdlib.h>
+
+/* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
+ * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+ *
+ * For an explanation of the algorithm, see doc/safegcd_implementation.md. This file contains an
+ * implementation for N=30, using 30-bit signed limbs represented as int32_t.
+ */
+
+/* Take as input a signed30 number in range (-2*modulus,modulus), and add a multiple of the modulus
+ * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
+ * process. The input must have limbs in range (-2^30,2^30). The output will have limbs in range
+ * [0,2^30). */
 static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int32_t sign, const secp256k1_modinv32_modinfo *modinfo) {
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4],
             r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8];
     int32_t cond_add, cond_negate;
 
+    /* In a first step, add the modulus if the input is negative, and then negate if requested.
+     * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
+     * limbs are in range (-2^30,2^30), this cannot overflow an int32_t. Note that the right
+     * shifts below are signed sign-extending shifts (see assumptions.h for tests that that is
+     * indeed the behavior of the right shift operator). */
     cond_add = r8 >> 31;
-
     r0 += modinfo->modulus.v[0] & cond_add;
     r1 += modinfo->modulus.v[1] & cond_add;
     r2 += modinfo->modulus.v[2] & cond_add;
@@ -28,9 +45,7 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
     r6 += modinfo->modulus.v[6] & cond_add;
     r7 += modinfo->modulus.v[7] & cond_add;
     r8 += modinfo->modulus.v[8] & cond_add;
-
     cond_negate = sign >> 31;
-
     r0 = (r0 ^ cond_negate) - cond_negate;
     r1 = (r1 ^ cond_negate) - cond_negate;
     r2 = (r2 ^ cond_negate) - cond_negate;
@@ -40,7 +55,7 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
     r6 = (r6 ^ cond_negate) - cond_negate;
     r7 = (r7 ^ cond_negate) - cond_negate;
     r8 = (r8 ^ cond_negate) - cond_negate;
-
+    /* Propagate the top bits, to bring limbs back to range (-2^30,2^30). */
     r1 += r0 >> 30; r0 &= M30;
     r2 += r1 >> 30; r1 &= M30;
     r3 += r2 >> 30; r2 &= M30;
@@ -50,8 +65,9 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
     r7 += r6 >> 30; r6 &= M30;
     r8 += r7 >> 30; r7 &= M30;
 
+    /* In a second step add the modulus again if the result is still negative, bringing r to range
+     * [0,modulus). */
     cond_add = r8 >> 31;
-
     r0 += modinfo->modulus.v[0] & cond_add;
     r1 += modinfo->modulus.v[1] & cond_add;
     r2 += modinfo->modulus.v[2] & cond_add;
@@ -61,7 +77,7 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
     r6 += modinfo->modulus.v[6] & cond_add;
     r7 += modinfo->modulus.v[7] & cond_add;
     r8 += modinfo->modulus.v[8] & cond_add;
-
+    /* And propagate again. */
     r1 += r0 >> 30; r0 &= M30;
     r2 += r1 >> 30; r1 &= M30;
     r3 += r2 >> 30; r2 &= M30;
@@ -82,51 +98,82 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
     r->v[8] = r8;
 }
 
+/* Data type for transition matrices (see section 3 of explanation).
+ *
+ * t = [ u  v ]
+ *     [ q  r ]
+ */
 typedef struct {
     int32_t u, v, q, r;
 } secp256k1_modinv32_trans2x2;
 
+/* Compute the transition matrix and eta for 30 divsteps.
+ *
+ * Input:  eta: initial eta
+ *         f0:  bottom limb of initial f
+ *         g0:  bottom limb of initial g
+ * Output: t: transition matrix
+ * Return: final eta
+ *
+ * Implements the divsteps_n_matrix function from the explanation.
+ */
 static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
+    /* u,v,q,r are the elements of the transformation matrix being built up,
+     * starting with the identity matrix. Semantically they are signed integers
+     * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This
+     * permits left shifting (which is UB for negative numbers). The range
+     * being inside [-2^31,2^31) means that casting to signed works correctly.
+     */
     uint32_t u = 1, v = 0, q = 0, r = 1;
     uint32_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 30; ++i) {
-        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
-
+        /* Compute conditional masks for (eta < 0) and for (g & 1). */
         c1 = eta >> 31;
         c2 = -(g & 1);
-
+        /* Compute x,y,z, conditionally negated versions of f,u,v. */
         x = (f ^ c1) - c1;
         y = (u ^ c1) - c1;
         z = (v ^ c1) - c1;
-
+        /* Conditionally add x,y,z to g,q,r. */
         g += x & c2;
         q += y & c2;
         r += z & c2;
-
+        /* In what follows, c1 is a condition mask for (eta < 0) and (g & 1). */
         c1 &= c2;
+        /* Conditionally negate eta, and unconditionally subtract 1. */
         eta = (eta ^ c1) - (c1 + 1);
-
+        /* Conditionally add g,q,r to f,u,v. */
         f += g & c1;
         u += q & c1;
         v += r & c1;
-
+        /* Shifts */
         g >>= 1;
         u <<= 1;
         v <<= 1;
     }
-
+    /* Return data in t and return value. */
     t->u = (int32_t)u;
     t->v = (int32_t)v;
     t->q = (int32_t)q;
     t->r = (int32_t)r;
-
     return eta;
 }
 
+/* Compute the transition matrix and eta for 30 divsteps (variable time).
+ *
+ * Input:  eta: initial eta
+ *         f0:  bottom limb of initial f
+ *         g0:  bottom limb of initial g
+ * Output: t: transition matrix
+ * Return: final eta
+ *
+ * Implements the divsteps_n_matrix_var function from the explanation.
+ */
 static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
     /* inv256[i] = -(2*i+1)^-1 (mod 256) */
     static const uint8_t inv256[128] = {
@@ -143,6 +190,7 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint
         0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
     };
 
+    /* Transformation matrix; see comments in secp256k1_modinv32_divsteps_30. */
     uint32_t u = 1, v = 0, q = 0, r = 1;
     uint32_t f = f0, g = g0, m;
     uint16_t w;
@@ -151,22 +199,19 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint
     for (;;) {
         /* Use a sentinel bit to count zeros only up to i. */
         zeros = secp256k1_ctz32_var(g | (UINT32_MAX << i));
-
+        /* Perform zeros divsteps at once; they all just divide g by two. */
         g >>= zeros;
         u <<= zeros;
         v <<= zeros;
         eta -= zeros;
         i -= zeros;
-
-        if (i <= 0) {
-            break;
-        }
-
+         /* We're done once we've done 30 divsteps. */
+        if (i == 0) break;
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((g & 1) == 1);
         VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
         VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
-
+        /* If eta is negative, negate it and replace f,g with g,-f. */
         if (eta < 0) {
             uint32_t tmp;
             eta = -eta;
@@ -174,141 +219,128 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint
             tmp = u; u = q; q = -tmp;
             tmp = v; v = r; r = -tmp;
         }
-
-        /* Handle up to 8 divsteps at once, subject to eta and i. */
+        /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
+         * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
+         * can be done as its sign will flip once that happens. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+        /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
         m = (UINT32_MAX >> (32 - limit)) & 255U;
-
+        /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
         w = (g * inv256[(f >> 1) & 127]) & m;
-
+        /* Do so. */
         g += f * w;
         q += u * w;
         r += v * w;
-
         VERIFY_CHECK((g & m) == 0);
     }
-
+    /* Return data in t and return value. */
     t->u = (int32_t)u;
     t->v = (int32_t)v;
     t->q = (int32_t)q;
     t->r = (int32_t)r;
-
     return eta;
 }
 
+/* Compute (t/2^30) * [d, e] mod modulus, where t is a transition matrix for 30 divsteps.
+ *
+ * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range
+ * (-2^30,2^30).
+ *
+ * This implements the update_de function from the explanation.
+ */
 static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp256k1_modinv32_signed30 *e, const secp256k1_modinv32_trans2x2 *t, const secp256k1_modinv32_modinfo* modinfo) {
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
     int32_t di, ei, md, me, sd, se;
     int64_t cd, ce;
     int i;
-
-    /*
-     * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add
-     * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior
-     * to division by 2^30). This has the same effect as if we added the modulus to the input(s).
-     */
-
+    /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
     sd = d->v[8] >> 31;
     se = e->v[8] >> 31;
-
     md = (u & sd) + (v & se);
     me = (q & sd) + (r & se);
-
+    /* Begin computing t*[d,e]. */
     di = d->v[0];
     ei = e->v[0];
-
     cd = (int64_t)u * di + (int64_t)v * ei;
     ce = (int64_t)q * di + (int64_t)r * ei;
-
-    /*
-     * Subtract from md/me an extra term in the range [0, 2^30) such that the low 30 bits of each
-     * sum of products will be 0. This allows clean division by 2^30. On output, d/e are thus in
-     * the range (-2.P, P), consistent with the input constraint.
-     */
-
+    /* Correct md,me so that t*[d,e]+modulus*[md,me] has 30 zero bottom bits. */
     md -= (modinfo->modulus_inv30 * (uint32_t)cd + md) & M30;
     me -= (modinfo->modulus_inv30 * (uint32_t)ce + me) & M30;
-
+    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
     cd += (int64_t)modinfo->modulus.v[0] * md;
     ce += (int64_t)modinfo->modulus.v[0] * me;
-
+    /* Verify that the low 30 bits of the computation are indeed zero, and then throw them away. */
     VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30;
     VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30;
-
+    /* Now iteratively compute limb i=1..8 of t*[d,e]+modulus*[md,me], and store them in output
+     * limb i-1 (shifting down by 30 bits). */
     for (i = 1; i < 9; ++i) {
         di = d->v[i];
         ei = e->v[i];
-
         cd += (int64_t)u * di + (int64_t)v * ei;
         ce += (int64_t)q * di + (int64_t)r * ei;
-
         cd += (int64_t)modinfo->modulus.v[i] * md;
         ce += (int64_t)modinfo->modulus.v[i] * me;
-
         d->v[i - 1] = (int32_t)cd & M30; cd >>= 30;
         e->v[i - 1] = (int32_t)ce & M30; ce >>= 30;
     }
-
+    /* What remains is limb 9 of t*[d,e]+modulus*[md,me]; store it as output limb 8. */
     d->v[8] = (int32_t)cd;
     e->v[8] = (int32_t)ce;
 }
 
+/* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps.
+ *
+ * This implements the update_fg function from the explanation.
+ */
 static void secp256k1_modinv32_update_fg_30(secp256k1_modinv32_signed30 *f, secp256k1_modinv32_signed30 *g, const secp256k1_modinv32_trans2x2 *t) {
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
     int32_t fi, gi;
     int64_t cf, cg;
     int i;
-
+    /* Start computing t*[f,g]. */
     fi = f->v[0];
     gi = g->v[0];
-
     cf = (int64_t)u * fi + (int64_t)v * gi;
     cg = (int64_t)q * fi + (int64_t)r * gi;
-
-    VERIFY_CHECK(((int32_t)cf & M30) == 0);
-    VERIFY_CHECK(((int32_t)cg & M30) == 0);
-
-    cf >>= 30;
-    cg >>= 30;
-
+    /* Verify that the bottom 30 bits of the result are zero, and then throw them away. */
+    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
+    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
+    /* Now iteratively compute limb i=1..8 of t*[f,g], and store them in output limb i-1 (shifting
+     * down by 30 bits). */
     for (i = 1; i < 9; ++i) {
         fi = f->v[i];
         gi = g->v[i];
-
         cf += (int64_t)u * fi + (int64_t)v * gi;
         cg += (int64_t)q * fi + (int64_t)r * gi;
-
         f->v[i - 1] = (int32_t)cf & M30; cf >>= 30;
         g->v[i - 1] = (int32_t)cg & M30; cg >>= 30;
     }
-
+    /* What remains is limb 9 of t*[f,g]; store it as output limb 8. */
     f->v[8] = (int32_t)cf;
     g->v[8] = (int32_t)cg;
 }
 
+/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
 static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
-    /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
+    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
     secp256k1_modinv32_signed30 d = {{0}};
     secp256k1_modinv32_signed30 e = {{1}};
     secp256k1_modinv32_signed30 f = modinfo->modulus;
     secp256k1_modinv32_signed30 g = *x;
     int i;
-    int32_t eta;
-
-    /* The paper uses 'delta'; eta == -delta (a performance tweak).
-     *
-     * If the maximum bitlength of g is known to be less than 256, then eta can be set
-     * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total
-     * divsteps are needed. */
-    eta = -1;
+    int32_t eta = -1;
 
+    /* Do 25 iterations of 30 divsteps each = 750 divsteps. 724 suffices for 256-bit inputs. */
     for (i = 0; i < 25; ++i) {
+        /* Compute transition matrix and new eta after 30 divsteps. */
         secp256k1_modinv32_trans2x2 t;
         eta = secp256k1_modinv32_divsteps_30(eta, f.v[0], g.v[0], &t);
+        /* Update d,e using that transition matrix. */
         secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
+        /* Update f,g using that transition matrix. */
         secp256k1_modinv32_update_fg_30(&f, &g, &t);
     }
 
@@ -317,38 +349,39 @@ static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_m
      * values i.e. +/- 1, and d now contains +/- the modular inverse. */
     VERIFY_CHECK((g.v[0] | g.v[1] | g.v[2] | g.v[3] | g.v[4] | g.v[5] | g.v[6] | g.v[7] | g.v[8]) == 0);
 
-    secp256k1_modinv32_normalize_30(&d, f.v[8] >> 31, modinfo);
-
+    /* Optionally negate d, normalize to [0,modulus), and return it. */
+    secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
     *x = d;
 }
 
+/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (variable time). */
 static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
-    /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
+    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
     secp256k1_modinv32_signed30 d = {{0, 0, 0, 0, 0, 0, 0, 0, 0}};
     secp256k1_modinv32_signed30 e = {{1, 0, 0, 0, 0, 0, 0, 0, 0}};
     secp256k1_modinv32_signed30 f = modinfo->modulus;
     secp256k1_modinv32_signed30 g = *x;
     int j;
-    int32_t eta;
+    int32_t eta = -1;
     int32_t cond;
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak).
-     *
-     * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to
-     * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */
-    eta = -1;
-
+    /* Do iterations of 30 divsteps each until g=0. */
     while (1) {
+        /* Compute transition matrix and new eta after 30 divsteps. */
         secp256k1_modinv32_trans2x2 t;
         eta = secp256k1_modinv32_divsteps_30_var(eta, f.v[0], g.v[0], &t);
+        /* Update d,e using that transition matrix. */
         secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
+        /* Update f,g using that transition matrix. */
         secp256k1_modinv32_update_fg_30(&f, &g, &t);
+        /* If the bottom limb of g is 0, there is a chance g=0. */
         if (g.v[0] == 0) {
             cond = 0;
+            /* Check if the other limbs are also 0. */
             for (j = 1; j < 9; ++j) {
                 cond |= g.v[j];
             }
+            /* If so, we're done. */
             if (cond == 0) break;
         }
     }
@@ -356,8 +389,8 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
     /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
-    secp256k1_modinv32_normalize_30(&d, f.v[8] >> 31, modinfo);
-
+    /* Optionally negate d, normalize to [0,modulus), and return it. */
+    secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
     *x = d;
 }
 
diff --git a/src/modinv64.h b/src/modinv64.h
index e70fea0d6..da506dfa9 100644
--- a/src/modinv64.h
+++ b/src/modinv64.h
@@ -17,19 +17,30 @@
 #error "modinv64 requires 128-bit wide multiplication support"
 #endif
 
+/* A signed 62-bit limb representation of integers.
+ *
+ * Its value is sum(v[i] * 2^(62*i), i=0..4). */
 typedef struct {
     int64_t v[5];
 } secp256k1_modinv64_signed62;
 
 typedef struct {
-    /* The modulus in signed62 notation. */
+    /* The modulus in signed62 notation, must be odd and in [3, 2^256]. */
     secp256k1_modinv64_signed62 modulus;
 
     /* modulus^{-1} mod 2^62 */
     uint64_t modulus_inv62;
 } secp256k1_modinv64_modinfo;
 
-static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
+/* Replace x with its modular inverse mod modinfo->modulus. x must be in range [0, modulus).
+ * If x is zero, the result will be zero as well. If not, the inverse must exist (i.e., the gcd of
+ * x and modulus must be 1). These rules are automatically satisfied if the modulus is prime.
+ *
+ * On output, all of x's limbs will be in [0, 2^62).
+ */
 static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
 
+/* Same as secp256k1_modinv64_var, but constant time in x (not in the modulus). */
+static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
+
 #endif /* SECP256K1_MODINV64_H */
diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
index 4d9105571..91eaf05c4 100644
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@@ -11,40 +11,54 @@
 
 #include "util.h"
 
+/* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
+ * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+ *
+ * For an explanation of the algorithm, see doc/safegcd_implementation.md. This file contains an
+ * implementation for N=62, using 62-bit signed limbs represented as int64_t.
+ */
+
+/* Take as input a signed62 number in range (-2*modulus,modulus), and add a multiple of the modulus
+ * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
+ * process. The input must have limbs in range (-2^62,2^62). The output will have limbs in range
+ * [0,2^62). */
 static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int64_t sign, const secp256k1_modinv64_modinfo *modinfo) {
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4];
     int64_t cond_add, cond_negate;
 
+    /* In a first step, add the modulus if the input is negative, and then negate if requested.
+     * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
+     * limbs are in range (-2^62,2^62), this cannot overflow an int64_t. Note that the right
+     * shifts below are signed sign-extending shifts (see assumptions.h for tests that that is
+     * indeed the behavior of the right shift operator). */
     cond_add = r4 >> 63;
-
     r0 += modinfo->modulus.v[0] & cond_add;
     r1 += modinfo->modulus.v[1] & cond_add;
     r2 += modinfo->modulus.v[2] & cond_add;
     r3 += modinfo->modulus.v[3] & cond_add;
     r4 += modinfo->modulus.v[4] & cond_add;
-
     cond_negate = sign >> 63;
-
     r0 = (r0 ^ cond_negate) - cond_negate;
     r1 = (r1 ^ cond_negate) - cond_negate;
     r2 = (r2 ^ cond_negate) - cond_negate;
     r3 = (r3 ^ cond_negate) - cond_negate;
     r4 = (r4 ^ cond_negate) - cond_negate;
-
+    /* Propagate the top bits, to bring limbs back to range (-2^62,2^62). */
     r1 += r0 >> 62; r0 &= M62;
     r2 += r1 >> 62; r1 &= M62;
     r3 += r2 >> 62; r2 &= M62;
     r4 += r3 >> 62; r3 &= M62;
 
+    /* In a second step add the modulus again if the result is still negative, bringing
+     * r to range [0,modulus). */
     cond_add = r4 >> 63;
-
     r0 += modinfo->modulus.v[0] & cond_add;
     r1 += modinfo->modulus.v[1] & cond_add;
     r2 += modinfo->modulus.v[2] & cond_add;
     r3 += modinfo->modulus.v[3] & cond_add;
     r4 += modinfo->modulus.v[4] & cond_add;
-
+    /* And propagate again. */
     r1 += r0 >> 62; r0 &= M62;
     r2 += r1 >> 62; r1 &= M62;
     r3 += r2 >> 62; r2 &= M62;
@@ -57,53 +71,82 @@ static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int6
     r->v[4] = r4;
 }
 
+/* Data type for transition matrices (see section 3 of explanation).
+ *
+ * t = [ u  v ]
+ *     [ q  r ]
+ */
 typedef struct {
     int64_t u, v, q, r;
 } secp256k1_modinv64_trans2x2;
 
+/* Compute the transition matrix and eta for 62 divsteps.
+ *
+ * Input:  eta: initial eta
+ *         f0:  bottom limb of initial f
+ *         g0:  bottom limb of initial g
+ * Output: t: transition matrix
+ * Return: final eta
+ *
+ * Implements the divsteps_n_matrix function from the explanation.
+ */
 static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
-
+    /* u,v,q,r are the elements of the transformation matrix being built up,
+     * starting with the identity matrix. Semantically they are signed integers
+     * in range [-2^62,2^62], but here represented as unsigned mod 2^64. This
+     * permits left shifting (which is UB for negative numbers). The range
+     * being inside [-2^63,2^63) means that casting to signed works correctly.
+     */
     uint64_t u = 1, v = 0, q = 0, r = 1;
     uint64_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 62; ++i) {
-
-        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
-
+        /* Compute conditional masks for (eta < 0) and for (g & 1). */
         c1 = eta >> 63;
         c2 = -(g & 1);
-
+        /* Compute x,y,z, conditionally negated versions of f,u,v. */
         x = (f ^ c1) - c1;
         y = (u ^ c1) - c1;
         z = (v ^ c1) - c1;
-
+        /* Conditionally add x,y,z to g,q,r. */
         g += x & c2;
         q += y & c2;
         r += z & c2;
-
+        /* In what follows, c1 is a condition mask for (eta < 0) and (g & 1). */
         c1 &= c2;
+        /* Conditionally negate eta, and unconditionally subtract 1. */
         eta = (eta ^ c1) - (c1 + 1);
-
+        /* Conditionally add g,q,r to f,u,v. */
         f += g & c1;
         u += q & c1;
         v += r & c1;
-
+        /* Shifts */
         g >>= 1;
         u <<= 1;
         v <<= 1;
     }
-
+    /* Return data in t and return value. */
     t->u = (int64_t)u;
     t->v = (int64_t)v;
     t->q = (int64_t)q;
     t->r = (int64_t)r;
-
     return eta;
 }
 
+/* Compute the transition matrix and eta for 62 divsteps (variable time).
+ *
+ * Input:  eta: initial eta
+ *         f0:  bottom limb of initial f
+ *         g0:  bottom limb of initial g
+ * Output: t: transition matrix
+ * Return: final eta
+ *
+ * Implements the divsteps_n_matrix_var function from the explanation.
+ */
 static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
     /* inv256[i] = -(2*i+1)^-1 (mod 256) */
     static const uint8_t inv256[128] = {
@@ -120,6 +163,7 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
         0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
     };
 
+    /* Transformation matrix; see comments in secp256k1_modinv64_divsteps_62. */
     uint64_t u = 1, v = 0, q = 0, r = 1;
     uint64_t f = f0, g = g0, m;
     uint32_t w;
@@ -128,22 +172,19 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
     for (;;) {
         /* Use a sentinel bit to count zeros only up to i. */
         zeros = secp256k1_ctz64_var(g | (UINT64_MAX << i));
-
+        /* Perform zeros divsteps at once; they all just divide g by two. */
         g >>= zeros;
         u <<= zeros;
         v <<= zeros;
         eta -= zeros;
         i -= zeros;
-
-        if (i <= 0) {
-            break;
-        }
-
+        /* We're done once we've done 62 divsteps. */
+        if (i == 0) break;
         VERIFY_CHECK((f & 1) == 1);
         VERIFY_CHECK((g & 1) == 1);
         VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i));
         VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i));
-
+        /* If eta is negative, negate it and replace f,g with g,-f. */
         if (eta < 0) {
             uint64_t tmp;
             eta = -eta;
@@ -151,28 +192,35 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
             tmp = u; u = q; q = -tmp;
             tmp = v; v = r; r = -tmp;
         }
-
-        /* Handle up to 8 divsteps at once, subject to eta and i. */
+        /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
+         * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
+         * can be done as its sign will flip once that happens. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+        /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
         m = (UINT64_MAX >> (64 - limit)) & 255U;
-
+        /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
         w = (g * inv256[(f >> 1) & 127]) & m;
-
+        /* Do so. */
         g += f * w;
         q += u * w;
         r += v * w;
-
         VERIFY_CHECK((g & m) == 0);
     }
-
+    /* Return data in t and return value. */
     t->u = (int64_t)u;
     t->v = (int64_t)v;
     t->q = (int64_t)q;
     t->r = (int64_t)r;
-
     return eta;
 }
 
+/* Compute (t/2^62) * [d, e] mod modulus, where t is a transition matrix for 62 divsteps.
+ *
+ * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range
+ * (-2^62,2^62).
+ *
+ * This implements the update_de function from the explanation.
+ */
 static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp256k1_modinv64_signed62 *e, const secp256k1_modinv64_trans2x2 *t, const secp256k1_modinv64_modinfo* modinfo) {
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     const int64_t d0 = d->v[0], d1 = d->v[1], d2 = d->v[2], d3 = d->v[3], d4 = d->v[4];
@@ -180,140 +228,115 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
     const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
     int64_t md, me, sd, se;
     int128_t cd, ce;
-
-    /*
-     * On input, d/e must be in the range (-2.P, P). For initially negative d (resp. e), we add
-     * u and/or v (resp. q and/or r) multiples of the modulus to the corresponding output (prior
-     * to division by 2^62). This has the same effect as if we added the modulus to the input(s).
-     */
-
+    /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
     sd = d4 >> 63;
     se = e4 >> 63;
-
     md = (u & sd) + (v & se);
     me = (q & sd) + (r & se);
-
+    /* Begin computing t*[d,e]. */
     cd = (int128_t)u * d0 + (int128_t)v * e0;
     ce = (int128_t)q * d0 + (int128_t)r * e0;
-
-    /*
-     * Subtract from md/me an extra term in the range [0, 2^62) such that the low 62 bits of each
-     * sum of products will be 0. This allows clean division by 2^62. On output, d/e are thus in
-     * the range (-2.P, P), consistent with the input constraint.
-     */
-
+    /* Correct md,me so that t*[d,e]+modulus*[md,me] has 62 zero bottom bits. */
     md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
     me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
-
+    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
     cd += (int128_t)modinfo->modulus.v[0] * md;
     ce += (int128_t)modinfo->modulus.v[0] * me;
-
+    /* Verify that the low 62 bits of the computation are indeed zero, and then throw them away. */
     VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
     VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
-
+    /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
     cd += (int128_t)u * d1 + (int128_t)v * e1;
     ce += (int128_t)q * d1 + (int128_t)r * e1;
-
     cd += (int128_t)modinfo->modulus.v[1] * md;
     ce += (int128_t)modinfo->modulus.v[1] * me;
-
     d->v[0] = (int64_t)cd & M62; cd >>= 62;
     e->v[0] = (int64_t)ce & M62; ce >>= 62;
-
+    /* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */
     cd += (int128_t)u * d2 + (int128_t)v * e2;
     ce += (int128_t)q * d2 + (int128_t)r * e2;
-
     cd += (int128_t)modinfo->modulus.v[2] * md;
     ce += (int128_t)modinfo->modulus.v[2] * me;
-
     d->v[1] = (int64_t)cd & M62; cd >>= 62;
     e->v[1] = (int64_t)ce & M62; ce >>= 62;
-
+    /* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */
     cd += (int128_t)u * d3 + (int128_t)v * e3;
     ce += (int128_t)q * d3 + (int128_t)r * e3;
-
     cd += (int128_t)modinfo->modulus.v[3] * md;
     ce += (int128_t)modinfo->modulus.v[3] * me;
-
     d->v[2] = (int64_t)cd & M62; cd >>= 62;
     e->v[2] = (int64_t)ce & M62; ce >>= 62;
-
+    /* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */
     cd += (int128_t)u * d4 + (int128_t)v * e4;
     ce += (int128_t)q * d4 + (int128_t)r * e4;
-
     cd += (int128_t)modinfo->modulus.v[4] * md;
     ce += (int128_t)modinfo->modulus.v[4] * me;
-
     d->v[3] = (int64_t)cd & M62; cd >>= 62;
     e->v[3] = (int64_t)ce & M62; ce >>= 62;
-
+    /* What remains is limb 5 of t*[d,e]+modulus*[md,me]; store it as output limb 4. */
     d->v[4] = (int64_t)cd;
     e->v[4] = (int64_t)ce;
 }
 
+/* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
+ *
+ * This implements the update_fg function from the explanation.
+ */
 static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp256k1_modinv64_signed62 *g, const secp256k1_modinv64_trans2x2 *t) {
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     const int64_t f0 = f->v[0], f1 = f->v[1], f2 = f->v[2], f3 = f->v[3], f4 = f->v[4];
     const int64_t g0 = g->v[0], g1 = g->v[1], g2 = g->v[2], g3 = g->v[3], g4 = g->v[4];
     const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
     int128_t cf, cg;
-
+    /* Start computing t*[f,g]. */
     cf = (int128_t)u * f0 + (int128_t)v * g0;
     cg = (int128_t)q * f0 + (int128_t)r * g0;
-
+    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
     VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
     VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
-
+    /* Compute limb 1 of t*[f,g], and store it as output limb 0 (= down shift). */
     cf += (int128_t)u * f1 + (int128_t)v * g1;
     cg += (int128_t)q * f1 + (int128_t)r * g1;
-
     f->v[0] = (int64_t)cf & M62; cf >>= 62;
     g->v[0] = (int64_t)cg & M62; cg >>= 62;
-
+    /* Compute limb 2 of t*[f,g], and store it as output limb 1. */
     cf += (int128_t)u * f2 + (int128_t)v * g2;
     cg += (int128_t)q * f2 + (int128_t)r * g2;
-
     f->v[1] = (int64_t)cf & M62; cf >>= 62;
     g->v[1] = (int64_t)cg & M62; cg >>= 62;
-
+    /* Compute limb 3 of t*[f,g], and store it as output limb 2. */
     cf += (int128_t)u * f3 + (int128_t)v * g3;
     cg += (int128_t)q * f3 + (int128_t)r * g3;
-
     f->v[2] = (int64_t)cf & M62; cf >>= 62;
     g->v[2] = (int64_t)cg & M62; cg >>= 62;
-
+    /* Compute limb 4 of t*[f,g], and store it as output limb 3. */
     cf += (int128_t)u * f4 + (int128_t)v * g4;
     cg += (int128_t)q * f4 + (int128_t)r * g4;
-
     f->v[3] = (int64_t)cf & M62; cf >>= 62;
     g->v[3] = (int64_t)cg & M62; cg >>= 62;
-
+    /* What remains is limb 5 of t*[f,g]; store it as output limb 4. */
     f->v[4] = (int64_t)cf;
     g->v[4] = (int64_t)cg;
 }
 
+/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
 static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
-    /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
-
+    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
     secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
     secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
     secp256k1_modinv64_signed62 f = modinfo->modulus;
     secp256k1_modinv64_signed62 g = *x;
     int i;
-    int64_t eta;
-
-    /* The paper uses 'delta'; eta == -delta (a performance tweak).
-     *
-     * If the maximum bitlength of g is known to be less than 256, then eta can be set
-     * initially to -(1 + (256 - maxlen(g))), and only (741 - (256 - maxlen(g))) total
-     * divsteps are needed. */
-    eta = -1;
+    int64_t eta = -1;
 
+    /* Do 12 iterations of 62 divsteps each = 744 divsteps. 724 suffices for 256-bit inputs. */
     for (i = 0; i < 12; ++i) {
+        /* Compute transition matrix and new eta after 62 divsteps. */
         secp256k1_modinv64_trans2x2 t;
         eta = secp256k1_modinv64_divsteps_62(eta, f.v[0], g.v[0], &t);
+        /* Update d,e using that transition matrix. */
         secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
+        /* Update f,g using that transition matrix. */
         secp256k1_modinv64_update_fg_62(&f, &g, &t);
     }
 
@@ -322,45 +345,48 @@ static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_m
      * values i.e. +/- 1, and d now contains +/- the modular inverse. */
     VERIFY_CHECK((g.v[0] | g.v[1] | g.v[2] | g.v[3] | g.v[4]) == 0);
 
+    /* Optionally negate d, normalize to [0,modulus), and return it. */
     secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
-
     *x = d;
 }
 
+/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (variable time). */
 static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
-    /* Modular inversion based on the paper "Fast constant-time gcd computation and
-     * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. */
-
+    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
     secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
     secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
     secp256k1_modinv64_signed62 f = modinfo->modulus;
     secp256k1_modinv64_signed62 g = *x;
     int j;
-    uint64_t eta;
+    int64_t eta = -1;
     int64_t cond;
 
-    /* The paper uses 'delta'; eta == -delta (a performance tweak).
-     *
-     * If g has leading zeros (w.r.t 256 bits), then eta can be set initially to
-     * -(1 + clz(g)), and the worst-case divstep count would be only (741 - clz(g)). */
-    eta = -1;
-
+    /* Do iterations of 62 divsteps each until g=0. */
     while (1) {
+        /* Compute transition matrix and new eta after 62 divsteps. */
         secp256k1_modinv64_trans2x2 t;
         eta = secp256k1_modinv64_divsteps_62_var(eta, f.v[0], g.v[0], &t);
+        /* Update d,e using that transition matrix. */
         secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
+        /* Update f,g using that transition matrix. */
         secp256k1_modinv64_update_fg_62(&f, &g, &t);
+        /* If the bottom limb of g is zero, there is a chance that g=0. */
         if (g.v[0] == 0) {
             cond = 0;
+            /* Check if the other limbs are also 0. */
             for (j = 1; j < 5; ++j) {
                 cond |= g.v[j];
             }
+            /* If so, we're done. */
             if (cond == 0) break;
         }
     }
 
-    secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
+    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 
+    /* Optionally negate d, normalize to [0,modulus), and return it. */
+    secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
     *x = d;
 }
 

From 151aac00d31ba5e94800376f6fda4193071168af Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Tue, 22 Dec 2020 18:24:36 -0800
Subject: [PATCH 06/59] Add tests for modinv modules

This adds tests for the modinv{32,64}_impl.h directly (before the functions are used
inside the field/scalar code). It uses a naive implementation of modular multiplication
and gcds in order to verify the modular inverses themselves.
---
 src/tests.c | 444 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 444 insertions(+)

diff --git a/src/tests.c b/src/tests.c
index ab981b5a7..32d9340f0 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -18,6 +18,7 @@
 #include "include/secp256k1.h"
 #include "include/secp256k1_preallocated.h"
 #include "testrand_impl.h"
+#include "util.h"
 
 #ifdef ENABLE_OPENSSL_TESTS
 #include "openssl/bn.h"
@@ -32,6 +33,11 @@ void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **pr, const BIGNUM **ps)
 #include "contrib/lax_der_parsing.c"
 #include "contrib/lax_der_privatekey_parsing.c"
 
+#include "modinv32_impl.h"
+#ifdef SECP256K1_WIDEMUL_INT128
+#include "modinv64_impl.h"
+#endif
+
 static int count = 64;
 static secp256k1_context *ctx = NULL;
 
@@ -816,8 +822,444 @@ void run_num_smalltests(void) {
 }
 #endif
 
+/***** MODINV TESTS *****/
+
+/* Compute the modular inverse of (odd) x mod 2^64. */
+uint64_t modinv2p64(uint64_t x) {
+    /* If w = 1/x mod 2^(2^L), then w*(2 - w*x) = 1/x mod 2^(2^(L+1)). See
+     * Hacker's Delight second edition, Henry S. Warren, Jr., pages 245-247 for
+     * why. Start with L=0, for which it is true for every odd x that
+     * 1/x=1 mod 2. Iterating 6 times gives us 1/x mod 2^64. */
+    int l;
+    uint64_t w = 1;
+    CHECK(x & 1);
+    for (l = 0; l < 6; ++l) w *= (2 - w*x);
+    return w;
+}
+
+/* compute out = (a*b) mod m; if b=NULL, treat b=1.
+ *
+ * Out is a 512-bit number (represented as 32 uint16_t's in LE order). The other
+ * arguments are 256-bit numbers (represented as 16 uint16_t's in LE order). */
+void mulmod256(uint16_t* out, const uint16_t* a, const uint16_t* b, const uint16_t* m) {
+    uint16_t mul[32];
+    uint64_t c = 0;
+    int i, j;
+    int m_bitlen = 0;
+    int mul_bitlen = 0;
+
+    if (b != NULL) {
+        /* Compute the product of a and b, and put it in mul. */
+        for (i = 0; i < 32; ++i) {
+            for (j = i <= 15 ? 0 : i - 15; j <= i && j <= 15; j++) {
+                c += (uint64_t)a[j] * b[i - j];
+            }
+            mul[i] = c & 0xFFFF;
+            c >>= 16;
+        }
+        CHECK(c == 0);
+
+        /* compute the highest set bit in mul */
+        for (i = 511; i >= 0; --i) {
+            if ((mul[i >> 4] >> (i & 15)) & 1) {
+                mul_bitlen = i;
+                break;
+            }
+        }
+    } else {
+        /* if b==NULL, set mul=a. */
+        memcpy(mul, a, 32);
+        memset(mul + 16, 0, 32);
+        /* compute the highest set bit in mul */
+        for (i = 255; i >= 0; --i) {
+            if ((mul[i >> 4] >> (i & 15)) & 1) {
+                mul_bitlen = i;
+                break;
+            }
+        }
+    }
+
+    /* Compute the highest set bit in m. */
+    for (i = 255; i >= 0; --i) {
+        if ((m[i >> 4] >> (i & 15)) & 1) {
+            m_bitlen = i;
+            break;
+        }
+    }
+
+    /* Try do mul -= m<<i, for i going down to 0, whenever the result is not negative */
+    for (i = mul_bitlen - m_bitlen; i >= 0; --i) {
+        uint16_t mul2[32];
+        int64_t cs;
+
+        /* Compute mul2 = mul - m<<i. */
+        cs = 0; /* accumulator */
+        for (j = 0; j < 32; ++j) { /* j loops over the output limbs in mul2. */
+            /* Compute sub: the 16 bits in m that will be subtracted from mul2[j]. */
+            uint16_t sub = 0;
+            int p;
+            for (p = 0; p < 16; ++p) { /* p loops over the bit positions in mul2[j]. */
+                int bitpos = j * 16 - i + p; /* bitpos is the correspond bit position in m. */
+                if (bitpos >= 0 && bitpos < 256) {
+                    sub |= ((m[bitpos >> 4] >> (bitpos & 15)) & 1) << p;
+                }
+            }
+            /* Add mul[j]-sub to accumulator, and shift bottom 16 bits out to mul2[j]. */
+            cs += mul[j];
+            cs -= sub;
+            mul2[j] = (cs & 0xFFFF);
+            cs >>= 16;
+        }
+        /* If remainder of subtraction is 0, set mul = mul2. */
+        if (cs == 0) {
+            memcpy(mul, mul2, sizeof(mul));
+        }
+    }
+    /* Sanity check: test that all limbs higher than m's highest are zero */
+    for (i = (m_bitlen >> 4) + 1; i < 32; ++i) {
+        CHECK(mul[i] == 0);
+    }
+    memcpy(out, mul, 32);
+}
+
+/* Convert a 256-bit number represented as 16 uint16_t's to signed30 notation. */
+void uint16_to_signed30(secp256k1_modinv32_signed30* out, const uint16_t* in) {
+    int i;
+    memset(out->v, 0, sizeof(out->v));
+    for (i = 0; i < 256; ++i) {
+        out->v[i / 30] |= (int32_t)(((in[i >> 4]) >> (i & 15)) & 1) << (i % 30);
+    }
+}
+
+/* Convert a 256-bit number in signed30 notation to a representation as 16 uint16_t's. */
+void signed30_to_uint16(uint16_t* out, const secp256k1_modinv32_signed30* in) {
+    int i;
+    memset(out, 0, 32);
+    for (i = 0; i < 256; ++i) {
+        out[i >> 4] |= (((in->v[i / 30]) >> (i % 30)) & 1) << (i & 15);
+    }
+}
+
+/* Randomly mutate the sign of limbs in signed30 representation, without changing the value. */
+void mutate_sign_signed30(secp256k1_modinv32_signed30* x) {
+    int i;
+    for (i = 0; i < 16; ++i) {
+        int pos = secp256k1_testrand_int(8);
+        if (x->v[pos] > 0 && x->v[pos + 1] <= 0x3fffffff) {
+            x->v[pos] -= 0x40000000;
+            x->v[pos + 1] += 1;
+        } else if (x->v[pos] < 0 && x->v[pos + 1] >= 0x3fffffff) {
+            x->v[pos] += 0x40000000;
+            x->v[pos + 1] -= 1;
+        }
+    }
+}
+
+/* Test secp256k1_modinv32{_var}, using inputs in 16-bit limb format, and returning inverse. */
+void test_modinv32_uint16(uint16_t* out, const uint16_t* in, const uint16_t* mod) {
+    uint16_t tmp[16];
+    secp256k1_modinv32_signed30 x;
+    secp256k1_modinv32_modinfo m;
+    int i, vartime, nonzero;
+
+    uint16_to_signed30(&x, in);
+    nonzero = (x.v[0] | x.v[1] | x.v[2] | x.v[3] | x.v[4] | x.v[5] | x.v[6] | x.v[7] | x.v[8]) != 0;
+    uint16_to_signed30(&m.modulus, mod);
+    mutate_sign_signed30(&m.modulus);
+
+    /* compute 1/modulus mod 2^30 */
+    m.modulus_inv30 = modinv2p64(m.modulus.v[0]) & 0x3fffffff;
+    CHECK(((m.modulus_inv30 * m.modulus.v[0]) & 0x3fffffff) == 1);
+
+    for (vartime = 0; vartime < 2; ++vartime) {
+        /* compute inverse */
+        (vartime ? secp256k1_modinv32_var : secp256k1_modinv32)(&x, &m);
+
+        /* produce output */
+        signed30_to_uint16(out, &x);
+
+        /* check if the inverse times the input is 1 (mod m), unless x is 0. */
+        mulmod256(tmp, out, in, mod);
+        CHECK(tmp[0] == nonzero);
+        for (i = 1; i < 16; ++i) CHECK(tmp[i] == 0);
+
+        /* invert again */
+        (vartime ? secp256k1_modinv32_var : secp256k1_modinv32)(&x, &m);
+
+        /* check if the result is equal to the input */
+        signed30_to_uint16(tmp, &x);
+        for (i = 0; i < 16; ++i) CHECK(tmp[i] == in[i]);
+    }
+}
+
+#ifdef SECP256K1_WIDEMUL_INT128
+/* Convert a 256-bit number represented as 16 uint16_t's to signed62 notation. */
+void uint16_to_signed62(secp256k1_modinv64_signed62* out, const uint16_t* in) {
+    int i;
+    memset(out->v, 0, sizeof(out->v));
+    for (i = 0; i < 256; ++i) {
+        out->v[i / 62] |= (int64_t)(((in[i >> 4]) >> (i & 15)) & 1) << (i % 62);
+    }
+}
+
+/* Convert a 256-bit number in signed62 notation to a representation as 16 uint16_t's. */
+void signed62_to_uint16(uint16_t* out, const secp256k1_modinv64_signed62* in) {
+    int i;
+    memset(out, 0, 32);
+    for (i = 0; i < 256; ++i) {
+        out[i >> 4] |= (((in->v[i / 62]) >> (i % 62)) & 1) << (i & 15);
+    }
+}
+
+/* Randomly mutate the sign of limbs in signed62 representation, without changing the value. */
+void mutate_sign_signed62(secp256k1_modinv64_signed62* x) {
+    static const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int i;
+    for (i = 0; i < 8; ++i) {
+        int pos = secp256k1_testrand_int(4);
+        if (x->v[pos] > 0 && x->v[pos + 1] <= M62) {
+            x->v[pos] -= (M62 + 1);
+            x->v[pos + 1] += 1;
+        } else if (x->v[pos] < 0 && x->v[pos + 1] >= -M62) {
+            x->v[pos] += (M62 + 1);
+            x->v[pos + 1] -= 1;
+        }
+    }
+}
+
+/* Test secp256k1_modinv64{_var}, using inputs in 16-bit limb format, and returning inverse. */
+void test_modinv64_uint16(uint16_t* out, const uint16_t* in, const uint16_t* mod) {
+    static const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    uint16_t tmp[16];
+    secp256k1_modinv64_signed62 x;
+    secp256k1_modinv64_modinfo m;
+    int i, vartime, nonzero;
+
+    uint16_to_signed62(&x, in);
+    nonzero = (x.v[0] | x.v[1] | x.v[2] | x.v[3] | x.v[4]) != 0;
+    uint16_to_signed62(&m.modulus, mod);
+    mutate_sign_signed62(&m.modulus);
+
+    /* compute 1/modulus mod 2^62 */
+    m.modulus_inv62 = modinv2p64(m.modulus.v[0]) & M62;
+    CHECK(((m.modulus_inv62 * m.modulus.v[0]) & M62) == 1);
+
+    for (vartime = 0; vartime < 2; ++vartime) {
+        /* compute inverse */
+        (vartime ? secp256k1_modinv64_var : secp256k1_modinv64)(&x, &m);
+
+        /* produce output */
+        signed62_to_uint16(out, &x);
+
+        /* check if the inverse times the input is 1 (mod m), unless x is 0. */
+        mulmod256(tmp, out, in, mod);
+        CHECK(tmp[0] == nonzero);
+        for (i = 1; i < 16; ++i) CHECK(tmp[i] == 0);
+
+        /* invert again */
+        (vartime ? secp256k1_modinv64_var : secp256k1_modinv64)(&x, &m);
+
+        /* check if the result is equal to the input */
+        signed62_to_uint16(tmp, &x);
+        for (i = 0; i < 16; ++i) CHECK(tmp[i] == in[i]);
+    }
+}
+#endif
+
+/* test if a and b are coprime */
+int coprime(const uint16_t* a, const uint16_t* b) {
+    uint16_t x[16], y[16], t[16];
+    int i;
+    int iszero;
+    memcpy(x, a, 32);
+    memcpy(y, b, 32);
+
+    /* simple gcd loop: while x!=0, (x,y)=(y%x,x) */
+    while (1) {
+        iszero = 1;
+        for (i = 0; i < 16; ++i) {
+            if (x[i] != 0) {
+                iszero = 0;
+                break;
+            }
+        }
+        if (iszero) break;
+        mulmod256(t, y, NULL, x);
+        memcpy(y, x, 32);
+        memcpy(x, t, 32);
+    }
+
+    /* return whether y=1 */
+    if (y[0] != 1) return 0;
+    for (i = 1; i < 16; ++i) {
+        if (y[i] != 0) return 0;
+    }
+    return 1;
+}
+
+void run_modinv_tests(void) {
+    /* Fixed test cases. Each tuple is (input, modulus, output), each as 16x16 bits in LE order. */
+    static const uint16_t CASES[][3][16] = {
+        /* Test case known to need 713 divsteps */
+        {{0x1513, 0x5389, 0x54e9, 0x2798, 0x1957, 0x66a0, 0x8057, 0x3477,
+          0x7784, 0x1052, 0x326a, 0x9331, 0x6506, 0xa95c, 0x91f3, 0xfb5e},
+         {0x2bdd, 0x8df4, 0xcc61, 0x481f, 0xdae5, 0x5ca7, 0xf43b, 0x7d54,
+          0x13d6, 0x469b, 0x2294, 0x20f4, 0xb2a4, 0xa2d1, 0x3ff1, 0xfd4b},
+         {0xffd8, 0xd9a0, 0x456e, 0x81bb, 0xbabd, 0x6cea, 0x6dbd, 0x73ab,
+          0xbb94, 0x3d3c, 0xdf08, 0x31c4, 0x3e32, 0xc179, 0x2486, 0xb86b}},
+        /* Test case known to need 589 divsteps, reaching delta=-140 and
+           delta=141. */
+        {{0x3fb1, 0x903b, 0x4eb7, 0x4813, 0xd863, 0x26bf, 0xd89f, 0xa8a9,
+          0x02fe, 0x57c6, 0x554a, 0x4eab, 0x165e, 0x3d61, 0xee1e, 0x456c},
+         {0x9295, 0x823b, 0x5c1f, 0x5386, 0x48e0, 0x02ff, 0x4c2a, 0xa2da,
+          0xe58f, 0x967c, 0xc97e, 0x3f5a, 0x69fb, 0x52d9, 0x0a86, 0xb4a3},
+         {0x3d30, 0xb893, 0xa809, 0xa7a8, 0x26f5, 0x5b42, 0x55be, 0xf4d0,
+          0x12c2, 0x7e6a, 0xe41a, 0x90c7, 0xebfa, 0xf920, 0x304e, 0x1419}},
+        /* Test case known to need 650 divsteps, and doing 65 consecutive (f,g/2) steps. */
+        {{0x8583, 0x5058, 0xbeae, 0xeb69, 0x48bc, 0x52bb, 0x6a9d, 0xcc94,
+          0x2a21, 0x87d5, 0x5b0d, 0x42f6, 0x5b8a, 0x2214, 0xe9d6, 0xa040},
+         {0x7531, 0x27cb, 0x7e53, 0xb739, 0x6a5f, 0x83f5, 0xa45c, 0xcb1d,
+          0x8a87, 0x1c9c, 0x51d7, 0x851c, 0xb9d8, 0x1fbe, 0xc241, 0xd4a3},
+         {0xcdb4, 0x275c, 0x7d22, 0xa906, 0x0173, 0xc054, 0x7fdf, 0x5005,
+          0x7fb8, 0x9059, 0xdf51, 0x99df, 0x2654, 0x8f6e, 0x070f, 0xb347}},
+        /* Test case with the group order as modulus, needing 635 divsteps. */
+        {{0x95ed, 0x6c01, 0xd113, 0x5ff1, 0xd7d0, 0x29cc, 0x5817, 0x6120,
+          0xca8e, 0xaad1, 0x25ae, 0x8e84, 0x9af6, 0x30bf, 0xf0ed, 0x1686},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x1631, 0xbf4a, 0x286a, 0x2716, 0x469f, 0x2ac8, 0x1312, 0xe9bc,
+          0x04f4, 0x304b, 0x9931, 0x113b, 0xd932, 0xc8f4, 0x0d0d, 0x01a1}},
+        /* Test case with the field size as modulus, needing 637 divsteps. */
+        {{0x9ec3, 0x1919, 0xca84, 0x7c11, 0xf996, 0x06f3, 0x5408, 0x6688,
+          0x1320, 0xdb8a, 0x632a, 0x0dcb, 0x8a84, 0x6bee, 0x9c95, 0xe34e},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x18e5, 0x19b6, 0xdf92, 0x1aaa, 0x09fb, 0x8a3f, 0x52b0, 0x8701,
+          0xac0c, 0x2582, 0xda44, 0x9bcc, 0x6828, 0x1c53, 0xbd8f, 0xbd2c}},
+        /* Test case with the field size as modulus, needing 935 divsteps with
+           broken eta handling. */
+        {{0x1b37, 0xbdc3, 0x8bcd, 0x25e3, 0x1eae, 0x567d, 0x30b6, 0xf0d8,
+          0x9277, 0x0cf8, 0x9c2e, 0xecd7, 0x631d, 0xe38f, 0xd4f8, 0x5c93},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x1622, 0xe05b, 0xe880, 0x7de9, 0x3e45, 0xb682, 0xee6c, 0x67ed,
+          0xa179, 0x15db, 0x6b0d, 0xa656, 0x7ccb, 0x8ef7, 0xa2ff, 0xe279}},
+        /* Test case with the group size as modulus, needing 981 divsteps with
+           broken eta handling. */
+        {{0xfeb9, 0xb877, 0xee41, 0x7fa3, 0x87da, 0x94c4, 0x9d04, 0xc5ae,
+          0x5708, 0x0994, 0xfc79, 0x0916, 0xbf32, 0x3ad8, 0xe11c, 0x5ca2},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0f12, 0x075e, 0xce1c, 0x6f92, 0xc80f, 0xca92, 0x9a04, 0x6126,
+          0x4b6c, 0x57d6, 0xca31, 0x97f3, 0x1f99, 0xf4fd, 0xda4d, 0x42ce}},
+        /* Test case with the field size as modulus, input = 0. */
+        {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the field size as modulus, input = 1. */
+        {{0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the field size as modulus, input = 2. */
+        {{0x0002, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0xfe18, 0x7fff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x7fff}},
+        /* Test case with the field size as modulus, input = field - 1. */
+        {{0xfc2e, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0xfc2e, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+        /* Test case with the group size as modulus, input = 0. */
+        {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the group size as modulus, input = 1. */
+        {{0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the group size as modulus, input = 2. */
+        {{0x0002, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x20a1, 0x681b, 0x2f46, 0xdfe9, 0x501d, 0x57a4, 0x6e73, 0x5d57,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x7fff}},
+        /* Test case with the group size as modulus, input = group - 1. */
+        {{0x4140, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x4140, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}
+    };
+
+    int i, j, ok;
+
+    /* Test known inputs/outputs */
+    for (i = 0; (size_t)i < sizeof(CASES) / sizeof(CASES[0]); ++i) {
+        uint16_t out[16];
+        test_modinv32_uint16(out, CASES[i][0], CASES[i][1]);
+        for (j = 0; j < 16; ++j) CHECK(out[j] == CASES[i][2][j]);
+#ifdef SECP256K1_WIDEMUL_INT128
+        test_modinv64_uint16(out, CASES[i][0], CASES[i][1]);
+        for (j = 0; j < 16; ++j) CHECK(out[j] == CASES[i][2][j]);
+#endif
+    }
+
+    for (i = 0; i < 100 * count; ++i) {
+        /* 256-bit numbers in 16-uint16_t's notation */
+        static const uint16_t ZERO[16] = {0};
+        uint16_t xd[16];  /* the number (in range [0,2^256)) to be inverted */
+        uint16_t md[16];  /* the modulus (odd, in range [3,2^256)) */
+        uint16_t id[16];  /* the inverse of xd mod md */
+
+        /* generate random xd and md, so that md is odd, md>1, xd<md, and gcd(xd,md)=1 */
+        do {
+            /* generate random xd and md (with many subsequent 0s and 1s) */
+            secp256k1_testrand256_test((unsigned char*)xd);
+            secp256k1_testrand256_test((unsigned char*)md);
+            md[0] |= 1; /* modulus must be odd */
+            /* If modulus is 1, find another one. */
+            ok = md[0] != 1;
+            for (j = 1; j < 16; ++j) ok |= md[j] != 0;
+            mulmod256(xd, xd, NULL, md); /* Make xd = xd mod md */
+        } while (!(ok && coprime(xd, md)));
+
+        test_modinv32_uint16(id, xd, md);
+#ifdef SECP256K1_WIDEMUL_INT128
+        test_modinv64_uint16(id, xd, md);
+#endif
+
+        /* In a few cases, also test with input=0 */
+        if (i < count) {
+            test_modinv32_uint16(id, ZERO, md);
+#ifdef SECP256K1_WIDEMUL_INT128
+            test_modinv64_uint16(id, ZERO, md);
+#endif
+        }
+    }
+}
+
 /***** SCALAR TESTS *****/
 
+
 void scalar_test(void) {
     secp256k1_scalar s;
     secp256k1_scalar s1;
@@ -5627,6 +6069,8 @@ int main(int argc, char **argv) {
 
     run_ctz_tests();
 
+    run_modinv_tests();
+
     run_sha256_tests();
     run_hmac_sha256_tests();
     run_rfc6979_hmac_sha256_tests();

From 08d54964e51f318ef0cc4ef09d64cfa5ec143c5c Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Wed, 23 Dec 2020 11:13:57 -0800
Subject: [PATCH 07/59] Improve bounds checks in modinv modules

This commit adds functions to verify and compare numbers in signed{30,62} notation,
and uses that to do more extensive bounds checking on various variables in the modinv
code.
---
 src/modinv32_impl.h | 143 +++++++++++++++++++++++++++++++++++++++++-
 src/modinv64_impl.h | 147 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 288 insertions(+), 2 deletions(-)

diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
index 3a6579df6..1da47bd22 100644
--- a/src/modinv32_impl.h
+++ b/src/modinv32_impl.h
@@ -20,6 +20,42 @@
  * implementation for N=30, using 30-bit signed limbs represented as int32_t.
  */
 
+#ifdef VERIFY
+static const secp256k1_modinv32_signed30 SECP256K1_SIGNED30_ONE = {{1}};
+
+/* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^30). */
+static void secp256k1_modinv32_mul_30(secp256k1_modinv32_signed30 *r, const secp256k1_modinv32_signed30 *a, int32_t factor) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    int64_t c = 0;
+    int i;
+    for (i = 0; i < 8; ++i) {
+        c += (int64_t)a->v[i] * factor;
+        r->v[i] = (int32_t)c & M30; c >>= 30;
+    }
+    c += (int64_t)a->v[8] * factor;
+    VERIFY_CHECK(c == (int32_t)c);
+    r->v[8] = (int32_t)c;
+}
+
+/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. */
+static int secp256k1_modinv32_mul_cmp_30(const secp256k1_modinv32_signed30 *a, const secp256k1_modinv32_signed30 *b, int32_t factor) {
+    int i;
+    secp256k1_modinv32_signed30 am, bm;
+    secp256k1_modinv32_mul_30(&am, a, 1); /* Normalize all but the top limb of a. */
+    secp256k1_modinv32_mul_30(&bm, b, factor);
+    for (i = 0; i < 8; ++i) {
+        /* Verify that all but the top limb of a and b are normalized. */
+        VERIFY_CHECK(am.v[i] >> 30 == 0);
+        VERIFY_CHECK(bm.v[i] >> 30 == 0);
+    }
+    for (i = 8; i >= 0; --i) {
+        if (am.v[i] < bm.v[i]) return -1;
+        if (am.v[i] > bm.v[i]) return 1;
+    }
+    return 0;
+}
+#endif
+
 /* Take as input a signed30 number in range (-2*modulus,modulus), and add a multiple of the modulus
  * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
  * process. The input must have limbs in range (-2^30,2^30). The output will have limbs in range
@@ -30,6 +66,17 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
             r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8];
     int32_t cond_add, cond_negate;
 
+#ifdef VERIFY
+    /* Verify that all limbs are in range (-2^30,2^30). */
+    int i;
+    for (i = 0; i < 9; ++i) {
+        VERIFY_CHECK(r->v[i] >= -M30);
+        VERIFY_CHECK(r->v[i] <= M30);
+    }
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, 1) < 0); /* r < modulus */
+#endif
+
     /* In a first step, add the modulus if the input is negative, and then negate if requested.
      * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
      * limbs are in range (-2^30,2^30), this cannot overflow an int32_t. Note that the right
@@ -96,6 +143,20 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
     r->v[6] = r6;
     r->v[7] = r7;
     r->v[8] = r8;
+
+#ifdef VERIFY
+    VERIFY_CHECK(r0 >> 30 == 0);
+    VERIFY_CHECK(r1 >> 30 == 0);
+    VERIFY_CHECK(r2 >> 30 == 0);
+    VERIFY_CHECK(r3 >> 30 == 0);
+    VERIFY_CHECK(r4 >> 30 == 0);
+    VERIFY_CHECK(r5 >> 30 == 0);
+    VERIFY_CHECK(r6 >> 30 == 0);
+    VERIFY_CHECK(r7 >> 30 == 0);
+    VERIFY_CHECK(r8 >> 30 == 0);
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, 0) >= 0); /* r >= 0 */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, 1) < 0); /* r < modulus */
+#endif
 }
 
 /* Data type for transition matrices (see section 3 of explanation).
@@ -155,12 +216,19 @@ static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t
         g >>= 1;
         u <<= 1;
         v <<= 1;
+        /* Bounds on eta that follow from the bounds on iteration count (max 25*30 divsteps). */
+        VERIFY_CHECK(eta >= -751 && eta <= 751);
     }
     /* Return data in t and return value. */
     t->u = (int32_t)u;
     t->v = (int32_t)v;
     t->q = (int32_t)q;
     t->r = (int32_t)r;
+    /* The determinant of t must be a power of two. This guarantees that multiplication with t
+     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+     * will be divided out again). As each divstep's individual matrix has determinant 2, the
+     * aggregate of 30 of them will have determinant 2^30. */
+    VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30);
     return eta;
 }
 
@@ -211,6 +279,8 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint
         VERIFY_CHECK((g & 1) == 1);
         VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
         VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
+        /* Bounds on eta that follow from the bounds on iteration count (max 25*30 divsteps). */
+        VERIFY_CHECK(eta >= -751 && eta <= 751);
         /* If eta is negative, negate it and replace f,g with g,-f. */
         if (eta < 0) {
             uint32_t tmp;
@@ -224,6 +294,7 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint
          * can be done as its sign will flip once that happens. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
+        VERIFY_CHECK(limit > 0 && limit <= 30);
         m = (UINT32_MAX >> (32 - limit)) & 255U;
         /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
         w = (g * inv256[(f >> 1) & 127]) & m;
@@ -238,6 +309,11 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint
     t->v = (int32_t)v;
     t->q = (int32_t)q;
     t->r = (int32_t)r;
+    /* The determinant of t must be a power of two. This guarantees that multiplication with t
+     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+     * will be divided out again). As each divstep's individual matrix has determinant 2, the
+     * aggregate of 30 of them will have determinant 2^30. */
+    VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30);
     return eta;
 }
 
@@ -254,6 +330,16 @@ static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp
     int32_t di, ei, md, me, sd, se;
     int64_t cd, ce;
     int i;
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+    VERIFY_CHECK((labs(u) + labs(v)) >= 0); /* |u|+|v| doesn't overflow */
+    VERIFY_CHECK((labs(q) + labs(r)) >= 0); /* |q|+|r| doesn't overflow */
+    VERIFY_CHECK((labs(u) + labs(v)) <= M30 + 1); /* |u|+|v| <= 2^30 */
+    VERIFY_CHECK((labs(q) + labs(r)) <= M30 + 1); /* |q|+|r| <= 2^30 */
+#endif
     /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
     sd = d->v[8] >> 31;
     se = e->v[8] >> 31;
@@ -288,6 +374,12 @@ static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp
     /* What remains is limb 9 of t*[d,e]+modulus*[md,me]; store it as output limb 8. */
     d->v[8] = (int32_t)cd;
     e->v[8] = (int32_t)ce;
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+#endif
 }
 
 /* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps.
@@ -341,13 +433,35 @@ static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_m
         /* Update d,e using that transition matrix. */
         secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
         secp256k1_modinv32_update_fg_30(&f, &g, &t);
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
      * values i.e. +/- 1, and d now contains +/- the modular inverse. */
-    VERIFY_CHECK((g.v[0] | g.v[1] | g.v[2] | g.v[3] | g.v[4] | g.v[5] | g.v[6] | g.v[7] | g.v[8]) == 0);
+#ifdef VERIFY
+    /* g == 0 */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &SECP256K1_SIGNED30_ONE, 0) == 0);
+    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
+                 secp256k1_modinv32_mul_cmp_30(&f, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
+                 (secp256k1_modinv32_mul_cmp_30(x, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  secp256k1_modinv32_mul_cmp_30(&d, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  (secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) == 0)));
+#endif
 
     /* Optionally negate d, normalize to [0,modulus), and return it. */
     secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
@@ -361,6 +475,9 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
     secp256k1_modinv32_signed30 e = {{1, 0, 0, 0, 0, 0, 0, 0, 0}};
     secp256k1_modinv32_signed30 f = modinfo->modulus;
     secp256k1_modinv32_signed30 g = *x;
+#ifdef VERIFY
+    int i = 0;
+#endif
     int j;
     int32_t eta = -1;
     int32_t cond;
@@ -373,6 +490,12 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
         /* Update d,e using that transition matrix. */
         secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
         secp256k1_modinv32_update_fg_30(&f, &g, &t);
         /* If the bottom limb of g is 0, there is a chance g=0. */
         if (g.v[0] == 0) {
@@ -384,10 +507,28 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
             /* If so, we're done. */
             if (cond == 0) break;
         }
+#ifdef VERIFY
+        VERIFY_CHECK(++i < 25); /* We should never need more than 25*30 = 750 divsteps */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
     }
 
     /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
+#ifdef VERIFY
+    /* g == 0 */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &SECP256K1_SIGNED30_ONE, 0) == 0);
+    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
+                 secp256k1_modinv32_mul_cmp_30(&f, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
+                 (secp256k1_modinv32_mul_cmp_30(x, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  secp256k1_modinv32_mul_cmp_30(&d, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  (secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) == 0)));
+#endif
 
     /* Optionally negate d, normalize to [0,modulus), and return it. */
     secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
index 91eaf05c4..3ab21cdc0 100644
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@@ -18,6 +18,50 @@
  * implementation for N=62, using 62-bit signed limbs represented as int64_t.
  */
 
+#ifdef VERIFY
+/* Helper function to compute the absolute value of an int64_t.
+ * (we don't use abs/labs/llabs as it depends on the int sizes). */
+static int64_t secp256k1_modinv64_abs(int64_t v) {
+    VERIFY_CHECK(v > INT64_MIN);
+    if (v < 0) return -v;
+    return v;
+}
+
+static const secp256k1_modinv64_signed62 SECP256K1_SIGNED62_ONE = {{1}};
+
+/* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^62). */
+static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int64_t factor) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int128_t c = 0;
+    int i;
+    for (i = 0; i < 4; ++i) {
+        c += (int128_t)a->v[i] * factor;
+        r->v[i] = (int64_t)c & M62; c >>= 62;
+    }
+    c += (int128_t)a->v[4] * factor;
+    VERIFY_CHECK(c == (int64_t)c);
+    r->v[4] = (int64_t)c;
+}
+
+/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. */
+static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, const secp256k1_modinv64_signed62 *b, int64_t factor) {
+    int i;
+    secp256k1_modinv64_signed62 am, bm;
+    secp256k1_modinv64_mul_62(&am, a, 1); /* Normalize all but the top limb of a. */
+    secp256k1_modinv64_mul_62(&bm, b, factor);
+    for (i = 0; i < 4; ++i) {
+        /* Verify that all but the top limb of a and b are normalized. */
+        VERIFY_CHECK(am.v[i] >> 62 == 0);
+        VERIFY_CHECK(bm.v[i] >> 62 == 0);
+    }
+    for (i = 4; i >= 0; --i) {
+        if (am.v[i] < bm.v[i]) return -1;
+        if (am.v[i] > bm.v[i]) return 1;
+    }
+    return 0;
+}
+#endif
+
 /* Take as input a signed62 number in range (-2*modulus,modulus), and add a multiple of the modulus
  * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
  * process. The input must have limbs in range (-2^62,2^62). The output will have limbs in range
@@ -27,6 +71,17 @@ static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int6
     int64_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4];
     int64_t cond_add, cond_negate;
 
+#ifdef VERIFY
+    /* Verify that all limbs are in range (-2^62,2^62). */
+    int i;
+    for (i = 0; i < 5; ++i) {
+        VERIFY_CHECK(r->v[i] >= -M62);
+        VERIFY_CHECK(r->v[i] <= M62);
+    }
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, &modinfo->modulus, 1) < 0); /* r < modulus */
+#endif
+
     /* In a first step, add the modulus if the input is negative, and then negate if requested.
      * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
      * limbs are in range (-2^62,2^62), this cannot overflow an int64_t. Note that the right
@@ -69,6 +124,16 @@ static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int6
     r->v[2] = r2;
     r->v[3] = r3;
     r->v[4] = r4;
+
+#ifdef VERIFY
+    VERIFY_CHECK(r0 >> 62 == 0);
+    VERIFY_CHECK(r1 >> 62 == 0);
+    VERIFY_CHECK(r2 >> 62 == 0);
+    VERIFY_CHECK(r3 >> 62 == 0);
+    VERIFY_CHECK(r4 >> 62 == 0);
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, &modinfo->modulus, 0) >= 0); /* r >= 0 */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, &modinfo->modulus, 1) < 0); /* r < modulus */
+#endif
 }
 
 /* Data type for transition matrices (see section 3 of explanation).
@@ -128,12 +193,19 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t
         g >>= 1;
         u <<= 1;
         v <<= 1;
+        /* Bounds on eta that follow from the bounds on iteration count (max 12*62 divsteps). */
+        VERIFY_CHECK(eta >= -745 && eta <= 745);
     }
     /* Return data in t and return value. */
     t->u = (int64_t)u;
     t->v = (int64_t)v;
     t->q = (int64_t)q;
     t->r = (int64_t)r;
+    /* The determinant of t must be a power of two. This guarantees that multiplication with t
+     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+     * will be divided out again). As each divstep's individual matrix has determinant 2, the
+     * aggregate of 62 of them will have determinant 2^62. */
+    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
     return eta;
 }
 
@@ -184,6 +256,8 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
         VERIFY_CHECK((g & 1) == 1);
         VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i));
         VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i));
+        /* Bounds on eta that follow from the bounds on iteration count (max 12*62 divsteps). */
+        VERIFY_CHECK(eta >= -745 && eta <= 745);
         /* If eta is negative, negate it and replace f,g with g,-f. */
         if (eta < 0) {
             uint64_t tmp;
@@ -197,6 +271,7 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
          * can be done as its sign will flip once that happens. */
         limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
         /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
+        VERIFY_CHECK(limit > 0 && limit <= 62);
         m = (UINT64_MAX >> (64 - limit)) & 255U;
         /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
         w = (g * inv256[(f >> 1) & 127]) & m;
@@ -211,6 +286,11 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
     t->v = (int64_t)v;
     t->q = (int64_t)q;
     t->r = (int64_t)r;
+    /* The determinant of t must be a power of two. This guarantees that multiplication with t
+     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+     * will be divided out again). As each divstep's individual matrix has determinant 2, the
+     * aggregate of 62 of them will have determinant 2^62. */
+    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
     return eta;
 }
 
@@ -228,6 +308,16 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
     const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
     int64_t md, me, sd, se;
     int128_t cd, ce;
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+    VERIFY_CHECK((secp256k1_modinv64_abs(u) + secp256k1_modinv64_abs(v)) >= 0); /* |u|+|v| doesn't overflow */
+    VERIFY_CHECK((secp256k1_modinv64_abs(q) + secp256k1_modinv64_abs(r)) >= 0); /* |q|+|r| doesn't overflow */
+    VERIFY_CHECK((secp256k1_modinv64_abs(u) + secp256k1_modinv64_abs(v)) <= M62 + 1); /* |u|+|v| <= 2^62 */
+    VERIFY_CHECK((secp256k1_modinv64_abs(q) + secp256k1_modinv64_abs(r)) <= M62 + 1); /* |q|+|r| <= 2^62 */
+#endif
     /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
     sd = d4 >> 63;
     se = e4 >> 63;
@@ -276,6 +366,12 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
     /* What remains is limb 5 of t*[d,e]+modulus*[md,me]; store it as output limb 4. */
     d->v[4] = (int64_t)cd;
     e->v[4] = (int64_t)ce;
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+#endif
 }
 
 /* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
@@ -337,13 +433,35 @@ static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_m
         /* Update d,e using that transition matrix. */
         secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
         secp256k1_modinv64_update_fg_62(&f, &g, &t);
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
     }
 
     /* At this point sufficient iterations have been performed that g must have reached 0
      * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
      * values i.e. +/- 1, and d now contains +/- the modular inverse. */
-    VERIFY_CHECK((g.v[0] | g.v[1] | g.v[2] | g.v[3] | g.v[4]) == 0);
+#ifdef VERIFY
+    /* g == 0 */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &SECP256K1_SIGNED62_ONE, 0) == 0);
+    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
+                 secp256k1_modinv64_mul_cmp_62(&f, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
+                 (secp256k1_modinv64_mul_cmp_62(x, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  secp256k1_modinv64_mul_cmp_62(&d, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  (secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) == 0)));
+#endif
 
     /* Optionally negate d, normalize to [0,modulus), and return it. */
     secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
@@ -358,6 +476,9 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
     secp256k1_modinv64_signed62 f = modinfo->modulus;
     secp256k1_modinv64_signed62 g = *x;
     int j;
+#ifdef VERIFY
+    int i = 0;
+#endif
     int64_t eta = -1;
     int64_t cond;
 
@@ -369,6 +490,12 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
         /* Update d,e using that transition matrix. */
         secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
         secp256k1_modinv64_update_fg_62(&f, &g, &t);
         /* If the bottom limb of g is zero, there is a chance that g=0. */
         if (g.v[0] == 0) {
@@ -380,10 +507,28 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
             /* If so, we're done. */
             if (cond == 0) break;
         }
+#ifdef VERIFY
+        VERIFY_CHECK(++i < 12); /* We should never need more than 12*62 = 744 divsteps */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
     }
 
     /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
+#ifdef VERIFY
+    /* g == 0 */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &SECP256K1_SIGNED62_ONE, 0) == 0);
+    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
+                 secp256k1_modinv64_mul_cmp_62(&f, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
+                 (secp256k1_modinv64_mul_cmp_62(x, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  secp256k1_modinv64_mul_cmp_62(&d, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  (secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) == 0)));
+#endif
 
     /* Optionally negate d, normalize to [0,modulus), and return it. */
     secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);

From aa404d53bef21d252a23171381d4bfda6e7e25c6 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 11 Oct 2020 15:30:37 -0700
Subject: [PATCH 08/59] Move secp256k1_scalar_{inverse{_var},is_even} to
 per-impl files

This temporarily duplicates the inversion code across the 4x64 and 8x32
implementations. Those implementations will be replaced in a later commit.
---
 src/scalar_4x64_impl.h | 179 ++++++++++++++++++++++++++++++++++++++
 src/scalar_8x32_impl.h | 179 ++++++++++++++++++++++++++++++++++++++
 src/scalar_impl.h      | 191 -----------------------------------------
 src/scalar_low_impl.h  |  15 ++++
 4 files changed, 373 insertions(+), 191 deletions(-)

diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 3eaa0418c..6ba38e25e 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -955,4 +955,183 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
 }
 
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_scalar *t;
+    int i;
+    /* First compute xN as x ^ (2^N - 1) for some values of N,
+     * and uM as x ^ M for some values of M. */
+    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
+    secp256k1_scalar u2, u5, u9, u11, u13;
+
+    secp256k1_scalar_sqr(&u2, x);
+    secp256k1_scalar_mul(&x2, &u2,  x);
+    secp256k1_scalar_mul(&u5, &u2, &x2);
+    secp256k1_scalar_mul(&x3, &u5,  &u2);
+    secp256k1_scalar_mul(&u9, &x3, &u2);
+    secp256k1_scalar_mul(&u11, &u9, &u2);
+    secp256k1_scalar_mul(&u13, &u11, &u2);
+
+    secp256k1_scalar_sqr(&x6, &u13);
+    secp256k1_scalar_sqr(&x6, &x6);
+    secp256k1_scalar_mul(&x6, &x6, &u11);
+
+    secp256k1_scalar_sqr(&x8, &x6);
+    secp256k1_scalar_sqr(&x8, &x8);
+    secp256k1_scalar_mul(&x8, &x8,  &x2);
+
+    secp256k1_scalar_sqr(&x14, &x8);
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(&x14, &x14);
+    }
+    secp256k1_scalar_mul(&x14, &x14, &x6);
+
+    secp256k1_scalar_sqr(&x28, &x14);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x28, &x28);
+    }
+    secp256k1_scalar_mul(&x28, &x28, &x14);
+
+    secp256k1_scalar_sqr(&x56, &x28);
+    for (i = 0; i < 27; i++) {
+        secp256k1_scalar_sqr(&x56, &x56);
+    }
+    secp256k1_scalar_mul(&x56, &x56, &x28);
+
+    secp256k1_scalar_sqr(&x112, &x56);
+    for (i = 0; i < 55; i++) {
+        secp256k1_scalar_sqr(&x112, &x112);
+    }
+    secp256k1_scalar_mul(&x112, &x112, &x56);
+
+    secp256k1_scalar_sqr(&x126, &x112);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x126, &x126);
+    }
+    secp256k1_scalar_mul(&x126, &x126, &x14);
+
+    /* Then accumulate the final result (t starts at x126). */
+    t = &x126;
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 10; i++) { /* 0000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 9; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x2); /* 11 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 10; i++) { /* 000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, x); /* 1 */
+    for (i = 0; i < 8; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+#if defined(USE_SCALAR_INV_BUILTIN)
+    secp256k1_scalar_inverse(r, x);
+#elif defined(USE_SCALAR_INV_NUM)
+    unsigned char b[32];
+    secp256k1_num n, m;
+    secp256k1_scalar t = *x;
+    secp256k1_scalar_get_b32(b, &t);
+    secp256k1_num_set_bin(&n, b, 32);
+    secp256k1_scalar_order_get_num(&m);
+    secp256k1_num_mod_inverse(&n, &n, &m);
+    secp256k1_num_get_bin(b, 32, &n);
+    secp256k1_scalar_set_b32(r, b, NULL);
+    /* Verify that the inverse was computed correctly, without GMP code. */
+    secp256k1_scalar_mul(&t, &t, r);
+    CHECK(secp256k1_scalar_is_one(&t));
+#else
+#error "Please select scalar inverse implementation"
+#endif
+}
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index bf98e01d7..53b8d4ec4 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -731,4 +731,183 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     r->d[7] = (r->d[7] & mask0) | (a->d[7] & mask1);
 }
 
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_scalar *t;
+    int i;
+    /* First compute xN as x ^ (2^N - 1) for some values of N,
+     * and uM as x ^ M for some values of M. */
+    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
+    secp256k1_scalar u2, u5, u9, u11, u13;
+
+    secp256k1_scalar_sqr(&u2, x);
+    secp256k1_scalar_mul(&x2, &u2,  x);
+    secp256k1_scalar_mul(&u5, &u2, &x2);
+    secp256k1_scalar_mul(&x3, &u5,  &u2);
+    secp256k1_scalar_mul(&u9, &x3, &u2);
+    secp256k1_scalar_mul(&u11, &u9, &u2);
+    secp256k1_scalar_mul(&u13, &u11, &u2);
+
+    secp256k1_scalar_sqr(&x6, &u13);
+    secp256k1_scalar_sqr(&x6, &x6);
+    secp256k1_scalar_mul(&x6, &x6, &u11);
+
+    secp256k1_scalar_sqr(&x8, &x6);
+    secp256k1_scalar_sqr(&x8, &x8);
+    secp256k1_scalar_mul(&x8, &x8,  &x2);
+
+    secp256k1_scalar_sqr(&x14, &x8);
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(&x14, &x14);
+    }
+    secp256k1_scalar_mul(&x14, &x14, &x6);
+
+    secp256k1_scalar_sqr(&x28, &x14);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x28, &x28);
+    }
+    secp256k1_scalar_mul(&x28, &x28, &x14);
+
+    secp256k1_scalar_sqr(&x56, &x28);
+    for (i = 0; i < 27; i++) {
+        secp256k1_scalar_sqr(&x56, &x56);
+    }
+    secp256k1_scalar_mul(&x56, &x56, &x28);
+
+    secp256k1_scalar_sqr(&x112, &x56);
+    for (i = 0; i < 55; i++) {
+        secp256k1_scalar_sqr(&x112, &x112);
+    }
+    secp256k1_scalar_mul(&x112, &x112, &x56);
+
+    secp256k1_scalar_sqr(&x126, &x112);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x126, &x126);
+    }
+    secp256k1_scalar_mul(&x126, &x126, &x14);
+
+    /* Then accumulate the final result (t starts at x126). */
+    t = &x126;
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 10; i++) { /* 0000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 9; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x2); /* 11 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 10; i++) { /* 000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, x); /* 1 */
+    for (i = 0; i < 8; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+#if defined(USE_SCALAR_INV_BUILTIN)
+    secp256k1_scalar_inverse(r, x);
+#elif defined(USE_SCALAR_INV_NUM)
+    unsigned char b[32];
+    secp256k1_num n, m;
+    secp256k1_scalar t = *x;
+    secp256k1_scalar_get_b32(b, &t);
+    secp256k1_num_set_bin(&n, b, 32);
+    secp256k1_scalar_order_get_num(&m);
+    secp256k1_num_mod_inverse(&n, &n, &m);
+    secp256k1_num_get_bin(b, 32, &n);
+    secp256k1_scalar_set_b32(r, b, NULL);
+    /* Verify that the inverse was computed correctly, without GMP code. */
+    secp256k1_scalar_mul(&t, &t, r);
+    CHECK(secp256k1_scalar_is_one(&t));
+#else
+#error "Please select scalar inverse implementation"
+#endif
+}
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
index 61c1fbd58..b328afdb9 100644
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -65,197 +65,6 @@ static int secp256k1_scalar_set_b32_seckey(secp256k1_scalar *r, const unsigned c
     return (!overflow) & (!secp256k1_scalar_is_zero(r));
 }
 
-static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(EXHAUSTIVE_TEST_ORDER)
-    int i;
-    *r = 0;
-    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
-        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
-            *r = i;
-    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
-     * have a composite group order; fix it in exhaustive_tests.c). */
-    VERIFY_CHECK(*r != 0);
-}
-#else
-    secp256k1_scalar *t;
-    int i;
-    /* First compute xN as x ^ (2^N - 1) for some values of N,
-     * and uM as x ^ M for some values of M. */
-    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
-    secp256k1_scalar u2, u5, u9, u11, u13;
-
-    secp256k1_scalar_sqr(&u2, x);
-    secp256k1_scalar_mul(&x2, &u2,  x);
-    secp256k1_scalar_mul(&u5, &u2, &x2);
-    secp256k1_scalar_mul(&x3, &u5,  &u2);
-    secp256k1_scalar_mul(&u9, &x3, &u2);
-    secp256k1_scalar_mul(&u11, &u9, &u2);
-    secp256k1_scalar_mul(&u13, &u11, &u2);
-
-    secp256k1_scalar_sqr(&x6, &u13);
-    secp256k1_scalar_sqr(&x6, &x6);
-    secp256k1_scalar_mul(&x6, &x6, &u11);
-
-    secp256k1_scalar_sqr(&x8, &x6);
-    secp256k1_scalar_sqr(&x8, &x8);
-    secp256k1_scalar_mul(&x8, &x8,  &x2);
-
-    secp256k1_scalar_sqr(&x14, &x8);
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(&x14, &x14);
-    }
-    secp256k1_scalar_mul(&x14, &x14, &x6);
-
-    secp256k1_scalar_sqr(&x28, &x14);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x28, &x28);
-    }
-    secp256k1_scalar_mul(&x28, &x28, &x14);
-
-    secp256k1_scalar_sqr(&x56, &x28);
-    for (i = 0; i < 27; i++) {
-        secp256k1_scalar_sqr(&x56, &x56);
-    }
-    secp256k1_scalar_mul(&x56, &x56, &x28);
-
-    secp256k1_scalar_sqr(&x112, &x56);
-    for (i = 0; i < 55; i++) {
-        secp256k1_scalar_sqr(&x112, &x112);
-    }
-    secp256k1_scalar_mul(&x112, &x112, &x56);
-
-    secp256k1_scalar_sqr(&x126, &x112);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x126, &x126);
-    }
-    secp256k1_scalar_mul(&x126, &x126, &x14);
-
-    /* Then accumulate the final result (t starts at x126). */
-    t = &x126;
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 10; i++) { /* 0000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 9; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 10; i++) { /* 000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (i = 0; i < 8; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
-}
-
-SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
-    return !(a->d[0] & 1);
-}
-#endif
-
-static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(USE_SCALAR_INV_BUILTIN)
-    secp256k1_scalar_inverse(r, x);
-#elif defined(USE_SCALAR_INV_NUM)
-    unsigned char b[32];
-    secp256k1_num n, m;
-    secp256k1_scalar t = *x;
-    secp256k1_scalar_get_b32(b, &t);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_scalar_order_get_num(&m);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    secp256k1_scalar_set_b32(r, b, NULL);
-    /* Verify that the inverse was computed correctly, without GMP code. */
-    secp256k1_scalar_mul(&t, &t, r);
-    CHECK(secp256k1_scalar_is_one(&t));
-#else
-#error "Please select scalar inverse implementation"
-#endif
-}
-
 /* These parameters are generated using sage/gen_exhaustive_groups.sage. */
 #if defined(EXHAUSTIVE_TEST_ORDER)
 #  if EXHAUSTIVE_TEST_ORDER == 13
diff --git a/src/scalar_low_impl.h b/src/scalar_low_impl.h
index 98ffd1536..eff270720 100644
--- a/src/scalar_low_impl.h
+++ b/src/scalar_low_impl.h
@@ -125,4 +125,19 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     *r = (*r & mask0) | (*a & mask1);
 }
 
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    int i;
+    *r = 0;
+    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
+        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
+            *r = i;
+    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
+     * have a composite group order; fix it in exhaustive_tests.c). */
+    VERIFY_CHECK(*r != 0);
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_scalar_inverse(r, x);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */

From 436281afdcb68991395f97338197d208212965e2 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 11 Oct 2020 15:41:54 -0700
Subject: [PATCH 09/59] Move secp256k1_fe_inverse{_var} to per-impl files

This temporarily duplicates the inversion code across the 5x52 and 10x26
implementations. Those implementations will be replaced in a next commit.
---
 src/field_10x26_impl.h | 127 +++++++++++++++++++++++++++++++++++++++++
 src/field_5x52_impl.h  | 127 +++++++++++++++++++++++++++++++++++++++++
 src/field_impl.h       | 127 -----------------------------------------
 3 files changed, 254 insertions(+), 127 deletions(-)

diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 62bffdc21..3539d5b89 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -1164,4 +1164,131 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
+    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;
+
+    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
+     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
+     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+     */
+
+    secp256k1_fe_sqr(&x2, a);
+    secp256k1_fe_mul(&x2, &x2, a);
+
+    secp256k1_fe_sqr(&x3, &x2);
+    secp256k1_fe_mul(&x3, &x3, a);
+
+    x6 = x3;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x6, &x6);
+    }
+    secp256k1_fe_mul(&x6, &x6, &x3);
+
+    x9 = x6;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x9, &x9);
+    }
+    secp256k1_fe_mul(&x9, &x9, &x3);
+
+    x11 = x9;
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&x11, &x11);
+    }
+    secp256k1_fe_mul(&x11, &x11, &x2);
+
+    x22 = x11;
+    for (j=0; j<11; j++) {
+        secp256k1_fe_sqr(&x22, &x22);
+    }
+    secp256k1_fe_mul(&x22, &x22, &x11);
+
+    x44 = x22;
+    for (j=0; j<22; j++) {
+        secp256k1_fe_sqr(&x44, &x44);
+    }
+    secp256k1_fe_mul(&x44, &x44, &x22);
+
+    x88 = x44;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x88, &x88);
+    }
+    secp256k1_fe_mul(&x88, &x88, &x44);
+
+    x176 = x88;
+    for (j=0; j<88; j++) {
+        secp256k1_fe_sqr(&x176, &x176);
+    }
+    secp256k1_fe_mul(&x176, &x176, &x88);
+
+    x220 = x176;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x220, &x220);
+    }
+    secp256k1_fe_mul(&x220, &x220, &x44);
+
+    x223 = x220;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x223, &x223);
+    }
+    secp256k1_fe_mul(&x223, &x223, &x3);
+
+    /* The final result is then assembled using a sliding window over the blocks. */
+
+    t1 = x223;
+    for (j=0; j<23; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x22);
+    for (j=0; j<5; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, a);
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x2);
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(r, a, &t1);
+}
+
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
+#if defined(USE_FIELD_INV_BUILTIN)
+    secp256k1_fe_inv(r, a);
+#elif defined(USE_FIELD_INV_NUM)
+    secp256k1_num n, m;
+    static const secp256k1_fe negone = SECP256K1_FE_CONST(
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
+    );
+    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
+    static const unsigned char prime[32] = {
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
+    };
+    unsigned char b[32];
+    int res;
+    secp256k1_fe c = *a;
+    secp256k1_fe_normalize_var(&c);
+    secp256k1_fe_get_b32(b, &c);
+    secp256k1_num_set_bin(&n, b, 32);
+    secp256k1_num_set_bin(&m, prime, 32);
+    secp256k1_num_mod_inverse(&n, &n, &m);
+    secp256k1_num_get_bin(b, 32, &n);
+    res = secp256k1_fe_set_b32(r, b);
+    (void)res;
+    VERIFY_CHECK(res);
+    /* Verify the result is the (unique) valid inverse using non-GMP code. */
+    secp256k1_fe_mul(&c, &c, r);
+    secp256k1_fe_add(&c, &negone);
+    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
+#else
+#error "Please select field inverse implementation"
+#endif
+}
+
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 3465ea324..b56456749 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -498,4 +498,131 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
+    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;
+
+    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
+     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
+     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+     */
+
+    secp256k1_fe_sqr(&x2, a);
+    secp256k1_fe_mul(&x2, &x2, a);
+
+    secp256k1_fe_sqr(&x3, &x2);
+    secp256k1_fe_mul(&x3, &x3, a);
+
+    x6 = x3;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x6, &x6);
+    }
+    secp256k1_fe_mul(&x6, &x6, &x3);
+
+    x9 = x6;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x9, &x9);
+    }
+    secp256k1_fe_mul(&x9, &x9, &x3);
+
+    x11 = x9;
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&x11, &x11);
+    }
+    secp256k1_fe_mul(&x11, &x11, &x2);
+
+    x22 = x11;
+    for (j=0; j<11; j++) {
+        secp256k1_fe_sqr(&x22, &x22);
+    }
+    secp256k1_fe_mul(&x22, &x22, &x11);
+
+    x44 = x22;
+    for (j=0; j<22; j++) {
+        secp256k1_fe_sqr(&x44, &x44);
+    }
+    secp256k1_fe_mul(&x44, &x44, &x22);
+
+    x88 = x44;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x88, &x88);
+    }
+    secp256k1_fe_mul(&x88, &x88, &x44);
+
+    x176 = x88;
+    for (j=0; j<88; j++) {
+        secp256k1_fe_sqr(&x176, &x176);
+    }
+    secp256k1_fe_mul(&x176, &x176, &x88);
+
+    x220 = x176;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x220, &x220);
+    }
+    secp256k1_fe_mul(&x220, &x220, &x44);
+
+    x223 = x220;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x223, &x223);
+    }
+    secp256k1_fe_mul(&x223, &x223, &x3);
+
+    /* The final result is then assembled using a sliding window over the blocks. */
+
+    t1 = x223;
+    for (j=0; j<23; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x22);
+    for (j=0; j<5; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, a);
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x2);
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(r, a, &t1);
+}
+
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
+#if defined(USE_FIELD_INV_BUILTIN)
+    secp256k1_fe_inv(r, a);
+#elif defined(USE_FIELD_INV_NUM)
+    secp256k1_num n, m;
+    static const secp256k1_fe negone = SECP256K1_FE_CONST(
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
+    );
+    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
+    static const unsigned char prime[32] = {
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
+    };
+    unsigned char b[32];
+    int res;
+    secp256k1_fe c = *a;
+    secp256k1_fe_normalize_var(&c);
+    secp256k1_fe_get_b32(b, &c);
+    secp256k1_num_set_bin(&n, b, 32);
+    secp256k1_num_set_bin(&m, prime, 32);
+    secp256k1_num_mod_inverse(&n, &n, &m);
+    secp256k1_num_get_bin(b, 32, &n);
+    res = secp256k1_fe_set_b32(r, b);
+    (void)res;
+    VERIFY_CHECK(res);
+    /* Verify the result is the (unique) valid inverse using non-GMP code. */
+    secp256k1_fe_mul(&c, &c, r);
+    secp256k1_fe_add(&c, &negone);
+    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
+#else
+#error "Please select field inverse implementation"
+#endif
+}
+
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_impl.h b/src/field_impl.h
index f0096f631..7b75e9860 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -136,133 +136,6 @@ static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a) {
     return secp256k1_fe_equal(&t1, a);
 }
 
-static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
-    int j;
-
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
-     */
-
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
-
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
-
-    x6 = x3;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
-    }
-    secp256k1_fe_mul(&x6, &x6, &x3);
-
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
-
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
-    }
-    secp256k1_fe_mul(&x11, &x11, &x2);
-
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
-
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
-
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
-
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
-    }
-    secp256k1_fe_mul(&x176, &x176, &x88);
-
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
-
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
-
-    /* The final result is then assembled using a sliding window over the blocks. */
-
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(r, a, &t1);
-}
-
-static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
-#if defined(USE_FIELD_INV_BUILTIN)
-    secp256k1_fe_inv(r, a);
-#elif defined(USE_FIELD_INV_NUM)
-    secp256k1_num n, m;
-    static const secp256k1_fe negone = SECP256K1_FE_CONST(
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
-    );
-    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
-    static const unsigned char prime[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-    unsigned char b[32];
-    int res;
-    secp256k1_fe c = *a;
-    secp256k1_fe_normalize_var(&c);
-    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_set_bin(&m, prime, 32);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    res = secp256k1_fe_set_b32(r, b);
-    (void)res;
-    VERIFY_CHECK(res);
-    /* Verify the result is the (unique) valid inverse using non-GMP code. */
-    secp256k1_fe_mul(&c, &c, r);
-    secp256k1_fe_add(&c, &negone);
-    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
-#else
-#error "Please select field inverse implementation"
-#endif
-}
-
 static int secp256k1_fe_is_quad_var(const secp256k1_fe *a) {
 #ifndef USE_NUM_NONE
     unsigned char b[32];

From 1e0e885c8ac814c3621d9e43e66d60f25e324e8e Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 29 Nov 2020 14:02:01 -0800
Subject: [PATCH 10/59] Make field/scalar code use the new modinv modules for
 inverses

---
 README.md              |   2 +-
 src/field_10x26_impl.h | 186 +++++++++++++-------------------
 src/field_5x52_impl.h  | 172 +++++++++++------------------
 src/scalar_4x64_impl.h | 217 ++++++++++---------------------------
 src/scalar_8x32_impl.h | 239 +++++++++++++----------------------------
 5 files changed, 268 insertions(+), 548 deletions(-)

diff --git a/README.md b/README.md
index 9918678e2..197a56fff 100644
--- a/README.md
+++ b/README.md
@@ -34,11 +34,11 @@ Implementation details
   * Optimized implementation of arithmetic modulo the curve's field size (2^256 - 0x1000003D1).
     * Using 5 52-bit limbs (including hand-optimized assembly for x86_64, by Diederik Huys).
     * Using 10 26-bit limbs (including hand-optimized assembly for 32-bit ARM, by Wladimir J. van der Laan).
-  * Field inverses and square roots using a sliding window over blocks of 1s (by Peter Dettman).
 * Scalar operations
   * Optimized implementation without data-dependent branches of arithmetic modulo the curve's order.
     * Using 4 64-bit limbs (relying on __int128 support in the compiler).
     * Using 8 32-bit limbs.
+* Modular inverses (both field elements and scalars) based on [safegcd](https://gcd.cr.yp.to/index.html) with some modifications, and a variable-time variant (by Peter Dettman).
 * Group operations
   * Point addition formula specifically simplified for the curve equation (y^2 = x^3 + 7).
   * Use addition between points in Jacobian and affine coordinates where possible.
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 3539d5b89..c2802514d 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -9,6 +9,7 @@
 
 #include "util.h"
 #include "field.h"
+#include "modinv32_impl.h"
 
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe *a) {
@@ -1164,131 +1165,92 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
-static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
-    int j;
+static void secp256k1_fe_from_signed30(secp256k1_fe *r, const secp256k1_modinv32_signed30 *a) {
+    const uint32_t M26 = UINT32_MAX >> 6;
+    const uint32_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4],
+                   a5 = a->v[5], a6 = a->v[6], a7 = a->v[7], a8 = a->v[8];
 
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+    /* The output from secp256k1_modinv32{_var} should be normalized to range [0,modulus), and
+     * have limbs in [0,2^30). The modulus is < 2^256, so the top limb must be below 2^(256-30*8).
      */
+    VERIFY_CHECK(a0 >> 30 == 0);
+    VERIFY_CHECK(a1 >> 30 == 0);
+    VERIFY_CHECK(a2 >> 30 == 0);
+    VERIFY_CHECK(a3 >> 30 == 0);
+    VERIFY_CHECK(a4 >> 30 == 0);
+    VERIFY_CHECK(a5 >> 30 == 0);
+    VERIFY_CHECK(a6 >> 30 == 0);
+    VERIFY_CHECK(a7 >> 30 == 0);
+    VERIFY_CHECK(a8 >> 16 == 0);
+
+    r->n[0] =  a0                   & M26;
+    r->n[1] = (a0 >> 26 | a1 <<  4) & M26;
+    r->n[2] = (a1 >> 22 | a2 <<  8) & M26;
+    r->n[3] = (a2 >> 18 | a3 << 12) & M26;
+    r->n[4] = (a3 >> 14 | a4 << 16) & M26;
+    r->n[5] = (a4 >> 10 | a5 << 20) & M26;
+    r->n[6] = (a5 >>  6 | a6 << 24) & M26;
+    r->n[7] = (a6 >>  2           ) & M26;
+    r->n[8] = (a6 >> 28 | a7 <<  2) & M26;
+    r->n[9] = (a7 >> 24 | a8 <<  6);
 
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
-
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
-
-    x6 = x3;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
-    }
-    secp256k1_fe_mul(&x6, &x6, &x3);
-
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
+    secp256k1_fe_verify(r);
+#endif
+}
 
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
-    }
-    secp256k1_fe_mul(&x11, &x11, &x2);
+static void secp256k1_fe_to_signed30(secp256k1_modinv32_signed30 *r, const secp256k1_fe *a) {
+    const uint32_t M30 = UINT32_MAX >> 2;
+    const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4],
+                   a5 = a->n[5], a6 = a->n[6], a7 = a->n[7], a8 = a->n[8], a9 = a->n[9];
 
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
+#ifdef VERIFY
+    VERIFY_CHECK(a->normalized);
+#endif
 
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
+    r->v[0] = (a0       | a1 << 26) & M30;
+    r->v[1] = (a1 >>  4 | a2 << 22) & M30;
+    r->v[2] = (a2 >>  8 | a3 << 18) & M30;
+    r->v[3] = (a3 >> 12 | a4 << 14) & M30;
+    r->v[4] = (a4 >> 16 | a5 << 10) & M30;
+    r->v[5] = (a5 >> 20 | a6 <<  6) & M30;
+    r->v[6] = (a6 >> 24 | a7 <<  2
+                        | a8 << 28) & M30;
+    r->v[7] = (a8 >>  2 | a9 << 24) & M30;
+    r->v[8] =  a9 >>  6;
+}
 
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
+static const secp256k1_modinv32_modinfo secp256k1_const_modinfo_fe = {
+    {{-0x3D1, -4, 0, 0, 0, 0, 0, 0, 65536}},
+    0x2DDACACFL
+};
 
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
-    }
-    secp256k1_fe_mul(&x176, &x176, &x88);
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
+    secp256k1_fe tmp;
+    secp256k1_modinv32_signed30 s;
 
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
+    tmp = *x;
+    secp256k1_fe_normalize(&tmp);
+    secp256k1_fe_to_signed30(&s, &tmp);
+    secp256k1_modinv32(&s, &secp256k1_const_modinfo_fe);
+    secp256k1_fe_from_signed30(r, &s);
 
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
+}
 
-    /* The final result is then assembled using a sliding window over the blocks. */
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
+    secp256k1_fe tmp;
+    secp256k1_modinv32_signed30 s;
 
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(r, a, &t1);
-}
+    tmp = *x;
+    secp256k1_fe_normalize_var(&tmp);
+    secp256k1_fe_to_signed30(&s, &tmp);
+    secp256k1_modinv32_var(&s, &secp256k1_const_modinfo_fe);
+    secp256k1_fe_from_signed30(r, &s);
 
-static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
-#if defined(USE_FIELD_INV_BUILTIN)
-    secp256k1_fe_inv(r, a);
-#elif defined(USE_FIELD_INV_NUM)
-    secp256k1_num n, m;
-    static const secp256k1_fe negone = SECP256K1_FE_CONST(
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
-    );
-    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
-    static const unsigned char prime[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-    unsigned char b[32];
-    int res;
-    secp256k1_fe c = *a;
-    secp256k1_fe_normalize_var(&c);
-    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_set_bin(&m, prime, 32);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    res = secp256k1_fe_set_b32(r, b);
-    (void)res;
-    VERIFY_CHECK(res);
-    /* Verify the result is the (unique) valid inverse using non-GMP code. */
-    secp256k1_fe_mul(&c, &c, r);
-    secp256k1_fe_add(&c, &negone);
-    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
-#else
-#error "Please select field inverse implementation"
-#endif
+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 }
 
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index b56456749..b73cfea20 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -13,6 +13,7 @@
 
 #include "util.h"
 #include "field.h"
+#include "modinv64_impl.h"
 
 #if defined(USE_ASM_X86_64)
 #include "field_5x52_asm_impl.h"
@@ -498,130 +499,79 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
-static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
-    int j;
+static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64_signed62 *a) {
+    const uint64_t M52 = UINT64_MAX >> 12;
+    const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
 
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+    /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and
+     * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4).
      */
+    VERIFY_CHECK(a0 >> 62 == 0);
+    VERIFY_CHECK(a1 >> 62 == 0);
+    VERIFY_CHECK(a2 >> 62 == 0);
+    VERIFY_CHECK(a3 >> 62 == 0);
+    VERIFY_CHECK(a4 >> 8 == 0);
+
+    r->n[0] =  a0                   & M52;
+    r->n[1] = (a0 >> 52 | a1 << 10) & M52;
+    r->n[2] = (a1 >> 42 | a2 << 20) & M52;
+    r->n[3] = (a2 >> 32 | a3 << 30) & M52;
+    r->n[4] = (a3 >> 22 | a4 << 40);
 
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
-
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
-
-    x6 = x3;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
-    }
-    secp256k1_fe_mul(&x6, &x6, &x3);
-
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
+    secp256k1_fe_verify(r);
+#endif
+}
 
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
-    }
-    secp256k1_fe_mul(&x11, &x11, &x2);
+static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_fe *a) {
+    const uint64_t M62 = UINT64_MAX >> 2;
+    const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
 
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
+#ifdef VERIFY
+    VERIFY_CHECK(a->normalized);
+#endif
 
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
+    r->v[0] = (a0       | a1 << 52) & M62;
+    r->v[1] = (a1 >> 10 | a2 << 42) & M62;
+    r->v[2] = (a2 >> 20 | a3 << 32) & M62;
+    r->v[3] = (a3 >> 30 | a4 << 22) & M62;
+    r->v[4] =  a4 >> 40;
+}
 
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
+static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_fe = {
+    {{-0x1000003D1LL, 0, 0, 0, 256}},
+    0x27C7F6E22DDACACFLL
+};
 
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
-    }
-    secp256k1_fe_mul(&x176, &x176, &x88);
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
+    secp256k1_fe tmp;
+    secp256k1_modinv64_signed62 s;
 
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
+    tmp = *x;
+    secp256k1_fe_normalize(&tmp);
+    secp256k1_fe_to_signed62(&s, &tmp);
+    secp256k1_modinv64(&s, &secp256k1_const_modinfo_fe);
+    secp256k1_fe_from_signed62(r, &s);
 
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
+#endif
+}
 
-    /* The final result is then assembled using a sliding window over the blocks. */
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
+    secp256k1_fe tmp;
+    secp256k1_modinv64_signed62 s;
 
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(r, a, &t1);
-}
+    tmp = *x;
+    secp256k1_fe_normalize_var(&tmp);
+    secp256k1_fe_to_signed62(&s, &tmp);
+    secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_fe);
+    secp256k1_fe_from_signed62(r, &s);
 
-static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
-#if defined(USE_FIELD_INV_BUILTIN)
-    secp256k1_fe_inv(r, a);
-#elif defined(USE_FIELD_INV_NUM)
-    secp256k1_num n, m;
-    static const secp256k1_fe negone = SECP256K1_FE_CONST(
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
-    );
-    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
-    static const unsigned char prime[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-    unsigned char b[32];
-    int res;
-    secp256k1_fe c = *a;
-    secp256k1_fe_normalize_var(&c);
-    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_set_bin(&m, prime, 32);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    res = secp256k1_fe_set_b32(r, b);
-    (void)res;
-    VERIFY_CHECK(res);
-    /* Verify the result is the (unique) valid inverse using non-GMP code. */
-    secp256k1_fe_mul(&c, &c, r);
-    secp256k1_fe_add(&c, &negone);
-    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
-#else
-#error "Please select field inverse implementation"
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 #endif
 }
 
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 6ba38e25e..ea33919bf 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -7,6 +7,8 @@
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 
+#include "modinv64_impl.h"
+
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
 #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
@@ -955,178 +957,73 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
 }
 
-static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
-    secp256k1_scalar *t;
-    int i;
-    /* First compute xN as x ^ (2^N - 1) for some values of N,
-     * and uM as x ^ M for some values of M. */
-    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
-    secp256k1_scalar u2, u5, u9, u11, u13;
+static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_modinv64_signed62 *a) {
+    const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
 
-    secp256k1_scalar_sqr(&u2, x);
-    secp256k1_scalar_mul(&x2, &u2,  x);
-    secp256k1_scalar_mul(&u5, &u2, &x2);
-    secp256k1_scalar_mul(&x3, &u5,  &u2);
-    secp256k1_scalar_mul(&u9, &x3, &u2);
-    secp256k1_scalar_mul(&u11, &u9, &u2);
-    secp256k1_scalar_mul(&u13, &u11, &u2);
+    /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and
+     * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4).
+     */
+    VERIFY_CHECK(a0 >> 62 == 0);
+    VERIFY_CHECK(a1 >> 62 == 0);
+    VERIFY_CHECK(a2 >> 62 == 0);
+    VERIFY_CHECK(a3 >> 62 == 0);
+    VERIFY_CHECK(a4 >> 8 == 0);
 
-    secp256k1_scalar_sqr(&x6, &u13);
-    secp256k1_scalar_sqr(&x6, &x6);
-    secp256k1_scalar_mul(&x6, &x6, &u11);
+    r->d[0] = a0      | a1 << 62;
+    r->d[1] = a1 >> 2 | a2 << 60;
+    r->d[2] = a2 >> 4 | a3 << 58;
+    r->d[3] = a3 >> 6 | a4 << 56;
 
-    secp256k1_scalar_sqr(&x8, &x6);
-    secp256k1_scalar_sqr(&x8, &x8);
-    secp256k1_scalar_mul(&x8, &x8,  &x2);
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
+#endif
+}
 
-    secp256k1_scalar_sqr(&x14, &x8);
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(&x14, &x14);
-    }
-    secp256k1_scalar_mul(&x14, &x14, &x6);
+static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_scalar *a) {
+    const uint64_t M62 = UINT64_MAX >> 2;
+    const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3];
 
-    secp256k1_scalar_sqr(&x28, &x14);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x28, &x28);
-    }
-    secp256k1_scalar_mul(&x28, &x28, &x14);
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
+#endif
 
-    secp256k1_scalar_sqr(&x56, &x28);
-    for (i = 0; i < 27; i++) {
-        secp256k1_scalar_sqr(&x56, &x56);
-    }
-    secp256k1_scalar_mul(&x56, &x56, &x28);
+    r->v[0] =  a0                   & M62;
+    r->v[1] = (a0 >> 62 | a1 <<  2) & M62;
+    r->v[2] = (a1 >> 60 | a2 <<  4) & M62;
+    r->v[3] = (a2 >> 58 | a3 <<  6) & M62;
+    r->v[4] =  a3 >> 56;
+}
 
-    secp256k1_scalar_sqr(&x112, &x56);
-    for (i = 0; i < 55; i++) {
-        secp256k1_scalar_sqr(&x112, &x112);
-    }
-    secp256k1_scalar_mul(&x112, &x112, &x56);
+static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_scalar = {
+    {{0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256}},
+    0x34F20099AA774EC1LL
+};
 
-    secp256k1_scalar_sqr(&x126, &x112);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x126, &x126);
-    }
-    secp256k1_scalar_mul(&x126, &x126, &x14);
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_modinv64_signed62 s;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+    secp256k1_scalar_to_signed62(&s, x);
+    secp256k1_modinv64(&s, &secp256k1_const_modinfo_scalar);
+    secp256k1_scalar_from_signed62(r, &s);
 
-    /* Then accumulate the final result (t starts at x126). */
-    t = &x126;
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 10; i++) { /* 0000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 9; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 10; i++) { /* 000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (i = 0; i < 8; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
+#endif
 }
 
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(USE_SCALAR_INV_BUILTIN)
-    secp256k1_scalar_inverse(r, x);
-#elif defined(USE_SCALAR_INV_NUM)
-    unsigned char b[32];
-    secp256k1_num n, m;
-    secp256k1_scalar t = *x;
-    secp256k1_scalar_get_b32(b, &t);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_scalar_order_get_num(&m);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    secp256k1_scalar_set_b32(r, b, NULL);
-    /* Verify that the inverse was computed correctly, without GMP code. */
-    secp256k1_scalar_mul(&t, &t, r);
-    CHECK(secp256k1_scalar_is_one(&t));
-#else
-#error "Please select scalar inverse implementation"
+    secp256k1_modinv64_signed62 s;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+    secp256k1_scalar_to_signed62(&s, x);
+    secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_scalar);
+    secp256k1_scalar_from_signed62(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
 #endif
 }
 
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index 53b8d4ec4..ccc2c2424 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -7,6 +7,8 @@
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 
+#include "modinv32_impl.h"
+
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint32_t)0xD0364141UL)
 #define SECP256K1_N_1 ((uint32_t)0xBFD25E8CUL)
@@ -731,178 +733,87 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     r->d[7] = (r->d[7] & mask0) | (a->d[7] & mask1);
 }
 
-static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
-    secp256k1_scalar *t;
-    int i;
-    /* First compute xN as x ^ (2^N - 1) for some values of N,
-     * and uM as x ^ M for some values of M. */
-    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
-    secp256k1_scalar u2, u5, u9, u11, u13;
-
-    secp256k1_scalar_sqr(&u2, x);
-    secp256k1_scalar_mul(&x2, &u2,  x);
-    secp256k1_scalar_mul(&u5, &u2, &x2);
-    secp256k1_scalar_mul(&x3, &u5,  &u2);
-    secp256k1_scalar_mul(&u9, &x3, &u2);
-    secp256k1_scalar_mul(&u11, &u9, &u2);
-    secp256k1_scalar_mul(&u13, &u11, &u2);
-
-    secp256k1_scalar_sqr(&x6, &u13);
-    secp256k1_scalar_sqr(&x6, &x6);
-    secp256k1_scalar_mul(&x6, &x6, &u11);
-
-    secp256k1_scalar_sqr(&x8, &x6);
-    secp256k1_scalar_sqr(&x8, &x8);
-    secp256k1_scalar_mul(&x8, &x8,  &x2);
-
-    secp256k1_scalar_sqr(&x14, &x8);
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(&x14, &x14);
-    }
-    secp256k1_scalar_mul(&x14, &x14, &x6);
+static void secp256k1_scalar_from_signed30(secp256k1_scalar *r, const secp256k1_modinv32_signed30 *a) {
+    const uint32_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4],
+                   a5 = a->v[5], a6 = a->v[6], a7 = a->v[7], a8 = a->v[8];
+
+    /* The output from secp256k1_modinv32{_var} should be normalized to range [0,modulus), and
+     * have limbs in [0,2^30). The modulus is < 2^256, so the top limb must be below 2^(256-30*8).
+     */
+    VERIFY_CHECK(a0 >> 30 == 0);
+    VERIFY_CHECK(a1 >> 30 == 0);
+    VERIFY_CHECK(a2 >> 30 == 0);
+    VERIFY_CHECK(a3 >> 30 == 0);
+    VERIFY_CHECK(a4 >> 30 == 0);
+    VERIFY_CHECK(a5 >> 30 == 0);
+    VERIFY_CHECK(a6 >> 30 == 0);
+    VERIFY_CHECK(a7 >> 30 == 0);
+    VERIFY_CHECK(a8 >> 16 == 0);
+
+    r->d[0] = a0       | a1 << 30;
+    r->d[1] = a1 >>  2 | a2 << 28;
+    r->d[2] = a2 >>  4 | a3 << 26;
+    r->d[3] = a3 >>  6 | a4 << 24;
+    r->d[4] = a4 >>  8 | a5 << 22;
+    r->d[5] = a5 >> 10 | a6 << 20;
+    r->d[6] = a6 >> 12 | a7 << 18;
+    r->d[7] = a7 >> 14 | a8 << 16;
 
-    secp256k1_scalar_sqr(&x28, &x14);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x28, &x28);
-    }
-    secp256k1_scalar_mul(&x28, &x28, &x14);
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
+#endif
+}
 
-    secp256k1_scalar_sqr(&x56, &x28);
-    for (i = 0; i < 27; i++) {
-        secp256k1_scalar_sqr(&x56, &x56);
-    }
-    secp256k1_scalar_mul(&x56, &x56, &x28);
+static void secp256k1_scalar_to_signed30(secp256k1_modinv32_signed30 *r, const secp256k1_scalar *a) {
+    const uint32_t M30 = UINT32_MAX >> 2;
+    const uint32_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3],
+                   a4 = a->d[4], a5 = a->d[5], a6 = a->d[6], a7 = a->d[7];
 
-    secp256k1_scalar_sqr(&x112, &x56);
-    for (i = 0; i < 55; i++) {
-        secp256k1_scalar_sqr(&x112, &x112);
-    }
-    secp256k1_scalar_mul(&x112, &x112, &x56);
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
+#endif
 
-    secp256k1_scalar_sqr(&x126, &x112);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x126, &x126);
-    }
-    secp256k1_scalar_mul(&x126, &x126, &x14);
+    r->v[0] =  a0                   & M30;
+    r->v[1] = (a0 >> 30 | a1 <<  2) & M30;
+    r->v[2] = (a1 >> 28 | a2 <<  4) & M30;
+    r->v[3] = (a2 >> 26 | a3 <<  6) & M30;
+    r->v[4] = (a3 >> 24 | a4 <<  8) & M30;
+    r->v[5] = (a4 >> 22 | a5 << 10) & M30;
+    r->v[6] = (a5 >> 20 | a6 << 12) & M30;
+    r->v[7] = (a6 >> 18 | a7 << 14) & M30;
+    r->v[8] =  a7 >> 16;
+}
 
-    /* Then accumulate the final result (t starts at x126). */
-    t = &x126;
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 10; i++) { /* 0000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 9; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 10; i++) { /* 000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (i = 0; i < 8; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
+static const secp256k1_modinv32_modinfo secp256k1_const_modinfo_scalar = {
+    {{0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L, 0, 0, 0, 65536}},
+    0x2A774EC1L
+};
+
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_modinv32_signed30 s;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+    secp256k1_scalar_to_signed30(&s, x);
+    secp256k1_modinv32(&s, &secp256k1_const_modinfo_scalar);
+    secp256k1_scalar_from_signed30(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
+#endif
 }
 
 static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(USE_SCALAR_INV_BUILTIN)
-    secp256k1_scalar_inverse(r, x);
-#elif defined(USE_SCALAR_INV_NUM)
-    unsigned char b[32];
-    secp256k1_num n, m;
-    secp256k1_scalar t = *x;
-    secp256k1_scalar_get_b32(b, &t);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_scalar_order_get_num(&m);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    secp256k1_scalar_set_b32(r, b, NULL);
-    /* Verify that the inverse was computed correctly, without GMP code. */
-    secp256k1_scalar_mul(&t, &t, r);
-    CHECK(secp256k1_scalar_is_one(&t));
-#else
-#error "Please select scalar inverse implementation"
+    secp256k1_modinv32_signed30 s;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+    secp256k1_scalar_to_signed30(&s, x);
+    secp256k1_modinv32_var(&s, &secp256k1_const_modinfo_scalar);
+    secp256k1_scalar_from_signed30(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
 #endif
 }
 

From aa9cc5218001f14f4312bde1058417d4b755fd11 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 11 Oct 2020 23:20:32 -0700
Subject: [PATCH 11/59] Improve field/scalar inverse tests

Add a new run_inverse_tests that replaces all existing field/scalar inverse tests,
and tests a few identities for fixed inputs, small numbers (-999...999), random
inputs (structured and unstructured), as well as comparing with the output of
secp256k1_fe_inv_all_var.
---
 src/tests.c | 225 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 164 insertions(+), 61 deletions(-)

diff --git a/src/tests.c b/src/tests.c
index 32d9340f0..6349c399e 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -1418,33 +1418,6 @@ void scalar_test(void) {
     }
 #endif
 
-    {
-        /* Test that scalar inverses are equal to the inverse of their number modulo the order. */
-        if (!secp256k1_scalar_is_zero(&s)) {
-            secp256k1_scalar inv;
-#ifndef USE_NUM_NONE
-            secp256k1_num invnum;
-            secp256k1_num invnum2;
-#endif
-            secp256k1_scalar_inverse(&inv, &s);
-#ifndef USE_NUM_NONE
-            secp256k1_num_mod_inverse(&invnum, &snum, &order);
-            secp256k1_scalar_get_num(&invnum2, &inv);
-            CHECK(secp256k1_num_eq(&invnum, &invnum2));
-#endif
-            secp256k1_scalar_mul(&inv, &inv, &s);
-            /* Multiplying a scalar with its inverse must result in one. */
-            CHECK(secp256k1_scalar_is_one(&inv));
-            secp256k1_scalar_inverse(&inv, &inv);
-            /* Inverting one must result in one. */
-            CHECK(secp256k1_scalar_is_one(&inv));
-#ifndef USE_NUM_NONE
-            secp256k1_scalar_get_num(&invnum, &inv);
-            CHECK(secp256k1_num_is_one(&invnum));
-#endif
-        }
-    }
-
     {
         /* Test commutativity of add. */
         secp256k1_scalar r1, r2;
@@ -2275,13 +2248,6 @@ int check_fe_equal(const secp256k1_fe *a, const secp256k1_fe *b) {
     return secp256k1_fe_equal_var(&an, &bn);
 }
 
-int check_fe_inverse(const secp256k1_fe *a, const secp256k1_fe *ai) {
-    secp256k1_fe x;
-    secp256k1_fe one = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1);
-    secp256k1_fe_mul(&x, a, ai);
-    return check_fe_equal(&x, &one);
-}
-
 void run_field_convert(void) {
     static const unsigned char b32[32] = {
         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -2401,30 +2367,6 @@ void run_field_misc(void) {
     }
 }
 
-void run_field_inv(void) {
-    secp256k1_fe x, xi, xii;
-    int i;
-    for (i = 0; i < 10*count; i++) {
-        random_fe_non_zero(&x);
-        secp256k1_fe_inv(&xi, &x);
-        CHECK(check_fe_inverse(&x, &xi));
-        secp256k1_fe_inv(&xii, &xi);
-        CHECK(check_fe_equal(&x, &xii));
-    }
-}
-
-void run_field_inv_var(void) {
-    secp256k1_fe x, xi, xii;
-    int i;
-    for (i = 0; i < 10*count; i++) {
-        random_fe_non_zero(&x);
-        secp256k1_fe_inv_var(&xi, &x);
-        CHECK(check_fe_inverse(&x, &xi));
-        secp256k1_fe_inv_var(&xii, &xi);
-        CHECK(check_fe_equal(&x, &xii));
-    }
-}
-
 void run_sqr(void) {
     secp256k1_fe x, s;
 
@@ -2489,6 +2431,169 @@ void run_sqrt(void) {
     }
 }
 
+/***** FIELD/SCALAR INVERSE TESTS *****/
+
+static const secp256k1_scalar scalar_minus_one = SECP256K1_SCALAR_CONST(
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE,
+    0xBAAEDCE6, 0xAF48A03B, 0xBFD25E8C, 0xD0364140
+);
+
+static const secp256k1_fe fe_minus_one = SECP256K1_FE_CONST(
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFC2E
+);
+
+/* These tests test the following identities:
+ *
+ * for x==0: 1/x == 0
+ * for x!=0: x*(1/x) == 1
+ * for x!=0 and x!=1: 1/(1/x - 1) + 1 == -1/(x-1)
+ */
+
+void test_inverse_scalar(secp256k1_scalar* out, const secp256k1_scalar* x, int var)
+{
+    secp256k1_scalar l, r, t;
+
+    (var ? secp256k1_scalar_inverse_var : secp256k1_scalar_inverse_var)(&l, x);  /* l = 1/x */
+    if (out) *out = l;
+    if (secp256k1_scalar_is_zero(x)) {
+        CHECK(secp256k1_scalar_is_zero(&l));
+        return;
+    }
+    secp256k1_scalar_mul(&t, x, &l);                                             /* t = x*(1/x) */
+    CHECK(secp256k1_scalar_is_one(&t));                                          /* x*(1/x) == 1 */
+    secp256k1_scalar_add(&r, x, &scalar_minus_one);                              /* r = x-1 */
+    if (secp256k1_scalar_is_zero(&r)) return;
+    (var ? secp256k1_scalar_inverse_var : secp256k1_scalar_inverse_var)(&r, &r); /* r = 1/(x-1) */
+    secp256k1_scalar_add(&l, &scalar_minus_one, &l);                             /* l = 1/x-1 */
+    (var ? secp256k1_scalar_inverse_var : secp256k1_scalar_inverse_var)(&l, &l); /* l = 1/(1/x-1) */
+    secp256k1_scalar_add(&l, &l, &secp256k1_scalar_one);                         /* l = 1/(1/x-1)+1 */
+    secp256k1_scalar_add(&l, &r, &l);                                            /* l = 1/(1/x-1)+1 + 1/(x-1) */
+    CHECK(secp256k1_scalar_is_zero(&l));                                         /* l == 0 */
+}
+
+void test_inverse_field(secp256k1_fe* out, const secp256k1_fe* x, int var)
+{
+    secp256k1_fe l, r, t;
+
+    (var ? secp256k1_fe_inv_var : secp256k1_fe_inv)(&l, x) ;   /* l = 1/x */
+    if (out) *out = l;
+    t = *x;                                                    /* t = x */
+    if (secp256k1_fe_normalizes_to_zero_var(&t)) {
+        CHECK(secp256k1_fe_normalizes_to_zero(&l));
+        return;
+    }
+    secp256k1_fe_mul(&t, x, &l);                               /* t = x*(1/x) */
+    secp256k1_fe_add(&t, &fe_minus_one);                       /* t = x*(1/x)-1 */
+    CHECK(secp256k1_fe_normalizes_to_zero(&t));                /* x*(1/x)-1 == 0 */
+    r = *x;                                                    /* r = x */
+    secp256k1_fe_add(&r, &fe_minus_one);                       /* r = x-1 */
+    if (secp256k1_fe_normalizes_to_zero_var(&r)) return;
+    (var ? secp256k1_fe_inv_var : secp256k1_fe_inv)(&r, &r);   /* r = 1/(x-1) */
+    secp256k1_fe_add(&l, &fe_minus_one);                       /* l = 1/x-1 */
+    (var ? secp256k1_fe_inv_var : secp256k1_fe_inv)(&l, &l);   /* l = 1/(1/x-1) */
+    secp256k1_fe_add(&l, &secp256k1_fe_one);                   /* l = 1/(1/x-1)+1 */
+    secp256k1_fe_add(&l, &r);                                  /* l = 1/(1/x-1)+1 + 1/(x-1) */
+    CHECK(secp256k1_fe_normalizes_to_zero_var(&l));            /* l == 0 */
+}
+
+void run_inverse_tests(void)
+{
+    /* Fixed test cases for field inverses: pairs of (x, 1/x) mod p. */
+    static const secp256k1_fe fe_cases[][2] = {
+        /* 0 */
+        {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0),
+         SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0)},
+        /* 1 */
+        {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1),
+         SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1)},
+        /* -1 */
+        {SECP256K1_FE_CONST(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xfffffc2e),
+         SECP256K1_FE_CONST(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xfffffc2e)},
+        /* 2 */
+        {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 2),
+         SECP256K1_FE_CONST(0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7ffffe18)},
+        /* 2**128 */
+        {SECP256K1_FE_CONST(0, 0, 0, 1, 0, 0, 0, 0),
+         SECP256K1_FE_CONST(0xbcb223fe, 0xdc24a059, 0xd838091d, 0xd2253530, 0xffffffff, 0xffffffff, 0xffffffff, 0x434dd931)},
+        /* Input known to need 637 divsteps */
+        {SECP256K1_FE_CONST(0xe34e9c95, 0x6bee8a84, 0x0dcb632a, 0xdb8a1320, 0x66885408, 0x06f3f996, 0x7c11ca84, 0x19199ec3),
+         SECP256K1_FE_CONST(0xbd2cbd8f, 0x1c536828, 0x9bccda44, 0x2582ac0c, 0x870152b0, 0x8a3f09fb, 0x1aaadf92, 0x19b618e5)}
+    };
+    /* Fixed test cases for scalar inverses: pairs of (x, 1/x) mod n. */
+    static const secp256k1_scalar scalar_cases[][2] = {
+        /* 0 */
+        {SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0),
+         SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0)},
+        /* 1 */
+        {SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1),
+         SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1)},
+        /* -1 */
+        {SECP256K1_SCALAR_CONST(0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xbaaedce6, 0xaf48a03b, 0xbfd25e8c, 0xd0364140),
+         SECP256K1_SCALAR_CONST(0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xbaaedce6, 0xaf48a03b, 0xbfd25e8c, 0xd0364140)},
+        /* 2 */
+        {SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 2),
+         SECP256K1_SCALAR_CONST(0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x5d576e73, 0x57a4501d, 0xdfe92f46, 0x681b20a1)},
+        /* 2**128 */
+        {SECP256K1_SCALAR_CONST(0, 0, 0, 1, 0, 0, 0, 0),
+         SECP256K1_SCALAR_CONST(0x50a51ac8, 0x34b9ec24, 0x4b0dff66, 0x5588b13e, 0x9984d5b3, 0xcf80ef0f, 0xd6a23766, 0xa3ee9f22)},
+        /* Input known to need 635 divsteps */
+        {SECP256K1_SCALAR_CONST(0xcb9f1d35, 0xdd4416c2, 0xcd71bf3f, 0x6365da66, 0x3c9b3376, 0x8feb7ae9, 0x32a5ef60, 0x19199ec3),
+         SECP256K1_SCALAR_CONST(0x1d7c7bba, 0xf1893d53, 0xb834bd09, 0x36b411dc, 0x42c2e42f, 0xec72c428, 0x5e189791, 0x8e9bc708)}
+    };
+    int i, var, testrand;
+    unsigned char b32[32];
+    secp256k1_fe x_fe;
+    secp256k1_scalar x_scalar;
+    memset(b32, 0, sizeof(b32));
+    /* Test fixed test cases through test_inverse_{scalar,field}, both ways. */
+    for (i = 0; (size_t)i < sizeof(fe_cases)/sizeof(fe_cases[0]); ++i) {
+        for (var = 0; var <= 1; ++var) {
+            test_inverse_field(&x_fe, &fe_cases[i][0], var);
+            check_fe_equal(&x_fe, &fe_cases[i][1]);
+            test_inverse_field(&x_fe, &fe_cases[i][1], var);
+            check_fe_equal(&x_fe, &fe_cases[i][0]);
+        }
+    }
+    for (i = 0; (size_t)i < sizeof(scalar_cases)/sizeof(scalar_cases[0]); ++i) {
+        for (var = 0; var <= 1; ++var) {
+            test_inverse_scalar(&x_scalar, &scalar_cases[i][0], var);
+            CHECK(secp256k1_scalar_eq(&x_scalar, &scalar_cases[i][1]));
+            test_inverse_scalar(&x_scalar, &scalar_cases[i][1], var);
+            CHECK(secp256k1_scalar_eq(&x_scalar, &scalar_cases[i][0]));
+        }
+    }
+    /* Test inputs 0..999 and their respective negations. */
+    for (i = 0; i < 1000; ++i) {
+        b32[31] = i & 0xff;
+        b32[30] = (i >> 8) & 0xff;
+        secp256k1_scalar_set_b32(&x_scalar, b32, NULL);
+        secp256k1_fe_set_b32(&x_fe, b32);
+        for (var = 0; var <= 1; ++var) {
+            test_inverse_scalar(NULL, &x_scalar, var);
+            test_inverse_field(NULL, &x_fe, var);
+        }
+        secp256k1_scalar_negate(&x_scalar, &x_scalar);
+        secp256k1_fe_negate(&x_fe, &x_fe, 1);
+        for (var = 0; var <= 1; ++var) {
+            test_inverse_scalar(NULL, &x_scalar, var);
+            test_inverse_field(NULL, &x_fe, var);
+        }
+    }
+    /* test 128*count random inputs; half with testrand256_test, half with testrand256 */
+    for (testrand = 0; testrand <= 1; ++testrand) {
+        for (i = 0; i < 64 * count; ++i) {
+            (testrand ? secp256k1_testrand256_test : secp256k1_testrand256)(b32);
+            secp256k1_scalar_set_b32(&x_scalar, b32, NULL);
+            secp256k1_fe_set_b32(&x_fe, b32);
+            for (var = 0; var <= 1; ++var) {
+                test_inverse_scalar(NULL, &x_scalar, var);
+                test_inverse_field(NULL, &x_fe, var);
+            }
+        }
+    }
+}
+
 /***** GROUP TESTS *****/
 
 void ge_equals_ge(const secp256k1_ge *a, const secp256k1_ge *b) {
@@ -6068,8 +6173,8 @@ int main(int argc, char **argv) {
     run_rand_int();
 
     run_ctz_tests();
-
     run_modinv_tests();
+    run_inverse_tests();
 
     run_sha256_tests();
     run_hmac_sha256_tests();
@@ -6084,8 +6189,6 @@ int main(int argc, char **argv) {
     run_scalar_tests();
 
     /* field tests */
-    run_field_inv();
-    run_field_inv_var();
     run_field_misc();
     run_field_convert();
     run_sqr();

From 5437e7bdfbffddf69fdf7b4af7e997c78f5dafbf Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sat, 23 Jan 2021 19:24:33 -0800
Subject: [PATCH 12/59] Remove unused scalar_sqr

---
 src/bench_internal.c   |  10 ---
 src/scalar.h           |   3 -
 src/scalar_4x64_impl.h | 166 -----------------------------------------
 src/scalar_8x32_impl.h |  89 ----------------------
 src/scalar_low_impl.h  |   4 -
 src/tests.c            |  16 +---
 6 files changed, 1 insertion(+), 287 deletions(-)

diff --git a/src/bench_internal.c b/src/bench_internal.c
index 7fa6882c1..7289d9430 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -99,15 +99,6 @@ void bench_scalar_negate(void* arg, int iters) {
     }
 }
 
-void bench_scalar_sqr(void* arg, int iters) {
-    int i;
-    bench_inv *data = (bench_inv*)arg;
-
-    for (i = 0; i < iters; i++) {
-        secp256k1_scalar_sqr(&data->scalar[0], &data->scalar[0]);
-    }
-}
-
 void bench_scalar_mul(void* arg, int iters) {
     int i;
     bench_inv *data = (bench_inv*)arg;
@@ -393,7 +384,6 @@ int main(int argc, char **argv) {
 
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "add")) run_benchmark("scalar_add", bench_scalar_add, bench_setup, NULL, &data, 10, iters*100);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "negate")) run_benchmark("scalar_negate", bench_scalar_negate, bench_setup, NULL, &data, 10, iters*100);
-    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "sqr")) run_benchmark("scalar_sqr", bench_scalar_sqr, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "mul")) run_benchmark("scalar_mul", bench_scalar_mul, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "split")) run_benchmark("scalar_split", bench_scalar_split, bench_setup, NULL, &data, 10, iters);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, 2000);
diff --git a/src/scalar.h b/src/scalar.h
index 0b737f940..d7c42cba8 100644
--- a/src/scalar.h
+++ b/src/scalar.h
@@ -63,9 +63,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
  *  the low bits that were shifted off */
 static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n);
 
-/** Compute the square of a scalar (modulo the group order). */
-static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a);
-
 /** Compute the inverse of a scalar (modulo the group order). */
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *a);
 
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index ea33919bf..a1def26fc 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -214,28 +214,6 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     VERIFY_CHECK(c1 >= th); \
 }
 
-/** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
-#define muladd2(a,b) { \
-    uint64_t tl, th, th2, tl2; \
-    { \
-        uint128_t t = (uint128_t)a * b; \
-        th = t >> 64;               /* at most 0xFFFFFFFFFFFFFFFE */ \
-        tl = t; \
-    } \
-    th2 = th + th;                  /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
-    c2 += (th2 < th);               /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
-    tl2 = tl + tl;                  /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
-    th2 += (tl2 < tl);              /* at most 0xFFFFFFFFFFFFFFFF */ \
-    c0 += tl2;                      /* overflow is handled on the next line */ \
-    th2 += (c0 < tl2);              /* second overflow is handled on the next line */ \
-    c2 += (c0 < tl2) & (th2 == 0);  /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \
-    c1 += th2;                      /* overflow is handled on the next line */ \
-    c2 += (c1 < th2);               /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \
-}
-
 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
     unsigned int over; \
@@ -745,148 +723,10 @@ static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar *a, c
 #endif
 }
 
-static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar *a) {
-#ifdef USE_ASM_X86_64
-    __asm__ __volatile__(
-    /* Preload */
-    "movq 0(%%rdi), %%r11\n"
-    "movq 8(%%rdi), %%r12\n"
-    "movq 16(%%rdi), %%r13\n"
-    "movq 24(%%rdi), %%r14\n"
-    /* (rax,rdx) = a0 * a0 */
-    "movq %%r11, %%rax\n"
-    "mulq %%r11\n"
-    /* Extract l0 */
-    "movq %%rax, 0(%%rsi)\n"
-    /* (r8,r9,r10) = (rdx,0) */
-    "movq %%rdx, %%r8\n"
-    "xorq %%r9, %%r9\n"
-    "xorq %%r10, %%r10\n"
-    /* (r8,r9,r10) += 2 * a0 * a1 */
-    "movq %%r11, %%rax\n"
-    "mulq %%r12\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    /* Extract l1 */
-    "movq %%r8, 8(%%rsi)\n"
-    "xorq %%r8, %%r8\n"
-    /* (r9,r10,r8) += 2 * a0 * a2 */
-    "movq %%r11, %%rax\n"
-    "mulq %%r13\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    /* (r9,r10,r8) += a1 * a1 */
-    "movq %%r12, %%rax\n"
-    "mulq %%r12\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    /* Extract l2 */
-    "movq %%r9, 16(%%rsi)\n"
-    "xorq %%r9, %%r9\n"
-    /* (r10,r8,r9) += 2 * a0 * a3 */
-    "movq %%r11, %%rax\n"
-    "mulq %%r14\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    "adcq $0, %%r9\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    "adcq $0, %%r9\n"
-    /* (r10,r8,r9) += 2 * a1 * a2 */
-    "movq %%r12, %%rax\n"
-    "mulq %%r13\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    "adcq $0, %%r9\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    "adcq $0, %%r9\n"
-    /* Extract l3 */
-    "movq %%r10, 24(%%rsi)\n"
-    "xorq %%r10, %%r10\n"
-    /* (r8,r9,r10) += 2 * a1 * a3 */
-    "movq %%r12, %%rax\n"
-    "mulq %%r14\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    /* (r8,r9,r10) += a2 * a2 */
-    "movq %%r13, %%rax\n"
-    "mulq %%r13\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    /* Extract l4 */
-    "movq %%r8, 32(%%rsi)\n"
-    "xorq %%r8, %%r8\n"
-    /* (r9,r10,r8) += 2 * a2 * a3 */
-    "movq %%r13, %%rax\n"
-    "mulq %%r14\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    /* Extract l5 */
-    "movq %%r9, 40(%%rsi)\n"
-    /* (r10,r8) += a3 * a3 */
-    "movq %%r14, %%rax\n"
-    "mulq %%r14\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    /* Extract l6 */
-    "movq %%r10, 48(%%rsi)\n"
-    /* Extract l7 */
-    "movq %%r8, 56(%%rsi)\n"
-    :
-    : "S"(l), "D"(a->d)
-    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory");
-#else
-    /* 160 bit accumulator. */
-    uint64_t c0 = 0, c1 = 0;
-    uint32_t c2 = 0;
-
-    /* l[0..7] = a[0..3] * b[0..3]. */
-    muladd_fast(a->d[0], a->d[0]);
-    extract_fast(l[0]);
-    muladd2(a->d[0], a->d[1]);
-    extract(l[1]);
-    muladd2(a->d[0], a->d[2]);
-    muladd(a->d[1], a->d[1]);
-    extract(l[2]);
-    muladd2(a->d[0], a->d[3]);
-    muladd2(a->d[1], a->d[2]);
-    extract(l[3]);
-    muladd2(a->d[1], a->d[3]);
-    muladd(a->d[2], a->d[2]);
-    extract(l[4]);
-    muladd2(a->d[2], a->d[3]);
-    extract(l[5]);
-    muladd_fast(a->d[3], a->d[3]);
-    extract_fast(l[6]);
-    VERIFY_CHECK(c1 == 0);
-    l[7] = c0;
-#endif
-}
-
 #undef sumadd
 #undef sumadd_fast
 #undef muladd
 #undef muladd_fast
-#undef muladd2
 #undef extract
 #undef extract_fast
 
@@ -908,12 +748,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     return ret;
 }
 
-static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
-    uint64_t l[8];
-    secp256k1_scalar_sqr_512(l, a);
-    secp256k1_scalar_reduce_512(r, l);
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     r1->d[0] = k->d[0];
     r1->d[1] = k->d[1];
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index ccc2c2424..62c7ae715 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -293,28 +293,6 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     VERIFY_CHECK(c1 >= th); \
 }
 
-/** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
-#define muladd2(a,b) { \
-    uint32_t tl, th, th2, tl2; \
-    { \
-        uint64_t t = (uint64_t)a * b; \
-        th = t >> 32;               /* at most 0xFFFFFFFE */ \
-        tl = t; \
-    } \
-    th2 = th + th;                  /* at most 0xFFFFFFFE (in case th was 0x7FFFFFFF) */ \
-    c2 += (th2 < th);               /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
-    tl2 = tl + tl;                  /* at most 0xFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFF) */ \
-    th2 += (tl2 < tl);              /* at most 0xFFFFFFFF */ \
-    c0 += tl2;                      /* overflow is handled on the next line */ \
-    th2 += (c0 < tl2);              /* second overflow is handled on the next line */ \
-    c2 += (c0 < tl2) & (th2 == 0);  /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \
-    c1 += th2;                      /* overflow is handled on the next line */ \
-    c2 += (c1 < th2);               /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \
-}
-
 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
     unsigned int over; \
@@ -578,71 +556,10 @@ static void secp256k1_scalar_mul_512(uint32_t *l, const secp256k1_scalar *a, con
     l[15] = c0;
 }
 
-static void secp256k1_scalar_sqr_512(uint32_t *l, const secp256k1_scalar *a) {
-    /* 96 bit accumulator. */
-    uint32_t c0 = 0, c1 = 0, c2 = 0;
-
-    /* l[0..15] = a[0..7]^2. */
-    muladd_fast(a->d[0], a->d[0]);
-    extract_fast(l[0]);
-    muladd2(a->d[0], a->d[1]);
-    extract(l[1]);
-    muladd2(a->d[0], a->d[2]);
-    muladd(a->d[1], a->d[1]);
-    extract(l[2]);
-    muladd2(a->d[0], a->d[3]);
-    muladd2(a->d[1], a->d[2]);
-    extract(l[3]);
-    muladd2(a->d[0], a->d[4]);
-    muladd2(a->d[1], a->d[3]);
-    muladd(a->d[2], a->d[2]);
-    extract(l[4]);
-    muladd2(a->d[0], a->d[5]);
-    muladd2(a->d[1], a->d[4]);
-    muladd2(a->d[2], a->d[3]);
-    extract(l[5]);
-    muladd2(a->d[0], a->d[6]);
-    muladd2(a->d[1], a->d[5]);
-    muladd2(a->d[2], a->d[4]);
-    muladd(a->d[3], a->d[3]);
-    extract(l[6]);
-    muladd2(a->d[0], a->d[7]);
-    muladd2(a->d[1], a->d[6]);
-    muladd2(a->d[2], a->d[5]);
-    muladd2(a->d[3], a->d[4]);
-    extract(l[7]);
-    muladd2(a->d[1], a->d[7]);
-    muladd2(a->d[2], a->d[6]);
-    muladd2(a->d[3], a->d[5]);
-    muladd(a->d[4], a->d[4]);
-    extract(l[8]);
-    muladd2(a->d[2], a->d[7]);
-    muladd2(a->d[3], a->d[6]);
-    muladd2(a->d[4], a->d[5]);
-    extract(l[9]);
-    muladd2(a->d[3], a->d[7]);
-    muladd2(a->d[4], a->d[6]);
-    muladd(a->d[5], a->d[5]);
-    extract(l[10]);
-    muladd2(a->d[4], a->d[7]);
-    muladd2(a->d[5], a->d[6]);
-    extract(l[11]);
-    muladd2(a->d[5], a->d[7]);
-    muladd(a->d[6], a->d[6]);
-    extract(l[12]);
-    muladd2(a->d[6], a->d[7]);
-    extract(l[13]);
-    muladd_fast(a->d[7], a->d[7]);
-    extract_fast(l[14]);
-    VERIFY_CHECK(c1 == 0);
-    l[15] = c0;
-}
-
 #undef sumadd
 #undef sumadd_fast
 #undef muladd
 #undef muladd_fast
-#undef muladd2
 #undef extract
 #undef extract_fast
 
@@ -668,12 +585,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     return ret;
 }
 
-static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
-    uint32_t l[16];
-    secp256k1_scalar_sqr_512(l, a);
-    secp256k1_scalar_reduce_512(r, l);
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     r1->d[0] = k->d[0];
     r1->d[1] = k->d[1];
diff --git a/src/scalar_low_impl.h b/src/scalar_low_impl.h
index eff270720..7176f0b2c 100644
--- a/src/scalar_low_impl.h
+++ b/src/scalar_low_impl.h
@@ -104,10 +104,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     return ret;
 }
 
-static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
-    *r = (*a * *a) % EXHAUSTIVE_TEST_ORDER;
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *a) {
     *r1 = *a;
     *r2 = 0;
diff --git a/src/tests.c b/src/tests.c
index 6349c399e..addeb8b66 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -793,7 +793,7 @@ void test_num_jacobi(void) {
     /** test with secp group order as order */
     secp256k1_scalar_order_get_num(&order);
     random_scalar_order_test(&sqr);
-    secp256k1_scalar_sqr(&sqr, &sqr);
+    secp256k1_scalar_mul(&sqr, &sqr, &sqr);
     /* test residue */
     secp256k1_scalar_get_num(&n, &sqr);
     CHECK(secp256k1_num_jacobi(&n, &order) == 1);
@@ -1488,14 +1488,6 @@ void scalar_test(void) {
         CHECK(secp256k1_scalar_eq(&r1, &r2));
     }
 
-    {
-        /* Test square. */
-        secp256k1_scalar r1, r2;
-        secp256k1_scalar_sqr(&r1, &s1);
-        secp256k1_scalar_mul(&r2, &s1, &s1);
-        CHECK(secp256k1_scalar_eq(&r1, &r2));
-    }
-
     {
         /* Test multiplicative identity. */
         secp256k1_scalar r1, v1;
@@ -2187,12 +2179,6 @@ void run_scalar_tests(void) {
                 CHECK(!secp256k1_scalar_check_overflow(&zz));
                 CHECK(secp256k1_scalar_eq(&one, &zz));
             }
-            secp256k1_scalar_mul(&z, &x, &x);
-            CHECK(!secp256k1_scalar_check_overflow(&z));
-            secp256k1_scalar_sqr(&zz, &x);
-            CHECK(!secp256k1_scalar_check_overflow(&zz));
-            CHECK(secp256k1_scalar_eq(&zz, &z));
-            CHECK(secp256k1_scalar_eq(&r2, &zz));
         }
     }
 }

From 20448b8d09a492afcfcae7721033c13a44a776fd Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 11 Oct 2020 15:56:17 -0700
Subject: [PATCH 13/59] Remove unused Jacobi symbol support

No exposed functions rely on Jacobi symbol computation anymore. Remove it; it can always
be brough back later if needed.
---
 src/bench_internal.c |  48 ++------------------
 src/field.h          |   3 --
 src/field_impl.h     |  25 -----------
 src/group.h          |   9 ----
 src/group_impl.h     |  22 +---------
 src/tests.c          | 101 +++----------------------------------------
 6 files changed, 11 insertions(+), 197 deletions(-)

diff --git a/src/bench_internal.c b/src/bench_internal.c
index 7289d9430..82a3eb6a0 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -246,26 +246,6 @@ void bench_group_add_affine_var(void* arg, int iters) {
     }
 }
 
-void bench_group_jacobi_var(void* arg, int iters) {
-    int i, j = 0;
-    bench_inv *data = (bench_inv*)arg;
-
-    for (i = 0; i < iters; i++) {
-        j += secp256k1_gej_has_quad_y_var(&data->gej[0]);
-        /* Vary the Y and Z coordinates of the input (the X coordinate doesn't matter to
-           secp256k1_gej_has_quad_y_var). Note that the resulting coordinates will
-           generally not correspond to a point on the curve, but this is not a problem
-           for the code being benchmarked here. Adding and normalizing have less
-           overhead than EC operations (which could guarantee the point remains on the
-           curve). */
-        secp256k1_fe_add(&data->gej[0].y, &data->fe[1]);
-        secp256k1_fe_add(&data->gej[0].z, &data->fe[2]);
-        secp256k1_fe_normalize_var(&data->gej[0].y);
-        secp256k1_fe_normalize_var(&data->gej[0].z);
-    }
-    CHECK(j <= iters);
-}
-
 void bench_group_to_affine_var(void* arg, int iters) {
     int i;
     bench_inv *data = (bench_inv*)arg;
@@ -273,8 +253,10 @@ void bench_group_to_affine_var(void* arg, int iters) {
     for (i = 0; i < iters; ++i) {
         secp256k1_ge_set_gej_var(&data->ge[1], &data->gej[0]);
         /* Use the output affine X/Y coordinates to vary the input X/Y/Z coordinates.
-           Similar to bench_group_jacobi_var, this approach does not result in
-           coordinates of points on the curve. */
+           Note that the resulting coordinates will generally not correspond to a point
+           on the curve, but this is not a problem for the code being benchmarked here.
+           Adding and normalizing have less overhead than EC operations (which could
+           guarantee the point remains on the curve). */
         secp256k1_fe_add(&data->gej[0].x, &data->ge[1].y);
         secp256k1_fe_add(&data->gej[0].y, &data->fe[2]);
         secp256k1_fe_add(&data->gej[0].z, &data->ge[1].x);
@@ -360,24 +342,6 @@ void bench_context_sign(void* arg, int iters) {
     }
 }
 
-#ifndef USE_NUM_NONE
-void bench_num_jacobi(void* arg, int iters) {
-    int i, j = 0;
-    bench_inv *data = (bench_inv*)arg;
-    secp256k1_num nx, na, norder;
-
-    secp256k1_scalar_get_num(&nx, &data->scalar[0]);
-    secp256k1_scalar_order_get_num(&norder);
-    secp256k1_scalar_get_num(&na, &data->scalar[1]);
-
-    for (i = 0; i < iters; i++) {
-        j += secp256k1_num_jacobi(&nx, &norder);
-        secp256k1_num_add(&nx, &nx, &na);
-    }
-    CHECK(j <= iters);
-}
-#endif
-
 int main(int argc, char **argv) {
     bench_inv data;
     int iters = get_iters(20000);
@@ -401,7 +365,6 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_var", bench_group_add_var, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine", bench_group_add_affine, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine_var", bench_group_add_affine_var, bench_setup, NULL, &data, 10, iters*10);
-    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "jacobi")) run_benchmark("group_jacobi_var", bench_group_jacobi_var, bench_setup, NULL, &data, 10, iters);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "to_affine")) run_benchmark("group_to_affine_var", bench_group_to_affine_var, bench_setup, NULL, &data, 10, iters);
 
     if (have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("wnaf_const", bench_wnaf_const, bench_setup, NULL, &data, 10, iters);
@@ -414,8 +377,5 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "context") || have_flag(argc, argv, "verify")) run_benchmark("context_verify", bench_context_verify, bench_setup, NULL, &data, 10, 1 + iters/1000);
     if (have_flag(argc, argv, "context") || have_flag(argc, argv, "sign")) run_benchmark("context_sign", bench_context_sign, bench_setup, NULL, &data, 10, 1 + iters/100);
 
-#ifndef USE_NUM_NONE
-    if (have_flag(argc, argv, "num") || have_flag(argc, argv, "jacobi")) run_benchmark("num_jacobi", bench_num_jacobi, bench_setup, NULL, &data, 10, iters*10);
-#endif
     return 0;
 }
diff --git a/src/field.h b/src/field.h
index ee222ee5d..c58554b53 100644
--- a/src/field.h
+++ b/src/field.h
@@ -104,9 +104,6 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a);
  *  itself. */
 static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a);
 
-/** Checks whether a field element is a quadratic residue. */
-static int secp256k1_fe_is_quad_var(const secp256k1_fe *a);
-
 /** Sets a field element to be the (modular) inverse of another. Requires the input's magnitude to be
  *  at most 8. The output magnitude is 1 (but not guaranteed to be normalized). */
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a);
diff --git a/src/field_impl.h b/src/field_impl.h
index 7b75e9860..70e2398cd 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -136,31 +136,6 @@ static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a) {
     return secp256k1_fe_equal(&t1, a);
 }
 
-static int secp256k1_fe_is_quad_var(const secp256k1_fe *a) {
-#ifndef USE_NUM_NONE
-    unsigned char b[32];
-    secp256k1_num n;
-    secp256k1_num m;
-    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
-    static const unsigned char prime[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-
-    secp256k1_fe c = *a;
-    secp256k1_fe_normalize_var(&c);
-    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_set_bin(&m, prime, 32);
-    return secp256k1_num_jacobi(&n, &m) >= 0;
-#else
-    secp256k1_fe r;
-    return secp256k1_fe_sqrt(&r, a);
-#endif
-}
-
 static const secp256k1_fe secp256k1_fe_one = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 
 #endif /* SECP256K1_FIELD_IMPL_H */
diff --git a/src/group.h b/src/group.h
index 40bf96122..e18c04071 100644
--- a/src/group.h
+++ b/src/group.h
@@ -43,12 +43,6 @@ typedef struct {
 /** Set a group element equal to the point with given X and Y coordinates */
 static void secp256k1_ge_set_xy(secp256k1_ge *r, const secp256k1_fe *x, const secp256k1_fe *y);
 
-/** Set a group element (affine) equal to the point with the given X coordinate
- *  and a Y coordinate that is a quadratic residue modulo p. The return value
- *  is true iff a coordinate with the given X coordinate exists.
- */
-static int secp256k1_ge_set_xquad(secp256k1_ge *r, const secp256k1_fe *x);
-
 /** Set a group element (affine) equal to the point with the given X coordinate, and given oddness
  *  for Y. Return value indicates whether the result is valid. */
 static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd);
@@ -96,9 +90,6 @@ static void secp256k1_gej_neg(secp256k1_gej *r, const secp256k1_gej *a);
 /** Check whether a group element is the point at infinity. */
 static int secp256k1_gej_is_infinity(const secp256k1_gej *a);
 
-/** Check whether a group element's y coordinate is a quadratic residue. */
-static int secp256k1_gej_has_quad_y_var(const secp256k1_gej *a);
-
 /** Set r equal to the double of a. Constant time. */
 static void secp256k1_gej_double(secp256k1_gej *r, const secp256k1_gej *a);
 
diff --git a/src/group_impl.h b/src/group_impl.h
index b7094c537..1324fadd9 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -207,18 +207,14 @@ static void secp256k1_ge_clear(secp256k1_ge *r) {
     secp256k1_fe_clear(&r->y);
 }
 
-static int secp256k1_ge_set_xquad(secp256k1_ge *r, const secp256k1_fe *x) {
+static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) {
     secp256k1_fe x2, x3;
     r->x = *x;
     secp256k1_fe_sqr(&x2, x);
     secp256k1_fe_mul(&x3, x, &x2);
     r->infinity = 0;
     secp256k1_fe_add(&x3, &secp256k1_fe_const_b);
-    return secp256k1_fe_sqrt(&r->y, &x3);
-}
-
-static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) {
-    if (!secp256k1_ge_set_xquad(r, x)) {
+    if (!secp256k1_fe_sqrt(&r->y, &x3)) {
         return 0;
     }
     secp256k1_fe_normalize_var(&r->y);
@@ -655,20 +651,6 @@ static void secp256k1_ge_mul_lambda(secp256k1_ge *r, const secp256k1_ge *a) {
     secp256k1_fe_mul(&r->x, &r->x, &beta);
 }
 
-static int secp256k1_gej_has_quad_y_var(const secp256k1_gej *a) {
-    secp256k1_fe yz;
-
-    if (a->infinity) {
-        return 0;
-    }
-
-    /* We rely on the fact that the Jacobi symbol of 1 / a->z^3 is the same as
-     * that of a->z. Thus a->y / a->z^3 is a quadratic residue iff a->y * a->z
-       is */
-    secp256k1_fe_mul(&yz, &a->y, &a->z);
-    return secp256k1_fe_is_quad_var(&yz);
-}
-
 static int secp256k1_ge_is_in_correct_subgroup(const secp256k1_ge* ge) {
 #ifdef EXHAUSTIVE_TEST_ORDER
     secp256k1_gej out;
diff --git a/src/tests.c b/src/tests.c
index addeb8b66..ec0b6e0fb 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -750,74 +750,12 @@ void test_num_mod(void) {
     CHECK(secp256k1_num_is_zero(&n));
 }
 
-void test_num_jacobi(void) {
-    secp256k1_scalar sqr;
-    secp256k1_scalar small;
-    secp256k1_scalar five;  /* five is not a quadratic residue */
-    secp256k1_num order, n;
-    int i;
-    /* squares mod 5 are 1, 4 */
-    const int jacobi5[10] = { 0, 1, -1, -1, 1, 0, 1, -1, -1, 1 };
-
-    /* check some small values with 5 as the order */
-    secp256k1_scalar_set_int(&five, 5);
-    secp256k1_scalar_get_num(&order, &five);
-    for (i = 0; i < 10; ++i) {
-        secp256k1_scalar_set_int(&small, i);
-        secp256k1_scalar_get_num(&n, &small);
-        CHECK(secp256k1_num_jacobi(&n, &order) == jacobi5[i]);
-    }
-
-    /** test large values with 5 as group order */
-    secp256k1_scalar_get_num(&order, &five);
-    /* we first need a scalar which is not a multiple of 5 */
-    do {
-        secp256k1_num fiven;
-        random_scalar_order_test(&sqr);
-        secp256k1_scalar_get_num(&fiven, &five);
-        secp256k1_scalar_get_num(&n, &sqr);
-        secp256k1_num_mod(&n, &fiven);
-    } while (secp256k1_num_is_zero(&n));
-    /* next force it to be a residue. 2 is a nonresidue mod 5 so we can
-     * just multiply by two, i.e. add the number to itself */
-    if (secp256k1_num_jacobi(&n, &order) == -1) {
-        secp256k1_num_add(&n, &n, &n);
-    }
-
-    /* test residue */
-    CHECK(secp256k1_num_jacobi(&n, &order) == 1);
-    /* test nonresidue */
-    secp256k1_num_add(&n, &n, &n);
-    CHECK(secp256k1_num_jacobi(&n, &order) == -1);
-
-    /** test with secp group order as order */
-    secp256k1_scalar_order_get_num(&order);
-    random_scalar_order_test(&sqr);
-    secp256k1_scalar_mul(&sqr, &sqr, &sqr);
-    /* test residue */
-    secp256k1_scalar_get_num(&n, &sqr);
-    CHECK(secp256k1_num_jacobi(&n, &order) == 1);
-    /* test nonresidue */
-    secp256k1_scalar_mul(&sqr, &sqr, &five);
-    secp256k1_scalar_get_num(&n, &sqr);
-    CHECK(secp256k1_num_jacobi(&n, &order) == -1);
-    /* test multiple of the order*/
-    CHECK(secp256k1_num_jacobi(&order, &order) == 0);
-
-    /* check one less than the order */
-    secp256k1_scalar_set_int(&small, 1);
-    secp256k1_scalar_get_num(&n, &small);
-    secp256k1_num_sub(&n, &order, &n);
-    CHECK(secp256k1_num_jacobi(&n, &order) == 1);  /* sage confirms this is 1 */
-}
-
 void run_num_smalltests(void) {
     int i;
     for (i = 0; i < 100*count; i++) {
         test_num_negate();
         test_num_add_sub();
         test_num_mod();
-        test_num_jacobi();
     }
 }
 #endif
@@ -2959,64 +2897,35 @@ void run_ec_combine(void) {
 void test_group_decompress(const secp256k1_fe* x) {
     /* The input itself, normalized. */
     secp256k1_fe fex = *x;
-    secp256k1_fe fez;
-    /* Results of set_xquad_var, set_xo_var(..., 0), set_xo_var(..., 1). */
-    secp256k1_ge ge_quad, ge_even, ge_odd;
-    secp256k1_gej gej_quad;
+    /* Results of set_xo_var(..., 0), set_xo_var(..., 1). */
+    secp256k1_ge ge_even, ge_odd;
     /* Return values of the above calls. */
-    int res_quad, res_even, res_odd;
+    int res_even, res_odd;
 
     secp256k1_fe_normalize_var(&fex);
 
-    res_quad = secp256k1_ge_set_xquad(&ge_quad, &fex);
     res_even = secp256k1_ge_set_xo_var(&ge_even, &fex, 0);
     res_odd = secp256k1_ge_set_xo_var(&ge_odd, &fex, 1);
 
-    CHECK(res_quad == res_even);
-    CHECK(res_quad == res_odd);
+    CHECK(res_even == res_odd);
 
-    if (res_quad) {
-        secp256k1_fe_normalize_var(&ge_quad.x);
+    if (res_even) {
         secp256k1_fe_normalize_var(&ge_odd.x);
         secp256k1_fe_normalize_var(&ge_even.x);
-        secp256k1_fe_normalize_var(&ge_quad.y);
         secp256k1_fe_normalize_var(&ge_odd.y);
         secp256k1_fe_normalize_var(&ge_even.y);
 
         /* No infinity allowed. */
-        CHECK(!ge_quad.infinity);
         CHECK(!ge_even.infinity);
         CHECK(!ge_odd.infinity);
 
         /* Check that the x coordinates check out. */
-        CHECK(secp256k1_fe_equal_var(&ge_quad.x, x));
         CHECK(secp256k1_fe_equal_var(&ge_even.x, x));
         CHECK(secp256k1_fe_equal_var(&ge_odd.x, x));
 
-        /* Check that the Y coordinate result in ge_quad is a square. */
-        CHECK(secp256k1_fe_is_quad_var(&ge_quad.y));
-
         /* Check odd/even Y in ge_odd, ge_even. */
         CHECK(secp256k1_fe_is_odd(&ge_odd.y));
         CHECK(!secp256k1_fe_is_odd(&ge_even.y));
-
-        /* Check secp256k1_gej_has_quad_y_var. */
-        secp256k1_gej_set_ge(&gej_quad, &ge_quad);
-        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
-        do {
-            random_fe_test(&fez);
-        } while (secp256k1_fe_is_zero(&fez));
-        secp256k1_gej_rescale(&gej_quad, &fez);
-        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
-        secp256k1_gej_neg(&gej_quad, &gej_quad);
-        CHECK(!secp256k1_gej_has_quad_y_var(&gej_quad));
-        do {
-            random_fe_test(&fez);
-        } while (secp256k1_fe_is_zero(&fez));
-        secp256k1_gej_rescale(&gej_quad, &fez);
-        CHECK(!secp256k1_gej_has_quad_y_var(&gej_quad));
-        secp256k1_gej_neg(&gej_quad, &gej_quad);
-        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
     }
 }
 

From 1f233b3fa05eb29a744487e0682d925055fb0d4c Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 11 Oct 2020 16:04:58 -0700
Subject: [PATCH 14/59] Remove num/gmp support

The whole "num" API and its libgmp-based implementation are now unused. Remove them.
---
 .cirrus.yml                  |  17 +--
 Makefile.am                  |   4 -
 build-aux/m4/bitcoin_secp.m4 |  13 --
 ci/cirrus.sh                 |   2 +-
 ci/linux-debian.Dockerfile   |   4 +-
 configure.ac                 |  58 -------
 src/basic-config.h           |   9 --
 src/bench_ecmult.c           |   1 -
 src/bench_internal.c         |   1 -
 src/ecmult.h                 |   1 -
 src/field_impl.h             |   1 -
 src/group.h                  |   1 -
 src/group_impl.h             |   1 -
 src/num.h                    |  74 ---------
 src/num_gmp.h                |  20 ---
 src/num_gmp_impl.h           | 288 -----------------------------------
 src/num_impl.h               |  24 ---
 src/scalar.h                 |   9 --
 src/scalar_impl.h            |  28 ----
 src/secp256k1.c              |   1 -
 src/tests.c                  | 264 --------------------------------
 21 files changed, 6 insertions(+), 815 deletions(-)
 delete mode 100644 src/num.h
 delete mode 100644 src/num_gmp.h
 delete mode 100644 src/num_gmp_impl.h
 delete mode 100644 src/num_impl.h

diff --git a/.cirrus.yml b/.cirrus.yml
index 9399fbda4..506a86033 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -1,6 +1,5 @@
 env:
   WIDEMUL: auto
-  BIGNUM: auto
   STATICPRECOMPUTATION: yes
   ECMULTGENPRECISION: auto
   ASM: no
@@ -59,9 +58,8 @@ task:
     - env: {WIDEMUL: int128,  RECOVERY: yes,            EXPERIMENTAL: yes, SCHNORRSIG: yes}
     - env: {WIDEMUL: int128,                 ECDH: yes, EXPERIMENTAL: yes, SCHNORRSIG: yes}
     - env: {WIDEMUL: int128,  ASM: x86_64}
-    - env: {BIGNUM: no}
-    - env: {BIGNUM: no,       RECOVERY: yes,            EXPERIMENTAL: yes, SCHNORRSIG: yes}
-    - env: {BIGNUM: no,       STATICPRECOMPUTATION: no}
+    - env: {                  RECOVERY: yes,            EXPERIMENTAL: yes, SCHNORRSIG: yes}
+    - env: {                  STATICPRECOMPUTATION: no}
     - env: {BUILD: distcheck, WITH_VALGRIND: no, CTIMETEST: no, BENCH: no}
     - env: {CPPFLAGS: -DDETERMINISTIC}
     - env: {CFLAGS: -O0, CTIMETEST: no}
@@ -69,7 +67,6 @@ task:
         CFLAGS:  "-fsanitize=undefined -fno-omit-frame-pointer"
         LDFLAGS: "-fsanitize=undefined -fno-omit-frame-pointer"
         UBSAN_OPTIONS: "print_stacktrace=1:halt_on_error=1"
-        BIGNUM: no
         ASM: x86_64
         ECDH: yes
         RECOVERY: yes
@@ -80,7 +77,6 @@ task:
     - env: { ECMULTGENPRECISION: 8 }
     - env:
         RUN_VALGRIND: yes
-        BIGNUM: no
         ASM: x86_64
         ECDH: yes
         RECOVERY: yes
@@ -115,12 +111,6 @@ task:
         CC: i686-linux-gnu-gcc
     - env:
         CC: clang --target=i686-pc-linux-gnu -isystem /usr/i686-linux-gnu/include
-  matrix:
-    - env:
-        BIGNUM: gmp
-    - env:
-        BIGNUM: no
-  << : *MERGE_BASE
   test_script:
     - ./ci/cirrus.sh
   << : *CAT_LOGS
@@ -178,7 +168,7 @@ task:
     # If we haven't restored from cached (and just run brew install), this is a no-op.
     - brew link valgrind
   brew_script:
-    - brew install automake libtool gmp gcc@9
+    - brew install automake libtool gcc@9
   << : *MERGE_BASE
   test_script:
     - ./ci/cirrus.sh
@@ -195,7 +185,6 @@ task:
     HOST: s390x-linux-gnu
     BUILD:
     WITH_VALGRIND: no
-    BIGNUM: no
     ECDH: yes
     RECOVERY: yes
     EXPERIMENTAL: yes
diff --git a/Makefile.am b/Makefile.am
index c399cff08..58c9635e5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,8 +14,6 @@ noinst_HEADERS += src/scalar_8x32_impl.h
 noinst_HEADERS += src/scalar_low_impl.h
 noinst_HEADERS += src/group.h
 noinst_HEADERS += src/group_impl.h
-noinst_HEADERS += src/num_gmp.h
-noinst_HEADERS += src/num_gmp_impl.h
 noinst_HEADERS += src/ecdsa.h
 noinst_HEADERS += src/ecdsa_impl.h
 noinst_HEADERS += src/eckey.h
@@ -26,8 +24,6 @@ noinst_HEADERS += src/ecmult_const.h
 noinst_HEADERS += src/ecmult_const_impl.h
 noinst_HEADERS += src/ecmult_gen.h
 noinst_HEADERS += src/ecmult_gen_impl.h
-noinst_HEADERS += src/num.h
-noinst_HEADERS += src/num_impl.h
 noinst_HEADERS += src/field_10x26.h
 noinst_HEADERS += src/field_10x26_impl.h
 noinst_HEADERS += src/field_5x52.h
diff --git a/build-aux/m4/bitcoin_secp.m4 b/build-aux/m4/bitcoin_secp.m4
index 7b48a5e58..e57888ca1 100644
--- a/build-aux/m4/bitcoin_secp.m4
+++ b/build-aux/m4/bitcoin_secp.m4
@@ -75,19 +75,6 @@ if test x"$has_libcrypto" = x"yes" && test x"$has_openssl_ec" = x; then
 fi
 ])
 
-dnl
-AC_DEFUN([SECP_GMP_CHECK],[
-if test x"$has_gmp" != x"yes"; then
-  CPPFLAGS_TEMP="$CPPFLAGS"
-  CPPFLAGS="$GMP_CPPFLAGS $CPPFLAGS"
-  LIBS_TEMP="$LIBS"
-  LIBS="$GMP_LIBS $LIBS"
-  AC_CHECK_HEADER(gmp.h,[AC_CHECK_LIB(gmp, __gmpz_init,[has_gmp=yes; GMP_LIBS="$GMP_LIBS -lgmp"; AC_DEFINE(HAVE_LIBGMP,1,[Define this symbol if libgmp is installed])])])
-  CPPFLAGS="$CPPFLAGS_TEMP"
-  LIBS="$LIBS_TEMP"
-fi
-])
-
 AC_DEFUN([SECP_VALGRIND_CHECK],[
 if test x"$has_valgrind" != x"yes"; then
   CPPFLAGS_TEMP="$CPPFLAGS"
diff --git a/ci/cirrus.sh b/ci/cirrus.sh
index f223a91ca..f26ca98d1 100755
--- a/ci/cirrus.sh
+++ b/ci/cirrus.sh
@@ -14,7 +14,7 @@ valgrind --version || true
 
 ./configure \
     --enable-experimental="$EXPERIMENTAL" \
-    --with-test-override-wide-multiply="$WIDEMUL" --with-bignum="$BIGNUM" --with-asm="$ASM" \
+    --with-test-override-wide-multiply="$WIDEMUL" --with-asm="$ASM" \
     --enable-ecmult-static-precomputation="$STATICPRECOMPUTATION" --with-ecmult-gen-precision="$ECMULTGENPRECISION" \
     --enable-module-ecdh="$ECDH" --enable-module-recovery="$RECOVERY" \
     --enable-module-schnorrsig="$SCHNORRSIG" \
diff --git a/ci/linux-debian.Dockerfile b/ci/linux-debian.Dockerfile
index 201ace4f6..5967cf8b3 100644
--- a/ci/linux-debian.Dockerfile
+++ b/ci/linux-debian.Dockerfile
@@ -8,6 +8,6 @@ RUN apt-get update
 RUN apt-get install --no-install-recommends --no-upgrade -y \
         git ca-certificates \
         make automake libtool pkg-config dpkg-dev valgrind qemu-user \
-        gcc clang libc6-dbg libgmp-dev \
-        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 libgmp-dev:i386 \
+        gcc clang libc6-dbg \
+        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 \
         gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x
diff --git a/configure.ac b/configure.ac
index fd15d3413..e84005edf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -48,17 +48,12 @@ case $host_os in
          # in expected paths because they may conflict with system files. Ask
          # Homebrew where each one is located, then adjust paths accordingly.
          openssl_prefix=`$BREW --prefix openssl 2>/dev/null`
-         gmp_prefix=`$BREW --prefix gmp 2>/dev/null`
          valgrind_prefix=`$BREW --prefix valgrind 2>/dev/null`
          if test x$openssl_prefix != x; then
            PKG_CONFIG_PATH="$openssl_prefix/lib/pkgconfig:$PKG_CONFIG_PATH"
            export PKG_CONFIG_PATH
            CRYPTO_CPPFLAGS="-I$openssl_prefix/include"
          fi
-         if test x$gmp_prefix != x; then
-           GMP_CPPFLAGS="-I$gmp_prefix/include"
-           GMP_LIBS="-L$gmp_prefix/lib"
-         fi
          if test x$valgrind_prefix != x; then
            VALGRIND_CPPFLAGS="-I$valgrind_prefix/include"
          fi
@@ -164,9 +159,6 @@ AC_ARG_ENABLE(external_default_callbacks,
 # Legal values are int64 (for [u]int64_t), int128 (for [unsigned] __int128), and auto (the default).
 AC_ARG_WITH([test-override-wide-multiply], [] ,[set_widemul=$withval], [set_widemul=auto])
 
-AC_ARG_WITH([bignum], [AS_HELP_STRING([--with-bignum=gmp|no|auto],
-[bignum implementation to use [default=auto]])],[req_bignum=$withval], [req_bignum=auto])
-
 AC_ARG_WITH([asm], [AS_HELP_STRING([--with-asm=x86_64|arm|no|auto],
 [assembly optimizations to use (experimental: arm) [default=auto]])],[req_asm=$withval], [req_asm=auto])
 
@@ -245,32 +237,6 @@ else
   esac
 fi
 
-if test x"$req_bignum" = x"auto"; then
-  SECP_GMP_CHECK
-  if test x"$has_gmp" = x"yes"; then
-    set_bignum=gmp
-  fi
-
-  if test x"$set_bignum" = x; then
-    set_bignum=no
-  fi
-else
-  set_bignum=$req_bignum
-  case $set_bignum in
-  gmp)
-    SECP_GMP_CHECK
-    if test x"$has_gmp" != x"yes"; then
-      AC_MSG_ERROR([gmp bignum explicitly requested but libgmp not available])
-    fi
-    ;;
-  no)
-    ;;
-  *)
-    AC_MSG_ERROR([invalid bignum implementation selection])
-    ;;
-  esac
-fi
-
 # Select assembly optimization
 use_external_asm=no
 
@@ -308,24 +274,6 @@ auto)
   ;;
 esac
 
-# Select bignum implementation
-case $set_bignum in
-gmp)
-  AC_DEFINE(HAVE_LIBGMP, 1, [Define this symbol if libgmp is installed])
-  AC_DEFINE(USE_NUM_GMP, 1, [Define this symbol to use the gmp implementation for num])
-  AC_DEFINE(USE_FIELD_INV_NUM, 1, [Define this symbol to use the num-based field inverse implementation])
-  AC_DEFINE(USE_SCALAR_INV_NUM, 1, [Define this symbol to use the num-based scalar inverse implementation])
-  ;;
-no)
-  AC_DEFINE(USE_NUM_NONE, 1, [Define this symbol to use no num implementation])
-  AC_DEFINE(USE_FIELD_INV_BUILTIN, 1, [Define this symbol to use the native field inverse implementation])
-  AC_DEFINE(USE_SCALAR_INV_BUILTIN, 1, [Define this symbol to use the native scalar inverse implementation])
-  ;;
-*)
-  AC_MSG_ERROR([invalid bignum implementation])
-  ;;
-esac
-
 # Set ecmult window size
 if test x"$req_ecmult_window" = x"auto"; then
   set_ecmult_window=15
@@ -390,11 +338,6 @@ else
   enable_openssl_tests=no
 fi
 
-if test x"$set_bignum" = x"gmp"; then
-  SECP_LIBS="$SECP_LIBS $GMP_LIBS"
-  SECP_INCLUDES="$SECP_INCLUDES $GMP_CPPFLAGS"
-fi
-
 if test x"$enable_valgrind" = x"yes"; then
   SECP_INCLUDES="$SECP_INCLUDES $VALGRIND_CPPFLAGS"
 fi
@@ -571,7 +514,6 @@ echo "  module extrakeys        = $enable_module_extrakeys"
 echo "  module schnorrsig       = $enable_module_schnorrsig"
 echo
 echo "  asm                     = $set_asm"
-echo "  bignum                  = $set_bignum"
 echo "  ecmult window size      = $set_ecmult_window"
 echo "  ecmult gen prec. bits   = $set_ecmult_gen_precision"
 # Hide test-only options unless they're used.
diff --git a/src/basic-config.h b/src/basic-config.h
index bb6b58259..e4b1b8b05 100644
--- a/src/basic-config.h
+++ b/src/basic-config.h
@@ -13,19 +13,10 @@
 #undef USE_ECMULT_STATIC_PRECOMPUTATION
 #undef USE_EXTERNAL_ASM
 #undef USE_EXTERNAL_DEFAULT_CALLBACKS
-#undef USE_FIELD_INV_BUILTIN
-#undef USE_FIELD_INV_NUM
-#undef USE_NUM_GMP
-#undef USE_NUM_NONE
-#undef USE_SCALAR_INV_BUILTIN
-#undef USE_SCALAR_INV_NUM
 #undef USE_FORCE_WIDEMUL_INT64
 #undef USE_FORCE_WIDEMUL_INT128
 #undef ECMULT_WINDOW_SIZE
 
-#define USE_NUM_NONE 1
-#define USE_FIELD_INV_BUILTIN 1
-#define USE_SCALAR_INV_BUILTIN 1
 #define USE_WIDEMUL_64 1
 #define ECMULT_WINDOW_SIZE 15
 
diff --git a/src/bench_ecmult.c b/src/bench_ecmult.c
index 85b9e439e..204e85a5d 100644
--- a/src/bench_ecmult.c
+++ b/src/bench_ecmult.c
@@ -9,7 +9,6 @@
 
 #include "util.h"
 #include "hash_impl.h"
-#include "num_impl.h"
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
diff --git a/src/bench_internal.c b/src/bench_internal.c
index 82a3eb6a0..8e7ffcb0d 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -10,7 +10,6 @@
 #include "assumptions.h"
 #include "util.h"
 #include "hash_impl.h"
-#include "num_impl.h"
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
diff --git a/src/ecmult.h b/src/ecmult.h
index 7aa394a11..7ab617e20 100644
--- a/src/ecmult.h
+++ b/src/ecmult.h
@@ -7,7 +7,6 @@
 #ifndef SECP256K1_ECMULT_H
 #define SECP256K1_ECMULT_H
 
-#include "num.h"
 #include "group.h"
 #include "scalar.h"
 #include "scratch.h"
diff --git a/src/field_impl.h b/src/field_impl.h
index 70e2398cd..374284a1f 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -12,7 +12,6 @@
 #endif
 
 #include "util.h"
-#include "num.h"
 
 #if defined(SECP256K1_WIDEMUL_INT128)
 #include "field_5x52_impl.h"
diff --git a/src/group.h b/src/group.h
index e18c04071..b9cd334da 100644
--- a/src/group.h
+++ b/src/group.h
@@ -7,7 +7,6 @@
 #ifndef SECP256K1_GROUP_H
 #define SECP256K1_GROUP_H
 
-#include "num.h"
 #include "field.h"
 
 /** A group element of the secp256k1 curve, in affine coordinates. */
diff --git a/src/group_impl.h b/src/group_impl.h
index 1324fadd9..19ebd8f44 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -7,7 +7,6 @@
 #ifndef SECP256K1_GROUP_IMPL_H
 #define SECP256K1_GROUP_IMPL_H
 
-#include "num.h"
 #include "field.h"
 #include "group.h"
 
diff --git a/src/num.h b/src/num.h
deleted file mode 100644
index 59a5cf2d7..000000000
--- a/src/num.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/***********************************************************************
- * Copyright (c) 2013, 2014 Pieter Wuille                              *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
- ***********************************************************************/
-
-#ifndef SECP256K1_NUM_H
-#define SECP256K1_NUM_H
-
-#ifndef USE_NUM_NONE
-
-#if defined HAVE_CONFIG_H
-#include "libsecp256k1-config.h"
-#endif
-
-#if defined(USE_NUM_GMP)
-#include "num_gmp.h"
-#else
-#error "Please select num implementation"
-#endif
-
-/** Copy a number. */
-static void secp256k1_num_copy(secp256k1_num *r, const secp256k1_num *a);
-
-/** Convert a number's absolute value to a binary big-endian string.
- *  There must be enough place. */
-static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num *a);
-
-/** Set a number to the value of a binary big-endian string. */
-static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsigned int alen);
-
-/** Compute a modular inverse. The input must be less than the modulus. */
-static void secp256k1_num_mod_inverse(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *m);
-
-/** Compute the jacobi symbol (a|b). b must be positive and odd. */
-static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b);
-
-/** Compare the absolute value of two numbers. */
-static int secp256k1_num_cmp(const secp256k1_num *a, const secp256k1_num *b);
-
-/** Test whether two number are equal (including sign). */
-static int secp256k1_num_eq(const secp256k1_num *a, const secp256k1_num *b);
-
-/** Add two (signed) numbers. */
-static void secp256k1_num_add(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
-
-/** Subtract two (signed) numbers. */
-static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
-
-/** Multiply two (signed) numbers. */
-static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
-
-/** Replace a number by its remainder modulo m. M's sign is ignored. The result is a number between 0 and m-1,
-    even if r was negative. */
-static void secp256k1_num_mod(secp256k1_num *r, const secp256k1_num *m);
-
-/** Right-shift the passed number by bits bits. */
-static void secp256k1_num_shift(secp256k1_num *r, int bits);
-
-/** Check whether a number is zero. */
-static int secp256k1_num_is_zero(const secp256k1_num *a);
-
-/** Check whether a number is one. */
-static int secp256k1_num_is_one(const secp256k1_num *a);
-
-/** Check whether a number is strictly negative. */
-static int secp256k1_num_is_neg(const secp256k1_num *a);
-
-/** Change a number's sign. */
-static void secp256k1_num_negate(secp256k1_num *r);
-
-#endif
-
-#endif /* SECP256K1_NUM_H */
diff --git a/src/num_gmp.h b/src/num_gmp.h
deleted file mode 100644
index cc6c51a5f..000000000
--- a/src/num_gmp.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/***********************************************************************
- * Copyright (c) 2013, 2014 Pieter Wuille                              *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
- ***********************************************************************/
-
-#ifndef SECP256K1_NUM_REPR_H
-#define SECP256K1_NUM_REPR_H
-
-#include <gmp.h>
-
-#define NUM_LIMBS ((256+GMP_NUMB_BITS-1)/GMP_NUMB_BITS)
-
-typedef struct {
-    mp_limb_t data[2*NUM_LIMBS];
-    int neg;
-    int limbs;
-} secp256k1_num;
-
-#endif /* SECP256K1_NUM_REPR_H */
diff --git a/src/num_gmp_impl.h b/src/num_gmp_impl.h
deleted file mode 100644
index c07947dd9..000000000
--- a/src/num_gmp_impl.h
+++ /dev/null
@@ -1,288 +0,0 @@
-/***********************************************************************
- * Copyright (c) 2013, 2014 Pieter Wuille                              *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
- ***********************************************************************/
-
-#ifndef SECP256K1_NUM_REPR_IMPL_H
-#define SECP256K1_NUM_REPR_IMPL_H
-
-#include <string.h>
-#include <stdlib.h>
-#include <gmp.h>
-
-#include "util.h"
-#include "num.h"
-
-#ifdef VERIFY
-static void secp256k1_num_sanity(const secp256k1_num *a) {
-    VERIFY_CHECK(a->limbs == 1 || (a->limbs > 1 && a->data[a->limbs-1] != 0));
-}
-#else
-#define secp256k1_num_sanity(a) do { } while(0)
-#endif
-
-static void secp256k1_num_copy(secp256k1_num *r, const secp256k1_num *a) {
-    *r = *a;
-}
-
-static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num *a) {
-    unsigned char tmp[65];
-    int len = 0;
-    int shift = 0;
-    if (a->limbs>1 || a->data[0] != 0) {
-        len = mpn_get_str(tmp, 256, (mp_limb_t*)a->data, a->limbs);
-    }
-    while (shift < len && tmp[shift] == 0) shift++;
-    VERIFY_CHECK(len-shift <= (int)rlen);
-    memset(r, 0, rlen - len + shift);
-    if (len > shift) {
-        memcpy(r + rlen - len + shift, tmp + shift, len - shift);
-    }
-    memset(tmp, 0, sizeof(tmp));
-}
-
-static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsigned int alen) {
-    int len;
-    VERIFY_CHECK(alen > 0);
-    VERIFY_CHECK(alen <= 64);
-    len = mpn_set_str(r->data, a, alen, 256);
-    if (len == 0) {
-        r->data[0] = 0;
-        len = 1;
-    }
-    VERIFY_CHECK(len <= NUM_LIMBS*2);
-    r->limbs = len;
-    r->neg = 0;
-    while (r->limbs > 1 && r->data[r->limbs-1]==0) {
-        r->limbs--;
-    }
-}
-
-static void secp256k1_num_add_abs(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    mp_limb_t c = mpn_add(r->data, a->data, a->limbs, b->data, b->limbs);
-    r->limbs = a->limbs;
-    if (c != 0) {
-        VERIFY_CHECK(r->limbs < 2*NUM_LIMBS);
-        r->data[r->limbs++] = c;
-    }
-}
-
-static void secp256k1_num_sub_abs(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    mp_limb_t c = mpn_sub(r->data, a->data, a->limbs, b->data, b->limbs);
-    (void)c;
-    VERIFY_CHECK(c == 0);
-    r->limbs = a->limbs;
-    while (r->limbs > 1 && r->data[r->limbs-1]==0) {
-        r->limbs--;
-    }
-}
-
-static void secp256k1_num_mod(secp256k1_num *r, const secp256k1_num *m) {
-    secp256k1_num_sanity(r);
-    secp256k1_num_sanity(m);
-
-    if (r->limbs >= m->limbs) {
-        mp_limb_t t[2*NUM_LIMBS];
-        mpn_tdiv_qr(t, r->data, 0, r->data, r->limbs, m->data, m->limbs);
-        memset(t, 0, sizeof(t));
-        r->limbs = m->limbs;
-        while (r->limbs > 1 && r->data[r->limbs-1]==0) {
-            r->limbs--;
-        }
-    }
-
-    if (r->neg && (r->limbs > 1 || r->data[0] != 0)) {
-        secp256k1_num_sub_abs(r, m, r);
-        r->neg = 0;
-    }
-}
-
-static void secp256k1_num_mod_inverse(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *m) {
-    int i;
-    mp_limb_t g[NUM_LIMBS+1];
-    mp_limb_t u[NUM_LIMBS+1];
-    mp_limb_t v[NUM_LIMBS+1];
-    mp_size_t sn;
-    mp_size_t gn;
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(m);
-
-    /** mpn_gcdext computes: (G,S) = gcdext(U,V), where
-     *  * G = gcd(U,V)
-     *  * G = U*S + V*T
-     *  * U has equal or more limbs than V, and V has no padding
-     *  If we set U to be (a padded version of) a, and V = m:
-     *    G = a*S + m*T
-     *    G = a*S mod m
-     *  Assuming G=1:
-     *    S = 1/a mod m
-     */
-    VERIFY_CHECK(m->limbs <= NUM_LIMBS);
-    VERIFY_CHECK(m->data[m->limbs-1] != 0);
-    for (i = 0; i < m->limbs; i++) {
-        u[i] = (i < a->limbs) ? a->data[i] : 0;
-        v[i] = m->data[i];
-    }
-    sn = NUM_LIMBS+1;
-    gn = mpn_gcdext(g, r->data, &sn, u, m->limbs, v, m->limbs);
-    (void)gn;
-    VERIFY_CHECK(gn == 1);
-    VERIFY_CHECK(g[0] == 1);
-    r->neg = a->neg ^ m->neg;
-    if (sn < 0) {
-        mpn_sub(r->data, m->data, m->limbs, r->data, -sn);
-        r->limbs = m->limbs;
-        while (r->limbs > 1 && r->data[r->limbs-1]==0) {
-            r->limbs--;
-        }
-    } else {
-        r->limbs = sn;
-    }
-    memset(g, 0, sizeof(g));
-    memset(u, 0, sizeof(u));
-    memset(v, 0, sizeof(v));
-}
-
-static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b) {
-    int ret;
-    mpz_t ga, gb;
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-    VERIFY_CHECK(!b->neg && (b->limbs > 0) && (b->data[0] & 1));
-
-    mpz_inits(ga, gb, NULL);
-
-    mpz_import(gb, b->limbs, -1, sizeof(mp_limb_t), 0, 0, b->data);
-    mpz_import(ga, a->limbs, -1, sizeof(mp_limb_t), 0, 0, a->data);
-    if (a->neg) {
-        mpz_neg(ga, ga);
-    }
-
-    ret = mpz_jacobi(ga, gb);
-
-    mpz_clears(ga, gb, NULL);
-
-    return ret;
-}
-
-static int secp256k1_num_is_one(const secp256k1_num *a) {
-    return (a->limbs == 1 && a->data[0] == 1);
-}
-
-static int secp256k1_num_is_zero(const secp256k1_num *a) {
-    return (a->limbs == 1 && a->data[0] == 0);
-}
-
-static int secp256k1_num_is_neg(const secp256k1_num *a) {
-    return (a->limbs > 1 || a->data[0] != 0) && a->neg;
-}
-
-static int secp256k1_num_cmp(const secp256k1_num *a, const secp256k1_num *b) {
-    if (a->limbs > b->limbs) {
-        return 1;
-    }
-    if (a->limbs < b->limbs) {
-        return -1;
-    }
-    return mpn_cmp(a->data, b->data, a->limbs);
-}
-
-static int secp256k1_num_eq(const secp256k1_num *a, const secp256k1_num *b) {
-    if (a->limbs > b->limbs) {
-        return 0;
-    }
-    if (a->limbs < b->limbs) {
-        return 0;
-    }
-    if ((a->neg && !secp256k1_num_is_zero(a)) != (b->neg && !secp256k1_num_is_zero(b))) {
-        return 0;
-    }
-    return mpn_cmp(a->data, b->data, a->limbs) == 0;
-}
-
-static void secp256k1_num_subadd(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b, int bneg) {
-    if (!(b->neg ^ bneg ^ a->neg)) { /* a and b have the same sign */
-        r->neg = a->neg;
-        if (a->limbs >= b->limbs) {
-            secp256k1_num_add_abs(r, a, b);
-        } else {
-            secp256k1_num_add_abs(r, b, a);
-        }
-    } else {
-        if (secp256k1_num_cmp(a, b) > 0) {
-            r->neg = a->neg;
-            secp256k1_num_sub_abs(r, a, b);
-        } else {
-            r->neg = b->neg ^ bneg;
-            secp256k1_num_sub_abs(r, b, a);
-        }
-    }
-}
-
-static void secp256k1_num_add(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-    secp256k1_num_subadd(r, a, b, 0);
-}
-
-static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-    secp256k1_num_subadd(r, a, b, 1);
-}
-
-static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    mp_limb_t tmp[2*NUM_LIMBS+1];
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-
-    VERIFY_CHECK(a->limbs + b->limbs <= 2*NUM_LIMBS+1);
-    if ((a->limbs==1 && a->data[0]==0) || (b->limbs==1 && b->data[0]==0)) {
-        r->limbs = 1;
-        r->neg = 0;
-        r->data[0] = 0;
-        return;
-    }
-    if (a->limbs >= b->limbs) {
-        mpn_mul(tmp, a->data, a->limbs, b->data, b->limbs);
-    } else {
-        mpn_mul(tmp, b->data, b->limbs, a->data, a->limbs);
-    }
-    r->limbs = a->limbs + b->limbs;
-    if (r->limbs > 1 && tmp[r->limbs - 1]==0) {
-        r->limbs--;
-    }
-    VERIFY_CHECK(r->limbs <= 2*NUM_LIMBS);
-    mpn_copyi(r->data, tmp, r->limbs);
-    r->neg = a->neg ^ b->neg;
-    memset(tmp, 0, sizeof(tmp));
-}
-
-static void secp256k1_num_shift(secp256k1_num *r, int bits) {
-    if (bits % GMP_NUMB_BITS) {
-        /* Shift within limbs. */
-        mpn_rshift(r->data, r->data, r->limbs, bits % GMP_NUMB_BITS);
-    }
-    if (bits >= GMP_NUMB_BITS) {
-        int i;
-        /* Shift full limbs. */
-        for (i = 0; i < r->limbs; i++) {
-            int index = i + (bits / GMP_NUMB_BITS);
-            if (index < r->limbs && index < 2*NUM_LIMBS) {
-                r->data[i] = r->data[index];
-            } else {
-                r->data[i] = 0;
-            }
-        }
-    }
-    while (r->limbs>1 && r->data[r->limbs-1]==0) {
-        r->limbs--;
-    }
-}
-
-static void secp256k1_num_negate(secp256k1_num *r) {
-    r->neg ^= 1;
-}
-
-#endif /* SECP256K1_NUM_REPR_IMPL_H */
diff --git a/src/num_impl.h b/src/num_impl.h
deleted file mode 100644
index 880598efe..000000000
--- a/src/num_impl.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/***********************************************************************
- * Copyright (c) 2013, 2014 Pieter Wuille                              *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
- ***********************************************************************/
-
-#ifndef SECP256K1_NUM_IMPL_H
-#define SECP256K1_NUM_IMPL_H
-
-#if defined HAVE_CONFIG_H
-#include "libsecp256k1-config.h"
-#endif
-
-#include "num.h"
-
-#if defined(USE_NUM_GMP)
-#include "num_gmp_impl.h"
-#elif defined(USE_NUM_NONE)
-/* Nothing. */
-#else
-#error "Please select num implementation"
-#endif
-
-#endif /* SECP256K1_NUM_IMPL_H */
diff --git a/src/scalar.h b/src/scalar.h
index d7c42cba8..aaaa3d882 100644
--- a/src/scalar.h
+++ b/src/scalar.h
@@ -7,7 +7,6 @@
 #ifndef SECP256K1_SCALAR_H
 #define SECP256K1_SCALAR_H
 
-#include "num.h"
 #include "util.h"
 
 #if defined HAVE_CONFIG_H
@@ -88,14 +87,6 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar *a);
  * Returns -1 if the number was negated, 1 otherwise */
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *a, int flag);
 
-#ifndef USE_NUM_NONE
-/** Convert a scalar to a number. */
-static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a);
-
-/** Get the order of the group as a number. */
-static void secp256k1_scalar_order_get_num(secp256k1_num *r);
-#endif
-
 /** Compare two scalars. */
 static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b);
 
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
index b328afdb9..e12447477 100644
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -31,34 +31,6 @@
 static const secp256k1_scalar secp256k1_scalar_one = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 static const secp256k1_scalar secp256k1_scalar_zero = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0);
 
-#ifndef USE_NUM_NONE
-static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a) {
-    unsigned char c[32];
-    secp256k1_scalar_get_b32(c, a);
-    secp256k1_num_set_bin(r, c, 32);
-}
-
-/** secp256k1 curve order, see secp256k1_ecdsa_const_order_as_fe in ecdsa_impl.h */
-static void secp256k1_scalar_order_get_num(secp256k1_num *r) {
-#if defined(EXHAUSTIVE_TEST_ORDER)
-    static const unsigned char order[32] = {
-        0,0,0,0,0,0,0,0,
-        0,0,0,0,0,0,0,0,
-        0,0,0,0,0,0,0,0,
-        0,0,0,0,0,0,0,EXHAUSTIVE_TEST_ORDER
-    };
-#else
-    static const unsigned char order[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
-        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
-        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
-    };
-#endif
-    secp256k1_num_set_bin(r, order, 32);
-}
-#endif
-
 static int secp256k1_scalar_set_b32_seckey(secp256k1_scalar *r, const unsigned char *bin) {
     int overflow;
     secp256k1_scalar_set_b32(r, bin, &overflow);
diff --git a/src/secp256k1.c b/src/secp256k1.c
index 4f56c27c8..aef3f99ac 100644
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@@ -9,7 +9,6 @@
 
 #include "assumptions.h"
 #include "util.h"
-#include "num_impl.h"
 #include "field_impl.h"
 #include "scalar_impl.h"
 #include "group_impl.h"
diff --git a/src/tests.c b/src/tests.c
index ec0b6e0fb..ba645bbe8 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -636,130 +636,6 @@ void run_rand_int(void) {
     }
 }
 
-/***** NUM TESTS *****/
-
-#ifndef USE_NUM_NONE
-void random_num_negate(secp256k1_num *num) {
-    if (secp256k1_testrand_bits(1)) {
-        secp256k1_num_negate(num);
-    }
-}
-
-void random_num_order_test(secp256k1_num *num) {
-    secp256k1_scalar sc;
-    random_scalar_order_test(&sc);
-    secp256k1_scalar_get_num(num, &sc);
-}
-
-void random_num_order(secp256k1_num *num) {
-    secp256k1_scalar sc;
-    random_scalar_order(&sc);
-    secp256k1_scalar_get_num(num, &sc);
-}
-
-void test_num_negate(void) {
-    secp256k1_num n1;
-    secp256k1_num n2;
-    random_num_order_test(&n1); /* n1 = R */
-    random_num_negate(&n1);
-    secp256k1_num_copy(&n2, &n1); /* n2 = R */
-    secp256k1_num_sub(&n1, &n2, &n1); /* n1 = n2-n1 = 0 */
-    CHECK(secp256k1_num_is_zero(&n1));
-    secp256k1_num_copy(&n1, &n2); /* n1 = R */
-    secp256k1_num_negate(&n1); /* n1 = -R */
-    CHECK(!secp256k1_num_is_zero(&n1));
-    secp256k1_num_add(&n1, &n2, &n1); /* n1 = n2+n1 = 0 */
-    CHECK(secp256k1_num_is_zero(&n1));
-    secp256k1_num_copy(&n1, &n2); /* n1 = R */
-    secp256k1_num_negate(&n1); /* n1 = -R */
-    CHECK(secp256k1_num_is_neg(&n1) != secp256k1_num_is_neg(&n2));
-    secp256k1_num_negate(&n1); /* n1 = R */
-    CHECK(secp256k1_num_eq(&n1, &n2));
-}
-
-void test_num_add_sub(void) {
-    int i;
-    secp256k1_scalar s;
-    secp256k1_num n1;
-    secp256k1_num n2;
-    secp256k1_num n1p2, n2p1, n1m2, n2m1;
-    random_num_order_test(&n1); /* n1 = R1 */
-    if (secp256k1_testrand_bits(1)) {
-        random_num_negate(&n1);
-    }
-    random_num_order_test(&n2); /* n2 = R2 */
-    if (secp256k1_testrand_bits(1)) {
-        random_num_negate(&n2);
-    }
-    secp256k1_num_add(&n1p2, &n1, &n2); /* n1p2 = R1 + R2 */
-    secp256k1_num_add(&n2p1, &n2, &n1); /* n2p1 = R2 + R1 */
-    secp256k1_num_sub(&n1m2, &n1, &n2); /* n1m2 = R1 - R2 */
-    secp256k1_num_sub(&n2m1, &n2, &n1); /* n2m1 = R2 - R1 */
-    CHECK(secp256k1_num_eq(&n1p2, &n2p1));
-    CHECK(!secp256k1_num_eq(&n1p2, &n1m2));
-    secp256k1_num_negate(&n2m1); /* n2m1 = -R2 + R1 */
-    CHECK(secp256k1_num_eq(&n2m1, &n1m2));
-    CHECK(!secp256k1_num_eq(&n2m1, &n1));
-    secp256k1_num_add(&n2m1, &n2m1, &n2); /* n2m1 = -R2 + R1 + R2 = R1 */
-    CHECK(secp256k1_num_eq(&n2m1, &n1));
-    CHECK(!secp256k1_num_eq(&n2p1, &n1));
-    secp256k1_num_sub(&n2p1, &n2p1, &n2); /* n2p1 = R2 + R1 - R2 = R1 */
-    CHECK(secp256k1_num_eq(&n2p1, &n1));
-
-    /* check is_one */
-    secp256k1_scalar_set_int(&s, 1);
-    secp256k1_scalar_get_num(&n1, &s);
-    CHECK(secp256k1_num_is_one(&n1));
-    /* check that 2^n + 1 is never 1 */
-    secp256k1_scalar_get_num(&n2, &s);
-    for (i = 0; i < 250; ++i) {
-        secp256k1_num_add(&n1, &n1, &n1);    /* n1 *= 2 */
-        secp256k1_num_add(&n1p2, &n1, &n2);  /* n1p2 = n1 + 1 */
-        CHECK(!secp256k1_num_is_one(&n1p2));
-    }
-}
-
-void test_num_mod(void) {
-    int i;
-    secp256k1_scalar s;
-    secp256k1_num order, n;
-
-    /* check that 0 mod anything is 0 */
-    random_scalar_order_test(&s);
-    secp256k1_scalar_get_num(&order, &s);
-    secp256k1_scalar_set_int(&s, 0);
-    secp256k1_scalar_get_num(&n, &s);
-    secp256k1_num_mod(&n, &order);
-    CHECK(secp256k1_num_is_zero(&n));
-
-    /* check that anything mod 1 is 0 */
-    secp256k1_scalar_set_int(&s, 1);
-    secp256k1_scalar_get_num(&order, &s);
-    secp256k1_scalar_get_num(&n, &s);
-    secp256k1_num_mod(&n, &order);
-    CHECK(secp256k1_num_is_zero(&n));
-
-    /* check that increasing the number past 2^256 does not break this */
-    random_scalar_order_test(&s);
-    secp256k1_scalar_get_num(&n, &s);
-    /* multiply by 2^8, which'll test this case with high probability */
-    for (i = 0; i < 8; ++i) {
-        secp256k1_num_add(&n, &n, &n);
-    }
-    secp256k1_num_mod(&n, &order);
-    CHECK(secp256k1_num_is_zero(&n));
-}
-
-void run_num_smalltests(void) {
-    int i;
-    for (i = 0; i < 100*count; i++) {
-        test_num_negate();
-        test_num_add_sub();
-        test_num_mod();
-    }
-}
-#endif
-
 /***** MODINV TESTS *****/
 
 /* Compute the modular inverse of (odd) x mod 2^64. */
@@ -1202,10 +1078,6 @@ void scalar_test(void) {
     secp256k1_scalar s;
     secp256k1_scalar s1;
     secp256k1_scalar s2;
-#ifndef USE_NUM_NONE
-    secp256k1_num snum, s1num, s2num;
-    secp256k1_num order, half_order;
-#endif
     unsigned char c[32];
 
     /* Set 's' to a random scalar, with value 'snum'. */
@@ -1218,16 +1090,6 @@ void scalar_test(void) {
     random_scalar_order_test(&s2);
     secp256k1_scalar_get_b32(c, &s2);
 
-#ifndef USE_NUM_NONE
-    secp256k1_scalar_get_num(&snum, &s);
-    secp256k1_scalar_get_num(&s1num, &s1);
-    secp256k1_scalar_get_num(&s2num, &s2);
-
-    secp256k1_scalar_order_get_num(&order);
-    half_order = order;
-    secp256k1_num_shift(&half_order, 1);
-#endif
-
     {
         int i;
         /* Test that fetching groups of 4 bits from a scalar and recursing n(i)=16*n(i-1)+p(i) reconstructs it. */
@@ -1267,80 +1129,6 @@ void scalar_test(void) {
         CHECK(secp256k1_scalar_eq(&n, &s));
     }
 
-#ifndef USE_NUM_NONE
-    {
-        /* Test that adding the scalars together is equal to adding their numbers together modulo the order. */
-        secp256k1_num rnum;
-        secp256k1_num r2num;
-        secp256k1_scalar r;
-        secp256k1_num_add(&rnum, &snum, &s2num);
-        secp256k1_num_mod(&rnum, &order);
-        secp256k1_scalar_add(&r, &s, &s2);
-        secp256k1_scalar_get_num(&r2num, &r);
-        CHECK(secp256k1_num_eq(&rnum, &r2num));
-    }
-
-    {
-        /* Test that multiplying the scalars is equal to multiplying their numbers modulo the order. */
-        secp256k1_scalar r;
-        secp256k1_num r2num;
-        secp256k1_num rnum;
-        secp256k1_num_mul(&rnum, &snum, &s2num);
-        secp256k1_num_mod(&rnum, &order);
-        secp256k1_scalar_mul(&r, &s, &s2);
-        secp256k1_scalar_get_num(&r2num, &r);
-        CHECK(secp256k1_num_eq(&rnum, &r2num));
-        /* The result can only be zero if at least one of the factors was zero. */
-        CHECK(secp256k1_scalar_is_zero(&r) == (secp256k1_scalar_is_zero(&s) || secp256k1_scalar_is_zero(&s2)));
-        /* The results can only be equal to one of the factors if that factor was zero, or the other factor was one. */
-        CHECK(secp256k1_num_eq(&rnum, &snum) == (secp256k1_scalar_is_zero(&s) || secp256k1_scalar_is_one(&s2)));
-        CHECK(secp256k1_num_eq(&rnum, &s2num) == (secp256k1_scalar_is_zero(&s2) || secp256k1_scalar_is_one(&s)));
-    }
-
-    {
-        secp256k1_scalar neg;
-        secp256k1_num negnum;
-        secp256k1_num negnum2;
-        /* Check that comparison with zero matches comparison with zero on the number. */
-        CHECK(secp256k1_num_is_zero(&snum) == secp256k1_scalar_is_zero(&s));
-        /* Check that comparison with the half order is equal to testing for high scalar. */
-        CHECK(secp256k1_scalar_is_high(&s) == (secp256k1_num_cmp(&snum, &half_order) > 0));
-        secp256k1_scalar_negate(&neg, &s);
-        secp256k1_num_sub(&negnum, &order, &snum);
-        secp256k1_num_mod(&negnum, &order);
-        /* Check that comparison with the half order is equal to testing for high scalar after negation. */
-        CHECK(secp256k1_scalar_is_high(&neg) == (secp256k1_num_cmp(&negnum, &half_order) > 0));
-        /* Negating should change the high property, unless the value was already zero. */
-        CHECK((secp256k1_scalar_is_high(&s) == secp256k1_scalar_is_high(&neg)) == secp256k1_scalar_is_zero(&s));
-        secp256k1_scalar_get_num(&negnum2, &neg);
-        /* Negating a scalar should be equal to (order - n) mod order on the number. */
-        CHECK(secp256k1_num_eq(&negnum, &negnum2));
-        secp256k1_scalar_add(&neg, &neg, &s);
-        /* Adding a number to its negation should result in zero. */
-        CHECK(secp256k1_scalar_is_zero(&neg));
-        secp256k1_scalar_negate(&neg, &neg);
-        /* Negating zero should still result in zero. */
-        CHECK(secp256k1_scalar_is_zero(&neg));
-    }
-
-    {
-        /* Test secp256k1_scalar_mul_shift_var. */
-        secp256k1_scalar r;
-        secp256k1_num one;
-        secp256k1_num rnum;
-        secp256k1_num rnum2;
-        unsigned char cone[1] = {0x01};
-        unsigned int shift = 256 + secp256k1_testrand_int(257);
-        secp256k1_scalar_mul_shift_var(&r, &s1, &s2, shift);
-        secp256k1_num_mul(&rnum, &s1num, &s2num);
-        secp256k1_num_shift(&rnum, shift - 1);
-        secp256k1_num_set_bin(&one, cone, 1);
-        secp256k1_num_add(&rnum, &rnum, &one);
-        secp256k1_num_shift(&rnum, 1);
-        secp256k1_scalar_get_num(&rnum2, &r);
-        CHECK(secp256k1_num_eq(&rnum, &rnum2));
-    }
-
     {
         /* test secp256k1_scalar_shr_int */
         secp256k1_scalar r;
@@ -1354,7 +1142,6 @@ void scalar_test(void) {
             CHECK(expected == low);
         }
     }
-#endif
 
     {
         /* Test commutativity of add. */
@@ -1490,48 +1277,6 @@ void run_scalar_tests(void) {
         CHECK(secp256k1_scalar_is_zero(&o));
     }
 
-#ifndef USE_NUM_NONE
-    {
-        /* Test secp256k1_scalar_set_b32 boundary conditions */
-        secp256k1_num order;
-        secp256k1_scalar scalar;
-        unsigned char bin[32];
-        unsigned char bin_tmp[32];
-        int overflow = 0;
-        /* 2^256-1 - order */
-        static const secp256k1_scalar all_ones_minus_order = SECP256K1_SCALAR_CONST(
-            0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000001UL,
-            0x45512319UL, 0x50B75FC4UL, 0x402DA173UL, 0x2FC9BEBEUL
-        );
-
-        /* A scalar set to 0s should be 0. */
-        memset(bin, 0, 32);
-        secp256k1_scalar_set_b32(&scalar, bin, &overflow);
-        CHECK(overflow == 0);
-        CHECK(secp256k1_scalar_is_zero(&scalar));
-
-        /* A scalar with value of the curve order should be 0. */
-        secp256k1_scalar_order_get_num(&order);
-        secp256k1_num_get_bin(bin, 32, &order);
-        secp256k1_scalar_set_b32(&scalar, bin, &overflow);
-        CHECK(overflow == 1);
-        CHECK(secp256k1_scalar_is_zero(&scalar));
-
-        /* A scalar with value of the curve order minus one should not overflow. */
-        bin[31] -= 1;
-        secp256k1_scalar_set_b32(&scalar, bin, &overflow);
-        CHECK(overflow == 0);
-        secp256k1_scalar_get_b32(bin_tmp, &scalar);
-        CHECK(secp256k1_memcmp_var(bin, bin_tmp, 32) == 0);
-
-        /* A scalar set to all 1s should overflow. */
-        memset(bin, 0xFF, 32);
-        secp256k1_scalar_set_b32(&scalar, bin, &overflow);
-        CHECK(overflow == 1);
-        CHECK(secp256k1_scalar_eq(&scalar, &all_ones_minus_order));
-    }
-#endif
-
     {
         /* Does check_overflow check catch all ones? */
         static const secp256k1_scalar overflowed = SECP256K1_SCALAR_CONST(
@@ -1554,9 +1299,7 @@ void run_scalar_tests(void) {
         secp256k1_scalar one;
         secp256k1_scalar r1;
         secp256k1_scalar r2;
-#if defined(USE_SCALAR_INV_NUM)
         secp256k1_scalar zzv;
-#endif
         int overflow;
         unsigned char chal[33][2][32] = {
             {{0xff, 0xff, 0x03, 0x07, 0x00, 0x00, 0x00, 0x00,
@@ -2106,10 +1849,8 @@ void run_scalar_tests(void) {
             if (!secp256k1_scalar_is_zero(&y)) {
                 secp256k1_scalar_inverse(&zz, &y);
                 CHECK(!secp256k1_scalar_check_overflow(&zz));
-#if defined(USE_SCALAR_INV_NUM)
                 secp256k1_scalar_inverse_var(&zzv, &y);
                 CHECK(secp256k1_scalar_eq(&zzv, &zz));
-#endif
                 secp256k1_scalar_mul(&z, &z, &zz);
                 CHECK(!secp256k1_scalar_check_overflow(&z));
                 CHECK(secp256k1_scalar_eq(&x, &z));
@@ -6075,11 +5816,6 @@ int main(int argc, char **argv) {
     run_hmac_sha256_tests();
     run_rfc6979_hmac_sha256_tests();
 
-#ifndef USE_NUM_NONE
-    /* num tests */
-    run_num_smalltests();
-#endif
-
     /* scalar tests */
     run_scalar_tests();
 

From 9164a1b6582e2fc833c760a3403d26b9b0b3b7b3 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sat, 28 Nov 2020 15:58:22 -0800
Subject: [PATCH 15/59] Optimization: special-case zero modulus limbs in
 modinv64

Both the field and scalar modulus can be written in signed{30,62} notation
with one or more zero limbs. Make use of this in the update_de function to
avoid a few wide multiplications when that is the case.

This doesn't appear to be a win in the 32-bit implementation, so only
do it for the 64-bit one.
---
 src/modinv64_impl.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
index 3ab21cdc0..281cdb911 100644
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@@ -338,22 +338,28 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
     /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
     cd += (int128_t)u * d1 + (int128_t)v * e1;
     ce += (int128_t)q * d1 + (int128_t)r * e1;
-    cd += (int128_t)modinfo->modulus.v[1] * md;
-    ce += (int128_t)modinfo->modulus.v[1] * me;
+    if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */
+        cd += (int128_t)modinfo->modulus.v[1] * md;
+        ce += (int128_t)modinfo->modulus.v[1] * me;
+    }
     d->v[0] = (int64_t)cd & M62; cd >>= 62;
     e->v[0] = (int64_t)ce & M62; ce >>= 62;
     /* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */
     cd += (int128_t)u * d2 + (int128_t)v * e2;
     ce += (int128_t)q * d2 + (int128_t)r * e2;
-    cd += (int128_t)modinfo->modulus.v[2] * md;
-    ce += (int128_t)modinfo->modulus.v[2] * me;
+    if (modinfo->modulus.v[2]) { /* Optimize for the case where limb of modulus is zero. */
+        cd += (int128_t)modinfo->modulus.v[2] * md;
+        ce += (int128_t)modinfo->modulus.v[2] * me;
+    }
     d->v[1] = (int64_t)cd & M62; cd >>= 62;
     e->v[1] = (int64_t)ce & M62; ce >>= 62;
     /* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */
     cd += (int128_t)u * d3 + (int128_t)v * e3;
     ce += (int128_t)q * d3 + (int128_t)r * e3;
-    cd += (int128_t)modinfo->modulus.v[3] * md;
-    ce += (int128_t)modinfo->modulus.v[3] * me;
+    if (modinfo->modulus.v[3]) { /* Optimize for the case where limb of modulus is zero. */
+        cd += (int128_t)modinfo->modulus.v[3] * md;
+        ce += (int128_t)modinfo->modulus.v[3] * me;
+    }
     d->v[2] = (int64_t)cd & M62; cd >>= 62;
     e->v[2] = (int64_t)ce & M62; ce >>= 62;
     /* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */

From b306935ac12bb24fd931d735b4dfc07f707e7447 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 15 Dec 2020 16:19:08 -0800
Subject: [PATCH 16/59] Optimization: use formulas instead of lookup tables for
 cancelling g bits

This only seems to be a win on 64-bit platforms, so only do it there.

Refactored by: Pieter Wuille <pieter@wuille.net>
---
 src/modinv64_impl.h | 46 +++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
index 281cdb911..15cda3d73 100644
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@@ -220,21 +220,6 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t
  * Implements the divsteps_n_matrix_var function from the explanation.
  */
 static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
-    /* inv256[i] = -(2*i+1)^-1 (mod 256) */
-    static const uint8_t inv256[128] = {
-        0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
-        0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
-        0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
-        0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
-        0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
-        0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
-        0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
-        0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
-        0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
-        0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
-        0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
-    };
-
     /* Transformation matrix; see comments in secp256k1_modinv64_divsteps_62. */
     uint64_t u = 1, v = 0, q = 0, r = 1;
     uint64_t f = f0, g = g0, m;
@@ -265,17 +250,28 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
             tmp = f; f = g; g = -tmp;
             tmp = u; u = q; q = -tmp;
             tmp = v; v = r; r = -tmp;
+            /* Use a formula to cancel out up to 6 bits of g. Also, no more than i can be cancelled
+             * out (as we'd be done before that point), and no more than eta+1 can be done as its
+             * will flip again once that happens. */
+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+            VERIFY_CHECK(limit > 0 && limit <= 62);
+            /* m is a mask for the bottom min(limit, 6) bits. */
+            m = (UINT64_MAX >> (64 - limit)) & 63U;
+            /* Find what multiple of f must be added to g to cancel its bottom min(limit, 6)
+             * bits. */
+            w = (f * g * (f * f - 2)) & m;
+        } else {
+            /* In this branch, use a simpler formula that only lets us cancel up to 4 bits of g, as
+             * eta tends to be smaller here. */
+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+            VERIFY_CHECK(limit > 0 && limit <= 62);
+            /* m is a mask for the bottom min(limit, 4) bits. */
+            m = (UINT64_MAX >> (64 - limit)) & 15U;
+            /* Find what multiple of f must be added to g to cancel its bottom min(limit, 4)
+             * bits. */
+            w = f + (((f + 1) & 4) << 1);
+            w = (-w * g) & m;
         }
-        /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
-         * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
-         * can be done as its sign will flip once that happens. */
-        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
-        /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
-        VERIFY_CHECK(limit > 0 && limit <= 62);
-        m = (UINT64_MAX >> (64 - limit)) & 255U;
-        /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
-        w = (g * inv256[(f >> 1) & 127]) & m;
-        /* Do so. */
         g += f * w;
         q += u * w;
         r += v * w;

From ebc1af700f9ec6e96586152b7090a2a6494308c3 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 15 Dec 2020 18:17:19 -0800
Subject: [PATCH 17/59] Optimization: track f,g limb count and pass to new
 variable-time update_fg_var

The magnitude of the f and g variables generally goes down as the algorithm
progresses. Make use of this by keeping tracking how many limbs are used, and
when the number becomes small enough, make use of this to reduce the complexity
of arithmetic on them.

Refactored by: Pieter Wuille <pieter@wuille.net>
---
 src/modinv32_impl.h | 159 +++++++++++++++++++++++++++++---------------
 src/modinv64_impl.h | 157 ++++++++++++++++++++++++++++---------------
 2 files changed, 207 insertions(+), 109 deletions(-)

diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
index 1da47bd22..aa7988c4b 100644
--- a/src/modinv32_impl.h
+++ b/src/modinv32_impl.h
@@ -24,25 +24,25 @@
 static const secp256k1_modinv32_signed30 SECP256K1_SIGNED30_ONE = {{1}};
 
 /* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^30). */
-static void secp256k1_modinv32_mul_30(secp256k1_modinv32_signed30 *r, const secp256k1_modinv32_signed30 *a, int32_t factor) {
+static void secp256k1_modinv32_mul_30(secp256k1_modinv32_signed30 *r, const secp256k1_modinv32_signed30 *a, int alen, int32_t factor) {
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int64_t c = 0;
     int i;
     for (i = 0; i < 8; ++i) {
-        c += (int64_t)a->v[i] * factor;
+        if (i < alen) c += (int64_t)a->v[i] * factor;
         r->v[i] = (int32_t)c & M30; c >>= 30;
     }
-    c += (int64_t)a->v[8] * factor;
+    if (8 < alen) c += (int64_t)a->v[8] * factor;
     VERIFY_CHECK(c == (int32_t)c);
     r->v[8] = (int32_t)c;
 }
 
-/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. */
-static int secp256k1_modinv32_mul_cmp_30(const secp256k1_modinv32_signed30 *a, const secp256k1_modinv32_signed30 *b, int32_t factor) {
+/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A consists of alen limbs; b has 9. */
+static int secp256k1_modinv32_mul_cmp_30(const secp256k1_modinv32_signed30 *a, int alen, const secp256k1_modinv32_signed30 *b, int32_t factor) {
     int i;
     secp256k1_modinv32_signed30 am, bm;
-    secp256k1_modinv32_mul_30(&am, a, 1); /* Normalize all but the top limb of a. */
-    secp256k1_modinv32_mul_30(&bm, b, factor);
+    secp256k1_modinv32_mul_30(&am, a, alen, 1); /* Normalize all but the top limb of a. */
+    secp256k1_modinv32_mul_30(&bm, b, 9, factor);
     for (i = 0; i < 8; ++i) {
         /* Verify that all but the top limb of a and b are normalized. */
         VERIFY_CHECK(am.v[i] >> 30 == 0);
@@ -73,8 +73,8 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
         VERIFY_CHECK(r->v[i] >= -M30);
         VERIFY_CHECK(r->v[i] <= M30);
     }
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, 1) < 0); /* r < modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */
 #endif
 
     /* In a first step, add the modulus if the input is negative, and then negate if requested.
@@ -154,8 +154,8 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
     VERIFY_CHECK(r6 >> 30 == 0);
     VERIFY_CHECK(r7 >> 30 == 0);
     VERIFY_CHECK(r8 >> 30 == 0);
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, 0) >= 0); /* r >= 0 */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, 1) < 0); /* r < modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 0) >= 0); /* r >= 0 */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */
 #endif
 }
 
@@ -331,10 +331,10 @@ static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp
     int64_t cd, ce;
     int i;
 #ifdef VERIFY
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, &modinfo->modulus, 1) < 0);  /* d <    modulus */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, 1) < 0);  /* e <    modulus */
     VERIFY_CHECK((labs(u) + labs(v)) >= 0); /* |u|+|v| doesn't overflow */
     VERIFY_CHECK((labs(q) + labs(r)) >= 0); /* |q|+|r| doesn't overflow */
     VERIFY_CHECK((labs(u) + labs(v)) <= M30 + 1); /* |u|+|v| <= 2^30 */
@@ -375,10 +375,10 @@ static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp
     d->v[8] = (int32_t)cd;
     e->v[8] = (int32_t)ce;
 #ifdef VERIFY
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, &modinfo->modulus, 1) < 0);  /* d <    modulus */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, 1) < 0);  /* e <    modulus */
 #endif
 }
 
@@ -415,6 +415,42 @@ static void secp256k1_modinv32_update_fg_30(secp256k1_modinv32_signed30 *f, secp
     g->v[8] = (int32_t)cg;
 }
 
+/* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps.
+ *
+ * Version that operates on a variable number of limbs in f and g.
+ *
+ * This implements the update_fg function from the explanation in modinv64_impl.h.
+ */
+static void secp256k1_modinv32_update_fg_30_var(int len, secp256k1_modinv32_signed30 *f, secp256k1_modinv32_signed30 *g, const secp256k1_modinv32_trans2x2 *t) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int32_t fi, gi;
+    int64_t cf, cg;
+    int i;
+    VERIFY_CHECK(len > 0);
+    /* Start computing t*[f,g]. */
+    fi = f->v[0];
+    gi = g->v[0];
+    cf = (int64_t)u * fi + (int64_t)v * gi;
+    cg = (int64_t)q * fi + (int64_t)r * gi;
+    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
+    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
+    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
+    /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
+     * down by 30 bits). */
+    for (i = 1; i < len; ++i) {
+        fi = f->v[i];
+        gi = g->v[i];
+        cf += (int64_t)u * fi + (int64_t)v * gi;
+        cg += (int64_t)q * fi + (int64_t)r * gi;
+        f->v[i - 1] = (int32_t)cf & M30; cf >>= 30;
+        g->v[i - 1] = (int32_t)cg & M30; cg >>= 30;
+    }
+    /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
+    f->v[len - 1] = (int32_t)cf;
+    g->v[len - 1] = (int32_t)cg;
+}
+
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
 static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
     /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
@@ -434,17 +470,17 @@ static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_m
         secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
 #ifdef VERIFY
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
         secp256k1_modinv32_update_fg_30(&f, &g, &t);
 #ifdef VERIFY
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
     }
 
@@ -453,14 +489,14 @@ static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_m
      * values i.e. +/- 1, and d now contains +/- the modular inverse. */
 #ifdef VERIFY
     /* g == 0 */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &SECP256K1_SIGNED30_ONE, 0) == 0);
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &SECP256K1_SIGNED30_ONE, 0) == 0);
     /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
-                 secp256k1_modinv32_mul_cmp_30(&f, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
-                 (secp256k1_modinv32_mul_cmp_30(x, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
-                  secp256k1_modinv32_mul_cmp_30(&d, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
-                  (secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) == 0 ||
-                   secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) == 0)));
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
+                 secp256k1_modinv32_mul_cmp_30(&f, 9, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
+                 (secp256k1_modinv32_mul_cmp_30(x, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  secp256k1_modinv32_mul_cmp_30(&d, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  (secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) == 0)));
 #endif
 
     /* Optionally negate d, normalize to [0,modulus), and return it. */
@@ -478,9 +514,9 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
 #ifdef VERIFY
     int i = 0;
 #endif
-    int j;
+    int j, len = 9;
     int32_t eta = -1;
-    int32_t cond;
+    int32_t cond, fn, gn;
 
     /* Do iterations of 30 divsteps each until g=0. */
     while (1) {
@@ -491,28 +527,41 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
         secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
 #ifdef VERIFY
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
-        secp256k1_modinv32_update_fg_30(&f, &g, &t);
+        secp256k1_modinv32_update_fg_30_var(len, &f, &g, &t);
         /* If the bottom limb of g is 0, there is a chance g=0. */
         if (g.v[0] == 0) {
             cond = 0;
-            /* Check if the other limbs are also 0. */
-            for (j = 1; j < 9; ++j) {
+            /* Check if all other limbs are also 0. */
+            for (j = 1; j < len; ++j) {
                 cond |= g.v[j];
             }
             /* If so, we're done. */
             if (cond == 0) break;
         }
+
+        /* Determine if len>1 and limb (len-1) of both f and g is 0 or -1. */
+        fn = f.v[len - 1];
+        gn = g.v[len - 1];
+        cond = ((int32_t)len - 2) >> 31;
+        cond |= fn ^ (fn >> 31);
+        cond |= gn ^ (gn >> 31);
+        /* If so, reduce length, propagating the sign of f and g's top limb into the one below. */
+        if (cond == 0) {
+            f.v[len - 2] |= (uint32_t)fn << 30;
+            g.v[len - 2] |= (uint32_t)gn << 30;
+            --len;
+        }
 #ifdef VERIFY
         VERIFY_CHECK(++i < 25); /* We should never need more than 25*30 = 750 divsteps */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
-        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
     }
 
@@ -520,18 +569,18 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 #ifdef VERIFY
     /* g == 0 */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, &SECP256K1_SIGNED30_ONE, 0) == 0);
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &SECP256K1_SIGNED30_ONE, 0) == 0);
     /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
-                 secp256k1_modinv32_mul_cmp_30(&f, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
-                 (secp256k1_modinv32_mul_cmp_30(x, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
-                  secp256k1_modinv32_mul_cmp_30(&d, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
-                  (secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, 1) == 0 ||
-                   secp256k1_modinv32_mul_cmp_30(&f, &modinfo->modulus, -1) == 0)));
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
+                 secp256k1_modinv32_mul_cmp_30(&f, len, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
+                 (secp256k1_modinv32_mul_cmp_30(x, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  secp256k1_modinv32_mul_cmp_30(&d, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  (secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) == 0)));
 #endif
 
     /* Optionally negate d, normalize to [0,modulus), and return it. */
-    secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
+    secp256k1_modinv32_normalize_30(&d, f.v[len - 1], modinfo);
     *x = d;
 }
 
diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
index 15cda3d73..78505fa18 100644
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@@ -30,25 +30,25 @@ static int64_t secp256k1_modinv64_abs(int64_t v) {
 static const secp256k1_modinv64_signed62 SECP256K1_SIGNED62_ONE = {{1}};
 
 /* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^62). */
-static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int64_t factor) {
+static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int alen, int64_t factor) {
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int128_t c = 0;
     int i;
     for (i = 0; i < 4; ++i) {
-        c += (int128_t)a->v[i] * factor;
+        if (i < alen) c += (int128_t)a->v[i] * factor;
         r->v[i] = (int64_t)c & M62; c >>= 62;
     }
-    c += (int128_t)a->v[4] * factor;
+    if (4 < alen) c += (int128_t)a->v[4] * factor;
     VERIFY_CHECK(c == (int64_t)c);
     r->v[4] = (int64_t)c;
 }
 
-/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. */
-static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, const secp256k1_modinv64_signed62 *b, int64_t factor) {
+/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A has alen limbs; b has 5. */
+static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, int alen, const secp256k1_modinv64_signed62 *b, int64_t factor) {
     int i;
     secp256k1_modinv64_signed62 am, bm;
-    secp256k1_modinv64_mul_62(&am, a, 1); /* Normalize all but the top limb of a. */
-    secp256k1_modinv64_mul_62(&bm, b, factor);
+    secp256k1_modinv64_mul_62(&am, a, alen, 1); /* Normalize all but the top limb of a. */
+    secp256k1_modinv64_mul_62(&bm, b, 5, factor);
     for (i = 0; i < 4; ++i) {
         /* Verify that all but the top limb of a and b are normalized. */
         VERIFY_CHECK(am.v[i] >> 62 == 0);
@@ -78,8 +78,8 @@ static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int6
         VERIFY_CHECK(r->v[i] >= -M62);
         VERIFY_CHECK(r->v[i] <= M62);
     }
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, &modinfo->modulus, 1) < 0); /* r < modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 1) < 0); /* r < modulus */
 #endif
 
     /* In a first step, add the modulus if the input is negative, and then negate if requested.
@@ -131,8 +131,8 @@ static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int6
     VERIFY_CHECK(r2 >> 62 == 0);
     VERIFY_CHECK(r3 >> 62 == 0);
     VERIFY_CHECK(r4 >> 62 == 0);
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, &modinfo->modulus, 0) >= 0); /* r >= 0 */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, &modinfo->modulus, 1) < 0); /* r < modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 0) >= 0); /* r >= 0 */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 1) < 0); /* r < modulus */
 #endif
 }
 
@@ -305,10 +305,10 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
     int64_t md, me, sd, se;
     int128_t cd, ce;
 #ifdef VERIFY
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, &modinfo->modulus, 1) < 0);  /* d <    modulus */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, 1) < 0);  /* e <    modulus */
     VERIFY_CHECK((secp256k1_modinv64_abs(u) + secp256k1_modinv64_abs(v)) >= 0); /* |u|+|v| doesn't overflow */
     VERIFY_CHECK((secp256k1_modinv64_abs(q) + secp256k1_modinv64_abs(r)) >= 0); /* |q|+|r| doesn't overflow */
     VERIFY_CHECK((secp256k1_modinv64_abs(u) + secp256k1_modinv64_abs(v)) <= M62 + 1); /* |u|+|v| <= 2^62 */
@@ -369,10 +369,10 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
     d->v[4] = (int64_t)cd;
     e->v[4] = (int64_t)ce;
 #ifdef VERIFY
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, &modinfo->modulus, 1) < 0);  /* d <    modulus */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, 1) < 0);  /* e <    modulus */
 #endif
 }
 
@@ -417,6 +417,42 @@ static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp
     g->v[4] = (int64_t)cg;
 }
 
+/* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
+ *
+ * Version that operates on a variable number of limbs in f and g.
+ *
+ * This implements the update_fg function from the explanation.
+ */
+static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_signed62 *f, secp256k1_modinv64_signed62 *g, const secp256k1_modinv64_trans2x2 *t) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int64_t fi, gi;
+    int128_t cf, cg;
+    int i;
+    VERIFY_CHECK(len > 0);
+    /* Start computing t*[f,g]. */
+    fi = f->v[0];
+    gi = g->v[0];
+    cf = (int128_t)u * fi + (int128_t)v * gi;
+    cg = (int128_t)q * fi + (int128_t)r * gi;
+    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
+    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
+    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
+     * down by 62 bits). */
+    for (i = 1; i < len; ++i) {
+        fi = f->v[i];
+        gi = g->v[i];
+        cf += (int128_t)u * fi + (int128_t)v * gi;
+        cg += (int128_t)q * fi + (int128_t)r * gi;
+        f->v[i - 1] = (int64_t)cf & M62; cf >>= 62;
+        g->v[i - 1] = (int64_t)cg & M62; cg >>= 62;
+    }
+    /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
+    f->v[len - 1] = (int64_t)cf;
+    g->v[len - 1] = (int64_t)cg;
+}
+
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
 static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
     /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
@@ -436,17 +472,17 @@ static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_m
         secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
 #ifdef VERIFY
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
         secp256k1_modinv64_update_fg_62(&f, &g, &t);
 #ifdef VERIFY
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
     }
 
@@ -455,14 +491,14 @@ static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_m
      * values i.e. +/- 1, and d now contains +/- the modular inverse. */
 #ifdef VERIFY
     /* g == 0 */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &SECP256K1_SIGNED62_ONE, 0) == 0);
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &SECP256K1_SIGNED62_ONE, 0) == 0);
     /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
-                 secp256k1_modinv64_mul_cmp_62(&f, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
-                 (secp256k1_modinv64_mul_cmp_62(x, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
-                  secp256k1_modinv64_mul_cmp_62(&d, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
-                  (secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) == 0 ||
-                   secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) == 0)));
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
+                 secp256k1_modinv64_mul_cmp_62(&f, 5, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
+                 (secp256k1_modinv64_mul_cmp_62(x, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  secp256k1_modinv64_mul_cmp_62(&d, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  (secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) == 0)));
 #endif
 
     /* Optionally negate d, normalize to [0,modulus), and return it. */
@@ -477,12 +513,12 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
     secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
     secp256k1_modinv64_signed62 f = modinfo->modulus;
     secp256k1_modinv64_signed62 g = *x;
-    int j;
 #ifdef VERIFY
     int i = 0;
 #endif
+    int j, len = 5;
     int64_t eta = -1;
-    int64_t cond;
+    int64_t cond, fn, gn;
 
     /* Do iterations of 62 divsteps each until g=0. */
     while (1) {
@@ -493,28 +529,41 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
         secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
 #ifdef VERIFY
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
-        secp256k1_modinv64_update_fg_62(&f, &g, &t);
+        secp256k1_modinv64_update_fg_62_var(len, &f, &g, &t);
         /* If the bottom limb of g is zero, there is a chance that g=0. */
         if (g.v[0] == 0) {
             cond = 0;
             /* Check if the other limbs are also 0. */
-            for (j = 1; j < 5; ++j) {
+            for (j = 1; j < len; ++j) {
                 cond |= g.v[j];
             }
             /* If so, we're done. */
             if (cond == 0) break;
         }
+
+        /* Determine if len>1 and limb (len-1) of both f and g is 0 or -1. */
+        fn = f.v[len - 1];
+        gn = g.v[len - 1];
+        cond = ((int64_t)len - 2) >> 63;
+        cond |= fn ^ (fn >> 63);
+        cond |= gn ^ (gn >> 63);
+        /* If so, reduce length, propagating the sign of f and g's top limb into the one below. */
+        if (cond == 0) {
+            f.v[len - 2] |= (uint64_t)fn << 62;
+            g.v[len - 2] |= (uint64_t)gn << 62;
+            --len;
+        }
 #ifdef VERIFY
         VERIFY_CHECK(++i < 12); /* We should never need more than 12*62 = 744 divsteps */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) > 0); /* f > -modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) <= 0); /* f <= modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, -1) > 0); /* g > -modulus */
-        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
 #endif
     }
 
@@ -522,18 +571,18 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
      * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
 #ifdef VERIFY
     /* g == 0 */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, &SECP256K1_SIGNED62_ONE, 0) == 0);
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &SECP256K1_SIGNED62_ONE, 0) == 0);
     /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
-                 secp256k1_modinv64_mul_cmp_62(&f, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
-                 (secp256k1_modinv64_mul_cmp_62(x, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
-                  secp256k1_modinv64_mul_cmp_62(&d, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
-                  (secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, 1) == 0 ||
-                   secp256k1_modinv64_mul_cmp_62(&f, &modinfo->modulus, -1) == 0)));
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
+                 secp256k1_modinv64_mul_cmp_62(&f, len, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
+                 (secp256k1_modinv64_mul_cmp_62(x, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  secp256k1_modinv64_mul_cmp_62(&d, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  (secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) == 0)));
 #endif
 
     /* Optionally negate d, normalize to [0,modulus), and return it. */
-    secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
+    secp256k1_modinv64_normalize_62(&d, f.v[len - 1], modinfo);
     *x = d;
 }
 

From 24ad04fc064e71abdf973e061c30eb1f3f78db39 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Fri, 22 Jan 2021 15:47:44 -0800
Subject: [PATCH 18/59] Make scalar_inverse{,_var} benchmark scale with
 SECP256K1_BENCH_ITERS

---
 src/bench_internal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bench_internal.c b/src/bench_internal.c
index 8e7ffcb0d..73b8a24cc 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -349,8 +349,8 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "negate")) run_benchmark("scalar_negate", bench_scalar_negate, bench_setup, NULL, &data, 10, iters*100);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "mul")) run_benchmark("scalar_mul", bench_scalar_mul, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "split")) run_benchmark("scalar_split", bench_scalar_split, bench_setup, NULL, &data, 10, iters);
-    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, 2000);
-    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, 2000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, iters);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, iters);
 
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize", bench_field_normalize, bench_setup, NULL, &data, 10, iters*100);
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize_weak", bench_field_normalize_weak, bench_setup, NULL, &data, 10, iters*100);

From 23c3fb629b905deebc4bcc9914bcfff7b9aedacd Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Fri, 12 Mar 2021 10:31:54 -0800
Subject: [PATCH 19/59] Make argument of fe_normalizes_to_zero{_var} const

---
 src/field.h            | 11 +++++------
 src/field_10x26_impl.h |  4 ++--
 src/field_5x52_impl.h  |  4 ++--
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/field.h b/src/field.h
index ee222ee5d..cf3bf10bb 100644
--- a/src/field.h
+++ b/src/field.h
@@ -43,13 +43,12 @@ static void secp256k1_fe_normalize_weak(secp256k1_fe *r);
 /** Normalize a field element, without constant-time guarantee. */
 static void secp256k1_fe_normalize_var(secp256k1_fe *r);
 
-/** Verify whether a field element represents zero i.e. would normalize to a zero value. The field
- *  implementation may optionally normalize the input, but this should not be relied upon. */
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r);
+/** Verify whether a field element represents zero i.e. would normalize to a zero value. */
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r);
 
-/** Verify whether a field element represents zero i.e. would normalize to a zero value. The field
- *  implementation may optionally normalize the input, but this should not be relied upon. */
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r);
+/** Verify whether a field element represents zero i.e. would normalize to a zero value,
+ *  without constant-time guarantee. */
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r);
 
 /** Set a field element equal to a small integer. Resulting field element is normalized. */
 static void secp256k1_fe_set_int(secp256k1_fe *r, int a);
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 62bffdc21..17de9a68d 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -181,7 +181,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
 #endif
 }
 
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {
     uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
              t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
 
@@ -210,7 +210,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
     return (z0 == 0) | (z1 == 0x3FFFFFFUL);
 }
 
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {
     uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
     uint32_t z0, z1;
     uint32_t x;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 3465ea324..dc6edcca9 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -161,7 +161,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
 #endif
 }
 
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {
     uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
 
     /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
@@ -184,7 +184,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
     return (z0 == 0) | (z1 == 0xFFFFFFFFFFFFFULL);
 }
 
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {
     uint64_t t0, t1, t2, t3, t4;
     uint64_t z0, z1;
     uint64_t x;

From 4504472269df06b8765b134d41f86619cdcdf8f6 Mon Sep 17 00:00:00 2001
From: William Bright <wbright@protonmail.com>
Date: Sat, 20 Mar 2021 19:59:51 -0400
Subject: [PATCH 20/59] changed import to use brackets <> for openssl as they
 are not local to the project

---
 src/tests.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tests.c b/src/tests.c
index ba645bbe8..e0c8c8560 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -21,10 +21,10 @@
 #include "util.h"
 
 #ifdef ENABLE_OPENSSL_TESTS
-#include "openssl/bn.h"
-#include "openssl/ec.h"
-#include "openssl/ecdsa.h"
-#include "openssl/obj_mac.h"
+#include <openssl/bn.h>
+#include <openssl/ec.h>
+#include <openssl/ecdsa.h>
+#include <openssl/obj_mac.h>
 # if OPENSSL_VERSION_NUMBER < 0x10100000L
 void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **pr, const BIGNUM **ps) {*pr = sig->r; *ps = sig->s;}
 # endif

From 376ca366db0469f39b93af0af762090986ea75f2 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Mon, 29 Mar 2021 16:33:36 -0700
Subject: [PATCH 21/59] Fix typo in explanation

---
 doc/safegcd_implementation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/safegcd_implementation.md b/doc/safegcd_implementation.md
index 8346d22e5..efb61b06c 100644
--- a/doc/safegcd_implementation.md
+++ b/doc/safegcd_implementation.md
@@ -681,7 +681,7 @@ def update_de(d, e, t, M, Mi):
     cd, ce = (u*d + v*e) % 2**N, (q*d + r*e) % 2**N
     md -= (Mi*cd + md) % 2**N
     me -= (Mi*ce + me) % 2**N
-    cd, ce = u*d + v*e + Mi*md, q*d + r*e + Mi*me
+    cd, ce = u*d + v*e + M*md, q*d + r*e + M*me
     return cd >> N, ce >> N
 ```
 

From 277b224b6aba942efbac4a6aae1054035a68d8dd Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Fri, 1 Jan 2021 11:15:10 -0800
Subject: [PATCH 22/59] Use modified divsteps with initial delta=1/2 for
 constant-time

Instead of using eta=-delta, use zeta=-(delta+1/2) to represent
delta. This variant only needs at most 590 iterations for 256-bit
inputs rather than 724 (by convex hull bounds analysis).
---
 doc/safegcd_implementation.md | 39 +++++++++++++++++++++----------
 src/modinv32_impl.h           | 42 ++++++++++++++++-----------------
 src/modinv64_impl.h           | 44 +++++++++++++++++------------------
 3 files changed, 70 insertions(+), 55 deletions(-)

diff --git a/doc/safegcd_implementation.md b/doc/safegcd_implementation.md
index efb61b06c..3ae556f9a 100644
--- a/doc/safegcd_implementation.md
+++ b/doc/safegcd_implementation.md
@@ -244,8 +244,8 @@ def modinv(M, Mi, x):
 
 This means that in practice we'll always perform a multiple of *N* divsteps. This is not a problem
 because once *g=0*, further divsteps do not affect *f*, *g*, *d*, or *e* anymore (only *&delta;* keeps
-increasing). For variable time code such excess iterations will be mostly optimized away in
-section 6.
+increasing). For variable time code such excess iterations will be mostly optimized away in later
+sections.
 
 
 ## 4. Avoiding modulus operations
@@ -519,6 +519,20 @@ computation:
     g >>= 1
 ```
 
+A variant of divsteps with better worst-case performance can be used instead: starting *&delta;* at
+*1/2* instead of *1*. This reduces the worst case number of iterations to *590* for *256*-bit inputs
+(which can be shown using convex hull analysis). In this case, the substitution *&zeta;=-(&delta;+1/2)*
+is used instead to keep the variable integral. Incrementing *&delta;* by *1* still translates to
+decrementing *&zeta;* by *1*, but negating *&delta;* now corresponds to going from *&zeta;* to *-(&zeta;+1)*, or
+*~&zeta;*. Doing that conditionally based on *c3* is simply:
+
+```python
+    ...
+    c3 = c1 & c2
+    zeta ^= c3
+    ...
+```
+
 By replacing the loop in `divsteps_n_matrix` with a variant of the divstep code above (extended to
 also apply all *f* operations to *u*, *v* and all *g* operations to *q*, *r*), a constant-time version of
 `divsteps_n_matrix` is obtained. The full code will be in section 7.
@@ -535,7 +549,8 @@ other cases, it slows down calculations unnecessarily. In this section, we will
 faster non-constant time `divsteps_n_matrix` function.
 
 To do so, first consider yet another way of writing the inner loop of divstep operations in
-`gcd` from section 1. This decomposition is also explained in the paper in section 8.2.
+`gcd` from section 1. This decomposition is also explained in the paper in section 8.2. We use
+the original version with initial *&delta;=1* and *&eta;=-&delta;* here.
 
 ```python
 for _ in range(N):
@@ -643,24 +658,24 @@ All together we need the following functions:
   section 5, extended to handle *u*, *v*, *q*, *r*:
 
 ```python
-def divsteps_n_matrix(eta, f, g):
-    """Compute eta and transition matrix t after N divsteps (multiplied by 2^N)."""
+def divsteps_n_matrix(zeta, f, g):
+    """Compute zeta and transition matrix t after N divsteps (multiplied by 2^N)."""
     u, v, q, r = 1, 0, 0, 1 # start with identity matrix
     for _ in range(N):
-        c1 = eta >> 63
+        c1 = zeta >> 63
         # Compute x, y, z as conditionally-negated versions of f, u, v.
         x, y, z = (f ^ c1) - c1, (u ^ c1) - c1, (v ^ c1) - c1
         c2 = -(g & 1)
         # Conditionally add x, y, z to g, q, r.
         g, q, r = g + (x & c2), q + (y & c2), r + (z & c2)
         c1 &= c2                     # reusing c1 here for the earlier c3 variable
-        eta = (eta ^ c1) - (c1 + 1)  # inlining the unconditional eta decrement here
+        zeta = (zeta ^ c1) - 1       # inlining the unconditional zeta decrement here
         # Conditionally add g, q, r to f, u, v.
         f, u, v = f + (g & c1), u + (q & c1), v + (r & c1)
         # When shifting g down, don't shift q, r, as we construct a transition matrix multiplied
         # by 2^N. Instead, shift f's coefficients u and v up.
         g, u, v = g >> 1, u << 1, v << 1
-    return eta, (u, v, q, r)
+    return zeta, (u, v, q, r)
 ```
 
 - The functions to update *f* and *g*, and *d* and *e*, from section 2 and section 4, with the constant-time
@@ -702,15 +717,15 @@ def normalize(sign, v, M):
     return v
 ```
 
-- And finally the `modinv` function too, adapted to use *&eta;* instead of *&delta;*, and using the fixed
+- And finally the `modinv` function too, adapted to use *&zeta;* instead of *&delta;*, and using the fixed
   iteration count from section 5:
 
 ```python
 def modinv(M, Mi, x):
     """Compute the modular inverse of x mod M, given Mi=1/M mod 2^N."""
-    eta, f, g, d, e = -1, M, x, 0, 1
-    for _ in range((724 + N - 1) // N):
-        eta, t = divsteps_n_matrix(-eta, f % 2**N, g % 2**N)
+    zeta, f, g, d, e = -1, M, x, 0, 1
+    for _ in range((590 + N - 1) // N):
+        zeta, t = divsteps_n_matrix(zeta, f % 2**N, g % 2**N)
         f, g = update_fg(f, g, t)
         d, e = update_de(d, e, t, M, Mi)
     return normalize(f, d, M)
diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
index aa7988c4b..661c5fc04 100644
--- a/src/modinv32_impl.h
+++ b/src/modinv32_impl.h
@@ -168,17 +168,17 @@ typedef struct {
     int32_t u, v, q, r;
 } secp256k1_modinv32_trans2x2;
 
-/* Compute the transition matrix and eta for 30 divsteps.
+/* Compute the transition matrix and zeta for 30 divsteps.
  *
- * Input:  eta: initial eta
- *         f0:  bottom limb of initial f
- *         g0:  bottom limb of initial g
+ * Input:  zeta: initial zeta
+ *         f0:   bottom limb of initial f
+ *         g0:   bottom limb of initial g
  * Output: t: transition matrix
- * Return: final eta
+ * Return: final zeta
  *
  * Implements the divsteps_n_matrix function from the explanation.
  */
-static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
+static int32_t secp256k1_modinv32_divsteps_30(int32_t zeta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
     /* u,v,q,r are the elements of the transformation matrix being built up,
      * starting with the identity matrix. Semantically they are signed integers
      * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This
@@ -193,8 +193,8 @@ static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t
         VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
-        /* Compute conditional masks for (eta < 0) and for (g & 1). */
-        c1 = eta >> 31;
+        /* Compute conditional masks for (zeta < 0) and for (g & 1). */
+        c1 = zeta >> 31;
         c2 = -(g & 1);
         /* Compute x,y,z, conditionally negated versions of f,u,v. */
         x = (f ^ c1) - c1;
@@ -204,10 +204,10 @@ static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t
         g += x & c2;
         q += y & c2;
         r += z & c2;
-        /* In what follows, c1 is a condition mask for (eta < 0) and (g & 1). */
+        /* In what follows, c1 is a condition mask for (zeta < 0) and (g & 1). */
         c1 &= c2;
-        /* Conditionally negate eta, and unconditionally subtract 1. */
-        eta = (eta ^ c1) - (c1 + 1);
+        /* Conditionally change zeta into -zeta-2 or zeta-1. */
+        zeta = (zeta ^ c1) - 1;
         /* Conditionally add g,q,r to f,u,v. */
         f += g & c1;
         u += q & c1;
@@ -216,8 +216,8 @@ static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t
         g >>= 1;
         u <<= 1;
         v <<= 1;
-        /* Bounds on eta that follow from the bounds on iteration count (max 25*30 divsteps). */
-        VERIFY_CHECK(eta >= -751 && eta <= 751);
+        /* Bounds on zeta that follow from the bounds on iteration count (max 20*30 divsteps). */
+        VERIFY_CHECK(zeta >= -601 && zeta <= 601);
     }
     /* Return data in t and return value. */
     t->u = (int32_t)u;
@@ -229,7 +229,7 @@ static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t
      * will be divided out again). As each divstep's individual matrix has determinant 2, the
      * aggregate of 30 of them will have determinant 2^30. */
     VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30);
-    return eta;
+    return zeta;
 }
 
 /* Compute the transition matrix and eta for 30 divsteps (variable time).
@@ -453,19 +453,19 @@ static void secp256k1_modinv32_update_fg_30_var(int len, secp256k1_modinv32_sign
 
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
 static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
-    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
+    /* Start with d=0, e=1, f=modulus, g=x, zeta=-1. */
     secp256k1_modinv32_signed30 d = {{0}};
     secp256k1_modinv32_signed30 e = {{1}};
     secp256k1_modinv32_signed30 f = modinfo->modulus;
     secp256k1_modinv32_signed30 g = *x;
     int i;
-    int32_t eta = -1;
+    int32_t zeta = -1; /* zeta = -(delta+1/2); delta is initially 1/2. */
 
-    /* Do 25 iterations of 30 divsteps each = 750 divsteps. 724 suffices for 256-bit inputs. */
-    for (i = 0; i < 25; ++i) {
-        /* Compute transition matrix and new eta after 30 divsteps. */
+    /* Do 20 iterations of 30 divsteps each = 600 divsteps. 590 suffices for 256-bit inputs. */
+    for (i = 0; i < 20; ++i) {
+        /* Compute transition matrix and new zeta after 30 divsteps. */
         secp256k1_modinv32_trans2x2 t;
-        eta = secp256k1_modinv32_divsteps_30(eta, f.v[0], g.v[0], &t);
+        zeta = secp256k1_modinv32_divsteps_30(zeta, f.v[0], g.v[0], &t);
         /* Update d,e using that transition matrix. */
         secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
@@ -515,7 +515,7 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
     int i = 0;
 #endif
     int j, len = 9;
-    int32_t eta = -1;
+    int32_t eta = -1; /* eta = -delta; delta is initially 1 (faster for the variable-time code) */
     int32_t cond, fn, gn;
 
     /* Do iterations of 30 divsteps each until g=0. */
diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
index 78505fa18..a88184361 100644
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@@ -145,17 +145,17 @@ typedef struct {
     int64_t u, v, q, r;
 } secp256k1_modinv64_trans2x2;
 
-/* Compute the transition matrix and eta for 62 divsteps.
+/* Compute the transition matrix and zeta for 62 divsteps (where zeta=-(delta+1/2)).
  *
- * Input:  eta: initial eta
- *         f0:  bottom limb of initial f
- *         g0:  bottom limb of initial g
+ * Input:  zeta: initial zeta
+ *         f0:   bottom limb of initial f
+ *         g0:   bottom limb of initial g
  * Output: t: transition matrix
- * Return: final eta
+ * Return: final zeta
  *
  * Implements the divsteps_n_matrix function from the explanation.
  */
-static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
+static int64_t secp256k1_modinv64_divsteps_62(int64_t zeta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
     /* u,v,q,r are the elements of the transformation matrix being built up,
      * starting with the identity matrix. Semantically they are signed integers
      * in range [-2^62,2^62], but here represented as unsigned mod 2^64. This
@@ -170,8 +170,8 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t
         VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
-        /* Compute conditional masks for (eta < 0) and for (g & 1). */
-        c1 = eta >> 63;
+        /* Compute conditional masks for (zeta < 0) and for (g & 1). */
+        c1 = zeta >> 63;
         c2 = -(g & 1);
         /* Compute x,y,z, conditionally negated versions of f,u,v. */
         x = (f ^ c1) - c1;
@@ -181,10 +181,10 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t
         g += x & c2;
         q += y & c2;
         r += z & c2;
-        /* In what follows, c1 is a condition mask for (eta < 0) and (g & 1). */
+        /* In what follows, c1 is a condition mask for (zeta < 0) and (g & 1). */
         c1 &= c2;
-        /* Conditionally negate eta, and unconditionally subtract 1. */
-        eta = (eta ^ c1) - (c1 + 1);
+        /* Conditionally change zeta into -zeta-2 or zeta-1. */
+        zeta = (zeta ^ c1) - 1;
         /* Conditionally add g,q,r to f,u,v. */
         f += g & c1;
         u += q & c1;
@@ -193,8 +193,8 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t
         g >>= 1;
         u <<= 1;
         v <<= 1;
-        /* Bounds on eta that follow from the bounds on iteration count (max 12*62 divsteps). */
-        VERIFY_CHECK(eta >= -745 && eta <= 745);
+        /* Bounds on zeta that follow from the bounds on iteration count (max 10*62 divsteps). */
+        VERIFY_CHECK(zeta >= -621 && zeta <= 621);
     }
     /* Return data in t and return value. */
     t->u = (int64_t)u;
@@ -206,10 +206,10 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t
      * will be divided out again). As each divstep's individual matrix has determinant 2, the
      * aggregate of 62 of them will have determinant 2^62. */
     VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
-    return eta;
+    return zeta;
 }
 
-/* Compute the transition matrix and eta for 62 divsteps (variable time).
+/* Compute the transition matrix and eta for 62 divsteps (variable time, eta=-delta).
  *
  * Input:  eta: initial eta
  *         f0:  bottom limb of initial f
@@ -455,19 +455,19 @@ static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_sign
 
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
 static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
-    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
+    /* Start with d=0, e=1, f=modulus, g=x, zeta=-1. */
     secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
     secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
     secp256k1_modinv64_signed62 f = modinfo->modulus;
     secp256k1_modinv64_signed62 g = *x;
     int i;
-    int64_t eta = -1;
+    int64_t zeta = -1; /* zeta = -(delta+1/2); delta starts at 1/2. */
 
-    /* Do 12 iterations of 62 divsteps each = 744 divsteps. 724 suffices for 256-bit inputs. */
-    for (i = 0; i < 12; ++i) {
-        /* Compute transition matrix and new eta after 62 divsteps. */
+    /* Do 10 iterations of 62 divsteps each = 620 divsteps. 590 suffices for 256-bit inputs. */
+    for (i = 0; i < 10; ++i) {
+        /* Compute transition matrix and new zeta after 62 divsteps. */
         secp256k1_modinv64_trans2x2 t;
-        eta = secp256k1_modinv64_divsteps_62(eta, f.v[0], g.v[0], &t);
+        zeta = secp256k1_modinv64_divsteps_62(zeta, f.v[0], g.v[0], &t);
         /* Update d,e using that transition matrix. */
         secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */
@@ -517,7 +517,7 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
     int i = 0;
 #endif
     int j, len = 5;
-    int64_t eta = -1;
+    int64_t eta = -1; /* eta = -delta; delta is initially 1 */
     int64_t cond, fn, gn;
 
     /* Do iterations of 62 divsteps each until g=0. */

From cd393ce2283e0e7234ea39a15c4931715f4dde1e Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Fri, 15 Jan 2021 15:20:39 -0800
Subject: [PATCH 23/59] Optimization: only do 59 hddivsteps per iteration
 instead of 62

---
 src/modinv64_impl.h | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
index a88184361..0743a9c82 100644
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@@ -145,7 +145,8 @@ typedef struct {
     int64_t u, v, q, r;
 } secp256k1_modinv64_trans2x2;
 
-/* Compute the transition matrix and zeta for 62 divsteps (where zeta=-(delta+1/2)).
+/* Compute the transition matrix and eta for 59 divsteps (where zeta=-(delta+1/2)).
+ * Note that the transformation matrix is scaled by 2^62 and not 2^59.
  *
  * Input:  zeta: initial zeta
  *         f0:   bottom limb of initial f
@@ -155,18 +156,19 @@ typedef struct {
  *
  * Implements the divsteps_n_matrix function from the explanation.
  */
-static int64_t secp256k1_modinv64_divsteps_62(int64_t zeta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
+static int64_t secp256k1_modinv64_divsteps_59(int64_t zeta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
     /* u,v,q,r are the elements of the transformation matrix being built up,
-     * starting with the identity matrix. Semantically they are signed integers
+     * starting with the identity matrix times 8 (because the caller expects
+     * a result scaled by 2^62). Semantically they are signed integers
      * in range [-2^62,2^62], but here represented as unsigned mod 2^64. This
      * permits left shifting (which is UB for negative numbers). The range
      * being inside [-2^63,2^63) means that casting to signed works correctly.
      */
-    uint64_t u = 1, v = 0, q = 0, r = 1;
+    uint64_t u = 8, v = 0, q = 0, r = 8;
     uint64_t c1, c2, f = f0, g = g0, x, y, z;
     int i;
 
-    for (i = 0; i < 62; ++i) {
+    for (i = 3; i < 62; ++i) {
         VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
         VERIFY_CHECK((u * f0 + v * g0) == f << i);
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
@@ -193,8 +195,8 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t zeta, uint64_t f0, uint64_
         g >>= 1;
         u <<= 1;
         v <<= 1;
-        /* Bounds on zeta that follow from the bounds on iteration count (max 10*62 divsteps). */
-        VERIFY_CHECK(zeta >= -621 && zeta <= 621);
+        /* Bounds on zeta that follow from the bounds on iteration count (max 10*59 divsteps). */
+        VERIFY_CHECK(zeta >= -591 && zeta <= 591);
     }
     /* Return data in t and return value. */
     t->u = (int64_t)u;
@@ -204,8 +206,10 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t zeta, uint64_t f0, uint64_
     /* The determinant of t must be a power of two. This guarantees that multiplication with t
      * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
      * will be divided out again). As each divstep's individual matrix has determinant 2, the
-     * aggregate of 62 of them will have determinant 2^62. */
-    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
+     * aggregate of 59 of them will have determinant 2^59. Multiplying with the initial
+     * 8*identity (which has determinant 2^6) means the overall outputs has determinant
+     * 2^65. */
+    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 65);
     return zeta;
 }
 
@@ -290,7 +294,7 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
     return eta;
 }
 
-/* Compute (t/2^62) * [d, e] mod modulus, where t is a transition matrix for 62 divsteps.
+/* Compute (t/2^62) * [d, e] mod modulus, where t is a transition matrix scaled by 2^62.
  *
  * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range
  * (-2^62,2^62).
@@ -376,7 +380,7 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
 #endif
 }
 
-/* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
+/* Compute (t/2^62) * [f, g], where t is a transition matrix scaled by 2^62.
  *
  * This implements the update_fg function from the explanation.
  */
@@ -463,11 +467,11 @@ static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_m
     int i;
     int64_t zeta = -1; /* zeta = -(delta+1/2); delta starts at 1/2. */
 
-    /* Do 10 iterations of 62 divsteps each = 620 divsteps. 590 suffices for 256-bit inputs. */
+    /* Do 10 iterations of 59 divsteps each = 590 divsteps. This suffices for 256-bit inputs. */
     for (i = 0; i < 10; ++i) {
-        /* Compute transition matrix and new zeta after 62 divsteps. */
+        /* Compute transition matrix and new zeta after 59 divsteps. */
         secp256k1_modinv64_trans2x2 t;
-        zeta = secp256k1_modinv64_divsteps_62(zeta, f.v[0], g.v[0], &t);
+        zeta = secp256k1_modinv64_divsteps_59(zeta, f.v[0], g.v[0], &t);
         /* Update d,e using that transition matrix. */
         secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
         /* Update f,g using that transition matrix. */

From be0609fd54af95a15b76cea150e6907d581318dd Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Thu, 25 Mar 2021 22:50:15 -0700
Subject: [PATCH 24/59] Add unit tests for edge cases with delta=1/2 variant of
 divsteps

---
 src/tests.c | 701 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 663 insertions(+), 38 deletions(-)

diff --git a/src/tests.c b/src/tests.c
index f19ab96e3..a14639430 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -914,6 +914,8 @@ int coprime(const uint16_t* a, const uint16_t* b) {
 void run_modinv_tests(void) {
     /* Fixed test cases. Each tuple is (input, modulus, output), each as 16x16 bits in LE order. */
     static const uint16_t CASES[][3][16] = {
+        /* Test cases triggering edge cases in divsteps */
+
         /* Test case known to need 713 divsteps */
         {{0x1513, 0x5389, 0x54e9, 0x2798, 0x1957, 0x66a0, 0x8057, 0x3477,
           0x7784, 0x1052, 0x326a, 0x9331, 0x6506, 0xa95c, 0x91f3, 0xfb5e},
@@ -936,6 +938,236 @@ void run_modinv_tests(void) {
           0x8a87, 0x1c9c, 0x51d7, 0x851c, 0xb9d8, 0x1fbe, 0xc241, 0xd4a3},
          {0xcdb4, 0x275c, 0x7d22, 0xa906, 0x0173, 0xc054, 0x7fdf, 0x5005,
           0x7fb8, 0x9059, 0xdf51, 0x99df, 0x2654, 0x8f6e, 0x070f, 0xb347}},
+        /* example needing 713 divsteps; delta=-2..3 */
+        {{0xe2e9, 0xee91, 0x4345, 0xe5ad, 0xf3ec, 0x8f42, 0x0364, 0xd5c9,
+          0xff49, 0xbef5, 0x4544, 0x4c7c, 0xae4b, 0xfd9d, 0xb35b, 0xda9d},
+         {0x36e7, 0x8cca, 0x2ed0, 0x47b3, 0xaca4, 0xb374, 0x7d2a, 0x0772,
+          0x6bdb, 0xe0a7, 0x900b, 0xfe10, 0x788c, 0x6f22, 0xd909, 0xf298},
+         {0xd8c6, 0xba39, 0x13ed, 0x198c, 0x16c8, 0xb837, 0xa5f2, 0x9797,
+          0x0113, 0x882a, 0x15b5, 0x324c, 0xabee, 0xe465, 0x8170, 0x85ac}},
+        /* example needing 713 divsteps; delta=-2..3 */
+        {{0xd5b7, 0x2966, 0x040e, 0xf59a, 0x0387, 0xd96d, 0xbfbc, 0xd850,
+          0x2d96, 0x872a, 0xad81, 0xc03c, 0xbb39, 0xb7fa, 0xd904, 0xef78},
+         {0x6279, 0x4314, 0xfdd3, 0x1568, 0x0982, 0x4d13, 0x625f, 0x010c,
+          0x22b1, 0x0cc3, 0xf22d, 0x5710, 0x1109, 0x5751, 0x7714, 0xfcf2},
+         {0xdb13, 0x5817, 0x232e, 0xe456, 0xbbbc, 0x6fbe, 0x4572, 0xa358,
+          0xc76d, 0x928e, 0x0162, 0x5314, 0x8325, 0x5683, 0xe21b, 0xda88}},
+        /* example needing 713 divsteps; delta=-2..3 */
+        {{0xa06f, 0x71ee, 0x3bac, 0x9ebb, 0xdeaa, 0x09ed, 0x1cf7, 0x9ec9,
+          0x7158, 0x8b72, 0x5d53, 0x5479, 0x5c75, 0xbb66, 0x9125, 0xeccc},
+         {0x2941, 0xd46c, 0x3cd4, 0x4a9d, 0x5c4a, 0x256b, 0xbd6c, 0x9b8e,
+          0x8fe0, 0x8a14, 0xffe8, 0x2496, 0x618d, 0xa9d7, 0x5018, 0xfb29},
+         {0x437c, 0xbd60, 0x7590, 0x94bb, 0x0095, 0xd35e, 0xd4fe, 0xd6da,
+          0x0d4e, 0x5342, 0x4cd2, 0x169b, 0x661c, 0x1380, 0xed2d, 0x85c1}},
+        /* example reaching delta=-64..65; 661 divsteps */
+        {{0xfde4, 0x68d6, 0x6c48, 0x7f77, 0x1c78, 0x96de, 0x2fd9, 0xa6c2,
+          0xbbb5, 0xd319, 0x69cf, 0xd4b3, 0xa321, 0xcda0, 0x172e, 0xe530},
+         {0xd9e3, 0x0f60, 0x3d86, 0xeeab, 0x25ee, 0x9582, 0x2d50, 0xfe16,
+          0xd4e2, 0xe3ba, 0x94e2, 0x9833, 0x6c5e, 0x8982, 0x13b6, 0xe598},
+         {0xe675, 0xf55a, 0x10f6, 0xabde, 0x5113, 0xecaa, 0x61ae, 0xad9f,
+          0x0c27, 0xef33, 0x62e5, 0x211d, 0x08fa, 0xa78d, 0xc675, 0x8bae}},
+        /* example reaching delta=-64..65; 661 divsteps */
+        {{0x21bf, 0x52d5, 0x8fd4, 0xaa18, 0x156a, 0x7247, 0xebb8, 0x5717,
+          0x4eb5, 0x1421, 0xb58f, 0x3b0b, 0x5dff, 0xe533, 0xb369, 0xd28a},
+         {0x9f6b, 0xe463, 0x2563, 0xc74d, 0x6d81, 0x636a, 0x8fc8, 0x7a94,
+          0x9429, 0x1585, 0xf35e, 0x7ff5, 0xb64f, 0x9720, 0xba74, 0xe108},
+         {0xa5ab, 0xea7b, 0xfe5e, 0x8a85, 0x13be, 0x7934, 0xe8a0, 0xa187,
+          0x86b5, 0xe477, 0xb9a4, 0x75d7, 0x538f, 0xdd70, 0xc781, 0xb67d}},
+        /* example reaching delta=-64..65; 661 divsteps */
+        {{0xa41a, 0x3e8d, 0xf1f5, 0x9493, 0x868c, 0x5103, 0x2725, 0x3ceb,
+          0x6032, 0x3624, 0xdc6b, 0x9120, 0xbf4c, 0x8821, 0x91ad, 0xb31a},
+         {0x5c0b, 0xdda5, 0x20f8, 0x32a1, 0xaf73, 0x6ec5, 0x4779, 0x43d6,
+          0xd454, 0x9573, 0xbf84, 0x5a58, 0xe04e, 0x307e, 0xd1d5, 0xe230},
+         {0xda15, 0xbcd6, 0x7180, 0xabd3, 0x04e6, 0x6986, 0xc0d7, 0x90bb,
+          0x3a4d, 0x7c95, 0xaaab, 0x9ab3, 0xda34, 0xa7f6, 0x9636, 0x6273}},
+        /* example doing 123 consecutive (f,g/2) steps; 615 divsteps */
+        {{0xb4d6, 0xb38f, 0x00aa, 0xebda, 0xd4c2, 0x70b8, 0x9dad, 0x58ee,
+          0x68f8, 0x48d3, 0xb5ff, 0xf422, 0x9e46, 0x2437, 0x18d0, 0xd9cc},
+         {0x5c83, 0xfed7, 0x97f5, 0x3f07, 0xcaad, 0x95b1, 0xb4a4, 0xb005,
+          0x23af, 0xdd27, 0x6c0d, 0x932c, 0xe2b2, 0xe3ae, 0xfb96, 0xdf67},
+         {0x3105, 0x0127, 0xfd48, 0x039b, 0x35f1, 0xbc6f, 0x6c0a, 0xb572,
+          0xe4df, 0xebad, 0x8edc, 0xb89d, 0x9555, 0x4c26, 0x1fef, 0x997c}},
+        /* example doing 123 consecutive (f,g/2) steps; 614 divsteps */
+        {{0x5138, 0xd474, 0x385f, 0xc964, 0x00f2, 0x6df7, 0x862d, 0xb185,
+          0xb264, 0xe9e1, 0x466c, 0xf39e, 0xafaf, 0x5f41, 0x47e2, 0xc89d},
+         {0x8607, 0x9c81, 0x46a2, 0x7dcc, 0xcb0c, 0x9325, 0xe149, 0x2bde,
+          0x6632, 0x2869, 0xa261, 0xb163, 0xccee, 0x22ae, 0x91e0, 0xcfd5},
+         {0x831c, 0xda22, 0xb080, 0xba7a, 0x26e2, 0x54b0, 0x073b, 0x5ea0,
+          0xed4b, 0xcb3d, 0xbba1, 0xbec8, 0xf2ad, 0xae0d, 0x349b, 0x17d1}},
+        /* example doing 123 consecutive (f,g/2) steps; 614 divsteps */
+        {{0xe9a5, 0xb4ad, 0xd995, 0x9953, 0xcdff, 0x50d7, 0xf715, 0x9dc7,
+          0x3e28, 0x15a9, 0x95a3, 0x8554, 0x5b5e, 0xad1d, 0x6d57, 0x3d50},
+         {0x3ad9, 0xbd60, 0x5cc7, 0x6b91, 0xadeb, 0x71f6, 0x7cc4, 0xa58a,
+          0x2cce, 0xf17c, 0x38c9, 0x97ed, 0x65fb, 0x3fa6, 0xa6bc, 0xeb24},
+         {0xf96c, 0x1963, 0x8151, 0xa0cc, 0x299b, 0xf277, 0x001a, 0x16bb,
+          0xfd2e, 0x532d, 0x0410, 0xe117, 0x6b00, 0x44ec, 0xca6a, 0x1745}},
+        /* example doing 446 (f,g/2) steps; 523 divsteps */
+        {{0x3758, 0xa56c, 0xe41e, 0x4e47, 0x0975, 0xa82b, 0x107c, 0x89cf,
+          0x2093, 0x5a0c, 0xda37, 0xe007, 0x6074, 0x4f68, 0x2f5a, 0xbb8a},
+         {0x4beb, 0xa40f, 0x2c42, 0xd9d6, 0x97e8, 0xca7c, 0xd395, 0x894f,
+          0x1f50, 0x8067, 0xa233, 0xb850, 0x1746, 0x1706, 0xbcda, 0xdf32},
+         {0x762a, 0xceda, 0x4c45, 0x1ca0, 0x8c37, 0xd8c5, 0xef57, 0x7a2c,
+          0x6e98, 0xe38a, 0xc50e, 0x2ca9, 0xcb85, 0x24d5, 0xc29c, 0x61f6}},
+        /* example doing 446 (f,g/2) steps; 523 divsteps */
+        {{0x6f38, 0x74ad, 0x7332, 0x4073, 0x6521, 0xb876, 0xa370, 0xa6bd,
+          0xcea5, 0xbd06, 0x969f, 0x77c6, 0x1e69, 0x7c49, 0x7d51, 0xb6e7},
+         {0x3f27, 0x4be4, 0xd81e, 0x1396, 0xb21f, 0x92aa, 0x6dc3, 0x6283,
+          0x6ada, 0x3ca2, 0xc1e5, 0x8b9b, 0xd705, 0x5598, 0x8ba1, 0xe087},
+         {0x6a22, 0xe834, 0xbc8d, 0xcee9, 0x42fc, 0xfc77, 0x9c45, 0x1ca8,
+          0xeb66, 0xed74, 0xaaf9, 0xe75f, 0xfe77, 0x46d2, 0x179b, 0xbf3e}},
+        /* example doing 336 (f,(f+g)/2) steps; 693 divsteps */
+        {{0x7ea7, 0x444e, 0x84ea, 0xc447, 0x7c1f, 0xab97, 0x3de6, 0x5878,
+          0x4e8b, 0xc017, 0x03e0, 0xdc40, 0xbbd0, 0x74ce, 0x0169, 0x7ab5},
+         {0x4023, 0x154f, 0xfbe4, 0x8195, 0xfda0, 0xef54, 0x9e9a, 0xc703,
+          0x2803, 0xf760, 0x6302, 0xed5b, 0x7157, 0x6456, 0xdd7d, 0xf14b},
+         {0xb6fb, 0xe3b3, 0x0733, 0xa77e, 0x44c5, 0x3003, 0xc937, 0xdd4d,
+          0x5355, 0x14e9, 0x184e, 0xcefe, 0xe6b5, 0xf2e0, 0x0a28, 0x5b74}},
+        /* example doing 336 (f,(f+g)/2) steps; 687 divsteps */
+        {{0xa893, 0xb5f4, 0x1ede, 0xa316, 0x242c, 0xbdcc, 0xb017, 0x0836,
+          0x3a37, 0x27fb, 0xfb85, 0x251e, 0xa189, 0xb15d, 0xa4b8, 0xc24c},
+         {0xb0b7, 0x57ba, 0xbb6d, 0x9177, 0xc896, 0xc7f2, 0x43b4, 0x85a6,
+          0xe6c4, 0xe50e, 0x3109, 0x7ca5, 0xd73d, 0x13ff, 0x0c3d, 0xcd62},
+         {0x48ca, 0xdb34, 0xe347, 0x2cef, 0x4466, 0x10fb, 0x7ee1, 0x6344,
+          0x4308, 0x966d, 0xd4d1, 0xb099, 0x994f, 0xd025, 0x2187, 0x5866}},
+        /* example doing 267 (g,(g-f)/2) steps; 678 divsteps */
+        {{0x0775, 0x1754, 0x01f6, 0xdf37, 0xc0be, 0x8197, 0x072f, 0x6cf5,
+          0x8b36, 0x8069, 0x5590, 0xb92d, 0x6084, 0x47a4, 0x23fe, 0xddd5},
+         {0x8e1b, 0xda37, 0x27d9, 0x312e, 0x3a2f, 0xef6d, 0xd9eb, 0x8153,
+          0xdcba, 0x9fa3, 0x9f80, 0xead5, 0x134d, 0x2ebb, 0x5ec0, 0xe032},
+         {0x1cb6, 0x5a61, 0x1bed, 0x77d6, 0xd5d1, 0x7498, 0xef33, 0x2dd2,
+          0x1089, 0xedbd, 0x6958, 0x16ae, 0x336c, 0x45e6, 0x4361, 0xbadc}},
+        /* example doing 267 (g,(g-f)/2) steps; 676 divsteps */
+        {{0x0207, 0xf948, 0xc430, 0xf36b, 0xf0a7, 0x5d36, 0x751f, 0x132c,
+          0x6f25, 0xa630, 0xca1f, 0xc967, 0xaf9c, 0x34e7, 0xa38f, 0xbe9f},
+         {0x5fb9, 0x7321, 0x6561, 0x5fed, 0x54ec, 0x9c3a, 0xee0e, 0x6717,
+          0x49af, 0xb896, 0xf4f5, 0x451c, 0x722a, 0xf116, 0x64a9, 0xcf0b},
+         {0xf4d7, 0xdb47, 0xfef2, 0x4806, 0x4cb8, 0x18c7, 0xd9a7, 0x4951,
+          0x14d8, 0x5c3a, 0xd22d, 0xd7b2, 0x750c, 0x3de7, 0x8b4a, 0x19aa}},
+
+        /* Test cases triggering edge cases in divsteps variant starting with delta=1/2 */
+
+        /* example needing 590 divsteps; delta=-5/2..7/2 */
+        {{0x9118, 0xb640, 0x53d7, 0x30ab, 0x2a23, 0xd907, 0x9323, 0x5b3a,
+          0xb6d4, 0x538a, 0x7637, 0xfe97, 0xfd05, 0x3cc0, 0x453a, 0xfb7e},
+         {0x6983, 0x4f75, 0x4ad1, 0x48ad, 0xb2d9, 0x521d, 0x3dbc, 0x9cc0,
+          0x4b60, 0x0ac6, 0xd3be, 0x0fb6, 0xd305, 0x3895, 0x2da5, 0xfdf8},
+         {0xcec1, 0x33ac, 0xa801, 0x8194, 0xe36c, 0x65ef, 0x103b, 0xca54,
+          0xfa9b, 0xb41d, 0x9b52, 0xb6f7, 0xa611, 0x84aa, 0x3493, 0xbf54}},
+        /* example needing 590 divsteps; delta=-3/2..5/2 */
+        {{0xb5f2, 0x42d0, 0x35e8, 0x8ca0, 0x4b62, 0x6e1d, 0xbdf3, 0x890e,
+          0x8c82, 0x23d8, 0xc79a, 0xc8e8, 0x789e, 0x353d, 0x9766, 0xea9d},
+         {0x6fa1, 0xacba, 0x4b7a, 0x5de1, 0x95d0, 0xc845, 0xebbf, 0x6f5a,
+          0x30cf, 0x52db, 0x69b7, 0xe278, 0x4b15, 0x8411, 0x2ab2, 0xf3e7},
+         {0xf12c, 0x9d6d, 0x95fa, 0x1878, 0x9f13, 0x4fb5, 0x3c8b, 0xa451,
+          0x7182, 0xc4b6, 0x7e2a, 0x7bb7, 0x6e0e, 0x5b68, 0xde55, 0x9927}},
+        /* example needing 590 divsteps; delta=-3/2..5/2 */
+        {{0x229c, 0x4ef8, 0x1e93, 0xe5dc, 0xcde5, 0x6d62, 0x263b, 0xad11,
+          0xced0, 0x88ff, 0xae8e, 0x3183, 0x11d2, 0xa50b, 0x350d, 0xeb40},
+         {0x3157, 0xe2ea, 0x8a02, 0x0aa3, 0x5ae1, 0xb26c, 0xea27, 0x6805,
+          0x87e2, 0x9461, 0x37c1, 0x2f8d, 0x85d2, 0x77a8, 0xf805, 0xeec9},
+         {0x6f4e, 0x2748, 0xf7e5, 0xd8d3, 0xabe2, 0x7270, 0xc4e0, 0xedc7,
+          0xf196, 0x78ca, 0x9139, 0xd8af, 0x72c6, 0xaf2f, 0x85d2, 0x6cd3}},
+        /* example needing 590 divsteps; delta=-5/2..7/2 */
+        {{0xdce8, 0xf1fe, 0x6708, 0x021e, 0xf1ca, 0xd609, 0x5443, 0x85ce,
+          0x7a05, 0x8f9c, 0x90c3, 0x52e7, 0x8e1d, 0x97b8, 0xc0bf, 0xf2a1},
+         {0xbd3d, 0xed11, 0x1625, 0xb4c5, 0x844c, 0xa413, 0x2569, 0xb9ba,
+          0xcd35, 0xff84, 0xcd6e, 0x7f0b, 0x7d5d, 0x10df, 0x3efe, 0xfbe5},
+         {0xa9dd, 0xafef, 0xb1b7, 0x4c8d, 0x50e4, 0xafbf, 0x2d5a, 0xb27c,
+          0x0653, 0x66b6, 0x5d36, 0x4694, 0x7e35, 0xc47c, 0x857f, 0x32c5}},
+        /* example needing 590 divsteps; delta=-3/2..5/2 */
+        {{0x7902, 0xc9f8, 0x926b, 0xaaeb, 0x90f8, 0x1c89, 0xcce3, 0x96b7,
+          0x28b2, 0x87a2, 0x136d, 0x695a, 0xa8df, 0x9061, 0x9e31, 0xee82},
+         {0xd3a9, 0x3c02, 0x818c, 0x6b81, 0x34b3, 0xebbb, 0xe2c8, 0x7712,
+          0xbfd6, 0x8248, 0xa6f4, 0xba6f, 0x03bb, 0xfb54, 0x7575, 0xfe89},
+         {0x8246, 0x0d63, 0x478e, 0xf946, 0xf393, 0x0451, 0x08c2, 0x5919,
+          0x5fd6, 0x4c61, 0xbeb7, 0x9a15, 0x30e1, 0x55fc, 0x6a01, 0x3724}},
+        /* example reaching delta=-127/2..129/2; 571 divsteps */
+        {{0x3eff, 0x926a, 0x77f5, 0x1fff, 0x1a5b, 0xf3ef, 0xf64b, 0x8681,
+          0xf800, 0xf9bc, 0x761d, 0xe268, 0x62b0, 0xa032, 0xba9c, 0xbe56},
+         {0xb8f9, 0x00e7, 0x47b7, 0xdffc, 0xfd9d, 0x5abb, 0xa19b, 0x1868,
+          0x31fd, 0x3b29, 0x3674, 0x5449, 0xf54d, 0x1d19, 0x6ac7, 0xff6f},
+         {0xf1d7, 0x3551, 0x5682, 0x9adf, 0xe8aa, 0x19a5, 0x8340, 0x71db,
+          0xb7ab, 0x4cfd, 0xf661, 0x632c, 0xc27e, 0xd3c6, 0xdf42, 0xd306}},
+        /* example reaching delta=-127/2..129/2; 571 divsteps */
+        {{0x0000, 0x0000, 0x0000, 0x0000, 0x3aff, 0x2ed7, 0xf2e0, 0xabc7,
+          0x8aee, 0x166e, 0x7ed0, 0x9ac7, 0x714a, 0xb9c5, 0x4d58, 0xad6c},
+         {0x9cf9, 0x47e2, 0xa421, 0xb277, 0xffc2, 0x2747, 0x6486, 0x94c1,
+          0x1d99, 0xd49b, 0x1096, 0x991a, 0xe986, 0xae02, 0xe89b, 0xea36},
+         {0x1fb4, 0x98d8, 0x19b7, 0x80e9, 0xcdac, 0xaa5a, 0xf1e6, 0x0074,
+          0xe393, 0xed8b, 0x8d5c, 0xe17d, 0x81b3, 0xc16d, 0x54d3, 0x9be3}},
+        /* example reaching delta=-127/2..129/2; 571 divsteps */
+        {{0xd047, 0x7e36, 0x3157, 0x7ab6, 0xb4d9, 0x8dae, 0x7534, 0x4f5d,
+          0x489e, 0xa8ab, 0x8a3d, 0xd52c, 0x62af, 0xa032, 0xba9c, 0xbe56},
+         {0xb1f1, 0x737f, 0x5964, 0x5afb, 0x3712, 0x8ef9, 0x19f7, 0x9669,
+          0x664d, 0x03ad, 0xc352, 0xf7a5, 0xf545, 0x1d19, 0x6ac7, 0xff6f},
+         {0xa834, 0x5256, 0x27bc, 0x33bd, 0xba11, 0x5a7b, 0x791e, 0xe6c0,
+          0x9ac4, 0x9370, 0x1130, 0x28b4, 0x2b2e, 0x231b, 0x082a, 0x796e}},
+        /* example doing 123 consecutive (f,g/2) steps; 554 divsteps */
+        {{0x6ab1, 0x6ea0, 0x1a99, 0xe0c2, 0xdd45, 0x645d, 0x8dbc, 0x466a,
+          0xfa64, 0x4289, 0xd3f7, 0xfc8f, 0x2894, 0xe3c5, 0xa008, 0xcc14},
+         {0xc75f, 0xc083, 0x4cc2, 0x64f2, 0x2aff, 0x4c12, 0x8461, 0xc4ae,
+          0xbbfa, 0xb336, 0xe4b2, 0x3ac5, 0x2c22, 0xf56c, 0x5381, 0xe943},
+         {0xcd80, 0x760d, 0x4395, 0xb3a6, 0xd497, 0xf583, 0x82bd, 0x1daa,
+          0xbe92, 0x2613, 0xfdfb, 0x869b, 0x0425, 0xa333, 0x7056, 0xc9c5}},
+        /* example doing 123 consecutive (f,g/2) steps; 554 divsteps */
+        {{0x71d4, 0x64df, 0xec4f, 0x74d8, 0x7e0c, 0x40d3, 0x7073, 0x4cc8,
+          0x2a2a, 0xb1ff, 0x8518, 0x6513, 0xb0ea, 0x640a, 0x62d9, 0xd5f4},
+         {0xdc75, 0xd937, 0x3b13, 0x1d36, 0xdf83, 0xd034, 0x1c1c, 0x4332,
+          0x4cc3, 0xeeec, 0x7d94, 0x6771, 0x3384, 0x74b0, 0x947d, 0xf2c4},
+         {0x0a82, 0x37a4, 0x12d5, 0xec97, 0x972c, 0xe6bf, 0xc348, 0xa0a9,
+          0xc50c, 0xdc7c, 0xae30, 0x19d1, 0x0fca, 0x35e1, 0xd6f6, 0x81ee}},
+        /* example doing 123 consecutive (f,g/2) steps; 554 divsteps */
+        {{0xa6b1, 0xabc5, 0x5bbc, 0x7f65, 0xdd32, 0xaa73, 0xf5a3, 0x1982,
+          0xced4, 0xe949, 0x0fd6, 0x2bc4, 0x2bd7, 0xe3c5, 0xa008, 0xcc14},
+         {0x4b5f, 0x8f96, 0xa375, 0xfbcf, 0x1c7d, 0xf1ec, 0x03f5, 0xb35d,
+          0xb999, 0xdb1f, 0xc9a1, 0xb4c7, 0x1dd5, 0xf56c, 0x5381, 0xe943},
+         {0xaa3d, 0x38b9, 0xf17d, 0xeed9, 0x9988, 0x69ee, 0xeb88, 0x1495,
+          0x203f, 0x18c8, 0x82b7, 0xdcb2, 0x34a7, 0x6b00, 0x6998, 0x589a}},
+        /* example doing 453 (f,g/2) steps; 514 divsteps */
+        {{0xa478, 0xe60d, 0x3244, 0x60e6, 0xada3, 0xfe50, 0xb6b1, 0x2eae,
+          0xd0ef, 0xa7b1, 0xef63, 0x05c0, 0xe213, 0x443e, 0x4427, 0x2448},
+         {0x258f, 0xf9ef, 0xe02b, 0x92dd, 0xd7f3, 0x252b, 0xa503, 0x9089,
+          0xedff, 0x96c1, 0xfe3a, 0x3a39, 0x198a, 0x981d, 0x0627, 0xedb7},
+         {0x595a, 0x45be, 0x8fb0, 0x2265, 0xc210, 0x02b8, 0xdce9, 0xe241,
+          0xcab6, 0xbf0d, 0x0049, 0x8d9a, 0x2f51, 0xae54, 0x5785, 0xb411}},
+        /* example doing 453 (f,g/2) steps; 514 divsteps */
+        {{0x48f0, 0x7db3, 0xdafe, 0x1c92, 0x5912, 0xe11a, 0xab52, 0xede1,
+          0x3182, 0x8980, 0x5d2b, 0x9b5b, 0x8718, 0xda27, 0x1683, 0x1de2},
+         {0x168f, 0x6f36, 0xce7a, 0xf435, 0x19d4, 0xda5e, 0x2351, 0x9af5,
+          0xb003, 0x0ef5, 0x3b4c, 0xecec, 0xa9f0, 0x78e1, 0xdfef, 0xe823},
+         {0x5f55, 0xfdcc, 0xb233, 0x2914, 0x84f0, 0x97d1, 0x9cf4, 0x2159,
+          0xbf56, 0xb79c, 0x17a3, 0x7cef, 0xd5de, 0x34f0, 0x5311, 0x4c54}},
+        /* example doing 510 (f,(f+g)/2) steps; 512 divsteps */
+        {{0x2789, 0x2e04, 0x6e0e, 0xb6cd, 0xe4de, 0x4dbf, 0x228d, 0x7877,
+          0xc335, 0x806b, 0x38cd, 0x8049, 0xa73b, 0xcfa2, 0x82f7, 0x9e19},
+         {0xc08d, 0xb99d, 0xb8f3, 0x663d, 0xbbb3, 0x1284, 0x1485, 0x1d49,
+          0xc98f, 0x9e78, 0x1588, 0x11e3, 0xd91a, 0xa2c7, 0xfff1, 0xc7b9},
+         {0x1e1f, 0x411d, 0x7c49, 0x0d03, 0xe789, 0x2f8e, 0x5d55, 0xa95e,
+          0x826e, 0x8de5, 0x52a0, 0x1abc, 0x4cd7, 0xd13a, 0x4395, 0x63e1}},
+        /* example doing 510 (f,(f+g)/2) steps; 512 divsteps */
+        {{0xd5a1, 0xf786, 0x555c, 0xb14b, 0x44ae, 0x535f, 0x4a49, 0xffc3,
+          0xf497, 0x70d1, 0x57c8, 0xa933, 0xc85a, 0x1910, 0x75bf, 0x960b},
+         {0xfe53, 0x5058, 0x496d, 0xfdff, 0x6fb8, 0x4100, 0x92bd, 0xe0c4,
+          0xda89, 0xe0a4, 0x841b, 0x43d4, 0xa388, 0x957f, 0x99ca, 0x9abf},
+         {0xe530, 0x05bc, 0xfeec, 0xfc7e, 0xbcd3, 0x1239, 0x54cb, 0x7042,
+          0xbccb, 0x139e, 0x9076, 0x0203, 0x6068, 0x90c7, 0x1ddf, 0x488d}},
+        /* example doing 228 (g,(g-f)/2) steps; 538 divsteps */
+        {{0x9488, 0xe54b, 0x0e43, 0x81d2, 0x06e7, 0x4b66, 0x36d0, 0x53d6,
+          0x2b68, 0x22ec, 0x3fa9, 0xc1a7, 0x9ad2, 0xa596, 0xb3ac, 0xdf42},
+         {0xe31f, 0x0b28, 0x5f3b, 0xc1ff, 0x344c, 0xbf5f, 0xd2ec, 0x2936,
+          0x9995, 0xdeb2, 0xae6c, 0x2852, 0xa2c6, 0xb306, 0x8120, 0xe305},
+         {0xa56e, 0xfb98, 0x1537, 0x4d85, 0x619e, 0x866c, 0x3cd4, 0x779a,
+          0xdd66, 0xa80d, 0xdc2f, 0xcae4, 0xc74c, 0x5175, 0xa65d, 0x605e}},
+        /* example doing 228 (g,(g-f)/2) steps; 537 divsteps */
+        {{0x8cd5, 0x376d, 0xd01b, 0x7176, 0x19ef, 0xcf09, 0x8403, 0x5e52,
+          0x83c1, 0x44de, 0xb91e, 0xb33d, 0xe15c, 0x51e7, 0xbad8, 0x6359},
+         {0x3b75, 0xf812, 0x5f9e, 0xa04e, 0x92d3, 0x226e, 0x540e, 0x7c9a,
+          0x31c6, 0x46d2, 0x0b7b, 0xdb4a, 0xe662, 0x4950, 0x0265, 0xf76f},
+         {0x09ed, 0x692f, 0xe8f1, 0x3482, 0xab54, 0x36b4, 0x8442, 0x6ae9,
+          0x4329, 0x6505, 0x183b, 0x1c1d, 0x482d, 0x7d63, 0xb44f, 0xcc09}},
+
+        /* Test cases with the group order as modulus. */
+
         /* Test case with the group order as modulus, needing 635 divsteps. */
         {{0x95ed, 0x6c01, 0xd113, 0x5ff1, 0xd7d0, 0x29cc, 0x5817, 0x6120,
           0xca8e, 0xaad1, 0x25ae, 0x8e84, 0x9af6, 0x30bf, 0xf0ed, 0x1686},
@@ -943,6 +1175,59 @@ void run_modinv_tests(void) {
           0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
          {0x1631, 0xbf4a, 0x286a, 0x2716, 0x469f, 0x2ac8, 0x1312, 0xe9bc,
           0x04f4, 0x304b, 0x9931, 0x113b, 0xd932, 0xc8f4, 0x0d0d, 0x01a1}},
+        /* example with group size as modulus needing 631 divsteps */
+        {{0x85ed, 0xc284, 0x9608, 0x3c56, 0x19b6, 0xbb5b, 0x2850, 0xdab7,
+          0xa7f5, 0xe9ab, 0x06a4, 0x5bbb, 0x1135, 0xa186, 0xc424, 0xc68b},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x8479, 0x450a, 0x8fa3, 0xde05, 0xb2f5, 0x7793, 0x7269, 0xbabb,
+          0xc3b3, 0xd49b, 0x3377, 0x03c6, 0xe694, 0xc760, 0xd3cb, 0x2811}},
+        /* example with group size as modulus needing 565 divsteps starting at delta=1/2 */
+        {{0x8432, 0x5ceb, 0xa847, 0x6f1e, 0x51dd, 0x535a, 0x6ddc, 0x70ce,
+          0x6e70, 0xc1f6, 0x18f2, 0x2a7e, 0xc8e7, 0x39f8, 0x7e96, 0xebbf},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x257e, 0x449f, 0x689f, 0x89aa, 0x3989, 0xb661, 0x376c, 0x1e32,
+          0x654c, 0xee2e, 0xf4e2, 0x33c8, 0x3f2f, 0x9716, 0x6046, 0xcaa3}},
+        /* Test case with the group size as modulus, needing 981 divsteps with
+           broken eta handling. */
+        {{0xfeb9, 0xb877, 0xee41, 0x7fa3, 0x87da, 0x94c4, 0x9d04, 0xc5ae,
+          0x5708, 0x0994, 0xfc79, 0x0916, 0xbf32, 0x3ad8, 0xe11c, 0x5ca2},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0f12, 0x075e, 0xce1c, 0x6f92, 0xc80f, 0xca92, 0x9a04, 0x6126,
+          0x4b6c, 0x57d6, 0xca31, 0x97f3, 0x1f99, 0xf4fd, 0xda4d, 0x42ce}},
+        /* Test case with the group size as modulus, input = 0. */
+        {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the group size as modulus, input = 1. */
+        {{0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the group size as modulus, input = 2. */
+        {{0x0002, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x20a1, 0x681b, 0x2f46, 0xdfe9, 0x501d, 0x57a4, 0x6e73, 0x5d57,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x7fff}},
+        /* Test case with the group size as modulus, input = group - 1. */
+        {{0x4140, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x4140, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+
+        /* Test cases with the field size as modulus. */
+
         /* Test case with the field size as modulus, needing 637 divsteps. */
         {{0x9ec3, 0x1919, 0xca84, 0x7c11, 0xf996, 0x06f3, 0x5408, 0x6688,
           0x1320, 0xdb8a, 0x632a, 0x0dcb, 0x8a84, 0x6bee, 0x9c95, 0xe34e},
@@ -950,6 +1235,20 @@ void run_modinv_tests(void) {
           0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
          {0x18e5, 0x19b6, 0xdf92, 0x1aaa, 0x09fb, 0x8a3f, 0x52b0, 0x8701,
           0xac0c, 0x2582, 0xda44, 0x9bcc, 0x6828, 0x1c53, 0xbd8f, 0xbd2c}},
+        /* example with field size as modulus needing 637 divsteps */
+        {{0xaec3, 0xa7cf, 0x2f2d, 0x0693, 0x5ad5, 0xa8ff, 0x7ec7, 0x30ff,
+          0x0c8b, 0xc242, 0xcab2, 0x063a, 0xf86e, 0x6057, 0x9cbd, 0xf6d8},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0310, 0x579d, 0xcb38, 0x9030, 0x3ded, 0x9bb9, 0x1234, 0x63ce,
+          0x0c63, 0x8e3d, 0xacfe, 0x3c20, 0xdc85, 0xf859, 0x919e, 0x1d45}},
+        /* example with field size as modulus needing 564 divsteps starting at delta=1/2 */
+        {{0x63ae, 0x8d10, 0x0071, 0xdb5c, 0xb454, 0x78d1, 0x744a, 0x5f8e,
+          0xe4d8, 0x87b1, 0x8e62, 0x9590, 0xcede, 0xa070, 0x36b4, 0x7f6f},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0xfdc8, 0xe8d5, 0xbe15, 0x9f86, 0xa5fe, 0xf18e, 0xa7ff, 0xd291,
+          0xf4c2, 0x9c87, 0xf150, 0x073e, 0x69b8, 0xf7c4, 0xee4b, 0xc7e6}},
         /* Test case with the field size as modulus, needing 935 divsteps with
            broken eta handling. */
         {{0x1b37, 0xbdc3, 0x8bcd, 0x25e3, 0x1eae, 0x567d, 0x30b6, 0xf0d8,
@@ -958,14 +1257,6 @@ void run_modinv_tests(void) {
           0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
          {0x1622, 0xe05b, 0xe880, 0x7de9, 0x3e45, 0xb682, 0xee6c, 0x67ed,
           0xa179, 0x15db, 0x6b0d, 0xa656, 0x7ccb, 0x8ef7, 0xa2ff, 0xe279}},
-        /* Test case with the group size as modulus, needing 981 divsteps with
-           broken eta handling. */
-        {{0xfeb9, 0xb877, 0xee41, 0x7fa3, 0x87da, 0x94c4, 0x9d04, 0xc5ae,
-          0x5708, 0x0994, 0xfc79, 0x0916, 0xbf32, 0x3ad8, 0xe11c, 0x5ca2},
-         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
-          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
-         {0x0f12, 0x075e, 0xce1c, 0x6f92, 0xc80f, 0xca92, 0x9a04, 0x6126,
-          0x4b6c, 0x57d6, 0xca31, 0x97f3, 0x1f99, 0xf4fd, 0xda4d, 0x42ce}},
         /* Test case with the field size as modulus, input = 0. */
         {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
           0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
@@ -994,34 +1285,219 @@ void run_modinv_tests(void) {
           0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
          {0xfc2e, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
           0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
-        /* Test case with the group size as modulus, input = 0. */
-        {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
-         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
-          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
-         {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
-        /* Test case with the group size as modulus, input = 1. */
-        {{0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
-         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
-          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
-         {0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
-        /* Test case with the group size as modulus, input = 2. */
-        {{0x0002, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
-         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
-          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
-         {0x20a1, 0x681b, 0x2f46, 0xdfe9, 0x501d, 0x57a4, 0x6e73, 0x5d57,
-          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x7fff}},
-        /* Test case with the group size as modulus, input = group - 1. */
-        {{0x4140, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
-          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
-         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
-          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
-         {0x4140, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
-          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}
+
+         /* Selected from a large number of random inputs to reach small/large
+          * d/e values in various configurations. */
+        {{0x3a08, 0x23e1, 0x4d8c, 0xe606, 0x3263, 0x67af, 0x9bf1, 0x9d70,
+          0xf5fd, 0x12e4, 0x03c8, 0xb9ca, 0xe847, 0x8c5d, 0x6322, 0xbd30},
+         {0x8359, 0x59dd, 0x1831, 0x7c1a, 0x1e83, 0xaee1, 0x770d, 0xcea8,
+          0xfbb1, 0xeed6, 0x10b5, 0xe2c6, 0x36ea, 0xee17, 0xe32c, 0xffff},
+         {0x1727, 0x0f36, 0x6f85, 0x5d0c, 0xca6c, 0x3072, 0x9628, 0x5842,
+          0xcb44, 0x7c2b, 0xca4f, 0x62e5, 0x29b1, 0x6ffd, 0x9055, 0xc196}},
+        {{0x905d, 0x41c8, 0xa2ff, 0x295b, 0x72bb, 0x4679, 0x6d01, 0x2c98,
+          0xb3e0, 0xc537, 0xa310, 0xe07e, 0xe72f, 0x4999, 0x1148, 0xf65e},
+         {0x5b41, 0x4239, 0x3c37, 0x5130, 0x30e3, 0xff35, 0xc51f, 0x1a43,
+          0xdb23, 0x13cf, 0x9f49, 0xf70c, 0x5e70, 0xd411, 0x3005, 0xf8c6},
+         {0xc30e, 0x68f0, 0x201a, 0xe10c, 0x864a, 0x6243, 0xe946, 0x43ae,
+          0xf3f1, 0x52dc, 0x1f7f, 0x50d4, 0x2797, 0x064c, 0x5ca4, 0x90e3}},
+        {{0xf1b5, 0xc6e5, 0xd2c4, 0xff95, 0x27c5, 0x0c92, 0x5d19, 0x7ae5,
+          0x4fbe, 0x5438, 0x99e1, 0x880d, 0xd892, 0xa05c, 0x6ffd, 0x7eac},
+         {0x2153, 0xcc9d, 0xfc6c, 0x8358, 0x49a1, 0x01e2, 0xcef0, 0x4969,
+          0xd69a, 0x8cef, 0xf5b2, 0xfd95, 0xdcc2, 0x71f4, 0x6ae2, 0xceeb},
+         {0x9b2e, 0xcdc6, 0x0a5c, 0x7317, 0x9084, 0xe228, 0x56cf, 0xd512,
+          0x628a, 0xce21, 0x3473, 0x4e13, 0x8823, 0x1ed0, 0x34d0, 0xbfa3}},
+        {{0x5bae, 0x53e5, 0x5f4d, 0x21ca, 0xb875, 0x8ecf, 0x9aa6, 0xbe3c,
+          0x9f96, 0x7b82, 0x375d, 0x4d3e, 0x491c, 0xb1eb, 0x04c9, 0xb6c8},
+         {0xfcfd, 0x10b7, 0x73b2, 0xd23b, 0xa357, 0x67da, 0x0d9f, 0x8702,
+          0xa037, 0xff8e, 0x0e8b, 0x1801, 0x2c5c, 0x4e6e, 0x4558, 0xfff2},
+         {0xc50f, 0x5654, 0x6713, 0x5ef5, 0xa7ce, 0xa647, 0xc832, 0x69ce,
+          0x1d5c, 0x4310, 0x0746, 0x5a01, 0x96ea, 0xde4b, 0xa88b, 0x5543}},
+        {{0xdc7f, 0x5e8c, 0x89d1, 0xb077, 0xd521, 0xcf90, 0x32fa, 0x5737,
+          0x839e, 0x1464, 0x007c, 0x09c6, 0x9371, 0xe8ea, 0xc1cb, 0x75c4},
+         {0xe3a3, 0x107f, 0xa82a, 0xa375, 0x4578, 0x60f4, 0x75c9, 0x5ee4,
+          0x3fd7, 0x2736, 0x2871, 0xd3d2, 0x5f1d, 0x1abb, 0xa764, 0xffff},
+         {0x45c6, 0x1f2e, 0xb14c, 0x84d7, 0x7bb7, 0x5a04, 0x0504, 0x3f33,
+          0x5cc1, 0xb07a, 0x6a6c, 0x786f, 0x647f, 0xe1d7, 0x78a2, 0x4cf4}},
+        {{0xc006, 0x356f, 0x8cd2, 0x967b, 0xb49e, 0x2d4e, 0x14bf, 0x4bcb,
+          0xddab, 0xd3f9, 0xa068, 0x2c1c, 0xd242, 0xa56d, 0xf2c7, 0x5f97},
+         {0x465b, 0xb745, 0x0e0d, 0x69a9, 0x987d, 0xcb37, 0xf637, 0xb311,
+          0xc4d6, 0x2ddb, 0xf68f, 0x2af9, 0x959d, 0x3f53, 0x98f2, 0xf640},
+         {0xc0f2, 0x6bfb, 0xf5c3, 0x91c1, 0x6b05, 0x0825, 0x5ca0, 0x7df7,
+          0x9d55, 0x6d9e, 0xfe94, 0x2ad9, 0xd9f0, 0xe68b, 0xa72b, 0xd1b2}},
+        {{0x2279, 0x61ba, 0x5bc6, 0x136b, 0xf544, 0x717c, 0xafda, 0x02bd,
+          0x79af, 0x1fad, 0xea09, 0x81bb, 0x932b, 0x32c9, 0xdf1d, 0xe576},
+         {0x8215, 0x7817, 0xca82, 0x43b0, 0x9b06, 0xea65, 0x1291, 0x0621,
+          0x0089, 0x46fe, 0xc5a6, 0xddd7, 0x8065, 0xc6a0, 0x214b, 0xfc64},
+         {0x04bf, 0x6f2a, 0x86b2, 0x841a, 0x4a95, 0xc632, 0x97b7, 0x5821,
+          0x2b18, 0x1bb0, 0x3e97, 0x935e, 0xcc7d, 0x066b, 0xd513, 0xc251}},
+        {{0x76e8, 0x5bc2, 0x3eaa, 0x04fc, 0x9974, 0x92c1, 0x7c15, 0xfa89,
+          0x1151, 0x36ee, 0x48b2, 0x049c, 0x5f16, 0xcee4, 0x925b, 0xe98e},
+         {0x913f, 0x0a2d, 0xa185, 0x9fea, 0xda5a, 0x4025, 0x40d7, 0x7cfa,
+          0x88ca, 0xbbe8, 0xb265, 0xb7e4, 0x6cb1, 0xed64, 0xc6f9, 0xffb5},
+         {0x6ab1, 0x1a86, 0x5009, 0x152b, 0x1cc4, 0xe2c8, 0x960b, 0x19d0,
+          0x3554, 0xc562, 0xd013, 0xcf91, 0x10e1, 0x7933, 0xe195, 0xcf49}},
+        {{0x9cb5, 0xd2d7, 0xc6ed, 0xa818, 0xb495, 0x06ee, 0x0f4a, 0x06e3,
+          0x4c5a, 0x80ce, 0xd49a, 0x4cd7, 0x7487, 0x92af, 0xe516, 0x676c},
+         {0xd6e9, 0x6b85, 0x619a, 0xb52c, 0x20a0, 0x2f79, 0x3545, 0x1edd,
+          0x5a6f, 0x8082, 0x9b80, 0xf8f8, 0xc78a, 0xd0a3, 0xadf4, 0xffff},
+         {0x01c2, 0x2118, 0xef5e, 0xa877, 0x046a, 0xd2c2, 0x2ad5, 0x951c,
+          0x8900, 0xa5c9, 0x8d0f, 0x6b61, 0x55d3, 0xd572, 0x48de, 0x9219}},
+        {{0x5114, 0x0644, 0x23dd, 0x01d3, 0xc101, 0xa659, 0xea17, 0x640f,
+          0xf767, 0x2644, 0x9cec, 0xd8ba, 0xd6da, 0x9156, 0x8aeb, 0x875a},
+         {0xc1bf, 0xdae9, 0xe96b, 0xce77, 0xf7a1, 0x3e99, 0x5c2e, 0x973b,
+          0xd048, 0x5bd0, 0x4e8a, 0xcb85, 0xce39, 0x37f5, 0x815d, 0xffff},
+         {0x48cc, 0x35b6, 0x26d4, 0x2ea6, 0x50d6, 0xa2f9, 0x64b6, 0x03bf,
+          0xd00c, 0xe057, 0x3343, 0xfb79, 0x3ce5, 0xf717, 0xc5af, 0xe185}},
+        {{0x13ff, 0x6c76, 0x2077, 0x16e0, 0xd5ca, 0xf2ad, 0x8dba, 0x8f49,
+          0x7887, 0x16f9, 0xb646, 0xfc87, 0xfa31, 0x5096, 0xf08c, 0x3fbe},
+         {0x8139, 0x6fd7, 0xf6df, 0xa7bf, 0x6699, 0x5361, 0x6f65, 0x13c8,
+          0xf4d1, 0xe28f, 0xc545, 0x0a8c, 0x5274, 0xb0a6, 0xffff, 0xffff},
+         {0x22ca, 0x0cd6, 0xc1b5, 0xb064, 0x44a7, 0x297b, 0x495f, 0x34ac,
+          0xfa95, 0xec62, 0xf08d, 0x621c, 0x66a6, 0xba94, 0x84c6, 0x8ee0}},
+        {{0xaa30, 0x312e, 0x439c, 0x4e88, 0x2e2f, 0x32dc, 0xb880, 0xa28e,
+          0xf795, 0xc910, 0xb406, 0x8dd7, 0xb187, 0xa5a5, 0x38f1, 0xe49e},
+         {0xfb19, 0xf64a, 0xba6a, 0x8ec2, 0x7255, 0xce89, 0x2cf9, 0x9cba,
+          0xe1fe, 0x50da, 0x1705, 0xac52, 0xe3d4, 0x4269, 0x0648, 0xfd77},
+         {0xb4c8, 0x6e8a, 0x2b5f, 0x4c2d, 0x5a67, 0xa7bb, 0x7d6d, 0x5569,
+          0xa0ea, 0x244a, 0xc0f2, 0xf73d, 0x58cf, 0xac7f, 0xd32b, 0x3018}},
+        {{0xc953, 0x1ae1, 0xae46, 0x8709, 0x19c2, 0xa986, 0x9abe, 0x1611,
+          0x0395, 0xd5ab, 0xf0f6, 0xb5b0, 0x5b2b, 0x0317, 0x80ba, 0x376d},
+         {0xfe77, 0xbc03, 0xac2f, 0x9d00, 0xa175, 0x293d, 0x3b56, 0x0e3a,
+          0x0a9c, 0xf40c, 0x690e, 0x1508, 0x95d4, 0xddc4, 0xe805, 0xffff},
+         {0xb1ce, 0x0929, 0xa5fe, 0x4b50, 0x9d5d, 0x8187, 0x2557, 0x4376,
+          0x11ba, 0xdcef, 0xc1f3, 0xd531, 0x1824, 0x93f6, 0xd81f, 0x8f83}},
+        {{0xb8d2, 0xb900, 0x4a0c, 0x7188, 0xa5bf, 0x1b0b, 0x2ae5, 0xa35b,
+          0x98e0, 0x610c, 0x86db, 0x2487, 0xa267, 0x002c, 0xebb6, 0xc5f4},
+         {0x9cdd, 0x1c1b, 0x2f06, 0x43d1, 0xce47, 0xc334, 0x6e60, 0xc016,
+          0x989e, 0x0ab2, 0x0cac, 0x1196, 0xe2d9, 0x2e04, 0xc62b, 0xffff},
+         {0xdc36, 0x1f05, 0x6aa9, 0x7a20, 0x944f, 0x2fd3, 0xa553, 0xdb4f,
+          0xbd5c, 0x3a75, 0x25d4, 0xe20e, 0xa387, 0x1410, 0xdbb1, 0x1b60}},
+        {{0x76b3, 0x2207, 0x4930, 0x5dd7, 0x65a0, 0xd55c, 0xb443, 0x53b7,
+          0x5c22, 0x818a, 0xb2e7, 0x9de8, 0x9985, 0xed45, 0x33b1, 0x53e8},
+         {0x7913, 0x44e1, 0xf15b, 0x5edd, 0x34f3, 0x4eba, 0x0758, 0x7104,
+          0x32d9, 0x28f3, 0x4401, 0x85c5, 0xb695, 0xb899, 0xc0f2, 0xffff},
+         {0x7f43, 0xd202, 0x24c9, 0x69f3, 0x74dc, 0x1a69, 0xeaee, 0x5405,
+          0x1755, 0x4bb8, 0x04e3, 0x2fd2, 0xada8, 0x39eb, 0x5b4d, 0x96ca}},
+        {{0x807b, 0x7112, 0xc088, 0xdafd, 0x02fa, 0x9d95, 0x5e42, 0xc033,
+          0xde0a, 0xeecf, 0x8e90, 0x8da1, 0xb17e, 0x9a5b, 0x4c6d, 0x1914},
+         {0x4871, 0xd1cb, 0x47d7, 0x327f, 0x09ec, 0x97bb, 0x2fae, 0xd346,
+          0x6b78, 0x3707, 0xfeb2, 0xa6ab, 0x13df, 0x76b0, 0x8fb9, 0xffb3},
+         {0x179e, 0xb63b, 0x4784, 0x231e, 0x9f42, 0x7f1a, 0xa3fb, 0xdd8c,
+          0xd1eb, 0xb4c9, 0x8ca7, 0x018c, 0xf691, 0x576c, 0xa7d6, 0xce27}},
+        {{0x5f45, 0x7c64, 0x083d, 0xedd5, 0x08a0, 0x0c64, 0x6c6f, 0xec3c,
+          0xe2fb, 0x352c, 0x9303, 0x75e4, 0xb4e0, 0x8b09, 0xaca4, 0x7025},
+         {0x1025, 0xb482, 0xfed5, 0xa678, 0x8966, 0x9359, 0x5329, 0x98bb,
+          0x85b2, 0x73ba, 0x9982, 0x6fdc, 0xf190, 0xbe8c, 0xdc5c, 0xfd93},
+         {0x83a2, 0x87a4, 0xa680, 0x52a1, 0x1ba1, 0x8848, 0x5db7, 0x9744,
+          0x409c, 0x0745, 0x0e1e, 0x1cfc, 0x00cd, 0xf573, 0x2071, 0xccaa}},
+        {{0xf61f, 0x63d4, 0x536c, 0x9eb9, 0x5ddd, 0xbb11, 0x9014, 0xe904,
+          0xfe01, 0x6b45, 0x1858, 0xcb5b, 0x4c38, 0x43e1, 0x381d, 0x7f94},
+         {0xf61f, 0x63d4, 0xd810, 0x7ca3, 0x8a04, 0x4b83, 0x11fc, 0xdf94,
+          0x4169, 0xbd05, 0x608e, 0x7151, 0x4fbf, 0xb31a, 0x38a7, 0xa29b},
+         {0xe621, 0xdfa5, 0x3d06, 0x1d03, 0x81e6, 0x00da, 0x53a6, 0x965e,
+          0x93e5, 0x2164, 0x5b61, 0x59b8, 0xa629, 0x8d73, 0x699a, 0x6111}},
+        {{0x4cc3, 0xd29e, 0xf4a3, 0x3428, 0x2048, 0xeec9, 0x5f50, 0x99a4,
+          0x6de9, 0x05f2, 0x5aa9, 0x5fd2, 0x98b4, 0x1adc, 0x225f, 0x777f},
+         {0xe649, 0x37da, 0x5ba6, 0x5765, 0x3f4a, 0x8a1c, 0x2e79, 0xf550,
+          0x1a54, 0xcd1e, 0x7218, 0x3c3c, 0x6311, 0xfe28, 0x95fb, 0xed97},
+         {0xe9b6, 0x0c47, 0x3f0e, 0x849b, 0x11f8, 0xe599, 0x5e4d, 0xd618,
+          0xa06d, 0x33a0, 0x9a3e, 0x44db, 0xded8, 0x10f0, 0x94d2, 0x81fb}},
+        {{0x2e59, 0x7025, 0xd413, 0x455a, 0x1ce3, 0xbd45, 0x7263, 0x27f7,
+          0x23e3, 0x518e, 0xbe06, 0xc8c4, 0xe332, 0x4276, 0x68b4, 0xb166},
+         {0x596f, 0x0cf6, 0xc8ec, 0x787b, 0x04c1, 0x473c, 0xd2b8, 0x8d54,
+          0x9cdf, 0x77f2, 0xd3f3, 0x6735, 0x0638, 0xf80e, 0x9467, 0xc6aa},
+         {0xc7e7, 0x1822, 0xb62a, 0xec0d, 0x89cd, 0x7846, 0xbfa2, 0x35d5,
+          0xfa38, 0x870f, 0x494b, 0x1697, 0x8b17, 0xf904, 0x10b6, 0x9822}},
+        {{0x6d5b, 0x1d4f, 0x0aaf, 0x807b, 0x35fb, 0x7ee8, 0x00c6, 0x059a,
+          0xddf0, 0x1fb1, 0xc38a, 0xd78e, 0x2aa4, 0x79e7, 0xad28, 0xc3f1},
+         {0xe3bb, 0x174e, 0xe0a8, 0x74b6, 0xbd5b, 0x35f6, 0x6d23, 0x6328,
+          0xc11f, 0x83e1, 0xf928, 0xa918, 0x838e, 0xbf43, 0xe243, 0xfffb},
+         {0x9cf2, 0x6b8b, 0x3476, 0x9d06, 0xdcf2, 0xdb8a, 0x89cd, 0x4857,
+          0x75c2, 0xabb8, 0x490b, 0xc9bd, 0x890e, 0xe36e, 0xd552, 0xfffa}},
+        {{0x2f09, 0x9d62, 0xa9fc, 0xf090, 0xd6d1, 0x9d1d, 0x1828, 0xe413,
+          0xc92b, 0x3d5a, 0x1373, 0x368c, 0xbaf2, 0x2158, 0x71eb, 0x08a3},
+         {0x2f09, 0x1d62, 0x4630, 0x0de1, 0x06dc, 0xf7f1, 0xc161, 0x1e92,
+          0x7495, 0x97e4, 0x94b6, 0xa39e, 0x4f1b, 0x18f8, 0x7bd4, 0x0c4c},
+         {0xeb3d, 0x723d, 0x0907, 0x525b, 0x463a, 0x49a8, 0xc6b8, 0xce7f,
+          0x740c, 0x0d7d, 0xa83b, 0x457f, 0xae8e, 0xc6af, 0xd331, 0x0475}},
+        {{0x6abd, 0xc7af, 0x3e4e, 0x95fd, 0x8fc4, 0xee25, 0x1f9c, 0x0afe,
+          0x291d, 0xcde0, 0x48f4, 0xb2e8, 0xf7af, 0x8f8d, 0x0bd6, 0x078d},
+         {0x4037, 0xbf0e, 0x2081, 0xf363, 0x13b2, 0x381e, 0xfb6e, 0x818e,
+          0x27e4, 0x5662, 0x18b0, 0x0cd2, 0x81f5, 0x9415, 0x0d6c, 0xf9fb},
+         {0xd205, 0x0981, 0x0498, 0x1f08, 0xdb93, 0x1732, 0x0579, 0x1424,
+          0xad95, 0x642f, 0x050c, 0x1d6d, 0xfc95, 0xfc4a, 0xd41b, 0x3521}},
+        {{0xf23a, 0x4633, 0xaef4, 0x1a92, 0x3c8b, 0x1f09, 0x30f3, 0x4c56,
+          0x2a2f, 0x4f62, 0xf5e4, 0x8329, 0x63cc, 0xb593, 0xec6a, 0xc428},
+         {0x93a7, 0xfcf6, 0x606d, 0xd4b2, 0x2aad, 0x28b4, 0xc65b, 0x8998,
+          0x4e08, 0xd178, 0x0900, 0xc82b, 0x7470, 0xa342, 0x7c0f, 0xffff},
+         {0x315f, 0xf304, 0xeb7b, 0xe5c3, 0x1451, 0x6311, 0x8f37, 0x93a8,
+          0x4a38, 0xa6c6, 0xe393, 0x1087, 0x6301, 0xd673, 0x4ec4, 0xffff}},
+        {{0x892e, 0xeed0, 0x1165, 0xcbc1, 0x5545, 0xa280, 0x7243, 0x10c9,
+          0x9536, 0x36af, 0xb3fc, 0x2d7c, 0xe8a5, 0x09d6, 0xe1d4, 0xe85d},
+         {0xae09, 0xc28a, 0xd777, 0xbd80, 0x23d6, 0xf980, 0xeb7c, 0x4e0e,
+          0xf7dc, 0x6475, 0xf10a, 0x2d33, 0x5dfd, 0x797a, 0x7f1c, 0xf71a},
+         {0x4064, 0x8717, 0xd091, 0x80b0, 0x4527, 0x8442, 0xac8b, 0x9614,
+          0xc633, 0x35f5, 0x7714, 0x2e83, 0x4aaa, 0xd2e4, 0x1acd, 0x0562}},
+        {{0xdb64, 0x0937, 0x308b, 0x53b0, 0x00e8, 0xc77f, 0x2f30, 0x37f7,
+          0x79ce, 0xeb7f, 0xde81, 0x9286, 0xafda, 0x0e62, 0xae00, 0x0067},
+         {0x2cc7, 0xd362, 0xb161, 0x0557, 0x4ff2, 0xb9c8, 0x06fe, 0x5f2b,
+          0xde33, 0x0190, 0x28c6, 0xb886, 0xee2b, 0x5a4e, 0x3289, 0x0185},
+         {0x4215, 0x923e, 0xf34f, 0xb362, 0x88f8, 0xceec, 0xafdd, 0x7f42,
+          0x0c57, 0x56b2, 0xa366, 0x6a08, 0x0826, 0xfb8f, 0x1b03, 0x0163}},
+        {{0xa4ba, 0x8408, 0x810a, 0xdeba, 0x47a3, 0x853a, 0xeb64, 0x2f74,
+          0x3039, 0x038c, 0x7fbb, 0x498e, 0xd1e9, 0x46fb, 0x5691, 0x32a4},
+         {0xd749, 0xb49d, 0x20b7, 0x2af6, 0xd34a, 0xd2da, 0x0a10, 0xf781,
+          0x58c9, 0x171f, 0x3cb6, 0x6337, 0x88cd, 0xcf1e, 0xb246, 0x7351},
+         {0xf729, 0xcf0a, 0x96ea, 0x032c, 0x4a8f, 0x42fe, 0xbac8, 0xec65,
+          0x1510, 0x0d75, 0x4c17, 0x8d29, 0xa03f, 0x8b7e, 0x2c49, 0x0000}},
+        {{0x0fa4, 0x8e1c, 0x3788, 0xba3c, 0x8d52, 0xd89d, 0x12c8, 0xeced,
+          0x9fe6, 0x9b88, 0xecf3, 0xe3c8, 0xac48, 0x76ed, 0xf23e, 0xda79},
+         {0x1103, 0x227c, 0x5b00, 0x3fcf, 0xc5d0, 0x2d28, 0x8020, 0x4d1c,
+          0xc6b9, 0x67f9, 0x6f39, 0x989a, 0xda53, 0x3847, 0xd416, 0xe0d0},
+         {0xdd8e, 0xcf31, 0x3710, 0x7e44, 0xa511, 0x933c, 0x0cc3, 0x5145,
+          0xf632, 0x5e1d, 0x038f, 0x5ce7, 0x7265, 0xda9d, 0xded6, 0x08f8}},
+        {{0xe2c8, 0x91d5, 0xa5f5, 0x735f, 0x6b58, 0x56dc, 0xb39d, 0x5c4a,
+          0x57d0, 0xa1c2, 0xd92f, 0x9ad4, 0xf7c4, 0x51dd, 0xaf5c, 0x0096},
+         {0x1739, 0x7207, 0x7505, 0xbf35, 0x42de, 0x0a29, 0xa962, 0xdedf,
+          0x53e8, 0x12bf, 0xcde7, 0xd8e2, 0x8d4d, 0x2c4b, 0xb1b1, 0x0628},
+         {0x992d, 0xe3a7, 0xb422, 0xc198, 0x23ab, 0xa6ef, 0xb45d, 0x50da,
+          0xa738, 0x014a, 0x2310, 0x85fb, 0x5fe8, 0x1b18, 0x1774, 0x03a7}},
+        {{0x1f16, 0x2b09, 0x0236, 0xee90, 0xccf9, 0x9775, 0x8130, 0x4c91,
+          0x9091, 0x310b, 0x6dc4, 0x86f6, 0xc2e8, 0xef60, 0xfc0e, 0xf3a4},
+         {0x9f49, 0xac15, 0x02af, 0x110f, 0xc59d, 0x5677, 0xa1a9, 0x38d5,
+          0x914f, 0xa909, 0x3a3a, 0x4a39, 0x3703, 0xea30, 0x73da, 0xffad},
+         {0x15ed, 0xdd16, 0x83c7, 0x270a, 0x862f, 0xd8ad, 0xcaa1, 0x5f41,
+          0x99a9, 0x3fc8, 0x7bb2, 0x360a, 0xb06d, 0xfadc, 0x1b36, 0xffa8}},
+        {{0xc4e0, 0xb8fd, 0x5106, 0xe169, 0x754c, 0xa58c, 0xc413, 0x8224,
+          0x5483, 0x63ec, 0xd477, 0x8473, 0x4778, 0x9281, 0x0000, 0x0000},
+         {0x85e1, 0xff54, 0xb200, 0xe413, 0xf4f4, 0x4c0f, 0xfcec, 0xc183,
+          0x60d3, 0x1b0c, 0x3834, 0x601c, 0x943c, 0xbe6e, 0x0002, 0x0000},
+         {0xf4f8, 0xfd5e, 0x61ef, 0xece8, 0x9199, 0xe5c4, 0x05a6, 0xe6c3,
+          0xc4ae, 0x8b28, 0x66b1, 0x8a95, 0x9ece, 0x8f4a, 0x0001, 0x0000}},
+        {{0xeae9, 0xa1b4, 0xc6d8, 0x2411, 0x2b5a, 0x1dd0, 0x2dc9, 0xb57b,
+          0x5ccd, 0x4957, 0xaf59, 0xa04b, 0x5f42, 0xab7c, 0x2826, 0x526f},
+         {0xf407, 0x165a, 0xb724, 0x2f12, 0x2ea1, 0x470b, 0x4464, 0xbd35,
+          0x606f, 0xd73e, 0x50d3, 0x8a7f, 0x8029, 0x7ffc, 0xbe31, 0x6cfb},
+         {0x8171, 0x1f4c, 0xced2, 0x9c99, 0x6d7e, 0x5a0f, 0xfefb, 0x59e3,
+          0xa0c8, 0xabd9, 0xc4c5, 0x57d3, 0xbfa3, 0x4f11, 0x96a2, 0x5a7d}},
+        {{0xe068, 0x4cc0, 0x8bcd, 0xc903, 0x9e52, 0xb3e1, 0xd745, 0x0995,
+          0xdd8f, 0xf14b, 0xd2ac, 0xd65a, 0xda1d, 0xa742, 0xbac5, 0x474c},
+         {0x7481, 0xf2ad, 0x9757, 0x2d82, 0xb683, 0xb16b, 0x0002, 0x7b60,
+          0x8f0c, 0x2594, 0x8f64, 0x3b7a, 0x3552, 0x8d9d, 0xb9d7, 0x67eb},
+         {0xcaab, 0xb9a1, 0xf966, 0xe311, 0x5b34, 0x0fa0, 0x6abc, 0x8134,
+          0xab3d, 0x90f6, 0x1984, 0x9232, 0xec17, 0x74e5, 0x2ceb, 0x434e}},
+        {{0x0fb1, 0x7a55, 0x1a5c, 0x53eb, 0xd7b3, 0x7a01, 0xca32, 0x31f6,
+          0x3b74, 0x679e, 0x1501, 0x6c57, 0xdb20, 0x8b7c, 0xd7d0, 0x8097},
+         {0xb127, 0xb20c, 0xe3a2, 0x96f3, 0xe0d8, 0xd50c, 0x14b4, 0x0b40,
+          0x6eeb, 0xa258, 0x99db, 0x3c8c, 0x0f51, 0x4198, 0x3887, 0xffd0},
+         {0x0273, 0x9f8c, 0x9669, 0xbbba, 0x1c49, 0x767c, 0xc2af, 0x59f0,
+          0x1366, 0xd397, 0x63ac, 0x6fe8, 0x1a9a, 0x1259, 0x01d0, 0x0016}},
+        {{0x7876, 0x2a35, 0xa24a, 0x433e, 0x5501, 0x573c, 0xd76d, 0xcb82,
+          0x1334, 0xb4a6, 0xf290, 0xc797, 0xeae9, 0x2b83, 0x1e2b, 0x8b14},
+         {0x3885, 0x8aef, 0x9dea, 0x2b8c, 0xdd7c, 0xd7cd, 0xb0cc, 0x05ee,
+          0x361b, 0x3800, 0xb0d4, 0x4c23, 0xbd3f, 0x5180, 0x9783, 0xff80},
+         {0xab36, 0x3104, 0xdae8, 0x0704, 0x4a28, 0x6714, 0x824b, 0x0051,
+          0x8134, 0x1f6a, 0x712d, 0x1f03, 0x03b2, 0xecac, 0x377d, 0xfef9}}
     };
 
     int i, j, ok;
@@ -2183,7 +2659,91 @@ void run_inverse_tests(void)
          SECP256K1_FE_CONST(0xbcb223fe, 0xdc24a059, 0xd838091d, 0xd2253530, 0xffffffff, 0xffffffff, 0xffffffff, 0x434dd931)},
         /* Input known to need 637 divsteps */
         {SECP256K1_FE_CONST(0xe34e9c95, 0x6bee8a84, 0x0dcb632a, 0xdb8a1320, 0x66885408, 0x06f3f996, 0x7c11ca84, 0x19199ec3),
-         SECP256K1_FE_CONST(0xbd2cbd8f, 0x1c536828, 0x9bccda44, 0x2582ac0c, 0x870152b0, 0x8a3f09fb, 0x1aaadf92, 0x19b618e5)}
+         SECP256K1_FE_CONST(0xbd2cbd8f, 0x1c536828, 0x9bccda44, 0x2582ac0c, 0x870152b0, 0x8a3f09fb, 0x1aaadf92, 0x19b618e5)},
+        /* Input known to need 567 divsteps starting with delta=1/2. */
+        {SECP256K1_FE_CONST(0xf6bc3ba3, 0x636451c4, 0x3e46357d, 0x2c21d619, 0x0988e234, 0x15985661, 0x6672982b, 0xa7549bfc),
+         SECP256K1_FE_CONST(0xb024fdc7, 0x5547451e, 0x426c585f, 0xbd481425, 0x73df6b75, 0xeef6d9d0, 0x389d87d4, 0xfbb440ba)},
+        /* Input known to need 566 divsteps starting with delta=1/2. */
+        {SECP256K1_FE_CONST(0xb595d81b, 0x2e3c1e2f, 0x482dbc65, 0xe4865af7, 0x9a0a50aa, 0x29f9e618, 0x6f87d7a5, 0x8d1063ae),
+         SECP256K1_FE_CONST(0xc983337c, 0x5d5c74e1, 0x49918330, 0x0b53afb5, 0xa0428a0b, 0xce6eef86, 0x059bd8ef, 0xe5b908de)},
+        /* Set of 10 inputs accessing all 128 entries in the modinv32 divsteps_var table */
+        {SECP256K1_FE_CONST(0x00000000, 0x00000000, 0xe0ff1f80, 0x1f000000, 0x00000000, 0x00000000, 0xfeff0100, 0x00000000),
+         SECP256K1_FE_CONST(0x9faf9316, 0x77e5049d, 0x0b5e7a1b, 0xef70b893, 0x18c9e30c, 0x045e7fd7, 0x29eddf8c, 0xd62e9e3d)},
+        {SECP256K1_FE_CONST(0x621a538d, 0x511b2780, 0x35688252, 0x53f889a4, 0x6317c3ac, 0x32ba0a46, 0x6277c0d1, 0xccd31192),
+         SECP256K1_FE_CONST(0x38513b0c, 0x5eba856f, 0xe29e882e, 0x9b394d8c, 0x34bda011, 0xeaa66943, 0x6a841a4c, 0x6ae8bcff)},
+        {SECP256K1_FE_CONST(0x00000200, 0xf0ffff1f, 0x00000000, 0x0000e0ff, 0xffffffff, 0xfffcffff, 0xffffffff, 0xffff0100),
+         SECP256K1_FE_CONST(0x5da42a52, 0x3640de9e, 0x13e64343, 0x0c7591b7, 0x6c1e3519, 0xf048c5b6, 0x0484217c, 0xedbf8b2f)},
+        {SECP256K1_FE_CONST(0xd1343ef9, 0x4b952621, 0x7c52a2ee, 0x4ea1281b, 0x4ab46410, 0x9f26998d, 0xa686a8ff, 0x9f2103e8),
+         SECP256K1_FE_CONST(0x84044385, 0x9a4619bf, 0x74e35b6d, 0xa47e0c46, 0x6b7fb47d, 0x9ffab128, 0xb0775aa3, 0xcb318bd1)},
+        {SECP256K1_FE_CONST(0xb27235d2, 0xc56a52be, 0x210db37a, 0xd50d23a4, 0xbe621bdd, 0x5df22c6a, 0xe926ba62, 0xd2e4e440),
+         SECP256K1_FE_CONST(0x67a26e54, 0x483a9d3c, 0xa568469e, 0xd258ab3d, 0xb9ec9981, 0xdca9b1bd, 0x8d2775fe, 0x53ae429b)},
+        {SECP256K1_FE_CONST(0x00000000, 0x00000000, 0x00e0ffff, 0xffffff83, 0xffffffff, 0x3f00f00f, 0x000000e0, 0xffffffff),
+         SECP256K1_FE_CONST(0x310e10f8, 0x23bbfab0, 0xac94907d, 0x076c9a45, 0x8d357d7f, 0xc763bcee, 0x00d0e615, 0x5a6acef6)},
+        {SECP256K1_FE_CONST(0xfeff0300, 0x001c0000, 0xf80700c0, 0x0ff0ffff, 0xffffffff, 0x0fffffff, 0xffff0100, 0x7f0000fe),
+         SECP256K1_FE_CONST(0x28e2fdb4, 0x0709168b, 0x86f598b0, 0x3453a370, 0x530cf21f, 0x32f978d5, 0x1d527a71, 0x59269b0c)},
+        {SECP256K1_FE_CONST(0xc2591afa, 0x7bb98ef7, 0x090bb273, 0x85c14f87, 0xbb0b28e0, 0x54d3c453, 0x85c66753, 0xd5574d2f),
+         SECP256K1_FE_CONST(0xfdca70a2, 0x70ce627c, 0x95e66fae, 0x848a6dbb, 0x07ffb15c, 0x5f63a058, 0xba4140ed, 0x6113b503)},
+        {SECP256K1_FE_CONST(0xf5475db3, 0xedc7b5a3, 0x411c047e, 0xeaeb452f, 0xc625828e, 0x1cf5ad27, 0x8eec1060, 0xc7d3e690),
+         SECP256K1_FE_CONST(0x5eb756c0, 0xf963f4b9, 0xdc6a215e, 0xec8cc2d8, 0x2e9dec01, 0xde5eb88d, 0x6aba7164, 0xaecb2c5a)},
+        {SECP256K1_FE_CONST(0x00000000, 0x00f8ffff, 0xffffffff, 0x01000000, 0xe0ff1f00, 0x00000000, 0xffffff7f, 0x00000000),
+         SECP256K1_FE_CONST(0xe0d2e3d8, 0x49b6157d, 0xe54e88c2, 0x1a7f02ca, 0x7dd28167, 0xf1125d81, 0x7bfa444e, 0xbe110037)},
+        /* Selection of randomly generated inputs that reach high/low d/e values in various configurations. */
+        {SECP256K1_FE_CONST(0x13cc08a4, 0xd8c41f0f, 0x179c3e67, 0x54c46c67, 0xc4109221, 0x09ab3b13, 0xe24d9be1, 0xffffe950),
+         SECP256K1_FE_CONST(0xb80c8006, 0xd16abaa7, 0xcabd71e5, 0xcf6714f4, 0x966dd3d0, 0x64767a2d, 0xe92c4441, 0x51008cd1)},
+        {SECP256K1_FE_CONST(0xaa6db990, 0x95efbca1, 0x3cc6ff71, 0x0602e24a, 0xf49ff938, 0x99fffc16, 0x46f40993, 0xc6e72057),
+         SECP256K1_FE_CONST(0xd5d3dd69, 0xb0c195e5, 0x285f1d49, 0xe639e48c, 0x9223f8a9, 0xca1d731d, 0x9ca482f9, 0xa5b93e06)},
+        {SECP256K1_FE_CONST(0x1c680eac, 0xaeabffd8, 0x9bdc4aee, 0x1781e3de, 0xa3b08108, 0x0015f2e0, 0x94449e1b, 0x2f67a058),
+         SECP256K1_FE_CONST(0x7f083f8d, 0x31254f29, 0x6510f475, 0x245c373d, 0xc5622590, 0x4b323393, 0x32ed1719, 0xc127444b)},
+        {SECP256K1_FE_CONST(0x147d44b3, 0x012d83f8, 0xc160d386, 0x1a44a870, 0x9ba6be96, 0x8b962707, 0x267cbc1a, 0xb65b2f0a),
+         SECP256K1_FE_CONST(0x555554ff, 0x170aef1e, 0x50a43002, 0xe51fbd36, 0xafadb458, 0x7a8aded1, 0x0ca6cd33, 0x6ed9087c)},
+        {SECP256K1_FE_CONST(0x12423796, 0x22f0fe61, 0xf9ca017c, 0x5384d107, 0xa1fbf3b2, 0x3b018013, 0x916a3c37, 0x4000b98c),
+         SECP256K1_FE_CONST(0x20257700, 0x08668f94, 0x1177e306, 0x136c01f5, 0x8ed1fbd2, 0x95ec4589, 0xae38edb9, 0xfd19b6d7)},
+        {SECP256K1_FE_CONST(0xdcf2d030, 0x9ab42cb4, 0x93ffa181, 0xdcd23619, 0x39699b52, 0x08909a20, 0xb5a17695, 0x3a9dcf21),
+         SECP256K1_FE_CONST(0x1f701dea, 0xe211fb1f, 0x4f37180d, 0x63a0f51c, 0x29fe1e40, 0xa40b6142, 0x2e7b12eb, 0x982b06b6)},
+        {SECP256K1_FE_CONST(0x79a851f6, 0xa6314ed3, 0xb35a55e6, 0xca1c7d7f, 0xe32369ea, 0xf902432e, 0x375308c5, 0xdfd5b600),
+         SECP256K1_FE_CONST(0xcaae00c5, 0xe6b43851, 0x9dabb737, 0x38cba42c, 0xa02c8549, 0x7895dcbf, 0xbd183d71, 0xafe4476a)},
+        {SECP256K1_FE_CONST(0xede78fdd, 0xcfc92bf1, 0x4fec6c6c, 0xdb8d37e2, 0xfb66bc7b, 0x28701870, 0x7fa27c9a, 0x307196ec),
+         SECP256K1_FE_CONST(0x68193a6c, 0x9a8b87a7, 0x2a760c64, 0x13e473f6, 0x23ae7bed, 0x1de05422, 0x88865427, 0xa3418265)},
+        {SECP256K1_FE_CONST(0xa40b2079, 0xb8f88e89, 0xa7617997, 0x89baf5ae, 0x174df343, 0x75138eae, 0x2711595d, 0x3fc3e66c),
+         SECP256K1_FE_CONST(0x9f99c6a5, 0x6d685267, 0xd4b87c37, 0x9d9c4576, 0x358c692b, 0x6bbae0ed, 0x3389c93d, 0x7fdd2655)},
+        {SECP256K1_FE_CONST(0x7c74c6b6, 0xe98d9151, 0x72645cf1, 0x7f06e321, 0xcefee074, 0x15b2113a, 0x10a9be07, 0x08a45696),
+         SECP256K1_FE_CONST(0x8c919a88, 0x898bc1e0, 0x77f26f97, 0x12e655b7, 0x9ba0ac40, 0xe15bb19e, 0x8364cc3b, 0xe227a8ee)},
+        {SECP256K1_FE_CONST(0x109ba1ce, 0xdafa6d4a, 0xa1cec2b2, 0xeb1069f4, 0xb7a79e5b, 0xec6eb99b, 0xaec5f643, 0xee0e723e),
+         SECP256K1_FE_CONST(0x93d13eb8, 0x4bb0bcf9, 0xe64f5a71, 0xdbe9f359, 0x7191401c, 0x6f057a4a, 0xa407fe1b, 0x7ecb65cc)},
+        {SECP256K1_FE_CONST(0x3db076cd, 0xec74a5c9, 0xf61dd138, 0x90e23e06, 0xeeedd2d0, 0x74cbc4e0, 0x3dbe1e91, 0xded36a78),
+         SECP256K1_FE_CONST(0x3f07f966, 0x8e2a1e09, 0x706c71df, 0x02b5e9d5, 0xcb92ddbf, 0xcdd53010, 0x16545564, 0xe660b107)},
+        {SECP256K1_FE_CONST(0xe31c73ed, 0xb4c4b82c, 0x02ae35f7, 0x4cdec153, 0x98b522fd, 0xf7d2460c, 0x6bf7c0f8, 0x4cf67b0d),
+         SECP256K1_FE_CONST(0x4b8f1faf, 0x94e8b070, 0x19af0ff6, 0xa319cd31, 0xdf0a7ffb, 0xefaba629, 0x59c50666, 0x1fe5b843)},
+        {SECP256K1_FE_CONST(0x4c8b0e6e, 0x83392ab6, 0xc0e3e9f1, 0xbbd85497, 0x16698897, 0xf552d50d, 0x79652ddb, 0x12f99870),
+         SECP256K1_FE_CONST(0x56d5101f, 0xd23b7949, 0x17dc38d6, 0xf24022ef, 0xcf18e70a, 0x5cc34424, 0x438544c3, 0x62da4bca)},
+        {SECP256K1_FE_CONST(0xb0e040e2, 0x40cc35da, 0x7dd5c611, 0x7fccb178, 0x28888137, 0xbc930358, 0xea2cbc90, 0x775417dc),
+         SECP256K1_FE_CONST(0xca37f0d4, 0x016dd7c8, 0xab3ae576, 0x96e08d69, 0x68ed9155, 0xa9b44270, 0x900ae35d, 0x7c7800cd)},
+        {SECP256K1_FE_CONST(0x8a32ea49, 0x7fbb0bae, 0x69724a9d, 0x8e2105b2, 0xbdf69178, 0x862577ef, 0x35055590, 0x667ddaef),
+         SECP256K1_FE_CONST(0xd02d7ead, 0xc5e190f0, 0x559c9d72, 0xdaef1ffc, 0x64f9f425, 0xf43645ea, 0x7341e08d, 0x11768e96)},
+        {SECP256K1_FE_CONST(0xa3592d98, 0x9abe289d, 0x579ebea6, 0xbb0857a8, 0xe242ab73, 0x85f9a2ce, 0xb6998f0f, 0xbfffbfc6),
+         SECP256K1_FE_CONST(0x093c1533, 0x32032efa, 0x6aa46070, 0x0039599e, 0x589c35f4, 0xff525430, 0x7fe3777a, 0x44b43ddc)},
+        {SECP256K1_FE_CONST(0x647178a3, 0x229e607b, 0xcc98521a, 0xcce3fdd9, 0x1e1bc9c9, 0x97fb7c6a, 0x61b961e0, 0x99b10709),
+         SECP256K1_FE_CONST(0x98217c13, 0xd51ddf78, 0x96310e77, 0xdaebd908, 0x602ca683, 0xcb46d07a, 0xa1fcf17e, 0xc8e2feb3)},
+        {SECP256K1_FE_CONST(0x7334627c, 0x73f98968, 0x99464b4b, 0xf5964958, 0x1b95870d, 0xc658227e, 0x5e3235d8, 0xdcab5787),
+         SECP256K1_FE_CONST(0x000006fd, 0xc7e9dd94, 0x40ae367a, 0xe51d495c, 0x07603b9b, 0x2d088418, 0x6cc5c74c, 0x98514307)},
+        {SECP256K1_FE_CONST(0x82e83876, 0x96c28938, 0xa50dd1c5, 0x605c3ad1, 0xc048637d, 0x7a50825f, 0x335ed01a, 0x00005760),
+         SECP256K1_FE_CONST(0xb0393f9f, 0x9f2aa55e, 0xf5607e2e, 0x5287d961, 0x60b3e704, 0xf3e16e80, 0xb4f9a3ea, 0xfec7f02d)},
+        {SECP256K1_FE_CONST(0xc97b6cec, 0x3ee6b8dc, 0x98d24b58, 0x3c1970a1, 0xfe06297a, 0xae813529, 0xe76bb6bd, 0x771ae51d),
+         SECP256K1_FE_CONST(0x0507c702, 0xd407d097, 0x47ddeb06, 0xf6625419, 0x79f48f79, 0x7bf80d0b, 0xfc34b364, 0x253a5db1)},
+        {SECP256K1_FE_CONST(0xd559af63, 0x77ea9bc4, 0x3cf1ad14, 0x5c7a4bbb, 0x10e7d18b, 0x7ce0dfac, 0x380bb19d, 0x0bb99bd3),
+         SECP256K1_FE_CONST(0x00196119, 0xb9b00d92, 0x34edfdb5, 0xbbdc42fc, 0xd2daa33a, 0x163356ca, 0xaa8754c8, 0xb0ec8b0b)},
+        {SECP256K1_FE_CONST(0x8ddfa3dc, 0x52918da0, 0x640519dc, 0x0af8512a, 0xca2d33b2, 0xbde52514, 0xda9c0afc, 0xcb29fce4),
+         SECP256K1_FE_CONST(0xb3e4878d, 0x5cb69148, 0xcd54388b, 0xc23acce0, 0x62518ba8, 0xf09def92, 0x7b31e6aa, 0x6ba35b02)},
+        {SECP256K1_FE_CONST(0xf8207492, 0xe3049f0a, 0x65285f2b, 0x0bfff996, 0x00ca112e, 0xc05da837, 0x546d41f9, 0x5194fb91),
+         SECP256K1_FE_CONST(0x7b7ee50b, 0xa8ed4bbd, 0xf6469930, 0x81419a5c, 0x071441c7, 0x290d046e, 0x3b82ea41, 0x611c5f95)},
+        {SECP256K1_FE_CONST(0x050f7c80, 0x5bcd3c6b, 0x823cb724, 0x5ce74db7, 0xa4e39f5c, 0xbd8828d7, 0xfd4d3e07, 0x3ec2926a),
+         SECP256K1_FE_CONST(0x000d6730, 0xb0171314, 0x4764053d, 0xee157117, 0x48fd61da, 0xdea0b9db, 0x1d5e91c6, 0xbdc3f59e)},
+        {SECP256K1_FE_CONST(0x3e3ea8eb, 0x05d760cf, 0x23009263, 0xb3cb3ac9, 0x088f6f0d, 0x3fc182a3, 0xbd57087c, 0xe67c62f9),
+         SECP256K1_FE_CONST(0xbe988716, 0xa29c1bf6, 0x4456aed6, 0xab1e4720, 0x49929305, 0x51043bf4, 0xebd833dd, 0xdd511e8b)},
+        {SECP256K1_FE_CONST(0x6964d2a9, 0xa7fa6501, 0xa5959249, 0x142f4029, 0xea0c1b5f, 0x2f487ef6, 0x301ac80a, 0x768be5cd),
+         SECP256K1_FE_CONST(0x3918ffe4, 0x07492543, 0xed24d0b7, 0x3df95f8f, 0xaffd7cb4, 0x0de2191c, 0x9ec2f2ad, 0x2c0cb3c6)},
+        {SECP256K1_FE_CONST(0x37c93520, 0xf6ddca57, 0x2b42fd5e, 0xb5c7e4de, 0x11b5b81c, 0xb95e91f3, 0x95c4d156, 0x39877ccb),
+         SECP256K1_FE_CONST(0x9a94b9b5, 0x57eb71ee, 0x4c975b8b, 0xac5262a8, 0x077b0595, 0xe12a6b1f, 0xd728edef, 0x1a6bf956)}
     };
     /* Fixed test cases for scalar inverses: pairs of (x, 1/x) mod n. */
     static const secp256k1_scalar scalar_cases[][2] = {
@@ -2204,7 +2764,72 @@ void run_inverse_tests(void)
          SECP256K1_SCALAR_CONST(0x50a51ac8, 0x34b9ec24, 0x4b0dff66, 0x5588b13e, 0x9984d5b3, 0xcf80ef0f, 0xd6a23766, 0xa3ee9f22)},
         /* Input known to need 635 divsteps */
         {SECP256K1_SCALAR_CONST(0xcb9f1d35, 0xdd4416c2, 0xcd71bf3f, 0x6365da66, 0x3c9b3376, 0x8feb7ae9, 0x32a5ef60, 0x19199ec3),
-         SECP256K1_SCALAR_CONST(0x1d7c7bba, 0xf1893d53, 0xb834bd09, 0x36b411dc, 0x42c2e42f, 0xec72c428, 0x5e189791, 0x8e9bc708)}
+         SECP256K1_SCALAR_CONST(0x1d7c7bba, 0xf1893d53, 0xb834bd09, 0x36b411dc, 0x42c2e42f, 0xec72c428, 0x5e189791, 0x8e9bc708)},
+        /* Input known to need 566 divsteps starting with delta=1/2. */
+        {SECP256K1_SCALAR_CONST(0x7e3c993d, 0xa4272488, 0xbc015b49, 0x2db54174, 0xd382083a, 0xebe6db35, 0x80f82eff, 0xcd132c72),
+         SECP256K1_SCALAR_CONST(0x086f34a0, 0x3e631f76, 0x77418f28, 0xcc84ac95, 0x6304439d, 0x365db268, 0x312c6ded, 0xd0b934f8)},
+        /* Input known to need 565 divsteps starting with delta=1/2. */
+        {SECP256K1_SCALAR_CONST(0xbad7e587, 0x3f307859, 0x60d93147, 0x8a18491e, 0xb38a9fd5, 0x254350d3, 0x4b1f0e4b, 0x7dd6edc4),
+         SECP256K1_SCALAR_CONST(0x89f2df26, 0x39e2b041, 0xf19bd876, 0xd039c8ac, 0xc2223add, 0x29c4943e, 0x6632d908, 0x515f467b)},
+        /* Selection of randomly generated inputs that reach low/high d/e values in various configurations. */
+        {SECP256K1_SCALAR_CONST(0x1950d757, 0xb37a5809, 0x435059bb, 0x0bb8997e, 0x07e1e3c8, 0x5e5d7d2c, 0x6a0ed8e3, 0xdbde180e),
+         SECP256K1_SCALAR_CONST(0xbf72af9b, 0x750309e2, 0x8dda230b, 0xfe432b93, 0x7e25e475, 0x4388251e, 0x633d894b, 0x3bcb6f8c)},
+        {SECP256K1_SCALAR_CONST(0x9bccf4e7, 0xc5a515e3, 0x50637aa9, 0xbb65a13f, 0x391749a1, 0x62de7d4e, 0xf6d7eabb, 0x3cd10ce0),
+         SECP256K1_SCALAR_CONST(0xaf2d5623, 0xb6385a33, 0xcd0365be, 0x5e92a70d, 0x7f09179c, 0x3baaf30f, 0x8f9cc83b, 0x20092f67)},
+        {SECP256K1_SCALAR_CONST(0x73a57111, 0xb242952a, 0x5c5dee59, 0xf3be2ace, 0xa30a7659, 0xa46e5f47, 0xd21267b1, 0x39e642c9),
+         SECP256K1_SCALAR_CONST(0xa711df07, 0xcbcf13ef, 0xd61cc6be, 0xbcd058ce, 0xb02cf157, 0x272d4a18, 0x86d0feb3, 0xcd5fa004)},
+        {SECP256K1_SCALAR_CONST(0x04884963, 0xce0580b1, 0xba547030, 0x3c691db3, 0x9cd2c84f, 0x24c7cebd, 0x97ebfdba, 0x3e785ec2),
+         SECP256K1_SCALAR_CONST(0xaaaaaf14, 0xd7c99ba7, 0x517ce2c1, 0x78a28b4c, 0x3769a851, 0xe5c5a03d, 0x4cc28f33, 0x0ec4dc5d)},
+        {SECP256K1_SCALAR_CONST(0x1679ed49, 0x21f537b1, 0x815cb8ae, 0x9efc511c, 0x5b9fa037, 0x0b0f275e, 0x6c985281, 0x6c4a9905),
+         SECP256K1_SCALAR_CONST(0xb14ac3d5, 0x62b52999, 0xef34ead1, 0xffca4998, 0x0294341a, 0x1f8172aa, 0xea1624f9, 0x302eea62)},
+        {SECP256K1_SCALAR_CONST(0x626b37c0, 0xf0057c35, 0xee982f83, 0x452a1fd3, 0xea826506, 0x48b08a9d, 0x1d2c4799, 0x4ad5f6ec),
+         SECP256K1_SCALAR_CONST(0xe38643b7, 0x567bfc2f, 0x5d2f1c15, 0xe327239c, 0x07112443, 0x69509283, 0xfd98e77a, 0xdb71c1e8)},
+        {SECP256K1_SCALAR_CONST(0x1850a3a7, 0x759efc56, 0x54f287b2, 0x14d1234b, 0xe263bbc9, 0xcf4d8927, 0xd5f85f27, 0x965bd816),
+         SECP256K1_SCALAR_CONST(0x3b071831, 0xcac9619a, 0xcceb0596, 0xf614d63b, 0x95d0db2f, 0xc6a00901, 0x8eaa2621, 0xabfa0009)},
+        {SECP256K1_SCALAR_CONST(0x94ae5d06, 0xa27dc400, 0x487d72be, 0xaa51ebed, 0xe475b5c0, 0xea675ffc, 0xf4df627a, 0xdca4222f),
+         SECP256K1_SCALAR_CONST(0x01b412ed, 0xd7830956, 0x1532537e, 0xe5e3dc99, 0x8fd3930a, 0x54f8d067, 0x32ef5760, 0x594438a5)},
+        {SECP256K1_SCALAR_CONST(0x1f24278a, 0xb5bfe374, 0xa328dbbc, 0xebe35f48, 0x6620e009, 0xd58bb1b4, 0xb5a6bf84, 0x8815f63a),
+         SECP256K1_SCALAR_CONST(0xfe928416, 0xca5ba2d3, 0xfde513da, 0x903a60c7, 0x9e58ad8a, 0x8783bee4, 0x083a3843, 0xa608c914)},
+        {SECP256K1_SCALAR_CONST(0xdc107d58, 0x274f6330, 0x67dba8bc, 0x26093111, 0x5201dfb8, 0x968ce3f5, 0xf34d1bd4, 0xf2146504),
+         SECP256K1_SCALAR_CONST(0x660cfa90, 0x13c3d93e, 0x7023b1e5, 0xedd09e71, 0x6d9c9d10, 0x7a3d2cdb, 0xdd08edc3, 0xaa78fcfb)},
+        {SECP256K1_SCALAR_CONST(0x7cd1e905, 0xc6f02776, 0x2f551cc7, 0x5da61cff, 0x7da05389, 0x1119d5a4, 0x631c7442, 0x894fd4f7),
+         SECP256K1_SCALAR_CONST(0xff20862a, 0x9d3b1a37, 0x1628803b, 0x3004ccae, 0xaa23282a, 0xa89a1109, 0xd94ece5e, 0x181bdc46)},
+        {SECP256K1_SCALAR_CONST(0x5b9dade8, 0x23d26c58, 0xcd12d818, 0x25b8ae97, 0x3dea04af, 0xf482c96b, 0xa062f254, 0x9e453640),
+         SECP256K1_SCALAR_CONST(0x50c38800, 0x15fa53f4, 0xbe1e5392, 0x5c9b120a, 0x262c22c7, 0x18fa0816, 0x5f2baab4, 0x8cb5db46)},
+        {SECP256K1_SCALAR_CONST(0x11cdaeda, 0x969c464b, 0xef1f4ab0, 0x5b01d22e, 0x656fd098, 0x882bea84, 0x65cdbe7a, 0x0c19ff03),
+         SECP256K1_SCALAR_CONST(0x1968d0fa, 0xac46f103, 0xb55f1f72, 0xb3820bed, 0xec6b359a, 0x4b1ae0ad, 0x7e38e1fb, 0x295ccdfb)},
+        {SECP256K1_SCALAR_CONST(0x2c351aa1, 0x26e91589, 0x194f8a1e, 0x06561f66, 0x0cb97b7f, 0x10914454, 0x134d1c03, 0x157266b4),
+         SECP256K1_SCALAR_CONST(0xbe49ada6, 0x92bd8711, 0x41b176c4, 0xa478ba95, 0x14883434, 0x9d1cd6f3, 0xcc4b847d, 0x22af80f5)},
+        {SECP256K1_SCALAR_CONST(0x6ba07c6e, 0x13a60edb, 0x6247f5c3, 0x84b5fa56, 0x76fe3ec5, 0x80426395, 0xf65ec2ae, 0x623ba730),
+         SECP256K1_SCALAR_CONST(0x25ac23f7, 0x418cd747, 0x98376f9d, 0x4a11c7bf, 0x24c8ebfe, 0x4c8a8655, 0x345f4f52, 0x1c515595)},
+        {SECP256K1_SCALAR_CONST(0x9397a712, 0x8abb6951, 0x2d4a3d54, 0x703b1c2a, 0x0661dca8, 0xd75c9b31, 0xaed4d24b, 0xd2ab2948),
+         SECP256K1_SCALAR_CONST(0xc52e8bef, 0xd55ce3eb, 0x1c897739, 0xeb9fb606, 0x36b9cd57, 0x18c51cc2, 0x6a87489e, 0xffd0dcf3)},
+        {SECP256K1_SCALAR_CONST(0xe6a808cc, 0xeb437888, 0xe97798df, 0x4e224e44, 0x7e3b380a, 0x207c1653, 0x889f3212, 0xc6738b6f),
+         SECP256K1_SCALAR_CONST(0x31f9ae13, 0xd1e08b20, 0x757a2e5e, 0x5243a0eb, 0x8ae35f73, 0x19bb6122, 0xb910f26b, 0xda70aa55)},
+        {SECP256K1_SCALAR_CONST(0xd0320548, 0xab0effe7, 0xa70779e0, 0x61a347a6, 0xb8c1e010, 0x9d5281f8, 0x2ee588a6, 0x80000000),
+         SECP256K1_SCALAR_CONST(0x1541897e, 0x78195c90, 0x7583dd9e, 0x728b6100, 0xbce8bc6d, 0x7a53b471, 0x5dcd9e45, 0x4425fcaf)},
+        {SECP256K1_SCALAR_CONST(0x93d623f1, 0xd45b50b0, 0x796e9186, 0x9eac9407, 0xd30edc20, 0xef6304cf, 0x250494e7, 0xba503de9),
+         SECP256K1_SCALAR_CONST(0x7026d638, 0x1178b548, 0x92043952, 0x3c7fb47c, 0xcd3ea236, 0x31d82b01, 0x612fc387, 0x80b9b957)},
+        {SECP256K1_SCALAR_CONST(0xf860ab39, 0x55f5d412, 0xa4d73bcc, 0x3b48bd90, 0xc248ffd3, 0x13ca10be, 0x8fba84cc, 0xdd28d6a3),
+         SECP256K1_SCALAR_CONST(0x5c32fc70, 0xe0b15d67, 0x76694700, 0xfe62be4d, 0xeacdb229, 0x7a4433d9, 0x52155cd0, 0x7649ab59)},
+        {SECP256K1_SCALAR_CONST(0x4e41311c, 0x0800af58, 0x7a690a8e, 0xe175c9ba, 0x6981ab73, 0xac532ea8, 0x5c1f5e63, 0x6ac1f189),
+         SECP256K1_SCALAR_CONST(0xfffffff9, 0xd075982c, 0x7fbd3825, 0xc05038a2, 0x4533b91f, 0x94ec5f45, 0xb280b28f, 0x842324dc)},
+        {SECP256K1_SCALAR_CONST(0x48e473bf, 0x3555eade, 0xad5d7089, 0x2424c4e4, 0x0a99397c, 0x2dc796d8, 0xb7a43a69, 0xd0364141),
+         SECP256K1_SCALAR_CONST(0x634976b2, 0xa0e47895, 0x1ec38593, 0x266d6fd0, 0x6f602644, 0x9bb762f1, 0x7180c704, 0xe23a4daa)},
+        {SECP256K1_SCALAR_CONST(0xbe83878d, 0x3292fc54, 0x26e71c62, 0x556ccedc, 0x7cbb8810, 0x4032a720, 0x34ead589, 0xe4d6bd13),
+         SECP256K1_SCALAR_CONST(0x6cd150ad, 0x25e59d0f, 0x74cbae3d, 0x6377534a, 0x1e6562e8, 0xb71b9d18, 0xe1e5d712, 0x8480abb3)},
+        {SECP256K1_SCALAR_CONST(0xcdddf2e5, 0xefc15f88, 0xc9ee06de, 0x8a846ca9, 0x28561581, 0x68daa5fb, 0xd1cf3451, 0xeb1782d0),
+         SECP256K1_SCALAR_CONST(0xffffffd9, 0xed8d2af4, 0x993c865a, 0x23e9681a, 0x3ca3a3dc, 0xe6d5a46e, 0xbd86bd87, 0x61b55c70)},
+        {SECP256K1_SCALAR_CONST(0xb6a18f1f, 0x04872df9, 0x08165ec4, 0x319ca19c, 0x6c0359ab, 0x1f7118fb, 0xc2ef8082, 0xca8b7785),
+         SECP256K1_SCALAR_CONST(0xff55b19b, 0x0f1ac78c, 0x0f0c88c2, 0x2358d5ad, 0x5f455e4e, 0x3330b72f, 0x274dc153, 0xffbf272b)},
+        {SECP256K1_SCALAR_CONST(0xea4898e5, 0x30eba3e8, 0xcf0e5c3d, 0x06ec6844, 0x01e26fb6, 0x75636225, 0xc5d08f4c, 0x1decafa0),
+         SECP256K1_SCALAR_CONST(0xe5a014a8, 0xe3c4ec1e, 0xea4f9b32, 0xcfc7b386, 0x00630806, 0x12c08d02, 0x6407ccc2, 0xb067d90e)},
+        {SECP256K1_SCALAR_CONST(0x70e9aea9, 0x7e933af0, 0x8a23bfab, 0x23e4b772, 0xff951863, 0x5ffcf47d, 0x6bebc918, 0x2ca58265),
+         SECP256K1_SCALAR_CONST(0xf4e00006, 0x81bc6441, 0x4eb6ec02, 0xc194a859, 0x80ad7c48, 0xba4e9afb, 0x8b6bdbe0, 0x989d8f77)},
+        {SECP256K1_SCALAR_CONST(0x3c56c774, 0x46efe6f0, 0xe93618b8, 0xf9b5a846, 0xd247df61, 0x83b1e215, 0x06dc8bcc, 0xeefc1bf5),
+         SECP256K1_SCALAR_CONST(0xfff8937a, 0x2cd9586b, 0x43c25e57, 0xd1cefa7a, 0x9fb91ed3, 0x95b6533d, 0x8ad0de5b, 0xafb93f00)},
+        {SECP256K1_SCALAR_CONST(0xfb5c2772, 0x5cb30e83, 0xe38264df, 0xe4e3ebf3, 0x392aa92e, 0xa68756a1, 0x51279ac5, 0xb50711a8),
+         SECP256K1_SCALAR_CONST(0x000013af, 0x1105bfe7, 0xa6bbd7fb, 0x3d638f99, 0x3b266b02, 0x072fb8bc, 0x39251130, 0x2e0fd0ea)}
     };
     int i, var, testrand;
     unsigned char b32[32];

From a3aa2628c7b675814157556d774872755c9f1aba Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Thu, 15 Apr 2021 16:17:53 +0200
Subject: [PATCH 25/59] gen_context: Don't include basic-config.h

Before this commit, gen_context.c both included libsecp256k1-config.h
and basic-config.h: The former only to obtain ECMULT_GEN_PREC_BITS
and the latter to obtain a basic working configuration to be able to
use the library.

This was inelegant and confusing: It meant that basic-config.h needs
to #undef all the macros defined in libsecp256k1-config.h. Moreover,
it meant that basic-config.h cannot define ECMULT_GEN_PREC_BITS,
essentially making this file specific for use in gen_context.c.

After this commit, gen_context.c include only libsecp256k1-config.h.
basic-config.h is not necessary anymore for the modules used in
gen_context.c because 79f1f7a made the preprocessor detect all the
relevant config options.

On the way, we remove an unused #define in basic-config.h.
---
 src/basic-config.h | 9 ---------
 src/gen_context.c  | 5 +++--
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/basic-config.h b/src/basic-config.h
index e4b1b8b05..1326bf7c0 100644
--- a/src/basic-config.h
+++ b/src/basic-config.h
@@ -9,15 +9,6 @@
 
 #ifdef USE_BASIC_CONFIG
 
-#undef USE_ASM_X86_64
-#undef USE_ECMULT_STATIC_PRECOMPUTATION
-#undef USE_EXTERNAL_ASM
-#undef USE_EXTERNAL_DEFAULT_CALLBACKS
-#undef USE_FORCE_WIDEMUL_INT64
-#undef USE_FORCE_WIDEMUL_INT128
-#undef ECMULT_WINDOW_SIZE
-
-#define USE_WIDEMUL_64 1
 #define ECMULT_WINDOW_SIZE 15
 
 #endif /* USE_BASIC_CONFIG */
diff --git a/src/gen_context.c b/src/gen_context.c
index 05e7dee17..024c55726 100644
--- a/src/gen_context.c
+++ b/src/gen_context.c
@@ -9,8 +9,9 @@
 #if !defined(ECMULT_GEN_PREC_BITS)
 #include "libsecp256k1-config.h"
 #endif
-#define USE_BASIC_CONFIG 1
-#include "basic-config.h"
+
+/* We can't require the precomputed tables when creating them. */
+#undef USE_ECMULT_STATIC_PRECOMPUTATION
 
 #include "include/secp256k1.h"
 #include "assumptions.h"

From 07067967ee9dcc4af10fd3a565ffb846a2593e92 Mon Sep 17 00:00:00 2001
From: Aaron Voisine <voisine@gmail.com>
Date: Sat, 10 Apr 2021 11:37:08 -0700
Subject: [PATCH 26/59] add ECMULT_GEN_PREC_BITS to basic_config.h

set ECMULT_GEN_PREC_BITS to the "auto" value of 4 in basic_config.h, so libsecp can be used without autoconf
---
 src/basic-config.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/basic-config.h b/src/basic-config.h
index 1326bf7c0..6f7693cb8 100644
--- a/src/basic-config.h
+++ b/src/basic-config.h
@@ -10,6 +10,7 @@
 #ifdef USE_BASIC_CONFIG
 
 #define ECMULT_WINDOW_SIZE 15
+#define ECMULT_GEN_PREC_BITS 4
 
 #endif /* USE_BASIC_CONFIG */
 

From 0881633dfd0c530a915cf63be295f00841c94cc4 Mon Sep 17 00:00:00 2001
From: Jonas Nick <jonasd.nick@gmail.com>
Date: Wed, 28 Apr 2021 16:57:49 +0000
Subject: [PATCH 27/59] secp256k1.h: clarify that by default arguments must be
 != NULL

The same file says that the illegal callback will only triger for violations
explicitly mentioned, which is not true without this commit because we often
don't mention that an argument is not allowed to be NULL.
---
 include/secp256k1.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/secp256k1.h b/include/secp256k1.h
index d368488af..e13f78374 100644
--- a/include/secp256k1.h
+++ b/include/secp256k1.h
@@ -7,7 +7,9 @@ extern "C" {
 
 #include <stddef.h>
 
-/* These rules specify the order of arguments in API calls:
+/* Unless explicitly stated all pointer arguments must not be NULL.
+ *
+ * The following rules specify the order of arguments in API calls:
  *
  * 1. Context pointers go first, followed by output arguments, combined
  *    output/input arguments, and finally input-only arguments.

From 4dc37bf81b55b9a3ffcf09f7a212436d25844710 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sat, 17 Apr 2021 10:57:16 -0700
Subject: [PATCH 28/59] Add mingw32-w64/wine CI build

---
 .cirrus.yml                | 21 +++++++++++++++++++++
 ci/cirrus.sh               | 12 +++++++++++-
 ci/linux-debian.Dockerfile |  6 +++++-
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 506a86033..ffae165bb 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -196,3 +196,24 @@ task:
     - rm /etc/ld.so.cache
     - ./ci/cirrus.sh
   << : *CAT_LOGS
+
+task:
+  name: "x86_64 (mingw32-w64): Windows (Debian stable, Wine)"
+  container:
+    dockerfile: ci/linux-debian.Dockerfile
+    cpu: 1
+    memory: 1G
+  env:
+    WINE_CMD: wine64-stable
+    HOST: x86_64-w64-mingw32
+    BUILD:
+    WITH_VALGRIND: no
+    ECDH: yes
+    RECOVERY: yes
+    EXPERIMENTAL: yes
+    SCHNORRSIG: yes
+    CTIMETEST: no
+  << : *MERGE_BASE
+  test_script:
+    - ./ci/cirrus.sh
+  << : *CAT_LOGS
diff --git a/ci/cirrus.sh b/ci/cirrus.sh
index f26ca98d1..dafab8ae3 100755
--- a/ci/cirrus.sh
+++ b/ci/cirrus.sh
@@ -25,7 +25,7 @@ valgrind --version || true
 make
 
 # Print information about binaries so that we can see that the architecture is correct
-file *tests || true
+file *tests* || true
 file bench_* || true
 file .libs/* || true
 
@@ -47,6 +47,12 @@ then
     $QEMU_CMD ./exhaustive_tests
 fi
 
+if [ -n "$WINE_CMD" ]
+then
+    $WINE_CMD ./tests 16
+    $WINE_CMD ./exhaustive_tests
+fi
+
 if [ "$BENCH" = "yes" ]
 then
     # Using the local `libtool` because on macOS the system's libtool has nothing to do with GNU libtool
@@ -59,6 +65,10 @@ then
     then
         EXEC="$EXEC valgrind --error-exitcode=42"
     fi
+    if [ -n "$WINE_CMD" ]
+    then
+        EXEC="$WINE_CMD"
+    fi
     # This limits the iterations in the benchmarks below to ITER iterations.
     export SECP256K1_BENCH_ITERS="$ITERS"
     {
diff --git a/ci/linux-debian.Dockerfile b/ci/linux-debian.Dockerfile
index 5967cf8b3..ef2518ff0 100644
--- a/ci/linux-debian.Dockerfile
+++ b/ci/linux-debian.Dockerfile
@@ -10,4 +10,8 @@ RUN apt-get install --no-install-recommends --no-upgrade -y \
         make automake libtool pkg-config dpkg-dev valgrind qemu-user \
         gcc clang libc6-dbg \
         gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 \
-        gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x
+        gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x \
+        wine gcc-mingw-w64-x86-64
+
+# Run a dummy command in wine to make it set up configuration
+RUN wine64-stable xcopy || true

From ae9e648526ceaf7cd97ba4dfe3c105db8e226c35 Mon Sep 17 00:00:00 2001
From: Gregory Maxwell <greg@xiph.org>
Date: Sat, 1 May 2021 17:08:52 +0000
Subject: [PATCH 29/59] Define SECP256K1_BUILD in secp256k1.c directly.

This avoids building without it and makes it safer to use a custom
 building environment.  Test harnesses need to #include secp256k1.c
 first now.
---
 Makefile.am            | 12 ++++++------
 include/secp256k1.h    | 11 +++++++++++
 src/bench_ecmult.c     |  4 ++--
 src/bench_internal.c   |  4 ++--
 src/secp256k1.c        |  6 ++++++
 src/tests_exhaustive.c |  2 +-
 6 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 58c9635e5..053e435fb 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -68,7 +68,7 @@ endif
 endif
 
 libsecp256k1_la_SOURCES = src/secp256k1.c
-libsecp256k1_la_CPPFLAGS = -DSECP256K1_BUILD -I$(top_srcdir)/include -I$(top_srcdir)/src $(SECP_INCLUDES)
+libsecp256k1_la_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/src $(SECP_INCLUDES)
 libsecp256k1_la_LIBADD = $(SECP_LIBS) $(COMMON_LIB)
 
 if VALGRIND_ENABLED
@@ -81,22 +81,22 @@ noinst_PROGRAMS += bench_verify bench_sign bench_internal bench_ecmult
 bench_verify_SOURCES = src/bench_verify.c
 bench_verify_LDADD = libsecp256k1.la $(SECP_LIBS) $(SECP_TEST_LIBS) $(COMMON_LIB)
 # SECP_TEST_INCLUDES are only used here for CRYPTO_CPPFLAGS
-bench_verify_CPPFLAGS = -DSECP256K1_BUILD $(SECP_TEST_INCLUDES)
+bench_verify_CPPFLAGS = $(SECP_TEST_INCLUDES)
 bench_sign_SOURCES = src/bench_sign.c
 bench_sign_LDADD = libsecp256k1.la $(SECP_LIBS) $(SECP_TEST_LIBS) $(COMMON_LIB)
 bench_internal_SOURCES = src/bench_internal.c
 bench_internal_LDADD = $(SECP_LIBS) $(COMMON_LIB)
-bench_internal_CPPFLAGS = -DSECP256K1_BUILD $(SECP_INCLUDES)
+bench_internal_CPPFLAGS = $(SECP_INCLUDES)
 bench_ecmult_SOURCES = src/bench_ecmult.c
 bench_ecmult_LDADD = $(SECP_LIBS) $(COMMON_LIB)
-bench_ecmult_CPPFLAGS = -DSECP256K1_BUILD $(SECP_INCLUDES)
+bench_ecmult_CPPFLAGS = $(SECP_INCLUDES)
 endif
 
 TESTS =
 if USE_TESTS
 noinst_PROGRAMS += tests
 tests_SOURCES = src/tests.c
-tests_CPPFLAGS = -DSECP256K1_BUILD -I$(top_srcdir)/src -I$(top_srcdir)/include $(SECP_INCLUDES) $(SECP_TEST_INCLUDES)
+tests_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/include $(SECP_INCLUDES) $(SECP_TEST_INCLUDES)
 if VALGRIND_ENABLED
 tests_CPPFLAGS += -DVALGRIND
 noinst_PROGRAMS += valgrind_ctime_test
@@ -114,7 +114,7 @@ endif
 if USE_EXHAUSTIVE_TESTS
 noinst_PROGRAMS += exhaustive_tests
 exhaustive_tests_SOURCES = src/tests_exhaustive.c
-exhaustive_tests_CPPFLAGS = -DSECP256K1_BUILD -I$(top_srcdir)/src $(SECP_INCLUDES)
+exhaustive_tests_CPPFLAGS = -I$(top_srcdir)/src $(SECP_INCLUDES)
 if !ENABLE_COVERAGE
 exhaustive_tests_CPPFLAGS += -DVERIFY
 endif
diff --git a/include/secp256k1.h b/include/secp256k1.h
index d368488af..b616fc11e 100644
--- a/include/secp256k1.h
+++ b/include/secp256k1.h
@@ -127,6 +127,17 @@ typedef int (*secp256k1_nonce_function)(
 #  define SECP256K1_INLINE inline
 # endif
 
+/** When this header is used at build-time the SECP256K1_BUILD define needs to be set
+ *  to correctly setup export attributes and nullness checks.  This is normally done
+ *  by secp256k1.c but to guard against this header being included before secp256k1.c
+ *  has had a chance to set the define (e.g. via test harnesses that just includes
+ *  secp256k1.c) we set SECP256K1_NO_BUILD when this header is processed without the
+ *  BUILD define so this condition can be caught.
+ */
+#ifndef SECP256K1_BUILD
+# define SECP256K1_NO_BUILD
+#endif
+
 #ifndef SECP256K1_API
 # if defined(_WIN32)
 #  ifdef SECP256K1_BUILD
diff --git a/src/bench_ecmult.c b/src/bench_ecmult.c
index 204e85a5d..a536ab2ae 100644
--- a/src/bench_ecmult.c
+++ b/src/bench_ecmult.c
@@ -5,8 +5,9 @@
  ***********************************************************************/
 #include <stdio.h>
 
-#include "include/secp256k1.h"
+#include "secp256k1.c"
 
+#include "include/secp256k1.h"
 #include "util.h"
 #include "hash_impl.h"
 #include "field_impl.h"
@@ -14,7 +15,6 @@
 #include "scalar_impl.h"
 #include "ecmult_impl.h"
 #include "bench.h"
-#include "secp256k1.c"
 
 #define POINTS 32768
 
diff --git a/src/bench_internal.c b/src/bench_internal.c
index 73b8a24cc..dfe99cf3a 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -5,8 +5,9 @@
  ***********************************************************************/
 #include <stdio.h>
 
-#include "include/secp256k1.h"
+#include "secp256k1.c"
 
+#include "include/secp256k1.h"
 #include "assumptions.h"
 #include "util.h"
 #include "hash_impl.h"
@@ -16,7 +17,6 @@
 #include "ecmult_const_impl.h"
 #include "ecmult_impl.h"
 #include "bench.h"
-#include "secp256k1.c"
 
 typedef struct {
     secp256k1_scalar scalar[2];
diff --git a/src/secp256k1.c b/src/secp256k1.c
index aef3f99ac..8d2d08722 100644
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@@ -4,6 +4,8 @@
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
+#define SECP256K1_BUILD
+
 #include "include/secp256k1.h"
 #include "include/secp256k1_preallocated.h"
 
@@ -21,6 +23,10 @@
 #include "scratch_impl.h"
 #include "selftest.h"
 
+#ifdef SECP256K1_NO_BUILD
+# error "secp256k1.h processed without SECP256K1_BUILD defined while building secp256k1.c"
+#endif
+
 #if defined(VALGRIND)
 # include <valgrind/memcheck.h>
 #endif
diff --git a/src/tests_exhaustive.c b/src/tests_exhaustive.c
index 2bb538144..a8074d44f 100644
--- a/src/tests_exhaustive.c
+++ b/src/tests_exhaustive.c
@@ -20,10 +20,10 @@
 #define EXHAUSTIVE_TEST_ORDER 13
 #endif
 
+#include "secp256k1.c"
 #include "include/secp256k1.h"
 #include "assumptions.h"
 #include "group.h"
-#include "secp256k1.c"
 #include "testrand_impl.h"
 
 static int count = 2;

From ed5a199bed65bf084f34ce18d35807d31a1c75bb Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Sat, 1 May 2021 13:06:55 +0200
Subject: [PATCH 30/59] tests: fopen /dev/urandom in binary mode

This makes a difference with mingw builds on Wine, where the subsequent
fread() may abort early in the default text mode.

The Microsoft C docs say:
"In text mode, CTRL+Z is interpreted as an EOF character on input."
---
 src/testrand_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/testrand_impl.h b/src/testrand_impl.h
index e643778f3..c8d30ef6a 100644
--- a/src/testrand_impl.h
+++ b/src/testrand_impl.h
@@ -127,7 +127,7 @@ static void secp256k1_testrand_init(const char* hexseed) {
             pos++;
         }
     } else {
-        FILE *frand = fopen("/dev/urandom", "r");
+        FILE *frand = fopen("/dev/urandom", "rb");
         if ((frand == NULL) || fread(&seed16, 1, sizeof(seed16), frand) != sizeof(seed16)) {
             uint64_t t = time(NULL) * (uint64_t)1337;
             fprintf(stderr, "WARNING: could not read 16 bytes from /dev/urandom; falling back to insecure PRNG\n");

From 99e2d5be0dba938b7701d157cba86252db9eb61c Mon Sep 17 00:00:00 2001
From: Gregory Maxwell <greg@xiph.org>
Date: Sun, 2 May 2021 20:02:12 +0000
Subject: [PATCH 31/59] Avoids a missing brace warning in
 schnorrsig/tests_impl.h on old compilers.

GCC 4.9.2, at least, emits "warning: missing braces around initializer"
 without this.
---
 src/modules/schnorrsig/tests_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modules/schnorrsig/tests_impl.h b/src/modules/schnorrsig/tests_impl.h
index 338462fc9..6f960cf6a 100644
--- a/src/modules/schnorrsig/tests_impl.h
+++ b/src/modules/schnorrsig/tests_impl.h
@@ -103,7 +103,7 @@ void test_schnorrsig_api(void) {
     unsigned char sk3[32];
     unsigned char msg[32];
     secp256k1_keypair keypairs[3];
-    secp256k1_keypair invalid_keypair = { 0 };
+    secp256k1_keypair invalid_keypair = {{ 0 }};
     secp256k1_xonly_pubkey pk[3];
     secp256k1_xonly_pubkey zero_pk;
     unsigned char sig[64];

From 99f47c20ec41279075d6b3ae64c9c1a84b40a6f8 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Mon, 3 May 2021 14:11:08 +0200
Subject: [PATCH 32/59] gen_context: Don't use external ASM because it
 complicates the build

Fixes #931.
---
 src/gen_context.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gen_context.c b/src/gen_context.c
index 024c55726..0fb361fb9 100644
--- a/src/gen_context.c
+++ b/src/gen_context.c
@@ -13,6 +13,11 @@
 /* We can't require the precomputed tables when creating them. */
 #undef USE_ECMULT_STATIC_PRECOMPUTATION
 
+/* In principle we could use external ASM, but this yields only a minor speedup in
+   build time and it's very complicated. In particular when cross-compiling, we'd
+   need to build the external ASM for the build and the host machine. */
+#undef USE_EXTERNAL_ASM
+
 #include "include/secp256k1.h"
 #include "assumptions.h"
 #include "util.h"

From 2161f31785e66e4e46471208610b5e3e98331849 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Mon, 3 May 2021 13:05:33 +0200
Subject: [PATCH 33/59] Makefile.am: Honor config when building gen_context

This passes $(DEFS) (which should literally be "-DHAVE_CONFIG_H") to the
compiler when building gen_context.

This has currently no effect because gen_context.c does not check for
this macro but it's conceivable that it may do so in the future.
---
 Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.am b/Makefile.am
index 053e435fb..a0d78f90c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -129,7 +129,7 @@ CPPFLAGS_FOR_BUILD +=-I$(top_srcdir) -I$(builddir)/src
 gen_context_OBJECTS = gen_context.o
 gen_context_BIN = gen_context$(BUILD_EXEEXT)
 gen_%.o: src/gen_%.c src/libsecp256k1-config.h
-	$(CC_FOR_BUILD) $(CPPFLAGS_FOR_BUILD) $(CFLAGS_FOR_BUILD) -c $< -o $@
+	$(CC_FOR_BUILD) $(DEFS) $(CPPFLAGS_FOR_BUILD) $(CFLAGS_FOR_BUILD) -c $< -o $@
 
 $(gen_context_BIN): $(gen_context_OBJECTS)
 	$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) $^ -o $@

From c8483520c9077905a1dc8b9adb88b6ea2a3bd9ef Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Mon, 3 May 2021 14:11:38 +0200
Subject: [PATCH 34/59] Makefile.am: Don't pass a variable twice

---
 Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.am b/Makefile.am
index a0d78f90c..23b29281d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -101,7 +101,7 @@ if VALGRIND_ENABLED
 tests_CPPFLAGS += -DVALGRIND
 noinst_PROGRAMS += valgrind_ctime_test
 valgrind_ctime_test_SOURCES = src/valgrind_ctime_test.c
-valgrind_ctime_test_LDADD = libsecp256k1.la $(SECP_LIBS) $(SECP_LIBS) $(COMMON_LIB)
+valgrind_ctime_test_LDADD = libsecp256k1.la $(SECP_LIBS) $(COMMON_LIB)
 endif
 if !ENABLE_COVERAGE
 tests_CPPFLAGS += -DVERIFY

From 7d65ed5214273275841f5aa272ad561df7ea7f21 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 2 May 2021 12:48:38 -0700
Subject: [PATCH 35/59] Add ARM32/ARM64 CI

---
 .cirrus.yml                | 42 ++++++++++++++++++++++++++++++++++++++
 ci/linux-debian.Dockerfile |  4 ++++
 2 files changed, 46 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index ffae165bb..124238375 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -197,6 +197,48 @@ task:
     - ./ci/cirrus.sh
   << : *CAT_LOGS
 
+task:
+  name: "ARM32: Linux (Debian stable, QEMU)"
+  container:
+    dockerfile: ci/linux-debian.Dockerfile
+    cpu: 1
+    memory: 1G
+  env:
+    QEMU_CMD: qemu-arm
+    HOST: arm-linux-gnueabihf
+    BUILD:
+    WITH_VALGRIND: no
+    ECDH: yes
+    RECOVERY: yes
+    EXPERIMENTAL: yes
+    SCHNORRSIG: yes
+    CTIMETEST: no
+  << : *MERGE_BASE
+  test_script:
+    - ./ci/cirrus.sh
+  << : *CAT_LOGS
+
+task:
+  name: "ARM64: Linux (Debian stable, QEMU)"
+  container:
+    dockerfile: ci/linux-debian.Dockerfile
+    cpu: 1
+    memory: 1G
+  env:
+    QEMU_CMD: qemu-aarch64
+    HOST: aarch64-linux-gnu
+    BUILD:
+    WITH_VALGRIND: no
+    ECDH: yes
+    RECOVERY: yes
+    EXPERIMENTAL: yes
+    SCHNORRSIG: yes
+    CTIMETEST: no
+  << : *MERGE_BASE
+  test_script:
+    - ./ci/cirrus.sh
+  << : *CAT_LOGS
+
 task:
   name: "x86_64 (mingw32-w64): Windows (Debian stable, Wine)"
   container:
diff --git a/ci/linux-debian.Dockerfile b/ci/linux-debian.Dockerfile
index ef2518ff0..6559c5802 100644
--- a/ci/linux-debian.Dockerfile
+++ b/ci/linux-debian.Dockerfile
@@ -2,6 +2,8 @@ FROM debian:stable
 
 RUN dpkg --add-architecture i386
 RUN dpkg --add-architecture s390x
+RUN dpkg --add-architecture armhf
+RUN dpkg --add-architecture arm64
 RUN apt-get update
 
 # dkpg-dev: to make pkg-config work in cross-builds
@@ -11,6 +13,8 @@ RUN apt-get install --no-install-recommends --no-upgrade -y \
         gcc clang libc6-dbg \
         gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 \
         gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x \
+        gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6-dbg:armhf \
+        gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6-dbg:arm64 \
         wine gcc-mingw-w64-x86-64
 
 # Run a dummy command in wine to make it set up configuration

From 8bbad7a18e5dc5054b27ae44ea0c8dffe050f6bf Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Sun, 2 May 2021 13:39:22 -0700
Subject: [PATCH 36/59] Add asm build to ARM32 CI

---
 .cirrus.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index 124238375..c82983526 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -213,6 +213,9 @@ task:
     EXPERIMENTAL: yes
     SCHNORRSIG: yes
     CTIMETEST: no
+  matrix:
+    - env: {}
+    - env: {ASM: arm}
   << : *MERGE_BASE
   test_script:
     - ./ci/cirrus.sh

From dd6c3de322740a3054cf6a1994a38dc8f201b473 Mon Sep 17 00:00:00 2001
From: Russell O'Connor <roconnor@blockstream.io>
Date: Tue, 4 May 2021 14:59:47 -0400
Subject: [PATCH 37/59] Have secp256k1_ge_set_gej_var initialize all fields.
 Previous behaviour would not initialize r->x and r->y values in the case
 where infinity is passed in.

---
 src/group_impl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/group_impl.h b/src/group_impl.h
index 19ebd8f44..1b69082c2 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -100,8 +100,8 @@ static void secp256k1_ge_set_gej(secp256k1_ge *r, secp256k1_gej *a) {
 
 static void secp256k1_ge_set_gej_var(secp256k1_ge *r, secp256k1_gej *a) {
     secp256k1_fe z2, z3;
-    r->infinity = a->infinity;
     if (a->infinity) {
+        secp256k1_ge_set_infinity(r);
         return;
     }
     secp256k1_fe_inv_var(&a->z, &a->z);
@@ -110,8 +110,7 @@ static void secp256k1_ge_set_gej_var(secp256k1_ge *r, secp256k1_gej *a) {
     secp256k1_fe_mul(&a->x, &a->x, &z2);
     secp256k1_fe_mul(&a->y, &a->y, &z3);
     secp256k1_fe_set_int(&a->z, 1);
-    r->x = a->x;
-    r->y = a->y;
+    secp256k1_ge_set_xy(r, &a->x, &a->y);
 }
 
 static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a, size_t len) {

From 31c0f6de413e521731ad0e63424431b3dd49cec8 Mon Sep 17 00:00:00 2001
From: Russell O'Connor <roconnor@blockstream.io>
Date: Tue, 4 May 2021 15:49:48 -0400
Subject: [PATCH 38/59] Have secp256k1_gej_double_var initialize all fields.
 Previous behaviour would not initialize r->x and r->y values in the case
 where infinity is passed in.

---
 src/group_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/group_impl.h b/src/group_impl.h
index 1b69082c2..5ed45fda6 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -310,7 +310,7 @@ static void secp256k1_gej_double_var(secp256k1_gej *r, const secp256k1_gej *a, s
      *  point will be gibberish (z = 0 but infinity = 0).
      */
     if (a->infinity) {
-        r->infinity = 1;
+        secp256k1_gej_set_infinity(r);
         if (rzr != NULL) {
             secp256k1_fe_set_int(rzr, 1);
         }

From 45b6468d7e3ed9849ed474c71e9a9479de1a77db Mon Sep 17 00:00:00 2001
From: Russell O'Connor <roconnor@blockstream.io>
Date: Tue, 4 May 2021 16:17:00 -0400
Subject: [PATCH 39/59] Have secp256k1_ge_set_all_gej_var initialize all
 fields. Previous behaviour would not initialize r->y values in the case where
 infinity is passed in. Furthermore, the previous behaviour wouldn't
 initialize anything in the case where all inputs were infinity.

---
 src/group_impl.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/group_impl.h b/src/group_impl.h
index 5ed45fda6..47aea32be 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -119,7 +119,9 @@ static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a
     size_t last_i = SIZE_MAX;
 
     for (i = 0; i < len; i++) {
-        if (!a[i].infinity) {
+        if (a[i].infinity) {
+            secp256k1_ge_set_infinity(&r[i]);
+        } else {
             /* Use destination's x coordinates as scratch space */
             if (last_i == SIZE_MAX) {
                 r[i].x = a[i].z;
@@ -147,7 +149,6 @@ static void secp256k1_ge_set_all_gej_var(secp256k1_ge *r, const secp256k1_gej *a
     r[last_i].x = u;
 
     for (i = 0; i < len; i++) {
-        r[i].infinity = a[i].infinity;
         if (!a[i].infinity) {
             secp256k1_ge_set_gej_zinv(&r[i], &a[i], &r[i].x);
         }

From 3c90bdda95aa4e79ff33bfbbbe91872417650ae9 Mon Sep 17 00:00:00 2001
From: William Bright <wbright@protonmail.com>
Date: Tue, 4 May 2021 13:19:33 -0400
Subject: [PATCH 40/59] change local lib headers to be relative for those
 pointing at "include/" dir

added relative paths to header files imported from src directory

added include guards for contrib/ files when referring to secp256k1.h
---
 contrib/lax_der_parsing.c                      |  1 -
 contrib/lax_der_parsing.h                      |  2 ++
 contrib/lax_der_privatekey_parsing.c           |  1 -
 contrib/lax_der_privatekey_parsing.h           |  2 ++
 src/bench_ecdh.c                               |  4 ++--
 src/bench_ecmult.c                             |  2 +-
 src/bench_internal.c                           |  2 +-
 src/bench_recover.c                            |  4 ++--
 src/bench_schnorrsig.c                         |  4 ++--
 src/bench_sign.c                               |  2 +-
 src/bench_verify.c                             |  2 +-
 src/gen_context.c                              |  2 +-
 src/modules/ecdh/main_impl.h                   |  4 ++--
 src/modules/extrakeys/main_impl.h              |  4 ++--
 src/modules/extrakeys/tests_exhaustive_impl.h  |  2 +-
 src/modules/extrakeys/tests_impl.h             |  2 +-
 src/modules/recovery/main_impl.h               |  2 +-
 src/modules/recovery/tests_exhaustive_impl.h   |  2 +-
 src/modules/schnorrsig/main_impl.h             |  6 +++---
 src/modules/schnorrsig/tests_exhaustive_impl.h |  2 +-
 src/modules/schnorrsig/tests_impl.h            |  2 +-
 src/secp256k1.c                                |  4 ++--
 src/tests.c                                    |  8 ++++----
 src/tests_exhaustive.c                         |  3 +--
 src/valgrind_ctime_test.c                      | 10 +++++-----
 25 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/contrib/lax_der_parsing.c b/contrib/lax_der_parsing.c
index c1627e37e..885a81716 100644
--- a/contrib/lax_der_parsing.c
+++ b/contrib/lax_der_parsing.c
@@ -5,7 +5,6 @@
  ***********************************************************************/
 
 #include <string.h>
-#include <secp256k1.h>
 
 #include "lax_der_parsing.h"
 
diff --git a/contrib/lax_der_parsing.h b/contrib/lax_der_parsing.h
index 6b7255e28..5cb3222a5 100644
--- a/contrib/lax_der_parsing.h
+++ b/contrib/lax_der_parsing.h
@@ -51,7 +51,9 @@
 #ifndef SECP256K1_CONTRIB_LAX_DER_PARSING_H
 #define SECP256K1_CONTRIB_LAX_DER_PARSING_H
 
+#ifndef SECP256K1_H
 #include <secp256k1.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/contrib/lax_der_privatekey_parsing.c b/contrib/lax_der_privatekey_parsing.c
index 429760fbb..372e84ea4 100644
--- a/contrib/lax_der_privatekey_parsing.c
+++ b/contrib/lax_der_privatekey_parsing.c
@@ -5,7 +5,6 @@
  ***********************************************************************/
 
 #include <string.h>
-#include <secp256k1.h>
 
 #include "lax_der_privatekey_parsing.h"
 
diff --git a/contrib/lax_der_privatekey_parsing.h b/contrib/lax_der_privatekey_parsing.h
index 602c7c556..a397f0f50 100644
--- a/contrib/lax_der_privatekey_parsing.h
+++ b/contrib/lax_der_privatekey_parsing.h
@@ -28,7 +28,9 @@
 #ifndef SECP256K1_CONTRIB_BER_PRIVATEKEY_H
 #define SECP256K1_CONTRIB_BER_PRIVATEKEY_H
 
+#ifndef SECP256K1_H
 #include <secp256k1.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/bench_ecdh.c b/src/bench_ecdh.c
index ab4b8f424..cb020d26b 100644
--- a/src/bench_ecdh.c
+++ b/src/bench_ecdh.c
@@ -6,8 +6,8 @@
 
 #include <string.h>
 
-#include "include/secp256k1.h"
-#include "include/secp256k1_ecdh.h"
+#include "../include/secp256k1.h"
+#include "../include/secp256k1_ecdh.h"
 #include "util.h"
 #include "bench.h"
 
diff --git a/src/bench_ecmult.c b/src/bench_ecmult.c
index a536ab2ae..68eff676e 100644
--- a/src/bench_ecmult.c
+++ b/src/bench_ecmult.c
@@ -6,8 +6,8 @@
 #include <stdio.h>
 
 #include "secp256k1.c"
+#include "../include/secp256k1.h"
 
-#include "include/secp256k1.h"
 #include "util.h"
 #include "hash_impl.h"
 #include "field_impl.h"
diff --git a/src/bench_internal.c b/src/bench_internal.c
index dfe99cf3a..161b1c4a4 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -6,8 +6,8 @@
 #include <stdio.h>
 
 #include "secp256k1.c"
+#include "../include/secp256k1.h"
 
-#include "include/secp256k1.h"
 #include "assumptions.h"
 #include "util.h"
 #include "hash_impl.h"
diff --git a/src/bench_recover.c b/src/bench_recover.c
index 3f6270ce8..4bcac19dc 100644
--- a/src/bench_recover.c
+++ b/src/bench_recover.c
@@ -4,8 +4,8 @@
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
-#include "include/secp256k1.h"
-#include "include/secp256k1_recovery.h"
+#include "../include/secp256k1.h"
+#include "../include/secp256k1_recovery.h"
 #include "util.h"
 #include "bench.h"
 
diff --git a/src/bench_schnorrsig.c b/src/bench_schnorrsig.c
index f7f591c41..dfea14414 100644
--- a/src/bench_schnorrsig.c
+++ b/src/bench_schnorrsig.c
@@ -8,8 +8,8 @@
 #include <stdlib.h>
 
 
-#include "include/secp256k1.h"
-#include "include/secp256k1_schnorrsig.h"
+#include "../include/secp256k1.h"
+#include "../include/secp256k1_schnorrsig.h"
 #include "util.h"
 #include "bench.h"
 
diff --git a/src/bench_sign.c b/src/bench_sign.c
index 933f367c4..f659c18c9 100644
--- a/src/bench_sign.c
+++ b/src/bench_sign.c
@@ -4,7 +4,7 @@
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
 #include "util.h"
 #include "bench.h"
 
diff --git a/src/bench_verify.c b/src/bench_verify.c
index c56aefd36..565ae4bee 100644
--- a/src/bench_verify.c
+++ b/src/bench_verify.c
@@ -7,7 +7,7 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
 #include "util.h"
 #include "bench.h"
 
diff --git a/src/gen_context.c b/src/gen_context.c
index 0fb361fb9..8fab7aa49 100644
--- a/src/gen_context.c
+++ b/src/gen_context.c
@@ -18,7 +18,7 @@
    need to build the external ASM for the build and the host machine. */
 #undef USE_EXTERNAL_ASM
 
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
 #include "assumptions.h"
 #include "util.h"
 #include "field_impl.h"
diff --git a/src/modules/ecdh/main_impl.h b/src/modules/ecdh/main_impl.h
index 1ac67086b..5408c9de7 100644
--- a/src/modules/ecdh/main_impl.h
+++ b/src/modules/ecdh/main_impl.h
@@ -7,8 +7,8 @@
 #ifndef SECP256K1_MODULE_ECDH_MAIN_H
 #define SECP256K1_MODULE_ECDH_MAIN_H
 
-#include "include/secp256k1_ecdh.h"
-#include "ecmult_const_impl.h"
+#include "../../../include/secp256k1_ecdh.h"
+#include "../../ecmult_const_impl.h"
 
 static int ecdh_hash_function_sha256(unsigned char *output, const unsigned char *x32, const unsigned char *y32, void *data) {
     unsigned char version = (y32[31] & 0x01) | 0x02;
diff --git a/src/modules/extrakeys/main_impl.h b/src/modules/extrakeys/main_impl.h
index 7390b2271..ac3932e4a 100644
--- a/src/modules/extrakeys/main_impl.h
+++ b/src/modules/extrakeys/main_impl.h
@@ -7,8 +7,8 @@
 #ifndef SECP256K1_MODULE_EXTRAKEYS_MAIN_H
 #define SECP256K1_MODULE_EXTRAKEYS_MAIN_H
 
-#include "include/secp256k1.h"
-#include "include/secp256k1_extrakeys.h"
+#include "../../../include/secp256k1.h"
+#include "../../../include/secp256k1_extrakeys.h"
 
 static SECP256K1_INLINE int secp256k1_xonly_pubkey_load(const secp256k1_context* ctx, secp256k1_ge *ge, const secp256k1_xonly_pubkey *pubkey) {
     return secp256k1_pubkey_load(ctx, ge, (const secp256k1_pubkey *) pubkey);
diff --git a/src/modules/extrakeys/tests_exhaustive_impl.h b/src/modules/extrakeys/tests_exhaustive_impl.h
index 0aca4fb72..d4a2f5bdf 100644
--- a/src/modules/extrakeys/tests_exhaustive_impl.h
+++ b/src/modules/extrakeys/tests_exhaustive_impl.h
@@ -8,7 +8,7 @@
 #define SECP256K1_MODULE_EXTRAKEYS_TESTS_EXHAUSTIVE_H
 
 #include "src/modules/extrakeys/main_impl.h"
-#include "include/secp256k1_extrakeys.h"
+#include "../../../include/secp256k1_extrakeys.h"
 
 static void test_exhaustive_extrakeys(const secp256k1_context *ctx, const secp256k1_ge* group) {
     secp256k1_keypair keypair[EXHAUSTIVE_TEST_ORDER - 1];
diff --git a/src/modules/extrakeys/tests_impl.h b/src/modules/extrakeys/tests_impl.h
index 9473a7dd4..fcc16300b 100644
--- a/src/modules/extrakeys/tests_impl.h
+++ b/src/modules/extrakeys/tests_impl.h
@@ -7,7 +7,7 @@
 #ifndef SECP256K1_MODULE_EXTRAKEYS_TESTS_H
 #define SECP256K1_MODULE_EXTRAKEYS_TESTS_H
 
-#include "secp256k1_extrakeys.h"
+#include "../../../include/secp256k1_extrakeys.h"
 
 static secp256k1_context* api_test_context(int flags, int *ecount) {
     secp256k1_context *ctx0 = secp256k1_context_create(flags);
diff --git a/src/modules/recovery/main_impl.h b/src/modules/recovery/main_impl.h
index 7a440a729..9e19f2a2d 100644
--- a/src/modules/recovery/main_impl.h
+++ b/src/modules/recovery/main_impl.h
@@ -7,7 +7,7 @@
 #ifndef SECP256K1_MODULE_RECOVERY_MAIN_H
 #define SECP256K1_MODULE_RECOVERY_MAIN_H
 
-#include "include/secp256k1_recovery.h"
+#include "../../../include/secp256k1_recovery.h"
 
 static void secp256k1_ecdsa_recoverable_signature_load(const secp256k1_context* ctx, secp256k1_scalar* r, secp256k1_scalar* s, int* recid, const secp256k1_ecdsa_recoverable_signature* sig) {
     (void)ctx;
diff --git a/src/modules/recovery/tests_exhaustive_impl.h b/src/modules/recovery/tests_exhaustive_impl.h
index 0ba9409c6..590a972ed 100644
--- a/src/modules/recovery/tests_exhaustive_impl.h
+++ b/src/modules/recovery/tests_exhaustive_impl.h
@@ -8,7 +8,7 @@
 #define SECP256K1_MODULE_RECOVERY_EXHAUSTIVE_TESTS_H
 
 #include "src/modules/recovery/main_impl.h"
-#include "include/secp256k1_recovery.h"
+#include "../../../include/secp256k1_recovery.h"
 
 void test_exhaustive_recovery_sign(const secp256k1_context *ctx, const secp256k1_ge *group) {
     int i, j, k;
diff --git a/src/modules/schnorrsig/main_impl.h b/src/modules/schnorrsig/main_impl.h
index 22e1b33a5..af503bf5e 100644
--- a/src/modules/schnorrsig/main_impl.h
+++ b/src/modules/schnorrsig/main_impl.h
@@ -7,9 +7,9 @@
 #ifndef SECP256K1_MODULE_SCHNORRSIG_MAIN_H
 #define SECP256K1_MODULE_SCHNORRSIG_MAIN_H
 
-#include "include/secp256k1.h"
-#include "include/secp256k1_schnorrsig.h"
-#include "hash.h"
+#include "../../../include/secp256k1.h"
+#include "../../../include/secp256k1_schnorrsig.h"
+#include "../../hash.h"
 
 /* Initializes SHA256 with fixed midstate. This midstate was computed by applying
  * SHA256 to SHA256("BIP0340/nonce")||SHA256("BIP0340/nonce"). */
diff --git a/src/modules/schnorrsig/tests_exhaustive_impl.h b/src/modules/schnorrsig/tests_exhaustive_impl.h
index b4a428729..affabd23a 100644
--- a/src/modules/schnorrsig/tests_exhaustive_impl.h
+++ b/src/modules/schnorrsig/tests_exhaustive_impl.h
@@ -7,7 +7,7 @@
 #ifndef SECP256K1_MODULE_SCHNORRSIG_TESTS_EXHAUSTIVE_H
 #define SECP256K1_MODULE_SCHNORRSIG_TESTS_EXHAUSTIVE_H
 
-#include "include/secp256k1_schnorrsig.h"
+#include "../../../include/secp256k1_schnorrsig.h"
 #include "src/modules/schnorrsig/main_impl.h"
 
 static const unsigned char invalid_pubkey_bytes[][32] = {
diff --git a/src/modules/schnorrsig/tests_impl.h b/src/modules/schnorrsig/tests_impl.h
index 6f960cf6a..f4fa5b4d8 100644
--- a/src/modules/schnorrsig/tests_impl.h
+++ b/src/modules/schnorrsig/tests_impl.h
@@ -7,7 +7,7 @@
 #ifndef SECP256K1_MODULE_SCHNORRSIG_TESTS_H
 #define SECP256K1_MODULE_SCHNORRSIG_TESTS_H
 
-#include "secp256k1_schnorrsig.h"
+#include "../../../include/secp256k1_schnorrsig.h"
 
 /* Checks that a bit flip in the n_flip-th argument (that has n_bytes many
  * bytes) changes the hash function
diff --git a/src/secp256k1.c b/src/secp256k1.c
index 8d2d08722..dddd1c903 100644
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@@ -6,8 +6,8 @@
 
 #define SECP256K1_BUILD
 
-#include "include/secp256k1.h"
-#include "include/secp256k1_preallocated.h"
+#include "../include/secp256k1.h"
+#include "../include/secp256k1_preallocated.h"
 
 #include "assumptions.h"
 #include "util.h"
diff --git a/src/tests.c b/src/tests.c
index a14639430..ee34d9b3a 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -15,8 +15,8 @@
 #include <time.h>
 
 #include "secp256k1.c"
-#include "include/secp256k1.h"
-#include "include/secp256k1_preallocated.h"
+#include "../include/secp256k1.h"
+#include "../include/secp256k1_preallocated.h"
 #include "testrand_impl.h"
 #include "util.h"
 
@@ -30,8 +30,8 @@ void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **pr, const BIGNUM **ps)
 # endif
 #endif
 
-#include "contrib/lax_der_parsing.c"
-#include "contrib/lax_der_privatekey_parsing.c"
+#include "../contrib/lax_der_parsing.c"
+#include "../contrib/lax_der_privatekey_parsing.c"
 
 #include "modinv32_impl.h"
 #ifdef SECP256K1_WIDEMUL_INT128
diff --git a/src/tests_exhaustive.c b/src/tests_exhaustive.c
index a8074d44f..b7c782899 100644
--- a/src/tests_exhaustive.c
+++ b/src/tests_exhaustive.c
@@ -10,7 +10,6 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-
 #include <time.h>
 
 #undef USE_ECMULT_STATIC_PRECOMPUTATION
@@ -21,7 +20,7 @@
 #endif
 
 #include "secp256k1.c"
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
 #include "assumptions.h"
 #include "group.h"
 #include "testrand_impl.h"
diff --git a/src/valgrind_ctime_test.c b/src/valgrind_ctime_test.c
index cfca5a196..4ac0f011b 100644
--- a/src/valgrind_ctime_test.c
+++ b/src/valgrind_ctime_test.c
@@ -7,24 +7,24 @@
 #include <valgrind/memcheck.h>
 #include <stdio.h>
 
-#include "include/secp256k1.h"
+#include "../include/secp256k1.h"
 #include "assumptions.h"
 #include "util.h"
 
 #ifdef ENABLE_MODULE_ECDH
-# include "include/secp256k1_ecdh.h"
+# include "../include/secp256k1_ecdh.h"
 #endif
 
 #ifdef ENABLE_MODULE_RECOVERY
-# include "include/secp256k1_recovery.h"
+# include "../include/secp256k1_recovery.h"
 #endif
 
 #ifdef ENABLE_MODULE_EXTRAKEYS
-# include "include/secp256k1_extrakeys.h"
+# include "../include/secp256k1_extrakeys.h"
 #endif
 
 #ifdef ENABLE_MODULE_SCHNORRSIG
-#include "include/secp256k1_schnorrsig.h"
+#include "../include/secp256k1_schnorrsig.h"
 #endif
 
 void run_tests(secp256k1_context *ctx, unsigned char *key);

From 4a19668c37bc77d0165f4a1c0e626e321e9c4a09 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Wed, 5 May 2021 09:38:22 +0200
Subject: [PATCH 41/59] tests: Test secp256k1_ge_set_all_gej_var for all
 infinity inputs

---
 src/tests.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/tests.c b/src/tests.c
index a14639430..407860001 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -3115,6 +3115,17 @@ void test_ge(void) {
         ge_equals_gej(&ge[i], &gej[i]);
     }
 
+    /* Test batch gej -> ge conversion with all infinities. */
+    for (i = 0; i < 4 * runs + 1; i++) {
+        secp256k1_gej_set_infinity(&gej[i]);
+    }
+    /* batch convert */
+    secp256k1_ge_set_all_gej_var(ge, gej, 4 * runs + 1);
+    /* check result */
+    for (i = 0; i < 4 * runs + 1; i++) {
+        CHECK(secp256k1_ge_is_infinity(&ge[i]));
+    }
+
     free(ge);
     free(gej);
 }

From 14c9739a1fb485bb56dbe3447132a37bcbef4e22 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Wed, 5 May 2021 09:38:22 +0200
Subject: [PATCH 42/59] tests: Improve secp256k1_ge_set_all_gej_var for some
 infinity inputs

---
 src/tests.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/tests.c b/src/tests.c
index 407860001..2764bc6aa 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -3101,14 +3101,17 @@ void test_ge(void) {
 
     /* Test batch gej -> ge conversion with many infinities. */
     for (i = 0; i < 4 * runs + 1; i++) {
+        int odd;
         random_group_element_test(&ge[i]);
+        odd = secp256k1_fe_is_odd(&ge[i].x);
+        CHECK(odd == 0 || odd == 1);
         /* randomly set half the points to infinity */
-        if(secp256k1_fe_is_odd(&ge[i].x)) {
+        if (odd == i % 2) {
             secp256k1_ge_set_infinity(&ge[i]);
         }
         secp256k1_gej_set_ge(&gej[i], &ge[i]);
     }
-    /* batch invert */
+    /* batch convert */
     secp256k1_ge_set_all_gej_var(ge, gej, 4 * runs + 1);
     /* check result */
     for (i = 0; i < 4 * runs + 1; i++) {

From 22a9ea154a280987be7cf8322156c8738c41c3c5 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Thu, 6 May 2021 17:12:11 +0200
Subject: [PATCH 43/59] contrib: Explain explicit header guards

They were added in #925 and deserve a comment.
---
 contrib/lax_der_parsing.h            | 4 ++++
 contrib/lax_der_privatekey_parsing.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/contrib/lax_der_parsing.h b/contrib/lax_der_parsing.h
index 5cb3222a5..034a38e6a 100644
--- a/contrib/lax_der_parsing.h
+++ b/contrib/lax_der_parsing.h
@@ -51,6 +51,10 @@
 #ifndef SECP256K1_CONTRIB_LAX_DER_PARSING_H
 #define SECP256K1_CONTRIB_LAX_DER_PARSING_H
 
+/* #include secp256k1.h only when it hasn't been included yet.
+   This enables this file to be #included directly in other project
+   files (such as tests.c) without the need to set an explicit -I flag,
+   which would be necessary to locate secp256k1.h. */
 #ifndef SECP256K1_H
 #include <secp256k1.h>
 #endif
diff --git a/contrib/lax_der_privatekey_parsing.h b/contrib/lax_der_privatekey_parsing.h
index a397f0f50..1a8ad8ae0 100644
--- a/contrib/lax_der_privatekey_parsing.h
+++ b/contrib/lax_der_privatekey_parsing.h
@@ -28,6 +28,10 @@
 #ifndef SECP256K1_CONTRIB_BER_PRIVATEKEY_H
 #define SECP256K1_CONTRIB_BER_PRIVATEKEY_H
 
+/* #include secp256k1.h only when it hasn't been included yet.
+   This enables this file to be #included directly in other project
+   files (such as tests.c) without the need to set an explicit -I flag,
+   which would be necessary to locate secp256k1.h. */
 #ifndef SECP256K1_H
 #include <secp256k1.h>
 #endif

From 0d9561ae879848191a14bcc67db87cbfd44fb69a Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Sun, 22 Nov 2020 17:33:46 +0000
Subject: [PATCH 44/59] add `secp256k1_ec_pubkey_cmp` method

---
 include/secp256k1.h | 20 ++++++++++++++++--
 src/secp256k1.c     | 26 +++++++++++++++++++++++
 src/tests.c         | 50 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/include/secp256k1.h b/include/secp256k1.h
index d368488af..54eff5501 100644
--- a/include/secp256k1.h
+++ b/include/secp256k1.h
@@ -61,8 +61,9 @@ typedef struct secp256k1_scratch_space_struct secp256k1_scratch_space;
  *  The exact representation of data inside is implementation defined and not
  *  guaranteed to be portable between different platforms or versions. It is
  *  however guaranteed to be 64 bytes in size, and can be safely copied/moved.
- *  If you need to convert to a format suitable for storage, transmission, or
- *  comparison, use secp256k1_ec_pubkey_serialize and secp256k1_ec_pubkey_parse.
+ *  If you need to convert to a format suitable for storage or transmission,
+ *  use secp256k1_ec_pubkey_serialize and secp256k1_ec_pubkey_parse. To
+ *  compare keys, use secp256k1_ec_pubkey_cmp.
  */
 typedef struct {
     unsigned char data[64];
@@ -370,6 +371,21 @@ SECP256K1_API int secp256k1_ec_pubkey_serialize(
     unsigned int flags
 ) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4);
 
+/** Compare two public keys using lexicographic (of compressed serialization) order
+ *
+ *  Returns: <0 if the first public key is less than the second
+ *           >0 if the first public key is greater than the second
+ *           0 if the two public keys are equal
+ *  Args: ctx:      a secp256k1 context object.
+ *  In:   pubkey1:  first public key to compare
+ *        pubkey2:  second public key to compare
+ */
+SECP256K1_API SECP256K1_WARN_UNUSED_RESULT int secp256k1_ec_pubkey_cmp(
+    const secp256k1_context* ctx,
+    const secp256k1_pubkey* pubkey1,
+    const secp256k1_pubkey* pubkey2
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
+
 /** Parse an ECDSA signature in compact (64 bytes) format.
  *
  *  Returns: 1 when the signature could be parsed, 0 otherwise.
diff --git a/src/secp256k1.c b/src/secp256k1.c
index aef3f99ac..515bcb93d 100644
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@@ -316,6 +316,32 @@ int secp256k1_ec_pubkey_serialize(const secp256k1_context* ctx, unsigned char *o
     return ret;
 }
 
+int secp256k1_ec_pubkey_cmp(const secp256k1_context* ctx, const secp256k1_pubkey* pubkey0, const secp256k1_pubkey* pubkey1) {
+    unsigned char out[2][33];
+    const secp256k1_pubkey* pk[2];
+    int i;
+
+    VERIFY_CHECK(ctx != NULL);
+    pk[0] = pubkey0; pk[1] = pubkey1;
+    for (i = 0; i < 2; i++) {
+        size_t out_size = sizeof(out[i]);
+        /* If the public key is NULL or invalid, ec_pubkey_serialize will call
+         * the illegal_callback and return 0. In that case we will serialize the
+         * key as all zeros which is less than any valid public key. This
+         * results in consistent comparisons even if NULL or invalid pubkeys are
+         * involved and prevents edge cases such as sorting algorithms that use
+         * this function and do not terminate as a result. */
+        if (!secp256k1_ec_pubkey_serialize(ctx, out[i], &out_size, pk[i], SECP256K1_EC_COMPRESSED)) {
+            /* Note that ec_pubkey_serialize should already set the output to
+             * zero in that case, but it's not guaranteed by the API, we can't
+             * test it and writing a VERIFY_CHECK is more complex than
+             * explicitly memsetting (again). */
+            memset(out[i], 0, sizeof(out[i]));
+        }
+    }
+    return secp256k1_memcmp_var(out[0], out[1], sizeof(out[0]));
+}
+
 static void secp256k1_ecdsa_signature_load(const secp256k1_context* ctx, secp256k1_scalar* r, secp256k1_scalar* s, const secp256k1_ecdsa_signature* sig) {
     (void)ctx;
     if (sizeof(secp256k1_scalar) == 32) {
diff --git a/src/tests.c b/src/tests.c
index f19ab96e3..c901143c6 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -4809,6 +4809,55 @@ void test_random_pubkeys(void) {
     }
 }
 
+void run_pubkey_comparison(void) {
+    unsigned char pk1_ser[33] = {
+        0x02,
+        0x58, 0x84, 0xb3, 0xa2, 0x4b, 0x97, 0x37, 0x88, 0x92, 0x38, 0xa6, 0x26, 0x62, 0x52, 0x35, 0x11,
+        0xd0, 0x9a, 0xa1, 0x1b, 0x80, 0x0b, 0x5e, 0x93, 0x80, 0x26, 0x11, 0xef, 0x67, 0x4b, 0xd9, 0x23
+    };
+    const unsigned char pk2_ser[33] = {
+        0x02,
+        0xde, 0x36, 0x0e, 0x87, 0x59, 0x8f, 0x3c, 0x01, 0x36, 0x2a, 0x2a, 0xb8, 0xc6, 0xf4, 0x5e, 0x4d,
+        0xb2, 0xc2, 0xd5, 0x03, 0xa7, 0xf9, 0xf1, 0x4f, 0xa8, 0xfa, 0x95, 0xa8, 0xe9, 0x69, 0x76, 0x1c
+    };
+    secp256k1_pubkey pk1;
+    secp256k1_pubkey pk2;
+    int32_t ecount = 0;
+
+    CHECK(secp256k1_ec_pubkey_parse(ctx, &pk1, pk1_ser, sizeof(pk1_ser)) == 1);
+    CHECK(secp256k1_ec_pubkey_parse(ctx, &pk2, pk2_ser, sizeof(pk2_ser)) == 1);
+
+    secp256k1_context_set_illegal_callback(ctx, counting_illegal_callback_fn, &ecount);
+    CHECK(secp256k1_ec_pubkey_cmp(ctx, NULL, &pk2) < 0);
+    CHECK(ecount == 1);
+    CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk1, NULL) > 0);
+    CHECK(ecount == 2);
+    CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk1, &pk2) < 0);
+    CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk2, &pk1) > 0);
+    CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk1, &pk1) == 0);
+    CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk2, &pk2) == 0);
+    CHECK(ecount == 2);
+    {
+        secp256k1_pubkey pk_tmp;
+        memset(&pk_tmp, 0, sizeof(pk_tmp)); /* illegal pubkey */
+        CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk_tmp, &pk2) < 0);
+        CHECK(ecount == 3);
+        CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk_tmp, &pk_tmp) == 0);
+        CHECK(ecount == 5);
+        CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk2, &pk_tmp) > 0);
+        CHECK(ecount == 6);
+    }
+
+    secp256k1_context_set_illegal_callback(ctx, NULL, NULL);
+
+    /* Make pk2 the same as pk1 but with 3 rather than 2. Note that in
+     * an uncompressed encoding, these would have the opposite ordering */
+    pk1_ser[0] = 3;
+    CHECK(secp256k1_ec_pubkey_parse(ctx, &pk2, pk1_ser, sizeof(pk1_ser)) == 1);
+    CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk1, &pk2) < 0);
+    CHECK(secp256k1_ec_pubkey_cmp(ctx, &pk2, &pk1) > 0);
+}
+
 void run_random_pubkeys(void) {
     int i;
     for (i = 0; i < 10*count; i++) {
@@ -5860,6 +5909,7 @@ int main(int argc, char **argv) {
 #endif
 
     /* ecdsa tests */
+    run_pubkey_comparison();
     run_random_pubkeys();
     run_ecdsa_der_parse();
     run_ecdsa_sign_verify();

From 6eceec6d566898a5c157630e47f95b260767026b Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Mon, 30 Nov 2020 18:42:32 +0000
Subject: [PATCH 45/59] add `secp256k1_xonly_pubkey_cmp` method

---
 include/secp256k1_extrakeys.h      | 21 ++++++++++++++---
 src/modules/extrakeys/main_impl.h  | 26 ++++++++++++++++++++
 src/modules/extrakeys/tests_impl.h | 38 ++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/include/secp256k1_extrakeys.h b/include/secp256k1_extrakeys.h
index 6fc7b290f..0a37fb6b9 100644
--- a/include/secp256k1_extrakeys.h
+++ b/include/secp256k1_extrakeys.h
@@ -15,9 +15,9 @@ extern "C" {
  *  The exact representation of data inside is implementation defined and not
  *  guaranteed to be portable between different platforms or versions. It is
  *  however guaranteed to be 64 bytes in size, and can be safely copied/moved.
- *  If you need to convert to a format suitable for storage, transmission, or
- *  comparison, use secp256k1_xonly_pubkey_serialize and
- *  secp256k1_xonly_pubkey_parse.
+ *  If you need to convert to a format suitable for storage, transmission, use
+ *  use secp256k1_xonly_pubkey_serialize and secp256k1_xonly_pubkey_parse. To
+ *  compare keys, use secp256k1_xonly_pubkey_cmp.
  */
 typedef struct {
     unsigned char data[64];
@@ -67,6 +67,21 @@ SECP256K1_API int secp256k1_xonly_pubkey_serialize(
     const secp256k1_xonly_pubkey* pubkey
 ) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
 
+/** Compare two x-only public keys using lexicographic order
+ *
+ *  Returns: <0 if the first public key is less than the second
+ *           >0 if the first public key is greater than the second
+ *           0 if the two public keys are equal
+ *  Args: ctx:      a secp256k1 context object.
+ *  In:   pubkey1:  first public key to compare
+ *        pubkey2:  second public key to compare
+ */
+SECP256K1_API int secp256k1_xonly_pubkey_cmp(
+    const secp256k1_context* ctx,
+    const secp256k1_xonly_pubkey* pk1,
+    const secp256k1_xonly_pubkey* pk2
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
+
 /** Converts a secp256k1_pubkey into a secp256k1_xonly_pubkey.
  *
  *  Returns: 1 if the public key was successfully converted
diff --git a/src/modules/extrakeys/main_impl.h b/src/modules/extrakeys/main_impl.h
index 7390b2271..c052e29a8 100644
--- a/src/modules/extrakeys/main_impl.h
+++ b/src/modules/extrakeys/main_impl.h
@@ -55,6 +55,32 @@ int secp256k1_xonly_pubkey_serialize(const secp256k1_context* ctx, unsigned char
     return 1;
 }
 
+int secp256k1_xonly_pubkey_cmp(const secp256k1_context* ctx, const secp256k1_xonly_pubkey* pk0, const secp256k1_xonly_pubkey* pk1) {
+    unsigned char out[2][32];
+    const secp256k1_xonly_pubkey* pk[2];
+    int i;
+
+    VERIFY_CHECK(ctx != NULL);
+    pk[0] = pk0; pk[1] = pk1;
+    for (i = 0; i < 2; i++) {
+        /* If the public key is NULL or invalid, xonly_pubkey_serialize will
+         * call the illegal_callback and return 0. In that case we will
+         * serialize the key as all zeros which is less than any valid public
+         * key. This results in consistent comparisons even if NULL or invalid
+         * pubkeys are involved and prevents edge cases such as sorting
+         * algorithms that use this function and do not terminate as a
+         * result. */
+        if (!secp256k1_xonly_pubkey_serialize(ctx, out[i], pk[i])) {
+            /* Note that xonly_pubkey_serialize should already set the output to
+             * zero in that case, but it's not guaranteed by the API, we can't
+             * test it and writing a VERIFY_CHECK is more complex than
+             * explicitly memsetting (again). */
+            memset(out[i], 0, sizeof(out[i]));
+        }
+    }
+    return secp256k1_memcmp_var(out[0], out[1], sizeof(out[1]));
+}
+
 /** Keeps a group element as is if it has an even Y and otherwise negates it.
  *  y_parity is set to 0 in the former case and to 1 in the latter case.
  *  Requires that the coordinates of r are normalized. */
diff --git a/src/modules/extrakeys/tests_impl.h b/src/modules/extrakeys/tests_impl.h
index 9473a7dd4..25e42f68e 100644
--- a/src/modules/extrakeys/tests_impl.h
+++ b/src/modules/extrakeys/tests_impl.h
@@ -137,6 +137,43 @@ void test_xonly_pubkey(void) {
     secp256k1_context_destroy(verify);
 }
 
+void test_xonly_pubkey_comparison(void) {
+    unsigned char pk1_ser[32] = {
+        0x58, 0x84, 0xb3, 0xa2, 0x4b, 0x97, 0x37, 0x88, 0x92, 0x38, 0xa6, 0x26, 0x62, 0x52, 0x35, 0x11,
+        0xd0, 0x9a, 0xa1, 0x1b, 0x80, 0x0b, 0x5e, 0x93, 0x80, 0x26, 0x11, 0xef, 0x67, 0x4b, 0xd9, 0x23
+    };
+    const unsigned char pk2_ser[32] = {
+        0xde, 0x36, 0x0e, 0x87, 0x59, 0x8f, 0x3c, 0x01, 0x36, 0x2a, 0x2a, 0xb8, 0xc6, 0xf4, 0x5e, 0x4d,
+        0xb2, 0xc2, 0xd5, 0x03, 0xa7, 0xf9, 0xf1, 0x4f, 0xa8, 0xfa, 0x95, 0xa8, 0xe9, 0x69, 0x76, 0x1c
+    };
+    secp256k1_xonly_pubkey pk1;
+    secp256k1_xonly_pubkey pk2;
+    int ecount = 0;
+    secp256k1_context *none = api_test_context(SECP256K1_CONTEXT_NONE, &ecount);
+
+    CHECK(secp256k1_xonly_pubkey_parse(none, &pk1, pk1_ser) == 1);
+    CHECK(secp256k1_xonly_pubkey_parse(none, &pk2, pk2_ser) == 1);
+
+    CHECK(secp256k1_xonly_pubkey_cmp(none, NULL, &pk2) < 0);
+    CHECK(ecount == 1);
+    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, NULL) > 0);
+    CHECK(ecount == 2);
+    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, &pk2) < 0);
+    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk2, &pk1) > 0);
+    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, &pk1) == 0);
+    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk2, &pk2) == 0);
+    CHECK(ecount == 2);
+    memset(&pk1, 0, sizeof(pk1)); /* illegal pubkey */
+    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, &pk2) < 0);
+    CHECK(ecount == 3);
+    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk1, &pk1) == 0);
+    CHECK(ecount == 5);
+    CHECK(secp256k1_xonly_pubkey_cmp(none, &pk2, &pk1) > 0);
+    CHECK(ecount == 6);
+
+    secp256k1_context_destroy(none);
+}
+
 void test_xonly_pubkey_tweak(void) {
     unsigned char zeros64[64] = { 0 };
     unsigned char overflows[32];
@@ -540,6 +577,7 @@ void run_extrakeys_tests(void) {
     test_xonly_pubkey_tweak();
     test_xonly_pubkey_tweak_check();
     test_xonly_pubkey_tweak_recursive();
+    test_xonly_pubkey_comparison();
 
     /* keypair tests */
     test_keypair();

From 09b3bb8648fec903e4ac2ec1d047503d5f0f48d7 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Wed, 12 May 2021 11:49:36 +0200
Subject: [PATCH 46/59] Clean up git tree

This removes the ununsed `obj` directory. It also suggests in the README
to create the "coverage" files in a separate directory and adds the
coverage files to .gitignore.

readme: Improve instructions for coverage reports
---
 .gitignore     | 8 ++++++++
 README.md      | 3 ++-
 obj/.gitignore | 0
 3 files changed, 10 insertions(+), 1 deletion(-)
 delete mode 100644 obj/.gitignore

diff --git a/.gitignore b/.gitignore
index ccdef02b2..b62055a39 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,14 @@ libtool
 *~
 *.log
 *.trs
+
+coverage/
+coverage.html
+coverage.*.html
+*.gcda
+*.gcno
+*.gcov
+
 src/libsecp256k1-config.h
 src/libsecp256k1-config.h.in
 src/ecmult_static_context.h
diff --git a/README.md b/README.md
index 197a56fff..a7eb2b0e8 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,8 @@ To create a report, `gcovr` is recommended, as it includes branch coverage repor
 
 To create a HTML report with coloured and annotated source code:
 
-    $ gcovr --exclude 'src/bench*' --html --html-details -o coverage.html
+    $ mkdir -p coverage
+    $ gcovr --exclude 'src/bench*' --html --html-details -o coverage/coverage.html
 
 Reporting a vulnerability
 ------------
diff --git a/obj/.gitignore b/obj/.gitignore
deleted file mode 100644
index e69de29bb..000000000

From de4157f13acc43d521e3133ff1d2e7d67484f0ac Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Mon, 9 Nov 2020 17:52:12 +0100
Subject: [PATCH 47/59] ci: Run ASan/LSan and reorganize sanitizer and Valgrind
 jobs

---
 .cirrus.yml                | 64 +++++++++++++++++++++++++++-----------
 ci/linux-debian.Dockerfile |  5 +--
 2 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index c82983526..6a3b4b2eb 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -63,27 +63,8 @@ task:
     - env: {BUILD: distcheck, WITH_VALGRIND: no, CTIMETEST: no, BENCH: no}
     - env: {CPPFLAGS: -DDETERMINISTIC}
     - env: {CFLAGS: -O0, CTIMETEST: no}
-    - env:
-        CFLAGS:  "-fsanitize=undefined -fno-omit-frame-pointer"
-        LDFLAGS: "-fsanitize=undefined -fno-omit-frame-pointer"
-        UBSAN_OPTIONS: "print_stacktrace=1:halt_on_error=1"
-        ASM: x86_64
-        ECDH: yes
-        RECOVERY: yes
-        EXPERIMENTAL: yes
-        SCHNORRSIG: yes
-        CTIMETEST: no
     - env: { ECMULTGENPRECISION: 2 }
     - env: { ECMULTGENPRECISION: 8 }
-    - env:
-        RUN_VALGRIND: yes
-        ASM: x86_64
-        ECDH: yes
-        RECOVERY: yes
-        EXPERIMENTAL: yes
-        SCHNORRSIG: yes
-        EXTRAFLAGS: "--disable-openssl-tests"
-        BUILD:
   matrix:
     - env:
         CC: gcc
@@ -262,3 +243,48 @@ task:
   test_script:
     - ./ci/cirrus.sh
   << : *CAT_LOGS
+
+# Sanitizers
+task:
+  container:
+    dockerfile: ci/linux-debian.Dockerfile
+    cpu: 1
+    memory: 1G
+  env:
+    ECDH: yes
+    RECOVERY: yes
+    EXPERIMENTAL: yes
+    SCHNORRSIG: yes
+    CTIMETEST: no
+    EXTRAFLAGS: "--disable-openssl-tests"
+  matrix:
+    - name: "Valgrind (memcheck)"
+      env:
+        RUN_VALGRIND: yes
+    - name: "UBSan, ASan, LSan"
+      env:
+        CFLAGS: "-fsanitize=undefined,address"
+        CFLAGS_FOR_BUILD: "-fsanitize=undefined,address"
+        UBSAN_OPTIONS: "print_stacktrace=1:halt_on_error=1"
+        ASAN_OPTIONS: "strict_string_checks=1:detect_stack_use_after_return=1:detect_leaks=1"
+        LSAN_OPTIONS: "use_unaligned=1"
+  # Try to cover many configurations with just a tiny matrix.
+  matrix:
+    - env:
+        ASM: auto
+        STATICPRECOMPUTATION: yes
+    - env:
+        ASM: no
+        STATICPRECOMPUTATION: no
+        ECMULTGENPRECISION: 2
+  matrix:
+    - env:
+        CC: clang
+    - env:
+        HOST: i686-linux-gnu
+        CC: i686-linux-gnu-gcc
+  << : *MERGE_BASE
+  test_script:
+    - ./ci/cirrus.sh
+  << : *CAT_LOGS
+
diff --git a/ci/linux-debian.Dockerfile b/ci/linux-debian.Dockerfile
index 6559c5802..e06c81668 100644
--- a/ci/linux-debian.Dockerfile
+++ b/ci/linux-debian.Dockerfile
@@ -7,11 +7,12 @@ RUN dpkg --add-architecture arm64
 RUN apt-get update
 
 # dkpg-dev: to make pkg-config work in cross-builds
+# llvm: for llvm-symbolizer, which is used by clang's UBSan for symbolized stack traces
 RUN apt-get install --no-install-recommends --no-upgrade -y \
         git ca-certificates \
         make automake libtool pkg-config dpkg-dev valgrind qemu-user \
-        gcc clang libc6-dbg \
-        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 \
+        gcc clang llvm libc6-dbg \
+        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 libubsan1:i386 libasan5:i386 \
         gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x \
         gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6-dbg:armhf \
         gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6-dbg:arm64 \

From fcfcb97e74b55a107290d44c81c049d6168e954f Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Mon, 17 May 2021 17:19:01 +0200
Subject: [PATCH 48/59] ci: Simplify to use generic wrapper for QEMU, Valgrind,
 etc

---
 .cirrus.yml  | 16 ++++++----------
 ci/cirrus.sh | 37 +++++--------------------------------
 2 files changed, 11 insertions(+), 42 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 6a3b4b2eb..25579d98d 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -5,7 +5,6 @@ env:
   ASM: no
   BUILD: check
   WITH_VALGRIND: yes
-  RUN_VALGRIND: no
   EXTRAFLAGS:
   HOST:
   ECDH: no
@@ -162,9 +161,8 @@ task:
     cpu: 1
     memory: 1G
   env:
-    QEMU_CMD: qemu-s390x
+    WRAPPER_CMD: qemu-s390x
     HOST: s390x-linux-gnu
-    BUILD:
     WITH_VALGRIND: no
     ECDH: yes
     RECOVERY: yes
@@ -185,9 +183,8 @@ task:
     cpu: 1
     memory: 1G
   env:
-    QEMU_CMD: qemu-arm
+    WRAPPER_CMD: qemu-arm
     HOST: arm-linux-gnueabihf
-    BUILD:
     WITH_VALGRIND: no
     ECDH: yes
     RECOVERY: yes
@@ -209,9 +206,8 @@ task:
     cpu: 1
     memory: 1G
   env:
-    QEMU_CMD: qemu-aarch64
+    WRAPPER_CMD: qemu-aarch64
     HOST: aarch64-linux-gnu
-    BUILD:
     WITH_VALGRIND: no
     ECDH: yes
     RECOVERY: yes
@@ -230,9 +226,8 @@ task:
     cpu: 1
     memory: 1G
   env:
-    WINE_CMD: wine64-stable
+    WRAPPER_CMD: wine64-stable
     HOST: x86_64-w64-mingw32
-    BUILD:
     WITH_VALGRIND: no
     ECDH: yes
     RECOVERY: yes
@@ -260,7 +255,8 @@ task:
   matrix:
     - name: "Valgrind (memcheck)"
       env:
-        RUN_VALGRIND: yes
+        # The `--error-exitcode` is required to make the test fail if valgrind found errors, otherwise it'll return 0 (https://www.valgrind.org/docs/manual/manual-core.html)
+        WRAPPER_CMD: "valgrind --error-exitcode=42"
     - name: "UBSan, ASan, LSan"
       env:
         CFLAGS: "-fsanitize=undefined,address"
diff --git a/ci/cirrus.sh b/ci/cirrus.sh
index dafab8ae3..060335eec 100755
--- a/ci/cirrus.sh
+++ b/ci/cirrus.sh
@@ -29,45 +29,18 @@ file *tests* || true
 file bench_* || true
 file .libs/* || true
 
-if [ -n "$BUILD" ]
-then
-    make "$BUILD"
-fi
-
-if [ "$RUN_VALGRIND" = "yes" ]
-then
-    # the `--error-exitcode` is required to make the test fail if valgrind found errors, otherwise it'll return 0 (https://www.valgrind.org/docs/manual/manual-core.html)
-    valgrind --error-exitcode=42 ./tests 16
-    valgrind --error-exitcode=42 ./exhaustive_tests
-fi
-
-if [ -n "$QEMU_CMD" ]
-then
-    $QEMU_CMD ./tests 16
-    $QEMU_CMD ./exhaustive_tests
-fi
+# This tells `make check` to wrap test invocations.
+export LOG_COMPILER="$WRAPPER_CMD"
 
-if [ -n "$WINE_CMD" ]
-then
-    $WINE_CMD ./tests 16
-    $WINE_CMD ./exhaustive_tests
-fi
+make "$BUILD"
 
 if [ "$BENCH" = "yes" ]
 then
     # Using the local `libtool` because on macOS the system's libtool has nothing to do with GNU libtool
     EXEC='./libtool --mode=execute'
-    if [ -n "$QEMU_CMD" ]
-    then
-       EXEC="$EXEC $QEMU_CMD"
-    fi
-    if [ "$RUN_VALGRIND" = "yes" ]
-    then
-        EXEC="$EXEC valgrind --error-exitcode=42"
-    fi
-    if [ -n "$WINE_CMD" ]
+    if [ -n "$WRAPPER_CMD" ]
     then
-        EXEC="$WINE_CMD"
+        EXEC="$EXEC $WRAPPER_CMD"
     fi
     # This limits the iterations in the benchmarks below to ITER iterations.
     export SECP256K1_BENCH_ITERS="$ITERS"

From 489ff5c20a1457d0e7d765c8f05856c50c4777a8 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Fri, 21 May 2021 11:24:30 +0200
Subject: [PATCH 49/59] tests: Treat empty SECP2561_TEST_ITERS as if it was
 unset

---
 src/tests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests.c b/src/tests.c
index 9c2d6cbf8..6ceaba5e3 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -6471,7 +6471,7 @@ int main(int argc, char **argv) {
         count = strtol(argv[1], NULL, 0);
     } else {
         const char* env = getenv("SECP256K1_TEST_ITERS");
-        if (env) {
+        if (env && strlen(env) > 0) {
             count = strtol(env, NULL, 0);
         }
     }

From 02dcea1ad9441f857c7768e2b7d304bb19fd2a0c Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Mon, 17 May 2021 20:41:15 +0200
Subject: [PATCH 50/59] ci: Make test iterations configurable and tweak for
 sanitizer builds

---
 .cirrus.yml  | 9 ++++++++-
 ci/cirrus.sh | 6 ++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 25579d98d..1ba9315f0 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -13,7 +13,8 @@ env:
   EXPERIMENTAL: no
   CTIMETEST: yes
   BENCH: yes
-  ITERS: 2
+  TEST_ITERS:
+  BENCH_ITERS: 2
   MAKEFLAGS: -j2
 
 cat_logs_snippet: &CAT_LOGS
@@ -162,6 +163,7 @@ task:
     memory: 1G
   env:
     WRAPPER_CMD: qemu-s390x
+    TEST_ITERS: 16
     HOST: s390x-linux-gnu
     WITH_VALGRIND: no
     ECDH: yes
@@ -184,6 +186,7 @@ task:
     memory: 1G
   env:
     WRAPPER_CMD: qemu-arm
+    TEST_ITERS: 16
     HOST: arm-linux-gnueabihf
     WITH_VALGRIND: no
     ECDH: yes
@@ -207,6 +210,7 @@ task:
     memory: 1G
   env:
     WRAPPER_CMD: qemu-aarch64
+    TEST_ITERS: 16
     HOST: aarch64-linux-gnu
     WITH_VALGRIND: no
     ECDH: yes
@@ -227,6 +231,7 @@ task:
     memory: 1G
   env:
     WRAPPER_CMD: wine64-stable
+    TEST_ITERS: 16
     HOST: x86_64-w64-mingw32
     WITH_VALGRIND: no
     ECDH: yes
@@ -257,6 +262,7 @@ task:
       env:
         # The `--error-exitcode` is required to make the test fail if valgrind found errors, otherwise it'll return 0 (https://www.valgrind.org/docs/manual/manual-core.html)
         WRAPPER_CMD: "valgrind --error-exitcode=42"
+        TEST_ITERS: 16
     - name: "UBSan, ASan, LSan"
       env:
         CFLAGS: "-fsanitize=undefined,address"
@@ -264,6 +270,7 @@ task:
         UBSAN_OPTIONS: "print_stacktrace=1:halt_on_error=1"
         ASAN_OPTIONS: "strict_string_checks=1:detect_stack_use_after_return=1:detect_leaks=1"
         LSAN_OPTIONS: "use_unaligned=1"
+        TEST_ITERS: 32
   # Try to cover many configurations with just a tiny matrix.
   matrix:
     - env:
diff --git a/ci/cirrus.sh b/ci/cirrus.sh
index 060335eec..27db1e677 100755
--- a/ci/cirrus.sh
+++ b/ci/cirrus.sh
@@ -32,6 +32,10 @@ file .libs/* || true
 # This tells `make check` to wrap test invocations.
 export LOG_COMPILER="$WRAPPER_CMD"
 
+# This limits the iterations in the tests and benchmarks.
+export SECP256K1_TEST_ITERS="$TEST_ITERS"
+export SECP256K1_BENCH_ITERS="$BENCH_ITERS"
+
 make "$BUILD"
 
 if [ "$BENCH" = "yes" ]
@@ -42,8 +46,6 @@ then
     then
         EXEC="$EXEC $WRAPPER_CMD"
     fi
-    # This limits the iterations in the benchmarks below to ITER iterations.
-    export SECP256K1_BENCH_ITERS="$ITERS"
     {
         $EXEC ./bench_ecmult
         $EXEC ./bench_internal

From a35fdd3478f7556dfb9b83f32aaa319ccadff9a9 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Mon, 31 May 2021 18:11:27 +0200
Subject: [PATCH 51/59] ci: Run PRs on merge result even for i686

This line should have been added in c7f754fe4d5e032fd150c4b9b985855e9fcaa521.

This mistake caused some i686 builds to fail when the PR was not
rebased, see https://cirrus-ci.com/build/5156197872435200.
---
 .cirrus.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index 1ba9315f0..c65105bca 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -92,6 +92,7 @@ task:
         CC: i686-linux-gnu-gcc
     - env:
         CC: clang --target=i686-pc-linux-gnu -isystem /usr/i686-linux-gnu/include
+  << : *MERGE_BASE
   test_script:
     - ./ci/cirrus.sh
   << : *CAT_LOGS

From 593e6bad9c5cda05dd72a5bd8266c4880113b4af Mon Sep 17 00:00:00 2001
From: Jonas Nick <jonasd.nick@gmail.com>
Date: Thu, 5 Sep 2019 17:25:37 +0000
Subject: [PATCH 52/59] Clean up ecmult_bench to make space for more benchmarks

---
 src/bench_ecmult.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/bench_ecmult.c b/src/bench_ecmult.c
index 68eff676e..636cbcb8b 100644
--- a/src/bench_ecmult.c
+++ b/src/bench_ecmult.c
@@ -28,19 +28,26 @@ typedef struct {
     secp256k1_gej* expected_output;
     secp256k1_ecmult_multi_func ecmult_multi;
 
-    /* Changes per test */
+    /* Changes per benchmark */
     size_t count;
     int includes_g;
 
-    /* Changes per test iteration */
+    /* Changes per benchmark iteration, used to pick different scalars and pubkeys
+     * in each run. */
     size_t offset1;
     size_t offset2;
 
-    /* Test output. */
+    /* Benchmark output. */
     secp256k1_gej* output;
 } bench_data;
 
-static int bench_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, void* arg) {
+/* Hashes x into [0, POINTS) twice and store the result in offset1 and offset2. */
+static void hash_into_offset(bench_data* data, size_t x) {
+    data->offset1 = (x * 0x537b7f6f + 0x8f66a481) % POINTS;
+    data->offset2 = (x * 0x7f6f537b + 0x6a1a8f49) % POINTS;
+}
+
+static int bench_ecmult_multi_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, void* arg) {
     bench_data* data = (bench_data*)arg;
     if (data->includes_g) ++idx;
     if (idx == 0) {
@@ -53,7 +60,7 @@ static int bench_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, vo
     return 1;
 }
 
-static void bench_ecmult(void* arg, int iters) {
+static void bench_ecmult_multi(void* arg, int iters) {
     bench_data* data = (bench_data*)arg;
 
     int includes_g = data->includes_g;
@@ -62,19 +69,18 @@ static void bench_ecmult(void* arg, int iters) {
     iters = iters / data->count;
 
     for (iter = 0; iter < iters; ++iter) {
-        data->ecmult_multi(&data->ctx->error_callback, &data->ctx->ecmult_ctx, data->scratch, &data->output[iter], data->includes_g ? &data->scalars[data->offset1] : NULL, bench_callback, arg, count - includes_g);
+        data->ecmult_multi(&data->ctx->error_callback, &data->ctx->ecmult_ctx, data->scratch, &data->output[iter], data->includes_g ? &data->scalars[data->offset1] : NULL, bench_ecmult_multi_callback, arg, count - includes_g);
         data->offset1 = (data->offset1 + count) % POINTS;
         data->offset2 = (data->offset2 + count - 1) % POINTS;
     }
 }
 
-static void bench_ecmult_setup(void* arg) {
+static void bench_ecmult_multi_setup(void* arg) {
     bench_data* data = (bench_data*)arg;
-    data->offset1 = (data->count * 0x537b7f6f + 0x8f66a481) % POINTS;
-    data->offset2 = (data->count * 0x7f6f537b + 0x6a1a8f49) % POINTS;
+    hash_into_offset(data, data->count);
 }
 
-static void bench_ecmult_teardown(void* arg, int iters) {
+static void bench_ecmult_multi_teardown(void* arg, int iters) {
     bench_data* data = (bench_data*)arg;
     int iter;
     iters = iters / data->count;
@@ -102,7 +108,7 @@ static void generate_scalar(uint32_t num, secp256k1_scalar* scalar) {
     CHECK(!overflow);
 }
 
-static void run_test(bench_data* data, size_t count, int includes_g, int num_iters) {
+static void run_ecmult_multi_bench(bench_data* data, size_t count, int includes_g, int num_iters) {
     char str[32];
     static const secp256k1_scalar zero = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0);
     size_t iters = 1 + num_iters / count;
@@ -112,8 +118,7 @@ static void run_test(bench_data* data, size_t count, int includes_g, int num_ite
     data->includes_g = includes_g;
 
     /* Compute (the negation of) the expected results directly. */
-    data->offset1 = (data->count * 0x537b7f6f + 0x8f66a481) % POINTS;
-    data->offset2 = (data->count * 0x7f6f537b + 0x6a1a8f49) % POINTS;
+    hash_into_offset(data, data->count);
     for (iter = 0; iter < iters; ++iter) {
         secp256k1_scalar tmp;
         secp256k1_scalar total = data->scalars[(data->offset1++) % POINTS];
@@ -127,8 +132,8 @@ static void run_test(bench_data* data, size_t count, int includes_g, int num_ite
     }
 
     /* Run the benchmark. */
-    sprintf(str, includes_g ? "ecmult_%ig" : "ecmult_%i", (int)count);
-    run_benchmark(str, bench_ecmult, bench_ecmult_setup, bench_ecmult_teardown, data, 10, count * iters);
+    sprintf(str, includes_g ? "ecmult_multi %ig" : "ecmult_multi %i", (int)count);
+    run_benchmark(str, bench_ecmult_multi, bench_ecmult_multi_setup, bench_ecmult_multi_teardown, data, 10, count * iters);
 }
 
 int main(int argc, char **argv) {
@@ -185,7 +190,7 @@ int main(int argc, char **argv) {
     free(pubkeys_gej);
 
     for (i = 1; i <= 8; ++i) {
-        run_test(&data, i, 1, iters);
+        run_ecmult_multi_bench(&data, i, 1, iters);
     }
 
     /* This is disabled with low count of iterations because the loop runs 77 times even with iters=1
@@ -194,7 +199,7 @@ int main(int argc, char **argv) {
      if (iters > 2) {
         for (p = 0; p <= 11; ++p) {
             for (i = 9; i <= 16; ++i) {
-                run_test(&data, i << p, 1, iters);
+                run_ecmult_multi_bench(&data, i << p, 1, iters);
             }
         }
     }

From 2fe1b50df16c9f41ea77b151634d734b930eeddd Mon Sep 17 00:00:00 2001
From: Jonas Nick <jonasd.nick@gmail.com>
Date: Thu, 5 Sep 2019 17:26:12 +0000
Subject: [PATCH 53/59] Add ecmult_gen, ecmult_const and ecmult to benchmark

---
 src/bench_ecmult.c | 179 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 164 insertions(+), 15 deletions(-)

diff --git a/src/bench_ecmult.c b/src/bench_ecmult.c
index 636cbcb8b..505702056 100644
--- a/src/bench_ecmult.c
+++ b/src/bench_ecmult.c
@@ -18,12 +18,28 @@
 
 #define POINTS 32768
 
+void help(char **argv) {
+    printf("Benchmark EC multiplication algorithms\n");
+    printf("\n");
+    printf("Usage: %s <help|pippenger_wnaf|strauss_wnaf|simple>\n", argv[0]);
+    printf("The output shows the number of multiplied and summed points right after the\n");
+    printf("function name. The letter 'g' indicates that one of the points is the generator.\n");
+    printf("The benchmarks are divided by the number of points.\n");
+    printf("\n");
+    printf("default (ecmult_multi): picks pippenger_wnaf or strauss_wnaf depending on the\n");
+    printf("                        batch size\n");
+    printf("pippenger_wnaf:         for all batch sizes\n");
+    printf("strauss_wnaf:           for all batch sizes\n");
+    printf("simple:                 multiply and sum each point individually\n");
+}
+
 typedef struct {
     /* Setup once in advance */
     secp256k1_context* ctx;
     secp256k1_scratch_space* scratch;
     secp256k1_scalar* scalars;
     secp256k1_ge* pubkeys;
+    secp256k1_gej* pubkeys_gej;
     secp256k1_scalar* seckeys;
     secp256k1_gej* expected_output;
     secp256k1_ecmult_multi_func ecmult_multi;
@@ -47,6 +63,128 @@ static void hash_into_offset(bench_data* data, size_t x) {
     data->offset2 = (x * 0x7f6f537b + 0x6a1a8f49) % POINTS;
 }
 
+/* Check correctness of the benchmark by computing
+ * sum(outputs) ?= (sum(scalars_gen) + sum(seckeys)*sum(scalars))*G */
+static void bench_ecmult_teardown_helper(bench_data* data, size_t* seckey_offset, size_t* scalar_offset, size_t* scalar_gen_offset, int iters) {
+    int i;
+    secp256k1_gej sum_output, tmp;
+    secp256k1_scalar sum_scalars;
+
+    secp256k1_gej_set_infinity(&sum_output);
+    secp256k1_scalar_clear(&sum_scalars);
+    for (i = 0; i < iters; ++i) {
+        secp256k1_gej_add_var(&sum_output, &sum_output, &data->output[i], NULL);
+        if (scalar_gen_offset != NULL) {
+            secp256k1_scalar_add(&sum_scalars, &sum_scalars, &data->scalars[(*scalar_gen_offset+i) % POINTS]);
+        }
+        if (seckey_offset != NULL) {
+            secp256k1_scalar s = data->seckeys[(*seckey_offset+i) % POINTS];
+            secp256k1_scalar_mul(&s, &s, &data->scalars[(*scalar_offset+i) % POINTS]);
+            secp256k1_scalar_add(&sum_scalars, &sum_scalars, &s);
+        }
+    }
+    secp256k1_ecmult_gen(&data->ctx->ecmult_gen_ctx, &tmp, &sum_scalars);
+    secp256k1_gej_neg(&tmp, &tmp);
+    secp256k1_gej_add_var(&tmp, &tmp, &sum_output, NULL);
+    CHECK(secp256k1_gej_is_infinity(&tmp));
+}
+
+static void bench_ecmult_setup(void* arg) {
+    bench_data* data = (bench_data*)arg;
+    /* Re-randomize offset to ensure that we're using different scalars and
+     * group elements in each run. */
+    hash_into_offset(data, data->offset1);
+}
+
+static void bench_ecmult_gen(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    int i;
+
+    for (i = 0; i < iters; ++i) {
+        secp256k1_ecmult_gen(&data->ctx->ecmult_gen_ctx, &data->output[i], &data->scalars[(data->offset1+i) % POINTS]);
+    }
+}
+
+static void bench_ecmult_gen_teardown(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    bench_ecmult_teardown_helper(data, NULL, NULL, &data->offset1, iters);
+}
+
+static void bench_ecmult_const(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    int i;
+
+    for (i = 0; i < iters; ++i) {
+        secp256k1_ecmult_const(&data->output[i], &data->pubkeys[(data->offset1+i) % POINTS], &data->scalars[(data->offset2+i) % POINTS], 256);
+    }
+}
+
+static void bench_ecmult_const_teardown(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    bench_ecmult_teardown_helper(data, &data->offset1, &data->offset2, NULL, iters);
+}
+
+static void bench_ecmult_1(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    int i;
+
+    for (i = 0; i < iters; ++i) {
+        secp256k1_ecmult(&data->ctx->ecmult_ctx, &data->output[i], &data->pubkeys_gej[(data->offset1+i) % POINTS], &data->scalars[(data->offset2+i) % POINTS], NULL);
+    }
+}
+
+static void bench_ecmult_1_teardown(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    bench_ecmult_teardown_helper(data, &data->offset1, &data->offset2, NULL, iters);
+}
+
+static void bench_ecmult_1g(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    secp256k1_scalar zero;
+    int i;
+
+    secp256k1_scalar_set_int(&zero, 0);
+    for (i = 0; i < iters; ++i) {
+        secp256k1_ecmult(&data->ctx->ecmult_ctx, &data->output[i], NULL, &zero, &data->scalars[(data->offset1+i) % POINTS]);
+    }
+}
+
+static void bench_ecmult_1g_teardown(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    bench_ecmult_teardown_helper(data, NULL, NULL, &data->offset1, iters);
+}
+
+static void bench_ecmult_2g(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    int i;
+
+    for (i = 0; i < iters/2; ++i) {
+        secp256k1_ecmult(&data->ctx->ecmult_ctx, &data->output[i], &data->pubkeys_gej[(data->offset1+i) % POINTS], &data->scalars[(data->offset2+i) % POINTS], &data->scalars[(data->offset1+i) % POINTS]);
+    }
+}
+
+static void bench_ecmult_2g_teardown(void* arg, int iters) {
+    bench_data* data = (bench_data*)arg;
+    bench_ecmult_teardown_helper(data, &data->offset1, &data->offset2, &data->offset1, iters/2);
+}
+
+static void run_ecmult_bench(bench_data* data, int iters) {
+    char str[32];
+    sprintf(str, "ecmult_gen");
+    run_benchmark(str, bench_ecmult_gen, bench_ecmult_setup, bench_ecmult_gen_teardown, data, 10, iters);
+    sprintf(str, "ecmult_const");
+    run_benchmark(str, bench_ecmult_const, bench_ecmult_setup, bench_ecmult_const_teardown, data, 10, iters);
+    /* ecmult with non generator point */
+    sprintf(str, "ecmult 1");
+    run_benchmark(str, bench_ecmult_1, bench_ecmult_setup, bench_ecmult_1_teardown, data, 10, iters);
+    /* ecmult with generator point */
+    sprintf(str, "ecmult 1g");
+    run_benchmark(str, bench_ecmult_1g, bench_ecmult_setup, bench_ecmult_1g_teardown, data, 10, iters);
+    /* ecmult with generator and non-generator point. The reported time is per point. */
+    sprintf(str, "ecmult 2g");
+    run_benchmark(str, bench_ecmult_2g, bench_ecmult_setup, bench_ecmult_2g_teardown, data, 10, 2*iters);
+}
+
 static int bench_ecmult_multi_callback(secp256k1_scalar* sc, secp256k1_ge* ge, size_t idx, void* arg) {
     bench_data* data = (bench_data*)arg;
     if (data->includes_g) ++idx;
@@ -139,18 +277,19 @@ static void run_ecmult_multi_bench(bench_data* data, size_t count, int includes_
 int main(int argc, char **argv) {
     bench_data data;
     int i, p;
-    secp256k1_gej* pubkeys_gej;
     size_t scratch_size;
 
     int iters = get_iters(10000);
 
-    data.ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN | SECP256K1_CONTEXT_VERIFY);
-    scratch_size = secp256k1_strauss_scratch_size(POINTS) + STRAUSS_SCRATCH_OBJECTS*16;
-    data.scratch = secp256k1_scratch_space_create(data.ctx, scratch_size);
     data.ecmult_multi = secp256k1_ecmult_multi_var;
 
     if (argc > 1) {
-        if(have_flag(argc, argv, "pippenger_wnaf")) {
+        if(have_flag(argc, argv, "-h")
+           || have_flag(argc, argv, "--help")
+           || have_flag(argc, argv, "help")) {
+            help(argv);
+            return 1;
+        } else if(have_flag(argc, argv, "pippenger_wnaf")) {
             printf("Using pippenger_wnaf:\n");
             data.ecmult_multi = secp256k1_ecmult_pippenger_batch_single;
         } else if(have_flag(argc, argv, "strauss_wnaf")) {
@@ -158,36 +297,45 @@ int main(int argc, char **argv) {
             data.ecmult_multi = secp256k1_ecmult_strauss_batch_single;
         } else if(have_flag(argc, argv, "simple")) {
             printf("Using simple algorithm:\n");
-            data.ecmult_multi = secp256k1_ecmult_multi_var;
-            secp256k1_scratch_space_destroy(data.ctx, data.scratch);
-            data.scratch = NULL;
         } else {
-            fprintf(stderr, "%s: unrecognized argument '%s'.\n", argv[0], argv[1]);
-            fprintf(stderr, "Use 'pippenger_wnaf', 'strauss_wnaf', 'simple' or no argument to benchmark a combined algorithm.\n");
+            fprintf(stderr, "%s: unrecognized argument '%s'.\n\n", argv[0], argv[1]);
+            help(argv);
             return 1;
         }
     }
 
+    data.ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN | SECP256K1_CONTEXT_VERIFY);
+    scratch_size = secp256k1_strauss_scratch_size(POINTS) + STRAUSS_SCRATCH_OBJECTS*16;
+    if (!have_flag(argc, argv, "simple")) {
+        data.scratch = secp256k1_scratch_space_create(data.ctx, scratch_size);
+    } else {
+        data.scratch = NULL;
+    }
+
     /* Allocate stuff */
     data.scalars = malloc(sizeof(secp256k1_scalar) * POINTS);
     data.seckeys = malloc(sizeof(secp256k1_scalar) * POINTS);
     data.pubkeys = malloc(sizeof(secp256k1_ge) * POINTS);
+    data.pubkeys_gej = malloc(sizeof(secp256k1_gej) * POINTS);
     data.expected_output = malloc(sizeof(secp256k1_gej) * (iters + 1));
     data.output = malloc(sizeof(secp256k1_gej) * (iters + 1));
 
     /* Generate a set of scalars, and private/public keypairs. */
-    pubkeys_gej = malloc(sizeof(secp256k1_gej) * POINTS);
-    secp256k1_gej_set_ge(&pubkeys_gej[0], &secp256k1_ge_const_g);
+    secp256k1_gej_set_ge(&data.pubkeys_gej[0], &secp256k1_ge_const_g);
     secp256k1_scalar_set_int(&data.seckeys[0], 1);
     for (i = 0; i < POINTS; ++i) {
         generate_scalar(i, &data.scalars[i]);
         if (i) {
-            secp256k1_gej_double_var(&pubkeys_gej[i], &pubkeys_gej[i - 1], NULL);
+            secp256k1_gej_double_var(&data.pubkeys_gej[i], &data.pubkeys_gej[i - 1], NULL);
             secp256k1_scalar_add(&data.seckeys[i], &data.seckeys[i - 1], &data.seckeys[i - 1]);
         }
     }
-    secp256k1_ge_set_all_gej_var(data.pubkeys, pubkeys_gej, POINTS);
-    free(pubkeys_gej);
+    secp256k1_ge_set_all_gej_var(data.pubkeys, data.pubkeys_gej, POINTS);
+
+
+    /* Initialize offset1 and offset2 */
+    hash_into_offset(&data, 0);
+    run_ecmult_bench(&data, iters);
 
     for (i = 1; i <= 8; ++i) {
         run_ecmult_multi_bench(&data, i, 1, iters);
@@ -210,6 +358,7 @@ int main(int argc, char **argv) {
     secp256k1_context_destroy(data.ctx);
     free(data.scalars);
     free(data.pubkeys);
+    free(data.pubkeys_gej);
     free(data.seckeys);
     free(data.output);
     free(data.expected_output);

From 8f879c2887e166da2ec959ce78078f7b84ebfdf9 Mon Sep 17 00:00:00 2001
From: Jonas Nick <jonasd.nick@gmail.com>
Date: Tue, 29 Oct 2019 08:20:11 +0000
Subject: [PATCH 54/59] Fix array size in bench_ecmult

---
 src/bench_ecmult.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bench_ecmult.c b/src/bench_ecmult.c
index 505702056..1d463f92d 100644
--- a/src/bench_ecmult.c
+++ b/src/bench_ecmult.c
@@ -232,7 +232,7 @@ static void bench_ecmult_multi_teardown(void* arg, int iters) {
 
 static void generate_scalar(uint32_t num, secp256k1_scalar* scalar) {
     secp256k1_sha256 sha256;
-    unsigned char c[11] = {'e', 'c', 'm', 'u', 'l', 't', 0, 0, 0, 0};
+    unsigned char c[10] = {'e', 'c', 'm', 'u', 'l', 't', 0, 0, 0, 0};
     unsigned char buf[32];
     int overflow = 0;
     c[6] = num;

From c58c4ea4707ec5934e49890db881914df3a341b4 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Tue, 8 Jun 2021 17:03:53 +0200
Subject: [PATCH 55/59] ci: Add ppc64le build

---
 .cirrus.yml                | 21 +++++++++++++++++++++
 ci/linux-debian.Dockerfile |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index c65105bca..6d63511e6 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -224,6 +224,27 @@ task:
     - ./ci/cirrus.sh
   << : *CAT_LOGS
 
+task:
+  name: "ppc64le: Linux (Debian stable, QEMU)"
+  container:
+    dockerfile: ci/linux-debian.Dockerfile
+    cpu: 1
+    memory: 1G
+  env:
+    WRAPPER_CMD: qemu-ppc64le
+    TEST_ITERS: 16
+    HOST: powerpc64le-linux-gnu
+    WITH_VALGRIND: no
+    ECDH: yes
+    RECOVERY: yes
+    EXPERIMENTAL: yes
+    SCHNORRSIG: yes
+    CTIMETEST: no
+  << : *MERGE_BASE
+  test_script:
+    - ./ci/cirrus.sh
+  << : *CAT_LOGS
+
 task:
   name: "x86_64 (mingw32-w64): Windows (Debian stable, Wine)"
   container:
diff --git a/ci/linux-debian.Dockerfile b/ci/linux-debian.Dockerfile
index e06c81668..6def91333 100644
--- a/ci/linux-debian.Dockerfile
+++ b/ci/linux-debian.Dockerfile
@@ -4,6 +4,7 @@ RUN dpkg --add-architecture i386
 RUN dpkg --add-architecture s390x
 RUN dpkg --add-architecture armhf
 RUN dpkg --add-architecture arm64
+RUN dpkg --add-architecture ppc64el
 RUN apt-get update
 
 # dkpg-dev: to make pkg-config work in cross-builds
@@ -16,6 +17,7 @@ RUN apt-get install --no-install-recommends --no-upgrade -y \
         gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x \
         gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6-dbg:armhf \
         gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6-dbg:arm64 \
+        gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross libc6-dbg:ppc64el \
         wine gcc-mingw-w64-x86-64
 
 # Run a dummy command in wine to make it set up configuration

From d27e459861026ddaa376c9cb2acf93ad3c668ee3 Mon Sep 17 00:00:00 2001
From: Jonas Nick <jonasd.nick@gmail.com>
Date: Mon, 14 Jun 2021 19:54:41 +0000
Subject: [PATCH 56/59] Revert "Remove unused Jacobi symbol support"

This reverts commit 20448b8d09a492afcfcae7721033c13a44a776fd.

The removed functions secp256k1_ge_set_xquad and secp256k1_fe_is_quad_var
are required for some modules in secp256k1-zkp.
---
 src/bench_internal.c | 27 +++++++++++++++++++++++----
 src/field.h          |  3 +++
 src/field_impl.h     |  5 +++++
 src/group.h          |  9 +++++++++
 src/group_impl.h     | 22 ++++++++++++++++++++--
 src/tests.c          | 39 ++++++++++++++++++++++++++++++++++-----
 6 files changed, 94 insertions(+), 11 deletions(-)

diff --git a/src/bench_internal.c b/src/bench_internal.c
index 161b1c4a4..2be5e4506 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -245,6 +245,26 @@ void bench_group_add_affine_var(void* arg, int iters) {
     }
 }
 
+void bench_group_jacobi_var(void* arg, int iters) {
+    int i, j = 0;
+    bench_inv *data = (bench_inv*)arg;
+
+    for (i = 0; i < iters; i++) {
+        j += secp256k1_gej_has_quad_y_var(&data->gej[0]);
+        /* Vary the Y and Z coordinates of the input (the X coordinate doesn't matter to
+           secp256k1_gej_has_quad_y_var). Note that the resulting coordinates will
+           generally not correspond to a point on the curve, but this is not a problem
+           for the code being benchmarked here. Adding and normalizing have less
+           overhead than EC operations (which could guarantee the point remains on the
+           curve). */
+        secp256k1_fe_add(&data->gej[0].y, &data->fe[1]);
+        secp256k1_fe_add(&data->gej[0].z, &data->fe[2]);
+        secp256k1_fe_normalize_var(&data->gej[0].y);
+        secp256k1_fe_normalize_var(&data->gej[0].z);
+    }
+    CHECK(j <= iters);
+}
+
 void bench_group_to_affine_var(void* arg, int iters) {
     int i;
     bench_inv *data = (bench_inv*)arg;
@@ -252,10 +272,8 @@ void bench_group_to_affine_var(void* arg, int iters) {
     for (i = 0; i < iters; ++i) {
         secp256k1_ge_set_gej_var(&data->ge[1], &data->gej[0]);
         /* Use the output affine X/Y coordinates to vary the input X/Y/Z coordinates.
-           Note that the resulting coordinates will generally not correspond to a point
-           on the curve, but this is not a problem for the code being benchmarked here.
-           Adding and normalizing have less overhead than EC operations (which could
-           guarantee the point remains on the curve). */
+           Similar to bench_group_jacobi_var, this approach does not result in
+           coordinates of points on the curve. */
         secp256k1_fe_add(&data->gej[0].x, &data->ge[1].y);
         secp256k1_fe_add(&data->gej[0].y, &data->fe[2]);
         secp256k1_fe_add(&data->gej[0].z, &data->ge[1].x);
@@ -364,6 +382,7 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_var", bench_group_add_var, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine", bench_group_add_affine, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine_var", bench_group_add_affine_var, bench_setup, NULL, &data, 10, iters*10);
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "jacobi")) run_benchmark("group_jacobi_var", bench_group_jacobi_var, bench_setup, NULL, &data, 10, iters);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "to_affine")) run_benchmark("group_to_affine_var", bench_group_to_affine_var, bench_setup, NULL, &data, 10, iters);
 
     if (have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("wnaf_const", bench_wnaf_const, bench_setup, NULL, &data, 10, iters);
diff --git a/src/field.h b/src/field.h
index 854aaebab..cf3bf10bb 100644
--- a/src/field.h
+++ b/src/field.h
@@ -103,6 +103,9 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a);
  *  itself. */
 static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a);
 
+/** Checks whether a field element is a quadratic residue. */
+static int secp256k1_fe_is_quad_var(const secp256k1_fe *a);
+
 /** Sets a field element to be the (modular) inverse of another. Requires the input's magnitude to be
  *  at most 8. The output magnitude is 1 (but not guaranteed to be normalized). */
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a);
diff --git a/src/field_impl.h b/src/field_impl.h
index 374284a1f..eb8b8e200 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -135,6 +135,11 @@ static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a) {
     return secp256k1_fe_equal(&t1, a);
 }
 
+static int secp256k1_fe_is_quad_var(const secp256k1_fe *a) {
+    secp256k1_fe r;
+    return secp256k1_fe_sqrt(&r, a);
+}
+
 static const secp256k1_fe secp256k1_fe_one = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 
 #endif /* SECP256K1_FIELD_IMPL_H */
diff --git a/src/group.h b/src/group.h
index b9cd334da..2442e0cbe 100644
--- a/src/group.h
+++ b/src/group.h
@@ -42,6 +42,12 @@ typedef struct {
 /** Set a group element equal to the point with given X and Y coordinates */
 static void secp256k1_ge_set_xy(secp256k1_ge *r, const secp256k1_fe *x, const secp256k1_fe *y);
 
+/** Set a group element (affine) equal to the point with the given X coordinate
+ *  and a Y coordinate that is a quadratic residue modulo p. The return value
+ *  is true iff a coordinate with the given X coordinate exists.
+ */
+static int secp256k1_ge_set_xquad(secp256k1_ge *r, const secp256k1_fe *x);
+
 /** Set a group element (affine) equal to the point with the given X coordinate, and given oddness
  *  for Y. Return value indicates whether the result is valid. */
 static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd);
@@ -89,6 +95,9 @@ static void secp256k1_gej_neg(secp256k1_gej *r, const secp256k1_gej *a);
 /** Check whether a group element is the point at infinity. */
 static int secp256k1_gej_is_infinity(const secp256k1_gej *a);
 
+/** Check whether a group element's y coordinate is a quadratic residue. */
+static int secp256k1_gej_has_quad_y_var(const secp256k1_gej *a);
+
 /** Set r equal to the double of a. Constant time. */
 static void secp256k1_gej_double(secp256k1_gej *r, const secp256k1_gej *a);
 
diff --git a/src/group_impl.h b/src/group_impl.h
index 47aea32be..aa7a0fbae 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -206,14 +206,18 @@ static void secp256k1_ge_clear(secp256k1_ge *r) {
     secp256k1_fe_clear(&r->y);
 }
 
-static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) {
+static int secp256k1_ge_set_xquad(secp256k1_ge *r, const secp256k1_fe *x) {
     secp256k1_fe x2, x3;
     r->x = *x;
     secp256k1_fe_sqr(&x2, x);
     secp256k1_fe_mul(&x3, x, &x2);
     r->infinity = 0;
     secp256k1_fe_add(&x3, &secp256k1_fe_const_b);
-    if (!secp256k1_fe_sqrt(&r->y, &x3)) {
+    return secp256k1_fe_sqrt(&r->y, &x3);
+}
+
+static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) {
+    if (!secp256k1_ge_set_xquad(r, x)) {
         return 0;
     }
     secp256k1_fe_normalize_var(&r->y);
@@ -650,6 +654,20 @@ static void secp256k1_ge_mul_lambda(secp256k1_ge *r, const secp256k1_ge *a) {
     secp256k1_fe_mul(&r->x, &r->x, &beta);
 }
 
+static int secp256k1_gej_has_quad_y_var(const secp256k1_gej *a) {
+    secp256k1_fe yz;
+
+    if (a->infinity) {
+        return 0;
+    }
+
+    /* We rely on the fact that the Jacobi symbol of 1 / a->z^3 is the same as
+     * that of a->z. Thus a->y / a->z^3 is a quadratic residue iff a->y * a->z
+       is */
+    secp256k1_fe_mul(&yz, &a->y, &a->z);
+    return secp256k1_fe_is_quad_var(&yz);
+}
+
 static int secp256k1_ge_is_in_correct_subgroup(const secp256k1_ge* ge) {
 #ifdef EXHAUSTIVE_TEST_ORDER
     secp256k1_gej out;
diff --git a/src/tests.c b/src/tests.c
index ae781704e..0943d35b3 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -3510,35 +3510,64 @@ void run_ec_commit(void) {
 void test_group_decompress(const secp256k1_fe* x) {
     /* The input itself, normalized. */
     secp256k1_fe fex = *x;
-    /* Results of set_xo_var(..., 0), set_xo_var(..., 1). */
-    secp256k1_ge ge_even, ge_odd;
+    secp256k1_fe fez;
+    /* Results of set_xquad_var, set_xo_var(..., 0), set_xo_var(..., 1). */
+    secp256k1_ge ge_quad, ge_even, ge_odd;
+    secp256k1_gej gej_quad;
     /* Return values of the above calls. */
-    int res_even, res_odd;
+    int res_quad, res_even, res_odd;
 
     secp256k1_fe_normalize_var(&fex);
 
+    res_quad = secp256k1_ge_set_xquad(&ge_quad, &fex);
     res_even = secp256k1_ge_set_xo_var(&ge_even, &fex, 0);
     res_odd = secp256k1_ge_set_xo_var(&ge_odd, &fex, 1);
 
-    CHECK(res_even == res_odd);
+    CHECK(res_quad == res_even);
+    CHECK(res_quad == res_odd);
 
-    if (res_even) {
+    if (res_quad) {
+        secp256k1_fe_normalize_var(&ge_quad.x);
         secp256k1_fe_normalize_var(&ge_odd.x);
         secp256k1_fe_normalize_var(&ge_even.x);
+        secp256k1_fe_normalize_var(&ge_quad.y);
         secp256k1_fe_normalize_var(&ge_odd.y);
         secp256k1_fe_normalize_var(&ge_even.y);
 
         /* No infinity allowed. */
+        CHECK(!ge_quad.infinity);
         CHECK(!ge_even.infinity);
         CHECK(!ge_odd.infinity);
 
         /* Check that the x coordinates check out. */
+        CHECK(secp256k1_fe_equal_var(&ge_quad.x, x));
         CHECK(secp256k1_fe_equal_var(&ge_even.x, x));
         CHECK(secp256k1_fe_equal_var(&ge_odd.x, x));
 
+        /* Check that the Y coordinate result in ge_quad is a square. */
+        CHECK(secp256k1_fe_is_quad_var(&ge_quad.y));
+
         /* Check odd/even Y in ge_odd, ge_even. */
         CHECK(secp256k1_fe_is_odd(&ge_odd.y));
         CHECK(!secp256k1_fe_is_odd(&ge_even.y));
+
+        /* Check secp256k1_gej_has_quad_y_var. */
+        secp256k1_gej_set_ge(&gej_quad, &ge_quad);
+        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
+        do {
+            random_fe_test(&fez);
+        } while (secp256k1_fe_is_zero(&fez));
+        secp256k1_gej_rescale(&gej_quad, &fez);
+        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
+        secp256k1_gej_neg(&gej_quad, &gej_quad);
+        CHECK(!secp256k1_gej_has_quad_y_var(&gej_quad));
+        do {
+            random_fe_test(&fez);
+        } while (secp256k1_fe_is_zero(&fez));
+        secp256k1_gej_rescale(&gej_quad, &fez);
+        CHECK(!secp256k1_gej_has_quad_y_var(&gej_quad));
+        secp256k1_gej_neg(&gej_quad, &gej_quad);
+        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
     }
 }
 

From b053e853d4f556499decb5c50af473f91996f46e Mon Sep 17 00:00:00 2001
From: Jonas Nick <jonasd.nick@gmail.com>
Date: Mon, 14 Jun 2021 20:16:38 +0000
Subject: [PATCH 57/59] ecdsa_adaptor: fix test case with invalid signature

Previously the ECDSA signature had an overflowing s value, which after the sync
with upstream results in a failing VERIFY_CHECK in the inversion function.
However, normally parsed signatures shouldn't contain overflowing s values.
---
 src/modules/ecdsa_adaptor/main_impl.h  | 11 +++++++++++
 src/modules/ecdsa_adaptor/tests_impl.h | 12 ------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/modules/ecdsa_adaptor/main_impl.h b/src/modules/ecdsa_adaptor/main_impl.h
index 18e6132dd..ba0afedea 100644
--- a/src/modules/ecdsa_adaptor/main_impl.h
+++ b/src/modules/ecdsa_adaptor/main_impl.h
@@ -341,6 +341,17 @@ int secp256k1_ecdsa_adaptor_recover(const secp256k1_context* ctx, unsigned char
      * branch point. */
     secp256k1_declassify(ctx, &enckey_expected_ge, sizeof(enckey_expected_ge));
     if (!secp256k1_eckey_pubkey_serialize(&enckey_expected_ge, enckey_expected33, &size, SECP256K1_EC_COMPRESSED)) {
+        /* Unreachable from tests (and other VERIFY builds) and therefore this
+         * branch should be ignored in test coverage analysis.
+         *
+         * Proof:
+         *     eckey_pubkey_serialize fails <=> deckey = 0
+         *     deckey = 0 <=> s^-1 = 0 or sp = 0
+         *     case 1: s^-1 = 0 impossible by the definition of multiplicative
+         *             inverse and because the scalar_inverse implementation
+         *             VERIFY_CHECKs that the inputs are valid scalars.
+         *     case 2: sp = 0 impossible because ecdsa_adaptor_sig_deserialize would have already failed
+         */
         return 0;
     }
     if (!secp256k1_ec_pubkey_serialize(ctx, enckey33, &size, enckey, SECP256K1_EC_COMPRESSED)) {
diff --git a/src/modules/ecdsa_adaptor/tests_impl.h b/src/modules/ecdsa_adaptor/tests_impl.h
index 5a12bb745..a9d6b4f8e 100644
--- a/src/modules/ecdsa_adaptor/tests_impl.h
+++ b/src/modules/ecdsa_adaptor/tests_impl.h
@@ -1102,15 +1102,8 @@ void adaptor_tests(void) {
     }
     {
         /* Test key recover */
-        secp256k1_ecdsa_signature sig_tmp;
         unsigned char decryption_key_tmp[32];
         unsigned char adaptor_sig_tmp[162];
-        const unsigned char order_le[32] = {
-            0x41, 0x41, 0x36, 0xd0, 0x8c, 0x5e, 0xd2, 0xbf,
-            0x3b, 0xa0, 0x48, 0xaf, 0xe6, 0xdc, 0xae, 0xba,
-            0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-        };
 
         CHECK(secp256k1_ecdsa_adaptor_recover(ctx, decryption_key_tmp, &sig, adaptor_sig, &enckey) == 1);
         CHECK(secp256k1_memcmp_var(deckey, decryption_key_tmp, sizeof(deckey)) == 0);
@@ -1119,11 +1112,6 @@ void adaptor_tests(void) {
         memcpy(adaptor_sig_tmp, adaptor_sig, sizeof(adaptor_sig_tmp));
         memset(&adaptor_sig_tmp[66], 0xFF, 32);
         CHECK(secp256k1_ecdsa_adaptor_recover(ctx, decryption_key_tmp, &sig, adaptor_sig_tmp, &enckey) == 0);
-
-        /* Test failed enckey_expected serialization */
-        memcpy(sig_tmp.data, sig.data, 32);
-        memcpy(&sig_tmp.data[32], order_le, 32);
-        CHECK(secp256k1_ecdsa_adaptor_recover(ctx, decryption_key_tmp, &sig_tmp, adaptor_sig, &enckey) == 0);
     }
 }
 

From 7226cf215aaca80fcddcc5242c8ea11d2b35c85b Mon Sep 17 00:00:00 2001
From: Jonas Nick <jonasd.nick@gmail.com>
Date: Mon, 14 Jun 2021 20:57:40 +0000
Subject: [PATCH 58/59] ecdsa_adaptor: fix too small buffer in tests

Also add a specific test that fails adaptor sig deserialization because with the
correct size buffer that's not guaranteed anymore with the existing test.
---
 src/modules/ecdsa_adaptor/tests_impl.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/modules/ecdsa_adaptor/tests_impl.h b/src/modules/ecdsa_adaptor/tests_impl.h
index a9d6b4f8e..48fb33b73 100644
--- a/src/modules/ecdsa_adaptor/tests_impl.h
+++ b/src/modules/ecdsa_adaptor/tests_impl.h
@@ -1032,7 +1032,15 @@ void adaptor_tests(void) {
     CHECK(secp256k1_ecdsa_adaptor_verify(ctx, adaptor_sig, &enckey, msg, &enckey) == 0);
     CHECK(secp256k1_ecdsa_adaptor_verify(ctx, adaptor_sig, &pubkey, msg, &pubkey) == 0);
     {
-        unsigned char adaptor_sig_tmp[65];
+        /* Test failed adaptor sig deserialization */
+        unsigned char adaptor_sig_tmp[162];
+        memset(&adaptor_sig_tmp, 0xFF, 162);
+        CHECK(secp256k1_ecdsa_adaptor_verify(ctx, adaptor_sig_tmp, &pubkey, msg, &enckey) == 0);
+    }
+    {
+        /* Test that any flipped bit in the adaptor signature will make
+         * verification fail */
+        unsigned char adaptor_sig_tmp[162];
         memcpy(adaptor_sig_tmp, adaptor_sig, sizeof(adaptor_sig_tmp));
         rand_flip_bit(&adaptor_sig_tmp[1], sizeof(adaptor_sig_tmp) - 1);
         CHECK(secp256k1_ecdsa_adaptor_verify(ctx, adaptor_sig_tmp, &pubkey, msg, &enckey) == 0);

From f09497ea3e07d7a730a6ff3479dca18b848ef729 Mon Sep 17 00:00:00 2001
From: Jonas Nick <jonasd.nick@gmail.com>
Date: Tue, 15 Jun 2021 11:42:58 +0000
Subject: [PATCH 59/59] CI: tweak cirrus.yml to prevent OOM and timeout w
 sanitizer/valgrind

---
 .cirrus.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index d157e994c..95805d948 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -288,10 +288,11 @@ task:
 
 # Sanitizers
 task:
+  timeout_in: 120m
   container:
     dockerfile: ci/linux-debian.Dockerfile
     cpu: 1
-    memory: 1G
+    memory: 2G
   env:
     ECDH: yes
     RECOVERY: yes
@@ -310,7 +311,7 @@ task:
       env:
         # The `--error-exitcode` is required to make the test fail if valgrind found errors, otherwise it'll return 0 (https://www.valgrind.org/docs/manual/manual-core.html)
         WRAPPER_CMD: "valgrind --error-exitcode=42"
-        TEST_ITERS: 16
+        TEST_ITERS: 8
     - name: "UBSan, ASan, LSan"
       env:
         CFLAGS: "-fsanitize=undefined,address"