From 4839f7bcd1bb8eb48347dd25a487659e92300f25 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Wed, 24 Apr 2024 21:21:13 -0400
Subject: [PATCH 01/24] feat(bw6): Fp6 as a direct extension using Montgomery-6

---
 .../emulated/fields_bw6761/{e3.go => e3}      |    0
 .../fields_bw6761/{e3_test.go => e3_test}     |    0
 std/algebra/emulated/fields_bw6761/e6         |  512 ++++++++
 std/algebra/emulated/fields_bw6761/e6.go      | 1043 ++++++++++++-----
 .../emulated/fields_bw6761/e6_pairing.go      |  176 ++-
 std/algebra/emulated/fields_bw6761/e6_test.go |   86 +-
 std/algebra/emulated/fields_bw6761/hints.go   |  111 +-
 std/algebra/emulated/sw_bw6761/pairing.go     |  108 +-
 8 files changed, 1441 insertions(+), 595 deletions(-)
 rename std/algebra/emulated/fields_bw6761/{e3.go => e3} (100%)
 rename std/algebra/emulated/fields_bw6761/{e3_test.go => e3_test} (100%)
 create mode 100644 std/algebra/emulated/fields_bw6761/e6

diff --git a/std/algebra/emulated/fields_bw6761/e3.go b/std/algebra/emulated/fields_bw6761/e3
similarity index 100%
rename from std/algebra/emulated/fields_bw6761/e3.go
rename to std/algebra/emulated/fields_bw6761/e3
diff --git a/std/algebra/emulated/fields_bw6761/e3_test.go b/std/algebra/emulated/fields_bw6761/e3_test
similarity index 100%
rename from std/algebra/emulated/fields_bw6761/e3_test.go
rename to std/algebra/emulated/fields_bw6761/e3_test
diff --git a/std/algebra/emulated/fields_bw6761/e6 b/std/algebra/emulated/fields_bw6761/e6
new file mode 100644
index 0000000000..be781adcb4
--- /dev/null
+++ b/std/algebra/emulated/fields_bw6761/e6
@@ -0,0 +1,512 @@
+package fields_bw6761
+
+import (
+	"math/big"
+
+	bw6761 "github.com/consensys/gnark-crypto/ecc/bw6-761"
+	"github.com/consensys/gnark/frontend"
+	"github.com/consensys/gnark/std/math/emulated"
+)
+
+type E6 struct {
+	B0, B1 E3
+}
+
+type Ext6 struct {
+	*Ext3
+}
+
+func (e Ext6) Reduce(x *E6) *E6 {
+	var z E6
+	z.B0 = *e.Ext3.Reduce(&x.B0)
+	z.B1 = *e.Ext3.Reduce(&x.B1)
+	return &z
+}
+
+func NewExt6(api frontend.API) *Ext6 {
+	return &Ext6{Ext3: NewExt3(api)}
+}
+
+func (e Ext6) Zero() *E6 {
+	b0 := e.Ext3.Zero()
+	b1 := e.Ext3.Zero()
+	return &E6{
+		B0: *b0,
+		B1: *b1,
+	}
+}
+
+func (e Ext6) One() *E6 {
+	return &E6{
+		B0: *e.Ext3.One(),
+		B1: *e.Ext3.Zero(),
+	}
+}
+
+func (e Ext6) Add(x, y *E6) *E6 {
+	return &E6{
+		B0: *e.Ext3.Add(&x.B0, &y.B0),
+		B1: *e.Ext3.Add(&x.B1, &y.B1),
+	}
+}
+
+func (e Ext6) Sub(x, y *E6) *E6 {
+	return &E6{
+		B0: *e.Ext3.Sub(&x.B0, &y.B0),
+		B1: *e.Ext3.Sub(&x.B1, &y.B1),
+	}
+}
+
+func (e Ext6) Double(x *E6) *E6 {
+	return &E6{
+		B0: *e.Ext3.Double(&x.B0),
+		B1: *e.Ext3.Double(&x.B1),
+	}
+}
+
+func (e Ext6) Mul(x, y *E6) *E6 {
+	x = e.Reduce(x)
+	y = e.Reduce(y)
+
+	a := e.Ext3.Add(&x.B0, &x.B1)
+	b := e.Ext3.Add(&y.B0, &y.B1)
+	a = e.Ext3.Mul(a, b)
+	b = e.Ext3.Mul(&x.B0, &y.B0)
+	c := e.Ext3.Mul(&x.B1, &y.B1)
+	b1 := e.Ext3.Sub(a, b)
+	b1 = e.Ext3.Sub(b1, c)
+	b0 := e.Ext3.MulByNonResidue(c)
+	b0 = e.Ext3.Add(b0, b)
+
+	return &E6{
+		B0: *b0,
+		B1: *b1,
+	}
+}
+
+func (e Ext6) Square(x *E6) *E6 {
+
+	x = e.Reduce(x)
+	//Algorithm 22 from https://eprint.iacr.org/2010/354.pdf
+	c0 := e.Ext3.Sub(&x.B0, &x.B1)
+	c3 := e.Ext3.MulByNonResidue(&x.B1)
+	c3 = e.Ext3.Neg(c3)
+	c3 = e.Ext3.Add(&x.B0, c3)
+	c2 := e.Ext3.Mul(&x.B0, &x.B1)
+	c0 = e.Ext3.Mul(c0, c3)
+	c0 = e.Ext3.Add(c0, c2)
+	b1 := e.Ext3.Double(c2)
+	c2 = e.Ext3.MulByNonResidue(c2)
+	b0 := e.Ext3.Add(c0, c2)
+
+	return &E6{
+		B0: *b0,
+		B1: *b1,
+	}
+}
+
+// Karabina's compressed cyclotomic square SQR12345
+// https://eprint.iacr.org/2010/542.pdf
+// Sec. 5.6 with minor modifications to fit our tower
+func (e Ext6) CyclotomicSquareKarabina12345(x *E6) *E6 {
+	x = e.Reduce(x)
+
+	// h4 = -g4 + 3((g3+g5)(g1+c*g2)-g1g5-c*g3g2)
+	g1g5 := e.fp.Mul(&x.B0.A1, &x.B1.A2)
+	g3g2 := e.fp.Mul(&x.B1.A0, &x.B0.A2)
+	h4 := mulFpByNonResidue(e.fp, &x.B0.A2)
+	h4 = e.fp.Add(h4, &x.B0.A1)
+	t := e.fp.Add(&x.B1.A0, &x.B1.A2)
+	h4 = e.fp.Mul(h4, t)
+	h4 = e.fp.Sub(h4, g1g5)
+	t = e.fp.MulConst(g3g2, big.NewInt(4))
+	h4 = e.fp.Add(h4, t)
+	h4 = e.fp.MulConst(h4, big.NewInt(3))
+	h4 = e.fp.Sub(h4, &x.B1.A1)
+
+	// h3 = 2(g3+3c*g1g5)
+	h3 := mulFpByNonResidue(e.fp, g1g5)
+	h3 = e.fp.MulConst(h3, big.NewInt(3))
+	h3 = e.fp.Add(h3, &x.B1.A0)
+	h3 = e.fp.MulConst(h3, big.NewInt(2))
+
+	// h2 = 3((g1+g5)(g1+c*g5)-(c+1)*g1g5)-2g2
+	t = mulFpByNonResidue(e.fp, &x.B1.A2)
+	t = e.fp.Add(t, &x.B0.A1)
+	h2 := e.fp.Add(&x.B1.A2, &x.B0.A1)
+	h2 = e.fp.Mul(h2, t)
+	t = e.fp.MulConst(g1g5, big.NewInt(3))
+	h2 = e.fp.Add(h2, t)
+	h2 = e.fp.MulConst(h2, big.NewInt(3))
+	t = e.fp.MulConst(&x.B0.A2, big.NewInt(2))
+	h2 = e.fp.Sub(h2, t)
+
+	// h1 = 3((g3+g2)(g3+c*g2)-(c+1)*g3g2)-2g1
+	t = mulFpByNonResidue(e.fp, &x.B0.A2)
+	t = e.fp.Add(t, &x.B1.A0)
+	h1 := e.fp.Add(&x.B0.A2, &x.B1.A0)
+	h1 = e.fp.Mul(h1, t)
+	t = e.fp.MulConst(g3g2, big.NewInt(3))
+	h1 = e.fp.Add(h1, t)
+	h1 = e.fp.MulConst(h1, big.NewInt(3))
+	t = e.fp.MulConst(&x.B0.A1, big.NewInt(2))
+	h1 = e.fp.Sub(h1, t)
+
+	// h5 = 2(g5+3g3g2)
+	h5 := e.fp.MulConst(g3g2, big.NewInt(3))
+	h5 = e.fp.Add(h5, &x.B1.A2)
+	h5 = e.fp.MulConst(h5, big.NewInt(2))
+
+	return &E6{
+		B0: E3{
+			A0: x.B0.A0,
+			A1: *h1,
+			A2: *h2,
+		},
+		B1: E3{
+			A0: *h3,
+			A1: *h4,
+			A2: *h5,
+		},
+	}
+}
+
+// DecompressKarabina12345 decompresses Karabina's cyclotomic square result SQR12345
+func (e Ext6) DecompressKarabina12345(x *E6) *E6 {
+	x = e.Reduce(x)
+
+	// h0 = (2g4^2 + g3g5 - 3g2g1)*c + 1
+	t0 := e.fp.Mul(&x.B0.A1, &x.B0.A2)
+	t0 = e.fp.MulConst(t0, big.NewInt(3))
+	t1 := e.fp.Mul(&x.B1.A0, &x.B1.A2)
+	h0 := e.fp.Mul(&x.B1.A1, &x.B1.A1)
+	h0 = e.fp.MulConst(h0, big.NewInt(2))
+	h0 = e.fp.Add(h0, t1)
+	h0 = e.fp.Sub(t0, h0)
+	h0 = e.fp.MulConst(h0, big.NewInt(4))
+	h0 = e.fp.Add(h0, e.fp.One())
+
+	return &E6{
+		B0: E3{
+			A0: *h0,
+			A1: x.B0.A1,
+			A2: x.B0.A2,
+		},
+		B1: x.B1,
+	}
+}
+
+// Karabina's compressed cyclotomic square SQR2345
+// https://eprint.iacr.org/2010/542.pdf
+// Th. 3.2 with minor modifications to fit our tower
+func (e Ext6) CyclotomicSquareKarabina2345(x *E6) *E6 {
+	x = e.Reduce(x)
+	z := e.Copy(x)
+
+	var t [7]*baseEl
+
+	// t0 = g1²
+	t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
+	// t1 = g5²
+	t[1] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
+	// t5 = g1 + g5
+	t[5] = e.fp.Add(&x.B0.A1, &x.B1.A2)
+	// t2 = (g1 + g5)²
+	t[2] = e.fp.Mul(t[5], t[5])
+
+	// t3 = g1² + g5²
+	t[3] = e.fp.Add(t[0], t[1])
+	// t5 = 2 * g1 * g5
+	t[5] = e.fp.Sub(t[3], t[2])
+
+	// t6 = g3 + g2
+	t[6] = e.fp.Add(&x.B1.A0, &x.B0.A2)
+	// t3 = (g3 + g2)²
+	t[3] = e.fp.Mul(t[6], t[6])
+	// t2 = g3²
+	t[2] = e.fp.Mul(&x.B1.A0, &x.B1.A0)
+
+	// t6 = 2 * nr * g1 * g5
+	t[6] = e.fp.MulConst(t[5], big.NewInt(4))
+	// t5 = 4 * nr * g1 * g5 + 2 * g3
+	t[5] = e.fp.Add(t[6], &x.B1.A0)
+	t[5] = e.fp.MulConst(t[5], big.NewInt(2))
+	// z3 = 6 * nr * g1 * g5 + 2 * g3
+	z.B1.A0 = *e.fp.Add(t[5], t[6])
+
+	// t4 = nr * g5²
+	t[4] = mulFpByNonResidue(e.fp, t[1])
+	// t5 = nr * g5² + g1²
+	t[5] = e.fp.Add(t[0], t[4])
+	// t6 = nr * g5² + g1² - g2
+	t[6] = e.fp.Sub(t[5], &x.B0.A2)
+
+	// t1 = g2²
+	t[1] = e.fp.Mul(&x.B0.A2, &x.B0.A2)
+
+	// t6 = 2 * nr * g5² + 2 * g1² - 2*g2
+	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
+	// z2 = 3 * nr * g5² + 3 * g1² - 2*g2
+	z.B0.A2 = *e.fp.Add(t[6], t[5])
+
+	// t4 = nr * g2²
+	t[4] = mulFpByNonResidue(e.fp, t[1])
+	// t5 = g3² + nr * g2²
+	t[5] = e.fp.Add(t[2], t[4])
+	// t6 = g3² + nr * g2² - g1
+	t[6] = e.fp.Sub(t[5], &x.B0.A1)
+	// t6 = 2 * g3² + 2 * nr * g2² - 2 * g1
+	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
+	// z1 = 3 * g3² + 3 * nr * g2² - 2 * g1
+	z.B0.A1 = *e.fp.Add(t[6], t[5])
+
+	// t0 = g2² + g3²
+	t[0] = e.fp.Add(t[2], t[1])
+	// t5 = 2 * g3 * g2
+	t[5] = e.fp.Sub(t[3], t[0])
+	// t6 = 2 * g3 * g2 + g5
+	t[6] = e.fp.Add(t[5], &x.B1.A2)
+	// t6 = 4 * g3 * g2 + 2 * g5
+	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
+	// z5 = 6 * g3 * g2 + 2 * g5
+	z.B1.A2 = *e.fp.Add(t[5], t[6])
+
+	return z
+}
+
+// DecompressKarabina2345 decompresses Karabina's cyclotomic square result SQR2345
+// if g3 != 0
+//
+//	g4 = (E * g5^2 + 3 * g1^2 - 2 * g2)/4g3
+//
+// if g3 == 0
+//
+//	g4 = 2g1g5/g2
+//
+// if g3=g2=0 then g4=g5=g1=0 and g0=1 (x=1)
+// Theorem 3.1 is well-defined for all x in Gϕₙ\{1}
+func (e Ext6) DecompressKarabina2345(x *E6) *E6 {
+
+	x = e.Reduce(x)
+
+	var z E6
+
+	var t [3]*baseEl
+	var _t [2]*baseEl
+	one := e.fp.One()
+
+	// if g3 == 0
+	// t0 = 2 * g1 * g5
+	// t1 = g2
+	selector1 := e.fp.IsZero(&x.B1.A0)
+	_t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
+	_t[0] = e.fp.MulConst(_t[0], big.NewInt(2))
+	_t[1] = &x.B0.A2
+
+	// if g2 == g3 == 0
+	selector2 := e.fp.IsZero(_t[1])
+
+	// if g3 != 0
+	// t0 = E * g5^2 + 3 * g1^2 - 2 * g2
+	// t1 = 4 * g3
+	t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
+	t[1] = e.fp.Sub(t[0], &x.B0.A2)
+	t[1] = e.fp.MulConst(t[1], big.NewInt(2))
+	t[1] = e.fp.Add(t[1], t[0])
+	t[2] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
+	t[0] = mulFpByNonResidue(e.fp, t[2])
+	t[0] = e.fp.Add(t[0], t[1])
+	t[1] = e.fp.Add(&x.B1.A0, &x.B1.A0)
+	t[1] = e.fp.MulConst(t[1], big.NewInt(2))
+
+	// g4 = (E * g5^2 + 3 * g1^2 - 2 * g2)/4g3 or (2 * g1 * g5)/g2
+	t[0] = e.fp.Select(selector1, _t[0], t[0])
+	t[1] = e.fp.Select(selector1, _t[1], t[1])
+	// g4 = dummy value, continue
+	t[1] = e.fp.Select(selector2, one, t[1])
+
+	z.B1.A1 = *e.fp.Div(t[0], t[1])
+
+	// Rest of the computation for all cases
+	// t1 = g2 * g1
+	t[1] = e.fp.Mul(&x.B0.A2, &x.B0.A1)
+	// t2 = 2 * g4² - 3 * g2 * g1
+	t[2] = e.fp.Mul(&z.B1.A1, &z.B1.A1)
+	t[2] = e.fp.Sub(t[2], t[1])
+	t[2] = e.fp.MulConst(t[2], big.NewInt(2))
+	t[2] = e.fp.Sub(t[2], t[1])
+	// t1 = g3 * g5 (g3 can be 0)
+	t[1] = e.fp.Mul(&x.B1.A0, &x.B1.A2)
+	// g0 = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
+	t[2] = e.fp.Add(t[2], t[1])
+
+	z.B0.A0 = *mulFpByNonResidue(e.fp, t[2])
+	z.B0.A0 = *e.fp.Add(&z.B0.A0, one)
+
+	z.B0.A1 = x.B0.A1
+	z.B0.A2 = x.B0.A2
+	z.B1.A0 = x.B1.A0
+	z.B1.A2 = x.B1.A2
+
+	return e.Select(e.api.And(selector1, selector2), e.One(), &z)
+}
+
+// Granger-Scott's cyclotomic square
+// https://eprint.iacr.org/2009/565.pdf, 3.2
+func (e Ext6) CyclotomicSquare(x *E6) *E6 {
+	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶
+	// cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0,
+	//					3*x2²*u + 3*x3² - 2*x1,
+	//					3*x5²*u + 3*x1² - 2*x2,
+	//					6*x1*x5*u + 2*x3,
+	//					6*x0*x4 + 2*x4,
+	//					6*x2*x3 + 2*x5)
+
+	x = e.Reduce(x)
+
+	var t [9]*baseEl
+
+	t[0] = e.fp.Mul(&x.B1.A1, &x.B1.A1)
+	t[1] = e.fp.Mul(&x.B0.A0, &x.B0.A0)
+	t[6] = e.fp.Add(&x.B1.A1, &x.B0.A0)
+	t[6] = e.fp.Mul(t[6], t[6])
+	t[6] = e.fp.Sub(t[6], t[0])
+	t[6] = e.fp.Sub(t[6], t[1]) // 2*x4*x0
+	t[2] = e.fp.Mul(&x.B0.A2, &x.B0.A2)
+	t[3] = e.fp.Mul(&x.B1.A0, &x.B1.A0)
+	t[7] = e.fp.Add(&x.B0.A2, &x.B1.A0)
+	t[7] = e.fp.Mul(t[7], t[7])
+	t[7] = e.fp.Sub(t[7], t[2])
+	t[7] = e.fp.Sub(t[7], t[3]) // 2*x2*x3
+	t[4] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
+	t[5] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
+	t[8] = e.fp.Add(&x.B1.A2, &x.B0.A1)
+	t[8] = e.fp.Mul(t[8], t[8])
+	t[8] = e.fp.Sub(t[8], t[4])
+	t[8] = e.fp.Sub(t[5], t[8])
+	t[8] = e.fp.MulConst(t[8], big.NewInt(4)) // 2*x5*x1*u
+
+	t[0] = mulFpByNonResidue(e.fp, t[0])
+	t[0] = e.fp.Add(t[0], t[1]) // x4²*u + x0²
+	t[2] = mulFpByNonResidue(e.fp, t[2])
+	t[2] = e.fp.Add(t[2], t[3]) // x2²*u + x3²
+	t[4] = mulFpByNonResidue(e.fp, t[4])
+	t[4] = e.fp.Add(t[4], t[5]) // x5²*u + x1²
+
+	var z E6
+	z.B0.A0 = *e.fp.Sub(t[0], &x.B0.A0)
+	z.B0.A0 = *e.fp.MulConst(&z.B0.A0, big.NewInt(2))
+	z.B0.A0 = *e.fp.Add(&z.B0.A0, t[0])
+	z.B0.A1 = *e.fp.Sub(t[2], &x.B0.A1)
+	z.B0.A1 = *e.fp.MulConst(&z.B0.A1, big.NewInt(2))
+	z.B0.A1 = *e.fp.Add(&z.B0.A1, t[2])
+	z.B0.A2 = *e.fp.Sub(t[4], &x.B0.A2)
+	z.B0.A2 = *e.fp.MulConst(&z.B0.A2, big.NewInt(2))
+	z.B0.A2 = *e.fp.Add(&z.B0.A2, t[4])
+
+	z.B1.A0 = *e.fp.Add(t[8], &x.B1.A0)
+	z.B1.A0 = *e.fp.MulConst(&z.B1.A0, big.NewInt(2))
+	z.B1.A0 = *e.fp.Add(&z.B1.A0, t[8])
+	z.B1.A1 = *e.fp.Add(t[6], &x.B1.A1)
+	z.B1.A1 = *e.fp.MulConst(&z.B1.A1, big.NewInt(2))
+	z.B1.A1 = *e.fp.Add(&z.B1.A1, t[6])
+	z.B1.A2 = *e.fp.Add(t[7], &x.B1.A2)
+	z.B1.A2 = *e.fp.Add(&z.B1.A2, &z.B1.A2)
+	z.B1.A2 = *e.fp.Add(&z.B1.A2, t[7])
+
+	return &z
+}
+
+func (e Ext6) Inverse(x *E6) *E6 {
+	res, err := e.fp.NewHint(inverseE6Hint, 6, &x.B0.A0, &x.B0.A1, &x.B0.A2, &x.B1.A0, &x.B1.A1, &x.B1.A2)
+	if err != nil {
+		// err is non-nil only for invalid number of inputs
+		panic(err)
+	}
+
+	inv := E6{
+		B0: E3{A0: *res[0], A1: *res[1], A2: *res[2]},
+		B1: E3{A0: *res[3], A1: *res[4], A2: *res[5]},
+	}
+	one := e.One()
+
+	// 1 == inv * x
+	_one := e.Mul(&inv, x)
+	e.AssertIsEqual(one, _one)
+
+	return &inv
+
+}
+
+func (e Ext6) DivUnchecked(x, y *E6) *E6 {
+	res, err := e.fp.NewHint(divE6Hint, 12, &x.B0.A0, &x.B0.A1, &x.B0.A2, &x.B1.A0, &x.B1.A1, &x.B1.A2, &y.B0.A0, &y.B0.A1, &y.B0.A2, &y.B1.A0, &y.B1.A1, &y.B1.A2)
+	if err != nil {
+		// err is non-nil only for invalid number of inputs
+		panic(err)
+	}
+
+	div := E6{
+		B0: E3{A0: *res[0], A1: *res[1], A2: *res[2]},
+		B1: E3{A0: *res[3], A1: *res[4], A2: *res[5]},
+	}
+
+	// x = div * y
+	_x := e.Mul(&div, y)
+	e.AssertIsEqual(x, _x)
+
+	return &div
+
+}
+
+func (e Ext6) Conjugate(x *E6) *E6 {
+	return &E6{
+		B0: x.B0,
+		B1: *e.Ext3.Neg(&x.B1),
+	}
+}
+
+func (e Ext6) AssertIsEqual(a, b *E6) {
+	e.Ext3.AssertIsEqual(&a.B0, &b.B0)
+	e.Ext3.AssertIsEqual(&a.B1, &b.B1)
+}
+
+func (e Ext6) Copy(x *E6) *E6 {
+	b0 := e.Ext3.Copy(&x.B0)
+	b1 := e.Ext3.Copy(&x.B1)
+	return &E6{
+		B0: *b0,
+		B1: *b1,
+	}
+}
+
+func FromE6(a *bw6761.E6) E6 {
+	return E6{
+		B0: FromE3(&a.B0),
+		B1: FromE3(&a.B1),
+	}
+}
+
+// Frobenius set z in E6 to Frobenius(x), return z
+func (e Ext6) Frobenius(x *E6) *E6 {
+	_frobA := emulated.ValueOf[emulated.BW6761Fp]("4922464560225523242118178942575080391082002530232324381063048548642823052024664478336818169867474395270858391911405337707247735739826664939444490469542109391530482826728203582549674992333383150446779312029624171857054392282775648")
+	_frobB := emulated.ValueOf[emulated.BW6761Fp]("1968985824090209297278610739700577151397666382303825728450741611566800370218827257750865013421937292370006175842381275743914023380727582819905021229583192207421122272650305267822868639090213645505120388400344940985710520836292650")
+	_frobC := emulated.ValueOf[emulated.BW6761Fp]("4922464560225523242118178942575080391082002530232324381063048548642823052024664478336818169867474395270858391911405337707247735739826664939444490469542109391530482826728203582549674992333383150446779312029624171857054392282775649")
+	_frobAC := emulated.ValueOf[emulated.BW6761Fp]("-1")
+	_frobBC := emulated.ValueOf[emulated.BW6761Fp]("1968985824090209297278610739700577151397666382303825728450741611566800370218827257750865013421937292370006175842381275743914023380727582819905021229583192207421122272650305267822868639090213645505120388400344940985710520836292651")
+	var z E6
+	z.B0.A0 = x.B0.A0
+	z.B0.A1 = *e.fp.Mul(&x.B0.A1, &_frobA)
+	z.B0.A2 = *e.fp.Mul(&x.B0.A2, &_frobB)
+
+	z.B1.A0 = *e.fp.Mul(&x.B1.A0, &_frobC)
+	z.B1.A1 = *e.fp.Mul(&x.B1.A1, &_frobAC)
+	z.B1.A2 = *e.fp.Mul(&x.B1.A2, &_frobBC)
+
+	return &z
+}
+
+func (e Ext6) Select(selector frontend.Variable, z1, z0 *E6) *E6 {
+	b0 := e.Ext3.Select(selector, &z1.B0, &z0.B0)
+	b1 := e.Ext3.Select(selector, &z1.B1, &z0.B1)
+	return &E6{B0: *b0, B1: *b1}
+}
diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 2a78781e09..5d73f5ba6b 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -8,172 +8,742 @@ import (
 	"github.com/consensys/gnark/std/math/emulated"
 )
 
+type curveF = emulated.Field[emulated.BW6761Fp]
+type baseEl = emulated.Element[emulated.BW6761Fp]
+
 type E6 struct {
-	B0, B1 E3
+	A0, A1, A2, A3, A4, A5 baseEl
 }
 
 type Ext6 struct {
-	*Ext3
+	api frontend.API
+	fp  *curveF
+}
+
+func NewExt6(api frontend.API) *Ext6 {
+	fp, err := emulated.NewField[emulated.BW6761Fp](api)
+	if err != nil {
+		panic(err)
+	}
+	return &Ext6{
+		api: api,
+		fp:  fp,
+	}
 }
 
 func (e Ext6) Reduce(x *E6) *E6 {
 	var z E6
-	z.B0 = *e.Ext3.Reduce(&x.B0)
-	z.B1 = *e.Ext3.Reduce(&x.B1)
+	z.A0 = *e.fp.Reduce(&x.A0)
+	z.A1 = *e.fp.Reduce(&x.A1)
+	z.A2 = *e.fp.Reduce(&x.A2)
+	z.A3 = *e.fp.Reduce(&x.A3)
+	z.A4 = *e.fp.Reduce(&x.A4)
+	z.A5 = *e.fp.Reduce(&x.A5)
 	return &z
 }
 
-func NewExt6(api frontend.API) *Ext6 {
-	return &Ext6{Ext3: NewExt3(api)}
-}
-
 func (e Ext6) Zero() *E6 {
-	b0 := e.Ext3.Zero()
-	b1 := e.Ext3.Zero()
+	zero := e.fp.Zero()
 	return &E6{
-		B0: *b0,
-		B1: *b1,
+		A0: *zero,
+		A1: *zero,
+		A2: *zero,
+		A3: *zero,
+		A4: *zero,
+		A5: *zero,
 	}
 }
 
 func (e Ext6) One() *E6 {
+	one := e.fp.One()
+	zero := e.fp.Zero()
 	return &E6{
-		B0: *e.Ext3.One(),
-		B1: *e.Ext3.Zero(),
+		A0: *one,
+		A1: *zero,
+		A2: *zero,
+		A3: *zero,
+		A4: *zero,
+		A5: *zero,
+	}
+}
+
+func (e Ext6) Neg(x *E6) *E6 {
+	a0 := e.fp.Neg(&x.A0)
+	a1 := e.fp.Neg(&x.A1)
+	a2 := e.fp.Neg(&x.A2)
+	a3 := e.fp.Neg(&x.A3)
+	a4 := e.fp.Neg(&x.A4)
+	a5 := e.fp.Neg(&x.A5)
+	return &E6{
+		A0: *a0,
+		A1: *a1,
+		A2: *a2,
+		A3: *a3,
+		A4: *a4,
+		A5: *a5,
 	}
 }
 
 func (e Ext6) Add(x, y *E6) *E6 {
+	a0 := e.fp.Add(&x.A0, &y.A0)
+	a1 := e.fp.Add(&x.A1, &y.A1)
+	a2 := e.fp.Add(&x.A2, &y.A2)
+	a3 := e.fp.Add(&x.A3, &y.A3)
+	a4 := e.fp.Add(&x.A4, &y.A4)
+	a5 := e.fp.Add(&x.A5, &y.A5)
 	return &E6{
-		B0: *e.Ext3.Add(&x.B0, &y.B0),
-		B1: *e.Ext3.Add(&x.B1, &y.B1),
+		A0: *a0,
+		A1: *a1,
+		A2: *a2,
+		A3: *a3,
+		A4: *a4,
+		A5: *a5,
 	}
 }
 
 func (e Ext6) Sub(x, y *E6) *E6 {
+	a0 := e.fp.Sub(&x.A0, &y.A0)
+	a1 := e.fp.Sub(&x.A1, &y.A1)
+	a2 := e.fp.Sub(&x.A2, &y.A2)
+	a3 := e.fp.Sub(&x.A3, &y.A3)
+	a4 := e.fp.Sub(&x.A4, &y.A4)
+	a5 := e.fp.Sub(&x.A5, &y.A5)
 	return &E6{
-		B0: *e.Ext3.Sub(&x.B0, &y.B0),
-		B1: *e.Ext3.Sub(&x.B1, &y.B1),
+		A0: *a0,
+		A1: *a1,
+		A2: *a2,
+		A3: *a3,
+		A4: *a4,
+		A5: *a5,
 	}
 }
 
 func (e Ext6) Double(x *E6) *E6 {
+	two := big.NewInt(2)
+	a0 := e.fp.MulConst(&x.A0, two)
+	a1 := e.fp.MulConst(&x.A1, two)
+	a2 := e.fp.MulConst(&x.A2, two)
+	a3 := e.fp.MulConst(&x.A3, two)
+	a4 := e.fp.MulConst(&x.A4, two)
+	a5 := e.fp.MulConst(&x.A5, two)
+	return &E6{
+		A0: *a0,
+		A1: *a1,
+		A2: *a2,
+		A3: *a3,
+		A4: *a4,
+		A5: *a5,
+	}
+}
+
+func (e Ext6) MulByElement(x *E6, y *baseEl) *E6 {
+	a0 := e.fp.Mul(&x.A0, y)
+	a1 := e.fp.Mul(&x.A1, y)
+	a2 := e.fp.Mul(&x.A2, y)
+	a3 := e.fp.Mul(&x.A3, y)
+	a4 := e.fp.Mul(&x.A4, y)
+	a5 := e.fp.Mul(&x.A5, y)
+	z := &E6{
+		A0: *a0,
+		A1: *a1,
+		A2: *a2,
+		A3: *a3,
+		A4: *a4,
+		A5: *a5,
+	}
+	return z
+}
+
+func (e Ext6) MulByConstElement(x *E6, y *big.Int) *E6 {
+	a0 := e.fp.MulConst(&x.A0, y)
+	a1 := e.fp.MulConst(&x.A1, y)
+	a2 := e.fp.MulConst(&x.A2, y)
+	a3 := e.fp.MulConst(&x.A3, y)
+	a4 := e.fp.MulConst(&x.A4, y)
+	a5 := e.fp.MulConst(&x.A5, y)
+	return &E6{
+		A0: *a0,
+		A1: *a1,
+		A2: *a2,
+		A3: *a3,
+		A4: *a4,
+		A5: *a5,
+	}
+}
+
+func (e Ext6) Conjugate(x *E6) *E6 {
 	return &E6{
-		B0: *e.Ext3.Double(&x.B0),
-		B1: *e.Ext3.Double(&x.B1),
+		A0: x.A0,
+		A1: *e.fp.Neg(&x.A1),
+		A2: x.A2,
+		A3: *e.fp.Neg(&x.A3),
+		A4: x.A4,
+		A5: *e.fp.Neg(&x.A5),
 	}
 }
 
+func mulFpByNonResidue(fp *curveF, x *baseEl) *baseEl {
+
+	z := fp.Neg(x)
+	z = fp.MulConst(z, big.NewInt(4))
+	return z
+}
+
 func (e Ext6) Mul(x, y *E6) *E6 {
 	x = e.Reduce(x)
 	y = e.Reduce(y)
+	v := e.interpolationX6Mul(x, y)
+	return e.mulMontgomery6(v)
+	// return e.mulToomCook6(v)
+}
 
-	v0 := e.Ext3.Mul(&x.B0, &y.B0)
-	v1 := e.Ext3.Mul(&x.B1, &y.B1)
+func (e Ext6) interpolationX6Mul(x, y *E6) [18]*baseEl {
+	// Fixing the polynomial to X^6 we first compute the interpolation points
+	// vi = x(pi)*y(pi) at {0, ±1, ±2, ±3, ±4, 5,∞}:
+	//
+	//		v0 = (a0 + a1 + a2 + a3 + a4 + a5)(b0 + b1 + b2 + b3 + b4 + b5)
+	//		v2 = (a0 + a1 + a3 + a4)(b0 + b1 + b3 + b4)
+	//		v3 = (a0 − a2 − a3 + a5)(b0 − b2 − b3 + b5)
+	//		v4 = (a0 − a2 − a5)(b0 − b2 − b5)
+	//		v5 = (a0 + a3 − a5)(b0 + b3 − b5)
+	//		v6 = (a0 + a1 + a2)(b0 + b1 + b2)
+	//		v7 = (a3 + a4 + a5)(b3 + b4 + b5)
+	//		v8 = (a2 + a3)(b2 + b3)
+	//		v9 = (a1 − a4)(b1 − b4)
+	//		v10 = (a1 + a2)(b1 + b2)
+	//		v11 = (a3 + a4)(b3 + b4)
+	//		v12 = (a0 + a1)(b0 + b1)
+	//		v13 = (a4 + a5)(b4 + b5)
+	//		v14 = a0b0
+	//		v15 = a1b1
+	//		v16 = a4b4
+	//		v17 = a5b5
+	_t0 := e.fp.Add(&x.A0, &x.A1)
+	t0 := e.fp.Add(_t0, &x.A2)
+	t1 := e.fp.Add(&x.A3, &x.A4)
+	t2 := e.fp.Add(_t0, t1)
+	t3 := e.fp.Add(t2, &x.A5)
+	t3 = e.fp.Add(t3, &x.A2)
+
+	_s0 := e.fp.Add(&y.A0, &y.A1)
+	s0 := e.fp.Add(_s0, &y.A2)
+	s1 := e.fp.Add(&y.A3, &y.A4)
+	s2 := e.fp.Add(_s0, s1)
+	s3 := e.fp.Add(s2, &y.A5)
+	s3 = e.fp.Add(s3, &y.A2)
+
+	v0 := e.fp.Mul(t3, s3)
+	v2 := e.fp.Mul(t2, s2)
+	v6 := e.fp.Mul(t0, s0)
+	t4 := e.fp.Add(t1, &x.A5)
+	s4 := e.fp.Add(s1, &y.A5)
+	v7 := e.fp.Mul(t4, s4)
+	v12 := e.fp.Mul(_t0, _s0)
+	v11 := e.fp.Mul(t1, s1)
+	t0 = e.fp.Add(&x.A2, &x.A3)
+	s0 = e.fp.Add(&y.A2, &y.A3)
+	v8 := e.fp.Mul(t0, s0)
+	_t0 = e.fp.Sub(&x.A1, &x.A4)
+	_s0 = e.fp.Sub(&y.A1, &y.A4)
+	v9 := e.fp.Mul(_t0, _s0)
+	t1 = e.fp.Add(&x.A1, &x.A2)
+	s1 = e.fp.Add(&y.A1, &y.A2)
+	v10 := e.fp.Mul(t1, s1)
+	t1 = e.fp.Add(&x.A4, &x.A5)
+	s1 = e.fp.Add(&y.A4, &y.A5)
+	v13 := e.fp.Mul(t1, s1)
+	v3 := e.fp.Add(&x.A0, &x.A5)
+	v3 = e.fp.Sub(v3, t0)
+	s1 = e.fp.Add(&y.A0, &y.A5)
+	s1 = e.fp.Sub(s1, s0)
+	v3 = e.fp.Mul(v3, s1)
+	t1 = e.fp.Add(&x.A2, &x.A5)
+	t2 = e.fp.Sub(&x.A0, t1)
+	s1 = e.fp.Add(&y.A2, &y.A5)
+	s2 = e.fp.Sub(&y.A0, s1)
+	v4 := e.fp.Mul(t2, s2)
+	t1 = e.fp.Add(&x.A0, &x.A3)
+	t1 = e.fp.Sub(t1, &x.A5)
+	s1 = e.fp.Add(&y.A0, &y.A3)
+	s1 = e.fp.Sub(s1, &y.A5)
+	v5 := e.fp.Mul(t1, s1)
+	v14 := e.fp.Mul(&x.A0, &y.A0)
+	v15 := e.fp.Mul(&x.A1, &y.A1)
+	v16 := e.fp.Mul(&x.A4, &y.A4)
+	v17 := e.fp.Mul(&x.A5, &y.A5)
+	v1 := e.fp.Zero()
+
+	return [18]*baseEl{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17}
+}
+
+func (e Ext6) interpolationX6Sq(x *E6) [18]*baseEl {
+	// Fixing the polynomial to X^6 we first compute the interpolation points
+	// vi = x(pi)*y(pi) at {0, ±1, ±2, ±3, ±4, 5,∞}:
+	//
+	//		v0 = (a0 + a1 + a2 + a3 + a4 + a5)^2
+	//		v2 = (a0 + a1 + a3 + a4)^2
+	//		v3 = (a0 − a2 − a3 + a5)^2
+	//		v4 = (a0 − a2 − a5)^2
+	//		v5 = (a0 + a3 − a5)^2
+	//		v6 = (a0 + a1 + a2)^2
+	//		v7 = (a3 + a4 + a5)^2
+	//		v8 = (a2 + a3)^2
+	//		v9 = (a1 − a4)^2
+	//		v10 = (a1 + a2)^2
+	//		v11 = (a3 + a4)^2
+	//		v12 = (a0 + a1)^2
+	//		v13 = (a4 + a5)^2
+	//		v14 = a0^2
+	//		v15 = a1^2
+	//		v16 = a4^2
+	//		v17 = a5^2
+
+	_t0 := e.fp.Add(&x.A0, &x.A1)
+	t0 := e.fp.Add(_t0, &x.A2)
+	t1 := e.fp.Add(&x.A3, &x.A4)
+	t2 := e.fp.Add(_t0, t1)
+	t3 := e.fp.Add(t2, &x.A5)
+	t3 = e.fp.Add(t3, &x.A2)
+
+	v0 := e.fp.Mul(t3, t3)
+	v2 := e.fp.Mul(t2, t2)
+	v6 := e.fp.Mul(t0, t0)
+	t4 := e.fp.Add(t1, &x.A5)
+	v7 := e.fp.Mul(t4, t4)
+	v12 := e.fp.Mul(_t0, _t0)
+	v11 := e.fp.Mul(t1, t1)
+	t0 = e.fp.Add(&x.A2, &x.A3)
+	v8 := e.fp.Mul(t0, t0)
+	_t0 = e.fp.Sub(&x.A1, &x.A4)
+	v9 := e.fp.Mul(_t0, _t0)
+	t1 = e.fp.Add(&x.A1, &x.A2)
+	v10 := e.fp.Mul(t1, t1)
+	t1 = e.fp.Add(&x.A4, &x.A5)
+	v13 := e.fp.Mul(t1, t1)
+	v3 := e.fp.Add(&x.A0, &x.A5)
+	v3 = e.fp.Sub(v3, t0)
+	v3 = e.fp.Mul(v3, v3)
+	t1 = e.fp.Add(&x.A2, &x.A5)
+	t2 = e.fp.Sub(&x.A0, t1)
+	v4 := e.fp.Mul(t2, t2)
+	t1 = e.fp.Add(&x.A0, &x.A3)
+	t1 = e.fp.Sub(t1, &x.A5)
+	v5 := e.fp.Mul(t1, t1)
+	v14 := e.fp.Mul(&x.A0, &x.A0)
+	v15 := e.fp.Mul(&x.A1, &x.A1)
+	v16 := e.fp.Mul(&x.A4, &x.A4)
+	v17 := e.fp.Mul(&x.A5, &x.A5)
+	v1 := e.fp.Zero()
+
+	return [18]*baseEl{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17}
+}
 
-	b0 := e.Ext3.MulByNonResidue(v1)
-	b0 = e.Ext3.Add(b0, v0)
-	b1 := e.Ext3.Add(&x.B0, &x.B1)
-	tmp := e.Ext3.Add(&y.B0, &y.B1)
-	b1 = e.Ext3.Mul(b1, tmp)
-	tmp = e.Ext3.Add(v0, v1)
-	b1 = e.Ext3.Sub(b1, tmp)
+func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
+	// Then we compute the coefficients c0,c1,c3,c4 and c5 in the direct sextic
+	// extension of the product x*y as follows:
+	//
+	// Ref.: Peter L. Montgomery. Five, six, and seven-term Karatsuba-like formulae. IEEE
+	// Transactions on Computers, 54(3):362–369, 2005.
+	//
+	// 	c0 = v14 + β(v0 − v2 + v4 + 2(v3+v5+v6-v12) + 3(v7+v15-v8-v10-v11) +
+	// 	4(v16-v13) − 5(v14+v17))
+	//
+	//  c1 = v12 − (v14 + v15) + β(v8 + v10 + v12 − (v3 + v5 + v6 + v15) +
+	//  2(v14 + v17 + v13 - v7) + 3(v11 - v16))
+	//
+	// 	c2 = 2v15 + v6 − (v10 + v12) + β(2v16 + v7 − (v11 + v13))
+	//
+	// 	c3 = v8 + v11 + v13 − (v3 + v4 + v7 + v16) + 3(v10 - v15) + 2(v12 + v14
+	// 	+ v17 - v6) + β(v13 − (v16 + v17))
+	//
+	// 	c4 = v2 + v3 + v4 + v7 + v15 + v9 − (v8 + v13) − 3v12 + 2(v6 − (v17 +
+	// 	v10 + v11 + v14)) + βv17
+	//
+	//  c5 = −(v3 + v4 + v5 + v9 + v15 + v16) + 2(v8 + v10 + v11 + v12 + v13 −
+	//  (v6 + v7)) + 3(v14 + v17)
+
+	c0 := e.fp.Sub(v[0], v[2])
+	c0 = e.fp.Add(c0, v[4])
+	s1 := e.fp.Add(v[3], v[5])
+	s1 = e.fp.Add(s1, v[6])
+	s1 = e.fp.Sub(s1, v[12])
+	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.Add(v[7], v[15])
+	s2 := e.fp.Add(v[8], v[10])
+	s2 = e.fp.Add(s2, v[11])
+	s1 = e.fp.Sub(s1, s2)
+	s1 = e.fp.MulConst(s1, big.NewInt(3))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.Sub(v[16], v[13])
+	s1 = e.fp.MulConst(s1, big.NewInt(4))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.Add(v[14], v[17])
+	s1 = e.fp.MulConst(s1, big.NewInt(5))
+	c0 = e.fp.Sub(c0, s1)
+	c0 = mulFpByNonResidue(e.fp, c0)
+	c0 = e.fp.Add(c0, v[14])
+
+	c1 := e.fp.Add(v[15], v[14])
+	c1 = e.fp.Sub(v[12], c1)
+	s2 = e.fp.Add(v[3], v[5])
+	s2 = e.fp.Add(s2, v[6])
+	s2 = e.fp.Add(s2, v[15])
+	s1 = e.fp.Add(v[10], v[8])
+	s1 = e.fp.Add(s1, v[12])
+	s1 = e.fp.Sub(s1, s2)
+	s2 = e.fp.Add(v[14], v[17])
+	s2 = e.fp.Add(s2, v[13])
+	s2 = e.fp.Sub(s2, v[7])
+	s2 = e.fp.MulConst(s2, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.Sub(v[11], v[16])
+	s2 = e.fp.MulConst(s2, big.NewInt(3))
+	s1 = e.fp.Add(s1, s2)
+	s1 = mulFpByNonResidue(e.fp, s1)
+	c1 = e.fp.Add(c1, s1)
+
+	c2 := e.fp.MulConst(v[15], big.NewInt(2))
+	c2 = e.fp.Add(c2, v[6])
+	s1 = e.fp.Add(v[10], v[12])
+	c2 = e.fp.Sub(c2, s1)
+	s2 = e.fp.Add(v[11], v[13])
+	s1 = e.fp.MulConst(v[16], big.NewInt(2))
+	s1 = e.fp.Add(s1, v[7])
+	s1 = e.fp.Sub(s1, s2)
+	s1 = mulFpByNonResidue(e.fp, s1)
+	c2 = e.fp.Add(c2, s1)
+
+	c3 := e.fp.Add(v[8], v[11])
+	c3 = e.fp.Add(c3, v[13])
+	s1 = e.fp.Add(v[3], v[4])
+	s1 = e.fp.Add(s1, v[7])
+	s1 = e.fp.Add(s1, v[16])
+	c3 = e.fp.Sub(c3, s1)
+	s1 = e.fp.Sub(v[10], v[15])
+	s1 = e.fp.MulConst(s1, big.NewInt(3))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.Add(v[12], v[14])
+	s1 = e.fp.Add(s1, v[17])
+	s1 = e.fp.Sub(s1, v[6])
+	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	c3 = e.fp.Add(c3, s1)
+	s2 = e.fp.Add(v[16], v[17])
+	s1 = e.fp.Sub(v[13], s2)
+	s1 = mulFpByNonResidue(e.fp, s1)
+	c3 = e.fp.Add(c3, s1)
+
+	c4 := e.fp.Add(v[2], v[3])
+	c4 = e.fp.Add(c4, v[4])
+	c4 = e.fp.Add(c4, v[7])
+	c4 = e.fp.Add(c4, v[15])
+	c4 = e.fp.Add(c4, v[9])
+	s1 = e.fp.Add(v[8], v[13])
+	c4 = e.fp.Sub(c4, s1)
+	s1 = e.fp.MulConst(v[12], big.NewInt(3))
+	c4 = e.fp.Sub(c4, s1)
+	s1 = e.fp.Add(v[10], v[17])
+	s1 = e.fp.Add(s1, v[11])
+	s1 = e.fp.Add(s1, v[14])
+	s1 = e.fp.Sub(v[6], s1)
+	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	c4 = e.fp.Add(c4, s1)
+	s1 = mulFpByNonResidue(e.fp, v[17])
+	c4 = e.fp.Add(c4, s1)
+
+	c5 := e.fp.Add(v[8], v[10])
+	c5 = e.fp.Add(c5, v[11])
+	c5 = e.fp.Add(c5, v[12])
+	c5 = e.fp.Add(c5, v[13])
+	s1 = e.fp.Add(v[6], v[7])
+	c5 = e.fp.Sub(c5, s1)
+	c5 = e.fp.MulConst(c5, big.NewInt(2))
+	s1 = e.fp.Add(v[14], v[17])
+	s1 = e.fp.MulConst(s1, big.NewInt(3))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.Add(v[3], v[4])
+	s1 = e.fp.Add(s1, v[5])
+	s1 = e.fp.Add(s1, v[9])
+	s1 = e.fp.Add(s1, v[15])
+	s1 = e.fp.Add(s1, v[16])
+	c5 = e.fp.Sub(c5, s1)
 
 	return &E6{
-		B0: *b0,
-		B1: *b1,
+		A0: *c0,
+		A1: *c1,
+		A2: *c2,
+		A3: *c3,
+		A4: *c4,
+		A5: *c5,
 	}
 }
 
 func (e Ext6) Square(x *E6) *E6 {
-	//Algorithm 22 from https://eprint.iacr.org/2010/354.pdf
 	x = e.Reduce(x)
-	c0 := e.Ext3.Sub(&x.B0, &x.B1)
-	c3 := &E3{
-		A0: x.B1.A2,
-		A1: x.B1.A0,
-		A2: x.B1.A1,
-	}
-	c3.A0 = *e.fp.MulConst(&c3.A0, big.NewInt(4))
-	c3.A0 = *e.fp.Add(&x.B0.A0, &c3.A0)
-	c3.A1 = *e.fp.Sub(&x.B0.A1, &c3.A1)
-	c3.A2 = *e.fp.Sub(&x.B0.A2, &c3.A2)
-	c2 := e.Ext3.Mul(&x.B0, &x.B1)
-	c0 = e.Ext3.Mul(c0, c3)
-	c0 = e.Ext3.Add(c0, c2)
-	b1 := e.Ext3.Double(c2)
-	c2 = e.Ext3.MulByNonResidue(c2)
-	b0 := e.Ext3.Add(c0, c2)
+	v := e.interpolationX6Sq(x)
+	return e.mulMontgomery6(v)
+	// return e.mulToomCook6(v)
+}
+
+/*
+func (e Ext6) MulToomCook6x(x, y *E6) *E6 {
+	//	Then we compute the product  362880*x*y to avoid divisions:
+	//
+	//		c0 = 362880v0 + β(−18900v0 + 14616v2 − 6552(v3 + v4) + 1512(v5 +
+	//		v6) − 126(v7 + v8) + 99066240v10)
+	//
+	//		c1 = −(72576v0 + 241920v2 + 120960v3 - 51840v4 - 34560v5 + 8640v6 +
+	//		6480v7 - 720v8 - 576v9 + 1045094400v10 + β(-3780v0 + 2016v2 -
+	//		3024v3 - 576v4 + 1296v5 + 54v6 - 306v7 + 6v8 + 30v9 - 54432000v10))
+	//
+	//		c2 = −516600v0 + 290304v2 − 36288(v3 + v4) + 4608(v5 + v6) − 324(v7
+	//		+ v8) + 209018880v10 + β(630v0 − 504v2 + 252(v3 + v4) − 72(v5 + v6)
+	//		+ 9(v7 + v8) − 10886400v10)
+	//
+	//		c3 = 103320v0 + 54096v2 + 154056v3 − 55656v4 − 47664v5 + 10764v6 +
+	//		9144v7 − 944v8 − 820v9 + 1487808000v10 + β(−126v0 + 84(v2 − v3) −
+	//		36(v4 + v5) + 9(v6 − v7) − (v8 + v9) − 1814400v10)
+	//
+	//		c4 = 171990v0 − 122976v2 + 42588(v3 + v4) − 6048(v5 + v6) + 63(v7 +
+	//		v8) − 297561600v10 + β(362880v10)
+	//
+	//		c5 = −34398v0 + 8316v2 + 14364v5 − 36036v3 + 3276v4 − 2079v6 −
+	//		2961v7 + 231v8 + 273v9 − 495331200v10.
+
+	t1 = e.fp.Add(v3, v4) // v3 + v4
+	t2 = e.fp.Add(v5, v6) // v5 + v6
+	t3 = e.fp.Add(v7, v8) // v7 + v8
+	t4 = e.fp.Add(v4, v5) // v4 + v5
+	// _t0 = e.fp.Add(v8, v9) // v8 + v9
+
+	c0 := e.fp.MulConst(t2, big.NewInt(1512))
+	s1 = e.fp.MulConst(t1, big.NewInt(6552))
+	c0 = e.fp.Sub(c0, s1)
+	s1 = e.fp.MulConst(v2, big.NewInt(14616))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(18900))
+	c0 = e.fp.Sub(c0, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(99066240))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(t3, big.NewInt(126))
+	c0 = e.fp.Sub(c0, s1)
+	c0 = mulFpByNonResidue(e.fp, c0)
+	s1 = e.fp.MulConst(v0, big.NewInt(362880))
+	c0 = e.fp.Add(c0, s1)
+
+	c1 := e.fp.MulConst(v0, big.NewInt(72576))
+	s1 = e.fp.MulConst(v2, big.NewInt(241920))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v3, big.NewInt(120960))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v4, big.NewInt(51840))
+	c1 = e.fp.Sub(c1, s1)
+	s1 = e.fp.MulConst(v5, big.NewInt(34560))
+	c1 = e.fp.Sub(c1, s1)
+	s1 = e.fp.MulConst(v6, big.NewInt(8640))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v7, big.NewInt(6480))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v8, big.NewInt(720))
+	c1 = e.fp.Sub(c1, s1)
+	s1 = e.fp.MulConst(v9, big.NewInt(576))
+	c1 = e.fp.Sub(c1, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(1045094400))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(3780))
+	s2 = e.fp.MulConst(v2, big.NewInt(2016))
+	s1 = e.fp.Sub(s2, s1)
+	s2 = e.fp.MulConst(v3, big.NewInt(3024))
+	s1 = e.fp.Sub(s1, s2)
+	s2 = e.fp.MulConst(v4, big.NewInt(576))
+	s1 = e.fp.Sub(s1, s2)
+	s2 = e.fp.MulConst(v5, big.NewInt(1296))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v6, big.NewInt(54))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v7, big.NewInt(306))
+	s1 = e.fp.Sub(s1, s2)
+	s2 = e.fp.MulConst(v8, big.NewInt(6))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v9, big.NewInt(30))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v10, big.NewInt(54432000))
+	s1 = e.fp.Sub(s1, s2)
+	s1 = mulFpByNonResidue(e.fp, s1)
+	c1 = e.fp.Add(c1, s1)
+	c1 = e.fp.Neg(c1)
+
+	c2 := e.fp.MulConst(v2, big.NewInt(290304))
+	s1 = e.fp.MulConst(t1, big.NewInt(36288))
+	c2 = e.fp.Sub(c2, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(516600))
+	c2 = e.fp.Sub(c2, s1)
+	s1 = e.fp.MulConst(t2, big.NewInt(4608))
+	c2 = e.fp.Add(c2, s1)
+	s1 = e.fp.MulConst(t3, big.NewInt(324))
+	c2 = e.fp.Sub(c2, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(209018880))
+	c2 = e.fp.Add(c2, s1)
+	s2 = e.fp.MulConst(v0, big.NewInt(630))
+	s1 = e.fp.MulConst(v2, big.NewInt(504))
+	s1 = e.fp.Sub(s2, s1)
+	s2 = e.fp.MulConst(t1, big.NewInt(252))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(t2, big.NewInt(72))
+	s1 = e.fp.Sub(s1, s2)
+	s2 = e.fp.MulConst(t3, big.NewInt(9))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v10, big.NewInt(10886400))
+	s1 = e.fp.Sub(s1, s2)
+	s1 = mulFpByNonResidue(e.fp, s1)
+	c2 = e.fp.Add(c2, s1)
+
+	c3 := e.fp.MulConst(v0, big.NewInt(103320))
+	s1 = e.fp.MulConst(v2, big.NewInt(54096))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v3, big.NewInt(154056))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v4, big.NewInt(55656))
+	c3 = e.fp.Sub(c3, s1)
+	s1 = e.fp.MulConst(v5, big.NewInt(47664))
+	c3 = e.fp.Sub(c3, s1)
+	s1 = e.fp.MulConst(v6, big.NewInt(10764))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v7, big.NewInt(9144))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v8, big.NewInt(944))
+	c3 = e.fp.Sub(c3, s1)
+	s1 = e.fp.MulConst(v9, big.NewInt(820))
+	c3 = e.fp.Sub(c3, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(1487808000))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(126))
+	s2 = e.fp.Sub(v2, v3)
+	s2 = e.fp.MulConst(s2, big.NewInt(84))
+	s1 = e.fp.Sub(s2, s1)
+	s2 = e.fp.Add(v4, v5)
+	s2 = e.fp.MulConst(s2, big.NewInt(36))
+	s1 = e.fp.Sub(s1, s2)
+	s2 = e.fp.Sub(v6, v7)
+	s2 = e.fp.MulConst(s2, big.NewInt(9))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v10, big.NewInt(1814400))
+	s2 = e.fp.Add(s2, v8)
+	s2 = e.fp.Add(s2, v9)
+	s1 = e.fp.Sub(s1, s2)
+	s1 = mulFpByNonResidue(e.fp, s1)
+	c3 = e.fp.Add(c3, s1)
+
+	c4 := e.fp.MulConst(v0, big.NewInt(171990))
+	s1 = e.fp.MulConst(v2, big.NewInt(122976))
+	c4 = e.fp.Sub(c4, s1)
+	s1 = e.fp.MulConst(t1, big.NewInt(42588))
+	c4 = e.fp.Add(c4, s1)
+	s1 = e.fp.MulConst(t2, big.NewInt(6048))
+	c4 = e.fp.Sub(c4, s1)
+	s1 = e.fp.MulConst(t3, big.NewInt(63))
+	c4 = e.fp.Add(c4, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(297561600))
+	c4 = e.fp.Sub(c4, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(362880))
+	s1 = mulFpByNonResidue(e.fp, s1)
+	c4 = e.fp.Add(c4, s1)
+
+	c5 := e.fp.MulConst(v2, big.NewInt(8316))
+	s1 = e.fp.MulConst(v0, big.NewInt(34398))
+	c5 = e.fp.Sub(c5, s1)
+	s1 = e.fp.MulConst(v5, big.NewInt(14364))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v3, big.NewInt(36036))
+	c5 = e.fp.Sub(c5, s1)
+	s1 = e.fp.MulConst(v4, big.NewInt(3276))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v6, big.NewInt(2079))
+	c5 = e.fp.Sub(c5, s1)
+	s1 = e.fp.MulConst(v7, big.NewInt(2961))
+	c5 = e.fp.Sub(c5, s1)
+	s1 = e.fp.MulConst(v8, big.NewInt(231))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v9, big.NewInt(273))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(495331200))
+	c5 = e.fp.Sub(c5, s1)
+
+	inv362880 := emulated.ValueOf[emulated.BW6761Fp]("4671422665851984694040348663017660157508519176517181272289218522372474038323623073011971993796055931265397672069676435635279488178552288409646583546248183456271259848848724056226545014884280653287710097584502403952205015690976464")
 
 	return &E6{
-		B0: *b0,
-		B1: *b1,
+		A0: *e.fp.Mul(&inv362880, c0),
+		A1: *e.fp.Mul(&inv362880, c1),
+		A2: *e.fp.Mul(&inv362880, c2),
+		A3: *e.fp.Mul(&inv362880, c3),
+		A4: *e.fp.Mul(&inv362880, c4),
+		A5: *e.fp.Mul(&inv362880, c5),
 	}
 }
+*/
 
 // Karabina's compressed cyclotomic square SQR12345
 // https://eprint.iacr.org/2010/542.pdf
 // Sec. 5.6 with minor modifications to fit our tower
+//
+//	a00 a01 a02 a10 a11 a12
+//	A0  A2  A4  A1  A3  A5
 func (e Ext6) CyclotomicSquareKarabina12345(x *E6) *E6 {
 	x = e.Reduce(x)
 
 	// h4 = -g4 + 3((g3+g5)(g1+c*g2)-g1g5-c*g3g2)
-	g1g5 := e.fp.Mul(&x.B0.A1, &x.B1.A2)
-	g3g2 := e.fp.Mul(&x.B1.A0, &x.B0.A2)
-	h4 := mulFpByNonResidue(e.fp, &x.B0.A2)
-	h4 = e.fp.Add(h4, &x.B0.A1)
-	t := e.fp.Add(&x.B1.A0, &x.B1.A2)
+	g1g5 := e.fp.Mul(&x.A2, &x.A5)
+	g3g2 := e.fp.Mul(&x.A1, &x.A4)
+	h4 := mulFpByNonResidue(e.fp, &x.A4)
+	h4 = e.fp.Add(h4, &x.A2)
+	t := e.fp.Add(&x.A1, &x.A5)
 	h4 = e.fp.Mul(h4, t)
 	h4 = e.fp.Sub(h4, g1g5)
 	t = e.fp.MulConst(g3g2, big.NewInt(4))
 	h4 = e.fp.Add(h4, t)
 	h4 = e.fp.MulConst(h4, big.NewInt(3))
-	h4 = e.fp.Sub(h4, &x.B1.A1)
+	h4 = e.fp.Sub(h4, &x.A3)
 
 	// h3 = 2(g3+3c*g1g5)
 	h3 := mulFpByNonResidue(e.fp, g1g5)
 	h3 = e.fp.MulConst(h3, big.NewInt(3))
-	h3 = e.fp.Add(h3, &x.B1.A0)
+	h3 = e.fp.Add(h3, &x.A1)
 	h3 = e.fp.MulConst(h3, big.NewInt(2))
 
 	// h2 = 3((g1+g5)(g1+c*g5)-(c+1)*g1g5)-2g2
-	t = mulFpByNonResidue(e.fp, &x.B1.A2)
-	t = e.fp.Add(t, &x.B0.A1)
-	h2 := e.fp.Add(&x.B1.A2, &x.B0.A1)
+	t = mulFpByNonResidue(e.fp, &x.A5)
+	t = e.fp.Add(t, &x.A2)
+	h2 := e.fp.Add(&x.A5, &x.A2)
 	h2 = e.fp.Mul(h2, t)
 	t = e.fp.MulConst(g1g5, big.NewInt(3))
 	h2 = e.fp.Add(h2, t)
 	h2 = e.fp.MulConst(h2, big.NewInt(3))
-	t = e.fp.MulConst(&x.B0.A2, big.NewInt(2))
+	t = e.fp.MulConst(&x.A4, big.NewInt(2))
 	h2 = e.fp.Sub(h2, t)
 
 	// h1 = 3((g3+g2)(g3+c*g2)-(c+1)*g3g2)-2g1
-	t = mulFpByNonResidue(e.fp, &x.B0.A2)
-	t = e.fp.Add(t, &x.B1.A0)
-	h1 := e.fp.Add(&x.B0.A2, &x.B1.A0)
+	t = mulFpByNonResidue(e.fp, &x.A4)
+	t = e.fp.Add(t, &x.A1)
+	h1 := e.fp.Add(&x.A4, &x.A1)
 	h1 = e.fp.Mul(h1, t)
 	t = e.fp.MulConst(g3g2, big.NewInt(3))
 	h1 = e.fp.Add(h1, t)
 	h1 = e.fp.MulConst(h1, big.NewInt(3))
-	t = e.fp.MulConst(&x.B0.A1, big.NewInt(2))
+	t = e.fp.MulConst(&x.A2, big.NewInt(2))
 	h1 = e.fp.Sub(h1, t)
 
 	// h5 = 2(g5+3g3g2)
 	h5 := e.fp.MulConst(g3g2, big.NewInt(3))
-	h5 = e.fp.Add(h5, &x.B1.A2)
+	h5 = e.fp.Add(h5, &x.A5)
 	h5 = e.fp.MulConst(h5, big.NewInt(2))
 
 	return &E6{
-		B0: E3{
-			A0: x.B0.A0,
-			A1: *h1,
-			A2: *h2,
-		},
-		B1: E3{
-			A0: *h3,
-			A1: *h4,
-			A2: *h5,
-		},
+		A0: x.A0,
+		A1: *h3,
+		A2: *h1,
+		A3: *h4,
+		A4: *h2,
+		A5: *h5,
 	}
 }
 
@@ -182,181 +752,28 @@ func (e Ext6) DecompressKarabina12345(x *E6) *E6 {
 	x = e.Reduce(x)
 
 	// h0 = (2g4^2 + g3g5 - 3g2g1)*c + 1
-	t0 := e.fp.Mul(&x.B0.A1, &x.B0.A2)
+	t0 := e.fp.Mul(&x.A2, &x.A4)
 	t0 = e.fp.MulConst(t0, big.NewInt(3))
-	t1 := e.fp.Mul(&x.B1.A0, &x.B1.A2)
-	h0 := e.fp.Mul(&x.B1.A1, &x.B1.A1)
+	t1 := e.fp.Mul(&x.A1, &x.A5)
+	h0 := e.fp.Mul(&x.A3, &x.A3)
 	h0 = e.fp.MulConst(h0, big.NewInt(2))
 	h0 = e.fp.Add(h0, t1)
 	h0 = e.fp.Sub(t0, h0)
 	h0 = e.fp.MulConst(h0, big.NewInt(4))
 	h0 = e.fp.Add(h0, e.fp.One())
 
+	//	a00 a01 a02 a10 a11 a12
+	//	A0  A2  A4  A1  A3  A5
 	return &E6{
-		B0: E3{
-			A0: *h0,
-			A1: x.B0.A1,
-			A2: x.B0.A2,
-		},
-		B1: x.B1,
+		A0: *h0,
+		A1: x.A1,
+		A2: x.A2,
+		A3: x.A3,
+		A4: x.A4,
+		A5: x.A5,
 	}
 }
 
-// Karabina's compressed cyclotomic square SQR2345
-// https://eprint.iacr.org/2010/542.pdf
-// Th. 3.2 with minor modifications to fit our tower
-func (e Ext6) CyclotomicSquareKarabina2345(x *E6) *E6 {
-	x = e.Reduce(x)
-	z := e.Copy(x)
-
-	var t [7]*baseEl
-
-	// t0 = g1²
-	t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
-	// t1 = g5²
-	t[1] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
-	// t5 = g1 + g5
-	t[5] = e.fp.Add(&x.B0.A1, &x.B1.A2)
-	// t2 = (g1 + g5)²
-	t[2] = e.fp.Mul(t[5], t[5])
-
-	// t3 = g1² + g5²
-	t[3] = e.fp.Add(t[0], t[1])
-	// t5 = 2 * g1 * g5
-	t[5] = e.fp.Sub(t[3], t[2])
-
-	// t6 = g3 + g2
-	t[6] = e.fp.Add(&x.B1.A0, &x.B0.A2)
-	// t3 = (g3 + g2)²
-	t[3] = e.fp.Mul(t[6], t[6])
-	// t2 = g3²
-	t[2] = e.fp.Mul(&x.B1.A0, &x.B1.A0)
-
-	// t6 = 2 * nr * g1 * g5
-	t[6] = e.fp.MulConst(t[5], big.NewInt(4))
-	// t5 = 4 * nr * g1 * g5 + 2 * g3
-	t[5] = e.fp.Add(t[6], &x.B1.A0)
-	t[5] = e.fp.MulConst(t[5], big.NewInt(2))
-	// z3 = 6 * nr * g1 * g5 + 2 * g3
-	z.B1.A0 = *e.fp.Add(t[5], t[6])
-
-	// t4 = nr * g5²
-	t[4] = mulFpByNonResidue(e.fp, t[1])
-	// t5 = nr * g5² + g1²
-	t[5] = e.fp.Add(t[0], t[4])
-	// t6 = nr * g5² + g1² - g2
-	t[6] = e.fp.Sub(t[5], &x.B0.A2)
-
-	// t1 = g2²
-	t[1] = e.fp.Mul(&x.B0.A2, &x.B0.A2)
-
-	// t6 = 2 * nr * g5² + 2 * g1² - 2*g2
-	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
-	// z2 = 3 * nr * g5² + 3 * g1² - 2*g2
-	z.B0.A2 = *e.fp.Add(t[6], t[5])
-
-	// t4 = nr * g2²
-	t[4] = mulFpByNonResidue(e.fp, t[1])
-	// t5 = g3² + nr * g2²
-	t[5] = e.fp.Add(t[2], t[4])
-	// t6 = g3² + nr * g2² - g1
-	t[6] = e.fp.Sub(t[5], &x.B0.A1)
-	// t6 = 2 * g3² + 2 * nr * g2² - 2 * g1
-	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
-	// z1 = 3 * g3² + 3 * nr * g2² - 2 * g1
-	z.B0.A1 = *e.fp.Add(t[6], t[5])
-
-	// t0 = g2² + g3²
-	t[0] = e.fp.Add(t[2], t[1])
-	// t5 = 2 * g3 * g2
-	t[5] = e.fp.Sub(t[3], t[0])
-	// t6 = 2 * g3 * g2 + g5
-	t[6] = e.fp.Add(t[5], &x.B1.A2)
-	// t6 = 4 * g3 * g2 + 2 * g5
-	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
-	// z5 = 6 * g3 * g2 + 2 * g5
-	z.B1.A2 = *e.fp.Add(t[5], t[6])
-
-	return z
-}
-
-// DecompressKarabina2345 decompresses Karabina's cyclotomic square result SQR2345
-// if g3 != 0
-//
-//	g4 = (E * g5^2 + 3 * g1^2 - 2 * g2)/4g3
-//
-// if g3 == 0
-//
-//	g4 = 2g1g5/g2
-//
-// if g3=g2=0 then g4=g5=g1=0 and g0=1 (x=1)
-// Theorem 3.1 is well-defined for all x in Gϕₙ\{1}
-func (e Ext6) DecompressKarabina2345(x *E6) *E6 {
-
-	x = e.Reduce(x)
-
-	var z E6
-
-	var t [3]*baseEl
-	var _t [2]*baseEl
-	one := e.fp.One()
-
-	// if g3 == 0
-	// t0 = 2 * g1 * g5
-	// t1 = g2
-	selector1 := e.fp.IsZero(&x.B1.A0)
-	_t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
-	_t[0] = e.fp.MulConst(_t[0], big.NewInt(2))
-	_t[1] = &x.B0.A2
-
-	// if g2 == g3 == 0
-	selector2 := e.fp.IsZero(_t[1])
-
-	// if g3 != 0
-	// t0 = E * g5^2 + 3 * g1^2 - 2 * g2
-	// t1 = 4 * g3
-	t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
-	t[1] = e.fp.Sub(t[0], &x.B0.A2)
-	t[1] = e.fp.MulConst(t[1], big.NewInt(2))
-	t[1] = e.fp.Add(t[1], t[0])
-	t[2] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
-	t[0] = mulFpByNonResidue(e.fp, t[2])
-	t[0] = e.fp.Add(t[0], t[1])
-	t[1] = e.fp.Add(&x.B1.A0, &x.B1.A0)
-	t[1] = e.fp.MulConst(t[1], big.NewInt(2))
-
-	// g4 = (E * g5^2 + 3 * g1^2 - 2 * g2)/4g3 or (2 * g1 * g5)/g2
-	t[0] = e.fp.Select(selector1, _t[0], t[0])
-	t[1] = e.fp.Select(selector1, _t[1], t[1])
-	// g4 = dummy value, continue
-	t[1] = e.fp.Select(selector2, one, t[1])
-
-	z.B1.A1 = *e.fp.Div(t[0], t[1])
-
-	// Rest of the computation for all cases
-	// t1 = g2 * g1
-	t[1] = e.fp.Mul(&x.B0.A2, &x.B0.A1)
-	// t2 = 2 * g4² - 3 * g2 * g1
-	t[2] = e.fp.Mul(&z.B1.A1, &z.B1.A1)
-	t[2] = e.fp.Sub(t[2], t[1])
-	t[2] = e.fp.MulConst(t[2], big.NewInt(2))
-	t[2] = e.fp.Sub(t[2], t[1])
-	// t1 = g3 * g5 (g3 can be 0)
-	t[1] = e.fp.Mul(&x.B1.A0, &x.B1.A2)
-	// g0 = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
-	t[2] = e.fp.Add(t[2], t[1])
-
-	z.B0.A0 = *mulFpByNonResidue(e.fp, t[2])
-	z.B0.A0 = *e.fp.Add(&z.B0.A0, one)
-
-	z.B0.A1 = x.B0.A1
-	z.B0.A2 = x.B0.A2
-	z.B1.A0 = x.B1.A0
-	z.B1.A2 = x.B1.A2
-
-	return e.Select(e.api.And(selector1, selector2), e.One(), &z)
-}
-
 // Granger-Scott's cyclotomic square
 // https://eprint.iacr.org/2009/565.pdf, 3.2
 func (e Ext6) CyclotomicSquare(x *E6) *E6 {
@@ -372,21 +789,21 @@ func (e Ext6) CyclotomicSquare(x *E6) *E6 {
 
 	var t [9]*baseEl
 
-	t[0] = e.fp.Mul(&x.B1.A1, &x.B1.A1)
-	t[1] = e.fp.Mul(&x.B0.A0, &x.B0.A0)
-	t[6] = e.fp.Add(&x.B1.A1, &x.B0.A0)
+	t[0] = e.fp.Mul(&x.A3, &x.A3)
+	t[1] = e.fp.Mul(&x.A0, &x.A0)
+	t[6] = e.fp.Add(&x.A3, &x.A0)
 	t[6] = e.fp.Mul(t[6], t[6])
-	tmp := e.fp.Add(t[0], t[1])
-	t[6] = e.fp.Sub(t[6], tmp) // 2*x4*x0
-	t[2] = e.fp.Mul(&x.B0.A2, &x.B0.A2)
-	t[3] = e.fp.Mul(&x.B1.A0, &x.B1.A0)
-	t[7] = e.fp.Add(&x.B0.A2, &x.B1.A0)
+	t[6] = e.fp.Sub(t[6], t[0])
+	t[6] = e.fp.Sub(t[6], t[1]) // 2*x4*x0
+	t[2] = e.fp.Mul(&x.A4, &x.A4)
+	t[3] = e.fp.Mul(&x.A1, &x.A1)
+	t[7] = e.fp.Add(&x.A4, &x.A1)
 	t[7] = e.fp.Mul(t[7], t[7])
-	tmp = e.fp.Add(t[2], t[3])
-	t[7] = e.fp.Sub(t[7], tmp) // 2*x2*x3
-	t[4] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
-	t[5] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
-	t[8] = e.fp.Add(&x.B1.A2, &x.B0.A1)
+	t[7] = e.fp.Sub(t[7], t[2])
+	t[7] = e.fp.Sub(t[7], t[3]) // 2*x2*x3
+	t[4] = e.fp.Mul(&x.A5, &x.A5)
+	t[5] = e.fp.Mul(&x.A2, &x.A2)
+	t[8] = e.fp.Add(&x.A5, &x.A2)
 	t[8] = e.fp.Mul(t[8], t[8])
 	t[8] = e.fp.Sub(t[8], t[4])
 	t[8] = e.fp.Sub(t[5], t[8])
@@ -400,40 +817,37 @@ func (e Ext6) CyclotomicSquare(x *E6) *E6 {
 	t[4] = e.fp.Add(t[4], t[5]) // x5²*u + x1²
 
 	var z E6
-	z.B0.A0 = *e.fp.Sub(t[0], &x.B0.A0)
-	z.B0.A0 = *e.fp.MulConst(&z.B0.A0, big.NewInt(2))
-	z.B0.A0 = *e.fp.Add(&z.B0.A0, t[0])
-	z.B0.A1 = *e.fp.Sub(t[2], &x.B0.A1)
-	z.B0.A1 = *e.fp.MulConst(&z.B0.A1, big.NewInt(2))
-	z.B0.A1 = *e.fp.Add(&z.B0.A1, t[2])
-	z.B0.A2 = *e.fp.Sub(t[4], &x.B0.A2)
-	z.B0.A2 = *e.fp.MulConst(&z.B0.A2, big.NewInt(2))
-	z.B0.A2 = *e.fp.Add(&z.B0.A2, t[4])
-
-	z.B1.A0 = *e.fp.Add(t[8], &x.B1.A0)
-	z.B1.A0 = *e.fp.MulConst(&z.B1.A0, big.NewInt(2))
-	z.B1.A0 = *e.fp.Add(&z.B1.A0, t[8])
-	z.B1.A1 = *e.fp.Add(t[6], &x.B1.A1)
-	z.B1.A1 = *e.fp.MulConst(&z.B1.A1, big.NewInt(2))
-	z.B1.A1 = *e.fp.Add(&z.B1.A1, t[6])
-	z.B1.A2 = *e.fp.Add(t[7], &x.B1.A2)
-	z.B1.A2 = *e.fp.Add(&z.B1.A2, &z.B1.A2)
-	z.B1.A2 = *e.fp.Add(&z.B1.A2, t[7])
+	z.A0 = *e.fp.Sub(t[0], &x.A0)
+	z.A0 = *e.fp.MulConst(&z.A0, big.NewInt(2))
+	z.A0 = *e.fp.Add(&z.A0, t[0])
+	z.A2 = *e.fp.Sub(t[2], &x.A2)
+	z.A2 = *e.fp.MulConst(&z.A2, big.NewInt(2))
+	z.A2 = *e.fp.Add(&z.A2, t[2])
+	z.A4 = *e.fp.Sub(t[4], &x.A4)
+	z.A4 = *e.fp.MulConst(&z.A4, big.NewInt(2))
+	z.A4 = *e.fp.Add(&z.A4, t[4])
+
+	z.A1 = *e.fp.Add(t[8], &x.A1)
+	z.A1 = *e.fp.MulConst(&z.A1, big.NewInt(2))
+	z.A1 = *e.fp.Add(&z.A1, t[8])
+	z.A3 = *e.fp.Add(t[6], &x.A3)
+	z.A3 = *e.fp.MulConst(&z.A3, big.NewInt(2))
+	z.A3 = *e.fp.Add(&z.A3, t[6])
+	z.A5 = *e.fp.Add(t[7], &x.A5)
+	z.A5 = *e.fp.Add(&z.A5, &z.A5)
+	z.A5 = *e.fp.Add(&z.A5, t[7])
 
 	return &z
 }
 
 func (e Ext6) Inverse(x *E6) *E6 {
-	res, err := e.fp.NewHint(inverseE6Hint, 6, &x.B0.A0, &x.B0.A1, &x.B0.A2, &x.B1.A0, &x.B1.A1, &x.B1.A2)
+	res, err := e.fp.NewHint(inverseE6Hint, 6, &x.A0, &x.A1, &x.A2, &x.A3, &x.A4, &x.A5)
 	if err != nil {
 		// err is non-nil only for invalid number of inputs
 		panic(err)
 	}
 
-	inv := E6{
-		B0: E3{A0: *res[0], A1: *res[1], A2: *res[2]},
-		B1: E3{A0: *res[3], A1: *res[4], A2: *res[5]},
-	}
+	inv := E6{A0: *res[0], A1: *res[1], A2: *res[2], A3: *res[3], A4: *res[4], A5: *res[5]}
 	one := e.One()
 
 	// 1 == inv * x
@@ -445,16 +859,13 @@ func (e Ext6) Inverse(x *E6) *E6 {
 }
 
 func (e Ext6) DivUnchecked(x, y *E6) *E6 {
-	res, err := e.fp.NewHint(divE6Hint, 12, &x.B0.A0, &x.B0.A1, &x.B0.A2, &x.B1.A0, &x.B1.A1, &x.B1.A2, &y.B0.A0, &y.B0.A1, &y.B0.A2, &y.B1.A0, &y.B1.A1, &y.B1.A2)
+	res, err := e.fp.NewHint(divE6Hint, 12, &x.A0, &x.A1, &x.A2, &x.A3, &x.A4, &x.A5, &y.A0, &y.A1, &y.A2, &y.A3, &y.A4, &y.A5)
 	if err != nil {
 		// err is non-nil only for invalid number of inputs
 		panic(err)
 	}
 
-	div := E6{
-		B0: E3{A0: *res[0], A1: *res[1], A2: *res[2]},
-		B1: E3{A0: *res[3], A1: *res[4], A2: *res[5]},
-	}
+	div := E6{A0: *res[0], A1: *res[1], A2: *res[2], A3: *res[3], A4: *res[4], A5: *res[5]}
 
 	// x = div * y
 	_x := e.Mul(&div, y)
@@ -464,34 +875,53 @@ func (e Ext6) DivUnchecked(x, y *E6) *E6 {
 
 }
 
-func (e Ext6) Conjugate(x *E6) *E6 {
-	return &E6{
-		B0: x.B0,
-		B1: *e.Ext3.Neg(&x.B1),
-	}
-}
-
 func (e Ext6) AssertIsEqual(a, b *E6) {
-	e.Ext3.AssertIsEqual(&a.B0, &b.B0)
-	e.Ext3.AssertIsEqual(&a.B1, &b.B1)
+	e.fp.AssertIsEqual(&a.A0, &b.A0)
+	e.fp.AssertIsEqual(&a.A1, &b.A1)
+	e.fp.AssertIsEqual(&a.A2, &b.A2)
+	e.fp.AssertIsEqual(&a.A3, &b.A3)
+	e.fp.AssertIsEqual(&a.A4, &b.A4)
+	e.fp.AssertIsEqual(&a.A5, &b.A5)
+
 }
 
 func (e Ext6) Copy(x *E6) *E6 {
-	b0 := e.Ext3.Copy(&x.B0)
-	b1 := e.Ext3.Copy(&x.B1)
 	return &E6{
-		B0: *b0,
-		B1: *b1,
+		A0: x.A0,
+		A1: x.A1,
+		A2: x.A2,
+		A3: x.A3,
+		A4: x.A4,
+		A5: x.A5,
 	}
 }
 
 func FromE6(a *bw6761.E6) E6 {
+	// gnark-crypto uses a quadratic over cubic sextic extension of Fp.
+	// The two towers are isomorphic and the coefficients are permuted as follows:
+	// 		a00 a01 a02 a10 a11 a12
+	// 		A0  A2  A4  A1  A3  A5
 	return E6{
-		B0: FromE3(&a.B0),
-		B1: FromE3(&a.B1),
+		A0: emulated.ValueOf[emulated.BW6761Fp](a.B0.A0),
+		A1: emulated.ValueOf[emulated.BW6761Fp](a.B1.A0),
+		A2: emulated.ValueOf[emulated.BW6761Fp](a.B0.A1),
+		A3: emulated.ValueOf[emulated.BW6761Fp](a.B1.A1),
+		A4: emulated.ValueOf[emulated.BW6761Fp](a.B0.A2),
+		A5: emulated.ValueOf[emulated.BW6761Fp](a.B1.A2),
 	}
 }
 
+func (e Ext6) Select(selector frontend.Variable, z1, z0 *E6) *E6 {
+	a0 := e.fp.Select(selector, &z1.A0, &z0.A0)
+	a1 := e.fp.Select(selector, &z1.A1, &z0.A1)
+	a2 := e.fp.Select(selector, &z1.A2, &z0.A2)
+	a3 := e.fp.Select(selector, &z1.A3, &z0.A3)
+	a4 := e.fp.Select(selector, &z1.A4, &z0.A4)
+	a5 := e.fp.Select(selector, &z1.A5, &z0.A5)
+
+	return &E6{A0: *a0, A1: *a1, A2: *a2, A3: *a3, A4: *a4, A5: *a5}
+}
+
 // Frobenius set z in E6 to Frobenius(x), return z
 func (e Ext6) Frobenius(x *E6) *E6 {
 	_frobA := emulated.ValueOf[emulated.BW6761Fp]("4922464560225523242118178942575080391082002530232324381063048548642823052024664478336818169867474395270858391911405337707247735739826664939444490469542109391530482826728203582549674992333383150446779312029624171857054392282775648")
@@ -500,19 +930,12 @@ func (e Ext6) Frobenius(x *E6) *E6 {
 	_frobAC := emulated.ValueOf[emulated.BW6761Fp]("-1")
 	_frobBC := emulated.ValueOf[emulated.BW6761Fp]("1968985824090209297278610739700577151397666382303825728450741611566800370218827257750865013421937292370006175842381275743914023380727582819905021229583192207421122272650305267822868639090213645505120388400344940985710520836292651")
 	var z E6
-	z.B0.A0 = x.B0.A0
-	z.B0.A1 = *e.fp.Mul(&x.B0.A1, &_frobA)
-	z.B0.A2 = *e.fp.Mul(&x.B0.A2, &_frobB)
-
-	z.B1.A0 = *e.fp.Mul(&x.B1.A0, &_frobC)
-	z.B1.A1 = *e.fp.Mul(&x.B1.A1, &_frobAC)
-	z.B1.A2 = *e.fp.Mul(&x.B1.A2, &_frobBC)
+	z.A0 = x.A0
+	z.A2 = *e.fp.Mul(&x.A2, &_frobA)
+	z.A4 = *e.fp.Mul(&x.A4, &_frobB)
+	z.A1 = *e.fp.Mul(&x.A1, &_frobC)
+	z.A3 = *e.fp.Mul(&x.A3, &_frobAC)
+	z.A5 = *e.fp.Mul(&x.A5, &_frobBC)
 
 	return &z
 }
-
-func (e Ext6) Select(selector frontend.Variable, z1, z0 *E6) *E6 {
-	b0 := e.Ext3.Select(selector, &z1.B0, &z0.B0)
-	b1 := e.Ext3.Select(selector, &z1.B1, &z0.B1)
-	return &E6{B0: *b0, B1: *b1}
-}
diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index 8e5f677b61..12aa698ae3 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -2,8 +2,7 @@ package fields_bw6761
 
 import (
 	"math/big"
-
-	"github.com/consensys/gnark/std/math/emulated"
+	// "github.com/consensys/gnark/std/math/emulated"
 )
 
 func (e Ext6) nSquareKarabina12345(z *E6, n int) *E6 {
@@ -130,58 +129,154 @@ func (e Ext6) ExpC2(z *E6) *E6 {
 	return result
 }
 
-// MulBy014 multiplies z by an E6 sparse element of the form
+// MulBy023 multiplies z by an E6 sparse element of the form
 //
-//	E6{
-//		B0: E3{A0: c0, A1: c1, A2: 0},
-//		B1: E3{A0: 0,  A1: 1,  A2: 0},
-//	}
-func (e *Ext6) MulBy014(z *E6, c0, c1 *baseEl) *E6 {
-	z = e.Reduce(z)
+//	E6{A0: c0, A1: 0, A2: c1, A3: 1,  A4: 0,  A5: 0}
+func (e *Ext6) MulBy023(x *E6, c0, c1 *baseEl) *E6 {
+	x = e.Reduce(x)
+	//		v0 = (a0 + a1 + a2 + a3 + a4 + a5)(c0 + c1 + 1)
+	//		v2 = (a0 + a1 + a3 + a4)(c0 + 1)
+	//		v3 = (a0 − a2 − a3 + a5)(c0 − c1 − 1)
+	//		v4 = (a0 − a2 − a5)(c0 − c1)
+	//		v5 = (a0 + a3 − a5)(c0 + 1)
+	//		v6 = (a0 + a1 + a2)(c0 + c1)
+	//		v7 = (a3 + a4 + a5)
+	//		v8 = (a2 + a3)(c1 + 1)
+	//		v10 = (a1 + a2)c1
+	//		v11 = (a3 + a4)
+	//		v12 = (a0 + a1)c0
+	//		v14 = a0c0
 
-	a := e.MulBy01(&z.B0, c0, c1)
-
-	var b E3
-	// Mul by E3{0, 1, 0}
-	b.A0 = *e.fp.MulConst(&z.B1.A2, big.NewInt(4))
-	b.A2 = *e.fp.Neg(&z.B1.A1)
-	b.A1 = *e.fp.Neg(&z.B1.A0)
+	_t0 := e.fp.Add(&x.A0, &x.A1)
+	t0 := e.fp.Add(_t0, &x.A2)
+	t1 := e.fp.Add(&x.A3, &x.A4)
+	t2 := e.fp.Add(_t0, t1)
+	t3 := e.fp.Add(t2, &x.A5)
+	t3 = e.fp.Add(t3, &x.A2)
 
+	s0 := e.fp.Add(c0, c1)
 	one := e.fp.One()
-	d := e.fp.Add(c1, one)
-
-	zC1 := e.Ext3.Add(&z.B1, &z.B0)
-	zC1 = e.Ext3.MulBy01(zC1, c0, d)
-	zC1 = e.Ext3.Sub(zC1, a)
-	zC1 = e.Ext3.Add(zC1, &b)
-	zC0 := &E3{
-		A0: *e.fp.MulConst(&b.A2, big.NewInt(4)),
-		A1: *e.fp.Neg(&b.A0),
-		A2: *e.fp.Neg(&b.A1),
-	}
+	s2 := e.fp.Add(c0, one)
+	s3 := e.fp.Add(s2, c1)
+
+	v0 := e.fp.Mul(t3, s3)
+	v2 := e.fp.Mul(t2, s2)
+	v6 := e.fp.Mul(t0, s0)
+	t4 := e.fp.Add(t1, &x.A5)
+	v7 := t4
+	v12 := e.fp.Mul(_t0, c0)
+	v11 := t1
+	t0 = e.fp.Add(&x.A2, &x.A3)
+	s0 = e.fp.Add(c1, one)
+	v8 := e.fp.Mul(t0, s0)
+	t1 = e.fp.Add(&x.A1, &x.A2)
+	v10 := e.fp.Mul(t1, c1)
+	v3 := e.fp.Add(&x.A0, &x.A5)
+	v3 = e.fp.Sub(v3, t0)
+	s1 := e.fp.Sub(c0, s0)
+	v3 = e.fp.Mul(v3, s1)
+	t1 = e.fp.Add(&x.A2, &x.A5)
+	t2 = e.fp.Sub(&x.A0, t1)
+	s2 = e.fp.Sub(c0, c1)
+	v4 := e.fp.Mul(t2, s2)
+	t1 = e.fp.Add(&x.A0, &x.A3)
+	t1 = e.fp.Sub(t1, &x.A5)
+	s1 = e.fp.Add(c0, one)
+	v5 := e.fp.Mul(t1, s1)
+	v14 := e.fp.Mul(&x.A0, c0)
+
+	z0 := e.fp.Sub(v0, v2)
+	z0 = e.fp.Add(z0, v4)
+	s1 = e.fp.Add(v3, v5)
+	s1 = e.fp.Add(s1, v6)
+	s1 = e.fp.Sub(s1, v12)
+	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	z0 = e.fp.Add(z0, s1)
+	s2 = e.fp.Add(v8, v10)
+	s2 = e.fp.Add(s2, v11)
+	s1 = e.fp.Sub(v7, s2)
+	s1 = e.fp.MulConst(s1, big.NewInt(3))
+	z0 = e.fp.Add(z0, s1)
+	s1 = e.fp.MulConst(v14, big.NewInt(5))
+	z0 = e.fp.Sub(z0, s1)
+	z0 = mulFpByNonResidue(e.fp, z0)
+	z0 = e.fp.Add(z0, v14)
 
-	zC0 = e.Ext3.Add(zC0, a)
+	z1 := e.fp.Sub(v12, v14)
+	s2 = e.fp.Add(v3, v5)
+	s2 = e.fp.Add(s2, v6)
+	s1 = e.fp.Add(v10, v8)
+	s1 = e.fp.Add(s1, v12)
+	s1 = e.fp.Sub(s1, s2)
+	s2 = e.fp.Sub(v14, v7)
+	s2 = e.fp.MulConst(s2, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v11, big.NewInt(3))
+	s1 = e.fp.Add(s1, s2)
+	s1 = mulFpByNonResidue(e.fp, s1)
+	z1 = e.fp.Add(z1, s1)
+
+	z2 := v6
+	s1 = e.fp.Add(v10, v12)
+	z2 = e.fp.Sub(z2, s1)
+	s1 = e.fp.Sub(v7, v11)
+	s1 = mulFpByNonResidue(e.fp, s1)
+	z2 = e.fp.Add(z2, s1)
+
+	z3 := e.fp.Add(v8, v11)
+	s1 = e.fp.Add(v3, v4)
+	s1 = e.fp.Add(s1, v7)
+	z3 = e.fp.Sub(z3, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(3))
+	z3 = e.fp.Add(z3, s1)
+	s1 = e.fp.Add(v12, v14)
+	s1 = e.fp.Sub(s1, v6)
+	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	z3 = e.fp.Add(z3, s1)
+
+	z4 := e.fp.Add(v2, v3)
+	z4 = e.fp.Add(z4, v4)
+	z4 = e.fp.Add(z4, v7)
+	z4 = e.fp.Sub(z4, v8)
+	s1 = e.fp.MulConst(v12, big.NewInt(3))
+	z4 = e.fp.Sub(z4, s1)
+	s1 = e.fp.Add(v10, v11)
+	s1 = e.fp.Add(s1, v14)
+	s1 = e.fp.Sub(v6, s1)
+	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	z4 = e.fp.Add(z4, s1)
+
+	z5 := e.fp.Add(v8, v10)
+	z5 = e.fp.Add(z5, v11)
+	z5 = e.fp.Add(z5, v12)
+	s1 = e.fp.Add(v6, v7)
+	z5 = e.fp.Sub(z5, s1)
+	z5 = e.fp.MulConst(z5, big.NewInt(2))
+	s1 = e.fp.MulConst(v14, big.NewInt(3))
+	z5 = e.fp.Add(z5, s1)
+	s1 = e.fp.Add(v3, v4)
+	s1 = e.fp.Add(s1, v5)
+	z5 = e.fp.Sub(z5, s1)
 
 	return &E6{
-		B0: *zC0,
-		B1: *zC1,
+		A0: *z0,
+		A1: *z1,
+		A2: *z2,
+		A3: *z3,
+		A4: *z4,
+		A5: *z5,
 	}
 }
 
-//	multiplies two E6 sparse element of the form:
+/*
+//	Mul023By023 multiplies two E6 sparse element of the form:
 //
-//	E6{
-//		B0: E3{A0: c0, A1: c1, A2: 0},
-//		B1: E3{A0: 0,  A1: 1,  A2: 0},
-//	}
+//	E6{A0: c0, A1: 0, A2: c1, A3: 1,  A4: 0,  A5: 0}
 //
 // and
 //
-//	E6{
-//		B0: E3{A0: d0, A1: d1, A2: 0},
-//		B1: E3{A0: 0,  A1: 1,  A2: 0},
-//	}
-func (e Ext6) Mul014By014(d0, d1, c0, c1 *baseEl) [5]*baseEl {
+//	E6{A0: c0, A1: 0, A2: c1, A3: 1,  A4: 0,  A5: 0}
+func (e Ext6) Mul023By023(d0, d1, c0, c1 *baseEl) [5]*baseEl {
 	x0 := e.fp.Mul(c0, d0)
 	x1 := e.fp.Mul(c1, d1)
 	x04 := e.fp.Add(c0, d0)
@@ -267,3 +362,4 @@ func (e *Ext6) Mul01245By014(x [5]*baseEl, d0, d1 *baseEl) *E6 {
 		B1: *z1,
 	}
 }
+*/
diff --git a/std/algebra/emulated/fields_bw6761/e6_test.go b/std/algebra/emulated/fields_bw6761/e6_test.go
index ecf3104517..789629d963 100644
--- a/std/algebra/emulated/fields_bw6761/e6_test.go
+++ b/std/algebra/emulated/fields_bw6761/e6_test.go
@@ -1,12 +1,15 @@
 package fields_bw6761
 
 import (
+	"fmt"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc"
 	bw6761 "github.com/consensys/gnark-crypto/ecc/bw6-761"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 	"github.com/consensys/gnark/frontend"
+	"github.com/consensys/gnark/frontend/cs/scs"
+	"github.com/consensys/gnark/profile"
 	"github.com/consensys/gnark/std/math/emulated"
 	"github.com/consensys/gnark/test"
 )
@@ -245,62 +248,6 @@ func TestConjugateFp6(t *testing.T) {
 	assert.NoError(err)
 }
 
-type e6CyclotomicSquareKarabina2345 struct {
-	A, B E6
-}
-
-func (circuit *e6CyclotomicSquareKarabina2345) Define(api frontend.API) error {
-	e := NewExt6(api)
-	expected := e.CyclotomicSquareKarabina2345(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestCyclotomicSquareKarabina2345Fp6(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E6
-	_, _ = a.SetRandom()
-	b.Set(&a)
-	b.CyclotomicSquareCompressed(&a)
-
-	witness := e6CyclotomicSquareKarabina2345{
-		A: FromE6(&a),
-		B: FromE6(&b),
-	}
-
-	err := test.IsSolved(&e6CyclotomicSquareKarabina2345{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e6DecompressKarabina2345 struct {
-	A, B E6
-}
-
-func (circuit *e6DecompressKarabina2345) Define(api frontend.API) error {
-	e := NewExt6(api)
-	expected := e.DecompressKarabina2345(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestDecompressKarabina2345Fp6(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E6
-	_, _ = a.SetRandom()
-	b.Set(&a)
-	a.DecompressKarabina(&a)
-
-	witness := e6DecompressKarabina2345{
-		A: FromE6(&b),
-		B: FromE6(&a),
-	}
-
-	err := test.IsSolved(&e6DecompressKarabina2345{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
 type e6CyclotomicSquare struct {
 	A, B E6
 }
@@ -365,20 +312,20 @@ func TestExptFp6(t *testing.T) {
 	assert.NoError(err)
 }
 
-type e6MulBy014 struct {
+type e6MulBy023 struct {
 	A    E6 `gnark:",public"`
 	W    E6
 	B, C baseEl
 }
 
-func (circuit *e6MulBy014) Define(api frontend.API) error {
+func (circuit *e6MulBy023) Define(api frontend.API) error {
 	e := NewExt6(api)
-	res := e.MulBy014(&circuit.A, &circuit.B, &circuit.C)
+	res := e.MulBy023(&circuit.A, &circuit.B, &circuit.C)
 	e.AssertIsEqual(res, &circuit.W)
 	return nil
 }
 
-func TestFp6MulBy014(t *testing.T) {
+func TestFp6MulBy023(t *testing.T) {
 
 	assert := test.NewAssert(t)
 	// witness values
@@ -391,14 +338,29 @@ func TestFp6MulBy014(t *testing.T) {
 	w.Set(&a)
 	w.MulBy014(&b, &c, &one)
 
-	witness := e6MulBy014{
+	witness := e6MulBy023{
 		A: FromE6(&a),
 		B: emulated.ValueOf[emulated.BW6761Fp](&b),
 		C: emulated.ValueOf[emulated.BW6761Fp](&c),
 		W: FromE6(&w),
 	}
 
-	err := test.IsSolved(&e6MulBy014{}, &witness, ecc.BN254.ScalarField())
+	err := test.IsSolved(&e6MulBy023{}, &witness, ecc.BN254.ScalarField())
 	assert.NoError(err)
+}
+
+func BenchmarkMulMontgomery6(b *testing.B) {
+	var c e6Mul
+	p := profile.Start()
+	_, _ = frontend.Compile(ecc.BN254.ScalarField(), scs.NewBuilder, &c)
+	p.Stop()
+	fmt.Println("Fp6 Mul (Montgomery-6): ", p.NbConstraints())
+}
 
+func BenchmarkSqMontgomery6(b *testing.B) {
+	var c e6Square
+	p := profile.Start()
+	_, _ = frontend.Compile(ecc.BN254.ScalarField(), scs.NewBuilder, &c)
+	p.Stop()
+	fmt.Println("Fp6 Square (Montgomery-6): ", p.NbConstraints())
 }
diff --git a/std/algebra/emulated/fields_bw6761/hints.go b/std/algebra/emulated/fields_bw6761/hints.go
index 879ea22206..cf983c51c7 100644
--- a/std/algebra/emulated/fields_bw6761/hints.go
+++ b/std/algebra/emulated/fields_bw6761/hints.go
@@ -4,7 +4,6 @@ import (
 	"math/big"
 
 	bw6761 "github.com/consensys/gnark-crypto/ecc/bw6-761"
-	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 	"github.com/consensys/gnark/constraint/solver"
 	"github.com/consensys/gnark/std/math/emulated"
 )
@@ -16,100 +15,30 @@ func init() {
 // GetHints returns all hint functions used in the package.
 func GetHints() []solver.Hint {
 	return []solver.Hint{
-		// E3
-		divE3Hint,
-		inverseE3Hint,
-		divE3By6Hint,
-		// E6
 		divE6Hint,
 		inverseE6Hint,
 	}
 }
 
-// E3
-func inverseE3Hint(nativeMod *big.Int, nativeInputs, nativeOutputs []*big.Int) error {
-	return emulated.UnwrapHint(nativeInputs, nativeOutputs,
-		func(mod *big.Int, inputs, outputs []*big.Int) error {
-			var a, c bw6761.E3
-
-			a.A0.SetBigInt(inputs[0])
-			a.A1.SetBigInt(inputs[1])
-			a.A2.SetBigInt(inputs[2])
-
-			c.Inverse(&a)
-
-			c.A0.BigInt(outputs[0])
-			c.A1.BigInt(outputs[1])
-			c.A2.BigInt(outputs[2])
-
-			return nil
-		})
-}
-
-func divE3By6Hint(nativeMod *big.Int, nativeInputs, nativeOutputs []*big.Int) error {
-	return emulated.UnwrapHint(nativeInputs, nativeOutputs,
-		func(mod *big.Int, inputs, outputs []*big.Int) error {
-			var a, c bw6761.E3
-
-			a.A0.SetBigInt(inputs[0])
-			a.A1.SetBigInt(inputs[1])
-			a.A2.SetBigInt(inputs[2])
-
-			var sixInv fp.Element
-			sixInv.SetString("6")
-			sixInv.Inverse(&sixInv)
-			c.MulByElement(&a, &sixInv)
-
-			c.A0.BigInt(outputs[0])
-			c.A1.BigInt(outputs[1])
-			c.A2.BigInt(outputs[2])
-
-			return nil
-		})
-}
-
-func divE3Hint(nativeMod *big.Int, nativeInputs, nativeOutputs []*big.Int) error {
-	return emulated.UnwrapHint(nativeInputs, nativeOutputs,
-		func(mod *big.Int, inputs, outputs []*big.Int) error {
-			var a, b, c bw6761.E3
-
-			a.A0.SetBigInt(inputs[0])
-			a.A1.SetBigInt(inputs[1])
-			a.A2.SetBigInt(inputs[2])
-			b.A0.SetBigInt(inputs[3])
-			b.A1.SetBigInt(inputs[4])
-			b.A2.SetBigInt(inputs[5])
-
-			c.Inverse(&b).Mul(&c, &a)
-
-			c.A0.BigInt(outputs[0])
-			c.A1.BigInt(outputs[1])
-			c.A2.BigInt(outputs[2])
-
-			return nil
-		})
-}
-
-// E6
 func inverseE6Hint(nativeMod *big.Int, nativeInputs, nativeOutputs []*big.Int) error {
 	return emulated.UnwrapHint(nativeInputs, nativeOutputs,
 		func(mod *big.Int, inputs, outputs []*big.Int) error {
 			var a, c bw6761.E6
 
 			a.B0.A0.SetBigInt(inputs[0])
-			a.B0.A1.SetBigInt(inputs[1])
-			a.B0.A2.SetBigInt(inputs[2])
-			a.B1.A0.SetBigInt(inputs[3])
-			a.B1.A1.SetBigInt(inputs[4])
+			a.B0.A1.SetBigInt(inputs[2])
+			a.B0.A2.SetBigInt(inputs[4])
+			a.B1.A0.SetBigInt(inputs[1])
+			a.B1.A1.SetBigInt(inputs[3])
 			a.B1.A2.SetBigInt(inputs[5])
 
 			c.Inverse(&a)
 
 			c.B0.A0.BigInt(outputs[0])
-			c.B0.A1.BigInt(outputs[1])
-			c.B0.A2.BigInt(outputs[2])
-			c.B1.A0.BigInt(outputs[3])
-			c.B1.A1.BigInt(outputs[4])
+			c.B0.A1.BigInt(outputs[2])
+			c.B0.A2.BigInt(outputs[4])
+			c.B1.A0.BigInt(outputs[1])
+			c.B1.A1.BigInt(outputs[3])
 			c.B1.A2.BigInt(outputs[5])
 
 			return nil
@@ -122,25 +51,25 @@ func divE6Hint(nativeMod *big.Int, nativeInputs, nativeOutputs []*big.Int) error
 			var a, b, c bw6761.E6
 
 			a.B0.A0.SetBigInt(inputs[0])
-			a.B0.A1.SetBigInt(inputs[1])
-			a.B0.A2.SetBigInt(inputs[2])
-			a.B1.A0.SetBigInt(inputs[3])
-			a.B1.A1.SetBigInt(inputs[4])
+			a.B0.A1.SetBigInt(inputs[2])
+			a.B0.A2.SetBigInt(inputs[4])
+			a.B1.A0.SetBigInt(inputs[1])
+			a.B1.A1.SetBigInt(inputs[3])
 			a.B1.A2.SetBigInt(inputs[5])
 			b.B0.A0.SetBigInt(inputs[6])
-			b.B0.A1.SetBigInt(inputs[7])
-			b.B0.A2.SetBigInt(inputs[8])
-			b.B1.A0.SetBigInt(inputs[9])
-			b.B1.A1.SetBigInt(inputs[10])
+			b.B0.A1.SetBigInt(inputs[8])
+			b.B0.A2.SetBigInt(inputs[10])
+			b.B1.A0.SetBigInt(inputs[7])
+			b.B1.A1.SetBigInt(inputs[9])
 			b.B1.A2.SetBigInt(inputs[11])
 
 			c.Inverse(&b).Mul(&c, &a)
 
 			c.B0.A0.BigInt(outputs[0])
-			c.B0.A1.BigInt(outputs[1])
-			c.B0.A2.BigInt(outputs[2])
-			c.B1.A0.BigInt(outputs[3])
-			c.B1.A1.BigInt(outputs[4])
+			c.B0.A1.BigInt(outputs[2])
+			c.B0.A2.BigInt(outputs[4])
+			c.B1.A0.BigInt(outputs[1])
+			c.B1.A1.BigInt(outputs[3])
 			c.B1.A2.BigInt(outputs[5])
 
 			return nil
diff --git a/std/algebra/emulated/sw_bw6761/pairing.go b/std/algebra/emulated/sw_bw6761/pairing.go
index d5b0ac4fa7..3ab4b9f98f 100644
--- a/std/algebra/emulated/sw_bw6761/pairing.go
+++ b/std/algebra/emulated/sw_bw6761/pairing.go
@@ -26,16 +26,12 @@ type GTEl = fields_bw6761.E6
 
 func NewGTEl(v bw6761.GT) GTEl {
 	return GTEl{
-		B0: fields_bw6761.E3{
-			A0: emulated.ValueOf[BaseField](v.B0.A0),
-			A1: emulated.ValueOf[BaseField](v.B0.A1),
-			A2: emulated.ValueOf[BaseField](v.B0.A2),
-		},
-		B1: fields_bw6761.E3{
-			A0: emulated.ValueOf[BaseField](v.B1.A0),
-			A1: emulated.ValueOf[BaseField](v.B1.A1),
-			A2: emulated.ValueOf[BaseField](v.B1.A2),
-		},
+		A0: emulated.ValueOf[BaseField](v.B0.A0),
+		A1: emulated.ValueOf[BaseField](v.B1.A0),
+		A2: emulated.ValueOf[BaseField](v.B0.A1),
+		A3: emulated.ValueOf[BaseField](v.B1.A1),
+		A4: emulated.ValueOf[BaseField](v.B0.A2),
+		A5: emulated.ValueOf[BaseField](v.B1.A2),
 	}
 }
 
@@ -294,99 +290,27 @@ func (pr Pairing) millerLoopLines(P []*G1Affine, lines []lineEvaluations) (*GTEl
 	}
 
 	// f_{x₀+1+λ(x₀³-x₀²-x₀),Q}(P), Q is known in advance
-	var prodLines [5]*emulated.Element[BaseField]
 	result := pr.Ext6.One()
 
-	// i = 188
-	// k = 0
-	result = &fields_bw6761.E6{
-		B0: fields_bw6761.E3{
-			A0: *pr.curveF.Mul(&lines[0][0][188].R1, yInv[0]),
-			A1: *pr.curveF.Mul(&lines[0][0][188].R0, xNegOverY[0]),
-			A2: result.B0.A2,
-		},
-		B1: fields_bw6761.E3{
-			A0: result.B1.A0,
-			A1: *pr.curveF.One(),
-			A2: result.B1.A2,
-		},
-	}
-
-	if n >= 2 {
-		// k = 1, separately to avoid MulBy014 (res × ℓ)
-		// (res is also a line at this point, so we use Mul014By014 ℓ × ℓ)
-		prodLines = pr.Mul014By014(
-			pr.curveF.Mul(&lines[1][0][188].R1, yInv[1]),
-			pr.curveF.Mul(&lines[1][0][188].R0, xNegOverY[1]),
-			&result.B0.A0,
-			&result.B0.A1,
-		)
-		result = &fields_bw6761.E6{
-			B0: fields_bw6761.E3{
-				A0: *prodLines[0],
-				A1: *prodLines[1],
-				A2: *prodLines[2],
-			},
-			B1: fields_bw6761.E3{
-				A0: result.B1.A0,
-				A1: *prodLines[3],
-				A2: *prodLines[4],
-			},
-		}
-	}
-
-	if n >= 3 {
-		// k = 2, separately to avoid MulBy014 (res × ℓ)
-		// (res has a zero E2 element, so we use Mul01245By014)
-		result = pr.Mul01245By014(
-			prodLines,
-			pr.curveF.Mul(&lines[2][0][188].R1, yInv[2]),
-			pr.curveF.Mul(&lines[2][0][188].R0, xNegOverY[2]),
-		)
-
-		// k >= 3
-		for k := 3; k < n; k++ {
-			result = pr.MulBy014(result,
-				pr.curveF.Mul(&lines[k][0][188].R1, yInv[k]),
-				pr.curveF.Mul(&lines[k][0][188].R0, xNegOverY[k]),
-			)
-		}
-	}
-
-	for i := 187; i >= 0; i-- {
+	for i := 188; i >= 0; i-- {
 		// mutualize the square among n Miller loops
 		// (∏ᵢfᵢ)²
 		result = pr.Square(result)
 
-		if i > 0 && loopCounter2[i]*3+loopCounter1[i] != 0 {
-			for k := 0; k < n; k++ {
-				prodLines = pr.Mul014By014(
+		for k := 0; k < n; k++ {
+			result = pr.MulBy023(result,
+				pr.curveF.Mul(&lines[k][0][i].R1, yInv[k]),
+				pr.curveF.Mul(&lines[k][0][i].R0, xNegOverY[k]),
+			)
+			if i > 0 && loopCounter2[i]*3+loopCounter1[i] != 0 {
+				result = pr.MulBy023(result,
 					pr.curveF.Mul(&lines[k][0][i].R1, yInv[k]),
 					pr.curveF.Mul(&lines[k][0][i].R0, xNegOverY[k]),
+				)
+				result = pr.MulBy023(result,
 					pr.curveF.Mul(&lines[k][1][i].R1, yInv[k]),
 					pr.curveF.Mul(&lines[k][1][i].R0, xNegOverY[k]),
 				)
-				result = pr.MulBy01245(result, prodLines)
-			}
-		} else {
-			// if number of lines is odd, mul last line by res
-			// works for n=1 as well
-			if n%2 != 0 {
-				// ℓ × res
-				result = pr.MulBy014(result,
-					pr.curveF.Mul(&lines[n-1][0][i].R1, yInv[n-1]),
-					pr.curveF.Mul(&lines[n-1][0][i].R0, xNegOverY[n-1]),
-				)
-			}
-			// mul lines 2-by-2
-			for k := 1; k < n; k += 2 {
-				prodLines = pr.Mul014By014(
-					pr.curveF.Mul(&lines[k][0][i].R1, yInv[k]),
-					pr.curveF.Mul(&lines[k][0][i].R0, xNegOverY[k]),
-					pr.curveF.Mul(&lines[k-1][0][i].R1, yInv[k-1]),
-					pr.curveF.Mul(&lines[k-1][0][i].R0, xNegOverY[k-1]),
-				)
-				result = pr.MulBy01245(result, prodLines)
 			}
 		}
 	}

From ad255d3220ea55ac37b69e9ebf5989448d63a858 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Wed, 24 Apr 2024 21:31:38 -0400
Subject: [PATCH 02/24] fix(bw6): pairing using direct sextic extension

---
 std/algebra/emulated/sw_bw6761/pairing.go | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/std/algebra/emulated/sw_bw6761/pairing.go b/std/algebra/emulated/sw_bw6761/pairing.go
index 3ab4b9f98f..42a6487df0 100644
--- a/std/algebra/emulated/sw_bw6761/pairing.go
+++ b/std/algebra/emulated/sw_bw6761/pairing.go
@@ -302,11 +302,10 @@ func (pr Pairing) millerLoopLines(P []*G1Affine, lines []lineEvaluations) (*GTEl
 				pr.curveF.Mul(&lines[k][0][i].R1, yInv[k]),
 				pr.curveF.Mul(&lines[k][0][i].R0, xNegOverY[k]),
 			)
-			if i > 0 && loopCounter2[i]*3+loopCounter1[i] != 0 {
-				result = pr.MulBy023(result,
-					pr.curveF.Mul(&lines[k][0][i].R1, yInv[k]),
-					pr.curveF.Mul(&lines[k][0][i].R0, xNegOverY[k]),
-				)
+		}
+
+		if i > 0 && loopCounter2[i]*3+loopCounter1[i] != 0 {
+			for k := 0; k < n; k++ {
 				result = pr.MulBy023(result,
 					pr.curveF.Mul(&lines[k][1][i].R1, yInv[k]),
 					pr.curveF.Mul(&lines[k][1][i].R0, xNegOverY[k]),

From 931edcd7708744961416555feca89dbbdcbc0d5b Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Thu, 25 Apr 2024 13:06:12 -0400
Subject: [PATCH 03/24] perf(bw6): use Karabina12345 instead of GS for small
 sizes too

---
 std/algebra/emulated/fields_bw6761/e3         | 506 -----------------
 std/algebra/emulated/fields_bw6761/e3_test    | 410 --------------
 std/algebra/emulated/fields_bw6761/e6         | 512 ------------------
 std/algebra/emulated/fields_bw6761/e6.go      |   3 -
 .../emulated/fields_bw6761/e6_pairing.go      |  24 +-
 5 files changed, 11 insertions(+), 1444 deletions(-)
 delete mode 100644 std/algebra/emulated/fields_bw6761/e3
 delete mode 100644 std/algebra/emulated/fields_bw6761/e3_test
 delete mode 100644 std/algebra/emulated/fields_bw6761/e6

diff --git a/std/algebra/emulated/fields_bw6761/e3 b/std/algebra/emulated/fields_bw6761/e3
deleted file mode 100644
index b4ed19bddf..0000000000
--- a/std/algebra/emulated/fields_bw6761/e3
+++ /dev/null
@@ -1,506 +0,0 @@
-package fields_bw6761
-
-import (
-	"math/big"
-
-	bw6761 "github.com/consensys/gnark-crypto/ecc/bw6-761"
-	"github.com/consensys/gnark/frontend"
-	"github.com/consensys/gnark/std/math/emulated"
-)
-
-type curveF = emulated.Field[emulated.BW6761Fp]
-type baseEl = emulated.Element[emulated.BW6761Fp]
-
-type E3 struct {
-	A0, A1, A2 baseEl
-}
-
-type Ext3 struct {
-	api frontend.API
-	fp  *curveF
-}
-
-func NewExt3(api frontend.API) *Ext3 {
-	fp, err := emulated.NewField[emulated.BW6761Fp](api)
-	if err != nil {
-		panic(err)
-	}
-	return &Ext3{
-		api: api,
-		fp:  fp,
-	}
-}
-
-func (e Ext3) Reduce(x *E3) *E3 {
-	var z E3
-	z.A0 = *e.fp.Reduce(&x.A0)
-	z.A1 = *e.fp.Reduce(&x.A1)
-	z.A2 = *e.fp.Reduce(&x.A2)
-	return &z
-}
-
-func (e Ext3) Zero() *E3 {
-	zero := e.fp.Zero()
-	return &E3{
-		A0: *zero,
-		A1: *zero,
-		A2: *zero,
-	}
-}
-
-func (e Ext3) One() *E3 {
-	one := e.fp.One()
-	zero := e.fp.Zero()
-	return &E3{
-		A0: *one,
-		A1: *zero,
-		A2: *zero,
-	}
-}
-
-func (e Ext3) Neg(x *E3) *E3 {
-	a0 := e.fp.Neg(&x.A0)
-	a1 := e.fp.Neg(&x.A1)
-	a2 := e.fp.Neg(&x.A2)
-	return &E3{
-		A0: *a0,
-		A1: *a1,
-		A2: *a2,
-	}
-}
-
-func (e Ext3) Add(x, y *E3) *E3 {
-	a0 := e.fp.Add(&x.A0, &y.A0)
-	a1 := e.fp.Add(&x.A1, &y.A1)
-	a2 := e.fp.Add(&x.A2, &y.A2)
-	return &E3{
-		A0: *a0,
-		A1: *a1,
-		A2: *a2,
-	}
-}
-
-func (e Ext3) Sub(x, y *E3) *E3 {
-	a0 := e.fp.Sub(&x.A0, &y.A0)
-	a1 := e.fp.Sub(&x.A1, &y.A1)
-	a2 := e.fp.Sub(&x.A2, &y.A2)
-	return &E3{
-		A0: *a0,
-		A1: *a1,
-		A2: *a2,
-	}
-}
-
-func (e Ext3) Double(x *E3) *E3 {
-	two := big.NewInt(2)
-	a0 := e.fp.MulConst(&x.A0, two)
-	a1 := e.fp.MulConst(&x.A1, two)
-	a2 := e.fp.MulConst(&x.A2, two)
-	return &E3{
-		A0: *a0,
-		A1: *a1,
-		A2: *a2,
-	}
-}
-
-func mulFpByNonResidue(fp *curveF, x *baseEl) *baseEl {
-
-	z := fp.Neg(x)
-	z = fp.MulConst(z, big.NewInt(4))
-	return z
-}
-
-func (e Ext3) Conjugate(x *E3) *E3 {
-	a1 := e.fp.Neg(&x.A1)
-	return &E3{
-		A0: x.A0,
-		A1: *a1,
-		A2: x.A2,
-	}
-}
-
-func (e Ext3) MulByElement(x *E3, y *baseEl) *E3 {
-	a0 := e.fp.Mul(&x.A0, y)
-	a1 := e.fp.Mul(&x.A1, y)
-	a2 := e.fp.Mul(&x.A2, y)
-	z := &E3{
-		A0: *a0,
-		A1: *a1,
-		A2: *a2,
-	}
-	return z
-}
-
-func (e Ext3) MulByConstElement(x *E3, y *big.Int) *E3 {
-	a0 := e.fp.MulConst(&x.A0, y)
-	a1 := e.fp.MulConst(&x.A1, y)
-	a2 := e.fp.MulConst(&x.A2, y)
-	return &E3{
-		A0: *a0,
-		A1: *a1,
-		A2: *a2,
-	}
-}
-
-// MulBy01 multiplication by sparse element (c0,c1,0)
-func (e Ext3) MulBy01(z *E3, c0, c1 *baseEl) *E3 {
-
-	a := e.fp.Mul(&z.A0, c0)
-	b := e.fp.Mul(&z.A1, c1)
-
-	tmp := e.fp.Add(&z.A1, &z.A2)
-	t0 := e.fp.Mul(c1, tmp)
-	t0 = e.fp.Sub(b, t0)
-	t0 = e.fp.MulConst(t0, big.NewInt(4))
-	t0 = e.fp.Add(t0, a)
-
-	// for t2, schoolbook is faster than karatsuba
-	// c2 = a0b2 + a1b1 + a2b0,
-	// c2 = a2b0 + b ∵ b2 = 0, b = a1b1
-	t2 := e.fp.Mul(&z.A2, c0)
-	t2 = e.fp.Add(t2, b)
-
-	t1 := e.fp.Add(c0, c1)
-	tmp = e.fp.Add(&z.A0, &z.A1)
-	t1 = e.fp.Mul(t1, tmp)
-	tmp = e.fp.Add(b, a)
-	t1 = e.fp.Sub(t1, tmp)
-
-	return &E3{
-		A0: *t0,
-		A1: *t1,
-		A2: *t2,
-	}
-}
-
-// MulBy1 multiplication of E6 by sparse element (0, c1, 0)
-func (e Ext3) MulBy1(z *E3, c1 *baseEl) *E3 {
-
-	b := e.fp.Mul(&z.A1, c1)
-	tmp := e.fp.Add(&z.A1, &z.A2)
-	t0 := e.fp.Mul(c1, tmp)
-	t0 = e.fp.Sub(b, t0)
-	t0 = e.fp.MulConst(t0, big.NewInt(4))
-	tmp = e.fp.Add(&z.A0, &z.A1)
-	t1 := e.fp.Mul(c1, tmp)
-	t1 = e.fp.Sub(t1, b)
-
-	return &E3{
-		A0: *t0,
-		A1: *t1,
-		A2: *b,
-	}
-}
-
-// MulBy12 multiplication by sparse element (0,b1,b2)
-func (e Ext3) MulBy12(x *E3, b1, b2 *baseEl) *E3 {
-	t1 := e.fp.Mul(&x.A1, b1)
-	t2 := e.fp.Mul(&x.A2, b2)
-	c0 := e.fp.Add(&x.A1, &x.A2)
-	tmp := e.fp.Add(b1, b2)
-	c0 = e.fp.Mul(c0, tmp)
-	c0 = e.fp.Sub(c0, t1)
-	c0 = e.fp.Sub(t2, c0)
-	c0 = e.fp.MulConst(c0, big.NewInt(4))
-	c1 := e.fp.Add(&x.A0, &x.A1)
-	c1 = e.fp.Mul(c1, b1)
-	c1 = e.fp.Sub(c1, t1)
-	tmp = mulFpByNonResidue(e.fp, t2)
-	c1 = e.fp.Add(c1, tmp)
-	tmp = e.fp.Add(&x.A0, &x.A2)
-	c2 := e.fp.Mul(b2, tmp)
-	c2 = e.fp.Sub(c2, t2)
-	c2 = e.fp.Add(c2, t1)
-	return &E3{
-		A0: *c0,
-		A1: *c1,
-		A2: *c2,
-	}
-}
-
-// Mul01By01 multiplies two E3 sparse element of the form:
-//
-//	E3{
-//		A0: c0,
-//		A1: c1,
-//		A2: 0,
-//	}
-//
-// and
-//
-//	E3{
-//		A0: d0,
-//		A1: d1,
-//		A2: 0,
-//	}
-func (e Ext3) Mul01By01(c0, c1, d0, d1 *baseEl) *E3 {
-	a := e.fp.Mul(d0, c0)
-	b := e.fp.Mul(d1, c1)
-	t1 := e.fp.Add(c0, c1)
-	tmp := e.fp.Add(d0, d1)
-	t1 = e.fp.Mul(t1, tmp)
-	tmp = e.fp.Add(b, a)
-	t1 = e.fp.Sub(t1, tmp)
-	return &E3{
-		A0: *a,
-		A1: *t1,
-		A2: *b,
-	}
-}
-
-func (e Ext3) Mul(x, y *E3) *E3 {
-	return e.MulKaratsuba(x, y)
-}
-
-func (e Ext3) MulKaratsuba(x, y *E3) *E3 {
-	// Algorithm 13 from https://eprint.iacr.org/2010/354.pdf
-	t0 := e.fp.Mul(&x.A0, &y.A0)
-	t1 := e.fp.Mul(&x.A1, &y.A1)
-	t2 := e.fp.Mul(&x.A2, &y.A2)
-
-	c0 := e.fp.Add(&x.A1, &x.A2)
-	tmp := e.fp.Add(&y.A1, &y.A2)
-	c0 = e.fp.Mul(c0, tmp)
-	tmp = e.fp.Add(t2, t1)
-	c0 = e.fp.Sub(c0, tmp)
-	c0 = mulFpByNonResidue(e.fp, c0)
-	c0 = e.fp.Add(c0, t0)
-
-	c1 := e.fp.Add(&x.A0, &x.A1)
-	tmp = e.fp.Add(&y.A0, &y.A1)
-	c1 = e.fp.Mul(c1, tmp)
-	tmp = e.fp.MulConst(t2, big.NewInt(4))
-	tmp = e.fp.Add(tmp, t1)
-	tmp = e.fp.Add(tmp, t0)
-	c1 = e.fp.Sub(c1, tmp)
-
-	c2 := e.fp.Add(&y.A0, &y.A2)
-	tmp = e.fp.Add(&x.A0, &x.A2)
-	c2 = e.fp.Mul(c2, tmp)
-	c2 = e.fp.Add(c2, t1)
-	tmp = e.fp.Add(t2, t0)
-	c2 = e.fp.Sub(c2, tmp)
-
-	return &E3{
-		A0: *c0,
-		A1: *c1,
-		A2: *c2,
-	}
-}
-
-func (e Ext3) MulToomCook3(x, y *E3) *E3 {
-	// Toom-Cook-3x:
-	// We start by computing five interpolation points – these are evaluations of
-	// the product x(u)y(u) with u ∈ {0, ±1, 2, ∞}:
-	//
-	// v0 = x(0)y(0) = x.A0 * y.A0
-	// v1 = x(1)y(1) = (x.A0 + x.A1 + x.A2)(y.A0 + y.A1 + y.A2)
-	// v2 = x(−1)y(−1) = (x.A0 − x.A1 + x.A2)(y.A0 − y.A1 + y.A2)
-	// v3 = x(2)y(2) = (x.A0 + 2x.A1 + 4x.A2)(y.A0 + 2y.A1 + 4y.A2)
-	// v4 = x(∞)y(∞) = x.A2 * y.A2
-	//
-	// Then the interpolation is performed as:
-	//
-	// a0 = v0 + β((1/2)v0 − (1/2)v1 − (1/6)v2 + (1/6)v3 − 2v4)
-	// a1 = −(1/2)v0 + v1 − (1/3)v2 − (1/6)v3 + 2v4 + βv4
-	// a2 = −v0 + (1/2)v1 + (1/2)v2 − v4
-	//
-	// where is β=-4 the cubic non-residue (mulFpByNonResidue).
-	//
-	// In-circuit, we compute 6*x*y as
-	// a0 = 6v0 - β(3(v1 - v0 + 4v4) + v2 - v3)
-	// a1 = -(3v0 + 2v2 + v3) + 6(v1 + 2v4 + βv4)
-	// a2 = 3(v1 + v2 - 2(v0 + v4))
-	//
-	// and then divide a0, a1 and a2 by 6 using a hint.
-	//
-	// This costs 5M + 22A.
-
-	two := big.NewInt(2)
-	three := big.NewInt(3)
-	four := big.NewInt(4)
-	six := big.NewInt(6)
-
-	v0 := e.fp.Mul(&x.A0, &y.A0)
-	t1 := e.fp.Add(&x.A0, &x.A2)
-	t2 := e.fp.Add(&y.A0, &y.A2)
-	t3 := e.fp.Add(t2, &y.A1)
-	v1 := e.fp.Add(t1, &x.A1)
-	v1 = e.fp.Mul(v1, t3)
-	t3 = e.fp.Sub(t2, &y.A1)
-	v2 := e.fp.Sub(t1, &x.A1)
-	v2 = e.fp.Mul(v2, t3)
-	t1 = e.fp.MulConst(&x.A1, two)
-	t2 = e.fp.MulConst(&x.A2, four)
-	v3 := e.fp.Add(t1, t2)
-	v3 = e.fp.Add(v3, &x.A0)
-	t1 = e.fp.MulConst(&y.A1, two)
-	t2 = e.fp.MulConst(&y.A2, four)
-	t3 = e.fp.Add(t1, t2)
-	t3 = e.fp.Add(t3, &y.A0)
-	v3 = e.fp.Mul(v3, t3)
-	v4 := e.fp.Mul(&x.A2, &y.A2)
-
-	a0 := e.fp.Sub(v1, v0)
-	t1 = e.fp.MulConst(v4, four)
-	a0 = e.fp.Add(a0, t1)
-	a0 = e.fp.MulConst(a0, three)
-	a0 = e.fp.Sub(a0, v3)
-	a0 = e.fp.Add(a0, v2)
-	a0 = e.fp.MulConst(a0, four)
-	t1 = e.fp.MulConst(v0, six)
-	a0 = e.fp.Add(a0, t1)
-
-	t1 = e.fp.MulConst(v0, three)
-	t2 = e.fp.MulConst(v2, two)
-	t1 = e.fp.Add(t1, t2)
-	t1 = e.fp.Add(t1, v3)
-	a1 := e.fp.MulConst(v4, two)
-	a1 = e.fp.Sub(v1, a1)
-	a1 = e.fp.MulConst(a1, six)
-	a1 = e.fp.Sub(a1, t1)
-
-	t1 = e.fp.Add(v0, v4)
-	t1 = e.fp.MulConst(t1, two)
-	a2 := e.fp.Add(v1, v2)
-	a2 = e.fp.Sub(a2, t1)
-	a2 = e.fp.MulConst(a2, three)
-
-	return e.divE3By6(
-		&E3{A0: *a0, A1: *a1, A2: *a2},
-	)
-}
-
-func (e Ext3) Square(x *E3) *E3 {
-	// Chung-Hasan (SQR2)
-	// Algorithm 16 from https://eprint.iacr.org/2010/354.pdf
-
-	c6 := e.fp.MulConst(&x.A1, big.NewInt(2))
-	c4 := e.fp.Mul(&x.A0, c6) // x.A0 * xA1 * 2
-	c5 := e.fp.Mul(&x.A2, &x.A2)
-	c1 := mulFpByNonResidue(e.fp, c5)
-	c1 = e.fp.Add(c1, c4)
-	c2 := e.fp.Sub(c4, c5)
-
-	c3 := e.fp.Mul(&x.A0, &x.A0)
-	c4 = e.fp.Sub(&x.A0, &x.A1)
-	c4 = e.fp.Add(c4, &x.A2)
-	c5 = e.fp.Mul(c6, &x.A2) // x.A1 * xA2 * 2
-	c4 = e.fp.Mul(c4, c4)
-	c0 := mulFpByNonResidue(e.fp, c5)
-	c4 = e.fp.Add(c4, c5)
-	c4 = e.fp.Sub(c4, c3)
-
-	a0 := e.fp.Add(c0, c3)
-	a1 := c1
-	a2 := e.fp.Add(c2, c4)
-
-	return &E3{
-		A0: *a0,
-		A1: *a1,
-		A2: *a2,
-	}
-}
-
-func (e Ext3) Inverse(x *E3) *E3 {
-	res, err := e.fp.NewHint(inverseE3Hint, 3, &x.A0, &x.A1, &x.A2)
-	if err != nil {
-		// err is non-nil only for invalid number of inputs
-		panic(err)
-	}
-
-	inv := E3{
-		A0: *res[0],
-		A1: *res[1],
-		A2: *res[2],
-	}
-	one := e.One()
-
-	// 1 == inv * x
-	_one := e.Mul(&inv, x)
-	e.AssertIsEqual(one, _one)
-
-	return &inv
-
-}
-
-func (e Ext3) DivUnchecked(x, y *E3) *E3 {
-	res, err := e.fp.NewHint(divE3Hint, 6, &x.A0, &x.A1, &x.A2, &y.A0, &y.A1, &y.A2)
-	if err != nil {
-		// err is non-nil only for invalid number of inputs
-		panic(err)
-	}
-
-	div := E3{
-		A0: *res[0],
-		A1: *res[1],
-		A2: *res[2],
-	}
-
-	// x = div * y
-	_x := e.Mul(&div, y)
-	e.AssertIsEqual(x, _x)
-
-	return &div
-
-}
-
-func (e Ext3) divE3By6(x *E3) *E3 {
-	res, err := e.fp.NewHint(divE3By6Hint, 3, &x.A0, &x.A1, &x.A2)
-	if err != nil {
-		// err is non-nil only for invalid number of inputs
-		panic(err)
-	}
-
-	y := E3{
-		A0: *res[0],
-		A1: *res[1],
-		A2: *res[2],
-	}
-
-	// x == 6 * y
-	_x := e.MulByConstElement(&y, big.NewInt(6))
-	e.AssertIsEqual(x, _x)
-
-	return &y
-}
-
-// MulByNonResidue mul x by (0,1,0)
-func (e Ext3) MulByNonResidue(x *E3) *E3 {
-	z := &E3{
-		A0: x.A2,
-		A1: x.A0,
-		A2: x.A1,
-	}
-	z.A0 = *mulFpByNonResidue(e.fp, &z.A0)
-	return z
-}
-
-func (e Ext3) AssertIsEqual(a, b *E3) {
-	e.fp.AssertIsEqual(&a.A0, &b.A0)
-	e.fp.AssertIsEqual(&a.A1, &b.A1)
-	e.fp.AssertIsEqual(&a.A2, &b.A2)
-}
-
-func (e Ext3) Copy(x *E3) *E3 {
-	return &E3{
-		A0: x.A0,
-		A1: x.A1,
-		A2: x.A2,
-	}
-}
-
-func FromE3(a *bw6761.E3) E3 {
-	return E3{
-		A0: emulated.ValueOf[emulated.BW6761Fp](a.A0),
-		A1: emulated.ValueOf[emulated.BW6761Fp](a.A1),
-		A2: emulated.ValueOf[emulated.BW6761Fp](a.A2),
-	}
-}
-
-func (e Ext3) Select(selector frontend.Variable, z1, z0 *E3) *E3 {
-	a0 := e.fp.Select(selector, &z1.A0, &z0.A0)
-	a1 := e.fp.Select(selector, &z1.A1, &z0.A1)
-	a2 := e.fp.Select(selector, &z1.A2, &z0.A2)
-	return &E3{A0: *a0, A1: *a1, A2: *a2}
-}
diff --git a/std/algebra/emulated/fields_bw6761/e3_test b/std/algebra/emulated/fields_bw6761/e3_test
deleted file mode 100644
index 1893258437..0000000000
--- a/std/algebra/emulated/fields_bw6761/e3_test
+++ /dev/null
@@ -1,410 +0,0 @@
-package fields_bw6761
-
-import (
-	"testing"
-
-	"github.com/consensys/gnark-crypto/ecc"
-	bw6761 "github.com/consensys/gnark-crypto/ecc/bw6-761"
-	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
-	"github.com/consensys/gnark/frontend"
-	"github.com/consensys/gnark/std/math/emulated"
-	"github.com/consensys/gnark/test"
-)
-
-type e3Add struct {
-	A, B, C E3
-}
-
-func (circuit *e3Add) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Add(&circuit.A, &circuit.B)
-	e.AssertIsEqual(expected, &circuit.C)
-	return nil
-}
-
-func TestAddFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b, c bw6761.E3
-	_, _ = a.SetRandom()
-	_, _ = b.SetRandom()
-	c.Add(&a, &b)
-
-	witness := e3Add{
-		A: FromE3(&a),
-		B: FromE3(&b),
-		C: FromE3(&c),
-	}
-
-	err := test.IsSolved(&e3Add{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3Sub struct {
-	A, B, C E3
-}
-
-func (circuit *e3Sub) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Sub(&circuit.A, &circuit.B)
-	e.AssertIsEqual(expected, &circuit.C)
-	return nil
-}
-
-func TestSubFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b, c bw6761.E3
-	_, _ = a.SetRandom()
-	_, _ = b.SetRandom()
-	c.Sub(&a, &b)
-
-	witness := e3Sub{
-		A: FromE3(&a),
-		B: FromE3(&b),
-		C: FromE3(&c),
-	}
-
-	err := test.IsSolved(&e3Sub{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3Neg struct {
-	A, B E3
-}
-
-func (circuit *e3Neg) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Neg(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestNegFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E3
-	_, _ = a.SetRandom()
-	b.Neg(&a)
-
-	witness := e3Neg{
-		A: FromE3(&a),
-		B: FromE3(&b),
-	}
-
-	err := test.IsSolved(&e3Neg{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3Double struct {
-	A, B E3
-}
-
-func (circuit *e3Double) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Double(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestDoubleFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E3
-	_, _ = a.SetRandom()
-	b.Double(&a)
-
-	witness := e3Double{
-		A: FromE3(&a),
-		B: FromE3(&b),
-	}
-
-	err := test.IsSolved(&e3Double{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3Mul struct {
-	A, B, C E3
-}
-
-func (circuit *e3Mul) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Mul(&circuit.A, &circuit.B)
-	e.AssertIsEqual(expected, &circuit.C)
-	return nil
-}
-
-func TestMulFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b, c bw6761.E3
-	_, _ = a.SetRandom()
-	_, _ = b.SetRandom()
-	c.Mul(&a, &b)
-
-	witness := e3Mul{
-		A: FromE3(&a),
-		B: FromE3(&b),
-		C: FromE3(&c),
-	}
-
-	err := test.IsSolved(&e3Mul{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3Mul01By01 struct {
-	A0, A1 baseEl
-	B0, B1 baseEl
-	C      E3
-}
-
-func (circuit *e3Mul01By01) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Mul01By01(&circuit.A0, &circuit.A1, &circuit.B0, &circuit.B1)
-	e.AssertIsEqual(expected, &circuit.C)
-
-	return nil
-}
-
-func TestMul01By01(t *testing.T) {
-
-	// we test our new E3.Mul01By01 against E3.MulBy01
-	assert := test.NewAssert(t)
-	// witness values
-	var a, c bw6761.E3
-	var A0, A1, B0, B1 fp.Element
-	A0.SetRandom()
-	A1.SetRandom()
-	B0.SetRandom()
-	B1.SetRandom()
-	// build a 01 sparse E3 with,
-	// first two elements as A1 and A2,
-	// and the third as 0
-	a.A0 = A0
-	a.A1 = A1
-	a.A2.SetZero()
-	c.Set(&a)
-	c.MulBy01(&B0, &B1)
-
-	witness := e3Mul01By01{
-		A0: emulated.ValueOf[emulated.BW6761Fp](A0),
-		A1: emulated.ValueOf[emulated.BW6761Fp](A1),
-		B0: emulated.ValueOf[emulated.BW6761Fp](B0),
-		B1: emulated.ValueOf[emulated.BW6761Fp](B1),
-		C:  FromE3(&c),
-	}
-
-	err := test.IsSolved(&e3Mul01By01{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-
-}
-
-type e3MulByNonResidue struct {
-	A, B E3
-}
-
-func (circuit *e3MulByNonResidue) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.MulByNonResidue(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestMulByNonResidueFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E3
-	_, _ = a.SetRandom()
-	b.Set(&a)
-	b.MulByNonResidue(&a)
-
-	witness := e3MulByNonResidue{
-		A: FromE3(&a),
-		B: FromE3(&b),
-	}
-
-	err := test.IsSolved(&e3MulByNonResidue{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3MulByElement struct {
-	A E3
-	Y baseEl
-	B E3
-}
-
-func (circuit *e3MulByElement) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.MulByElement(&circuit.A, &circuit.Y)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestMulByElementFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E3
-	_, _ = a.SetRandom()
-	var y fp.Element
-	y.SetRandom()
-	b.Set(&a)
-	b.MulByElement(&a, &y)
-
-	witness := e3MulByElement{
-		A: FromE3(&a),
-		Y: emulated.ValueOf[emulated.BW6761Fp](y),
-		B: FromE3(&b),
-	}
-
-	err := test.IsSolved(&e3MulByElement{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3MulBy01 struct {
-	A      E3
-	C0, C1 baseEl
-	B      E3
-}
-
-func (circuit *e3MulBy01) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.MulBy01(&circuit.A, &circuit.C0, &circuit.C1)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestMulBy01Fp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E3
-	_, _ = a.SetRandom()
-	var c0, c1 fp.Element
-	c0.SetRandom()
-	c1.SetRandom()
-	b.Set(&a)
-	b.MulBy01(&c0, &c1)
-
-	witness := e3MulBy01{
-		A:  FromE3(&a),
-		C0: emulated.ValueOf[emulated.BW6761Fp](c0),
-		C1: emulated.ValueOf[emulated.BW6761Fp](c1),
-		B:  FromE3(&b),
-	}
-
-	err := test.IsSolved(&e3MulBy01{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3Square struct {
-	A, B E3
-}
-
-func (circuit *e3Square) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Square(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestSquareFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E3
-	_, _ = a.SetRandom()
-	b.Square(&a)
-
-	witness := e3Square{
-		A: FromE3(&a),
-		B: FromE3(&b),
-	}
-
-	err := test.IsSolved(&e3Square{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3Inverse struct {
-	A, B E3
-}
-
-func (circuit *e3Inverse) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Inverse(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestInverseFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E3
-	_, _ = a.SetRandom()
-	b.Inverse(&a)
-
-	witness := e3Inverse{
-		A: FromE3(&a),
-		B: FromE3(&b),
-	}
-
-	// add=50605 equals=769 fromBinary=0 mul=50315 sub=558 toBinary=0
-	err := test.IsSolved(&e3Inverse{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
-type e3Div struct {
-	A, B, C E3
-}
-
-func (circuit *e3Div) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.DivUnchecked(&circuit.A, &circuit.B)
-	e.AssertIsEqual(expected, &circuit.C)
-	return nil
-}
-
-func TestDivFp3(t *testing.T) {
-
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b, c bw6761.E3
-	_, _ = a.SetRandom()
-	_, _ = b.SetRandom()
-	c.Inverse(&b)
-	c.Mul(&a, &c)
-
-	witness := e3Div{
-		A: FromE3(&a),
-		B: FromE3(&b),
-		C: FromE3(&c),
-	}
-
-	err := test.IsSolved(&e3Div{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-
-}
-
-type e3Conjugate struct {
-	A, B E3
-}
-
-func (circuit *e3Conjugate) Define(api frontend.API) error {
-	e := NewExt3(api)
-	expected := e.Conjugate(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestConjugateFp3(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E3
-	_, _ = a.SetRandom()
-	b.Conjugate(&a)
-
-	witness := e3Conjugate{
-		A: FromE3(&a),
-		B: FromE3(&b),
-	}
-
-	err := test.IsSolved(&e3Conjugate{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
diff --git a/std/algebra/emulated/fields_bw6761/e6 b/std/algebra/emulated/fields_bw6761/e6
deleted file mode 100644
index be781adcb4..0000000000
--- a/std/algebra/emulated/fields_bw6761/e6
+++ /dev/null
@@ -1,512 +0,0 @@
-package fields_bw6761
-
-import (
-	"math/big"
-
-	bw6761 "github.com/consensys/gnark-crypto/ecc/bw6-761"
-	"github.com/consensys/gnark/frontend"
-	"github.com/consensys/gnark/std/math/emulated"
-)
-
-type E6 struct {
-	B0, B1 E3
-}
-
-type Ext6 struct {
-	*Ext3
-}
-
-func (e Ext6) Reduce(x *E6) *E6 {
-	var z E6
-	z.B0 = *e.Ext3.Reduce(&x.B0)
-	z.B1 = *e.Ext3.Reduce(&x.B1)
-	return &z
-}
-
-func NewExt6(api frontend.API) *Ext6 {
-	return &Ext6{Ext3: NewExt3(api)}
-}
-
-func (e Ext6) Zero() *E6 {
-	b0 := e.Ext3.Zero()
-	b1 := e.Ext3.Zero()
-	return &E6{
-		B0: *b0,
-		B1: *b1,
-	}
-}
-
-func (e Ext6) One() *E6 {
-	return &E6{
-		B0: *e.Ext3.One(),
-		B1: *e.Ext3.Zero(),
-	}
-}
-
-func (e Ext6) Add(x, y *E6) *E6 {
-	return &E6{
-		B0: *e.Ext3.Add(&x.B0, &y.B0),
-		B1: *e.Ext3.Add(&x.B1, &y.B1),
-	}
-}
-
-func (e Ext6) Sub(x, y *E6) *E6 {
-	return &E6{
-		B0: *e.Ext3.Sub(&x.B0, &y.B0),
-		B1: *e.Ext3.Sub(&x.B1, &y.B1),
-	}
-}
-
-func (e Ext6) Double(x *E6) *E6 {
-	return &E6{
-		B0: *e.Ext3.Double(&x.B0),
-		B1: *e.Ext3.Double(&x.B1),
-	}
-}
-
-func (e Ext6) Mul(x, y *E6) *E6 {
-	x = e.Reduce(x)
-	y = e.Reduce(y)
-
-	a := e.Ext3.Add(&x.B0, &x.B1)
-	b := e.Ext3.Add(&y.B0, &y.B1)
-	a = e.Ext3.Mul(a, b)
-	b = e.Ext3.Mul(&x.B0, &y.B0)
-	c := e.Ext3.Mul(&x.B1, &y.B1)
-	b1 := e.Ext3.Sub(a, b)
-	b1 = e.Ext3.Sub(b1, c)
-	b0 := e.Ext3.MulByNonResidue(c)
-	b0 = e.Ext3.Add(b0, b)
-
-	return &E6{
-		B0: *b0,
-		B1: *b1,
-	}
-}
-
-func (e Ext6) Square(x *E6) *E6 {
-
-	x = e.Reduce(x)
-	//Algorithm 22 from https://eprint.iacr.org/2010/354.pdf
-	c0 := e.Ext3.Sub(&x.B0, &x.B1)
-	c3 := e.Ext3.MulByNonResidue(&x.B1)
-	c3 = e.Ext3.Neg(c3)
-	c3 = e.Ext3.Add(&x.B0, c3)
-	c2 := e.Ext3.Mul(&x.B0, &x.B1)
-	c0 = e.Ext3.Mul(c0, c3)
-	c0 = e.Ext3.Add(c0, c2)
-	b1 := e.Ext3.Double(c2)
-	c2 = e.Ext3.MulByNonResidue(c2)
-	b0 := e.Ext3.Add(c0, c2)
-
-	return &E6{
-		B0: *b0,
-		B1: *b1,
-	}
-}
-
-// Karabina's compressed cyclotomic square SQR12345
-// https://eprint.iacr.org/2010/542.pdf
-// Sec. 5.6 with minor modifications to fit our tower
-func (e Ext6) CyclotomicSquareKarabina12345(x *E6) *E6 {
-	x = e.Reduce(x)
-
-	// h4 = -g4 + 3((g3+g5)(g1+c*g2)-g1g5-c*g3g2)
-	g1g5 := e.fp.Mul(&x.B0.A1, &x.B1.A2)
-	g3g2 := e.fp.Mul(&x.B1.A0, &x.B0.A2)
-	h4 := mulFpByNonResidue(e.fp, &x.B0.A2)
-	h4 = e.fp.Add(h4, &x.B0.A1)
-	t := e.fp.Add(&x.B1.A0, &x.B1.A2)
-	h4 = e.fp.Mul(h4, t)
-	h4 = e.fp.Sub(h4, g1g5)
-	t = e.fp.MulConst(g3g2, big.NewInt(4))
-	h4 = e.fp.Add(h4, t)
-	h4 = e.fp.MulConst(h4, big.NewInt(3))
-	h4 = e.fp.Sub(h4, &x.B1.A1)
-
-	// h3 = 2(g3+3c*g1g5)
-	h3 := mulFpByNonResidue(e.fp, g1g5)
-	h3 = e.fp.MulConst(h3, big.NewInt(3))
-	h3 = e.fp.Add(h3, &x.B1.A0)
-	h3 = e.fp.MulConst(h3, big.NewInt(2))
-
-	// h2 = 3((g1+g5)(g1+c*g5)-(c+1)*g1g5)-2g2
-	t = mulFpByNonResidue(e.fp, &x.B1.A2)
-	t = e.fp.Add(t, &x.B0.A1)
-	h2 := e.fp.Add(&x.B1.A2, &x.B0.A1)
-	h2 = e.fp.Mul(h2, t)
-	t = e.fp.MulConst(g1g5, big.NewInt(3))
-	h2 = e.fp.Add(h2, t)
-	h2 = e.fp.MulConst(h2, big.NewInt(3))
-	t = e.fp.MulConst(&x.B0.A2, big.NewInt(2))
-	h2 = e.fp.Sub(h2, t)
-
-	// h1 = 3((g3+g2)(g3+c*g2)-(c+1)*g3g2)-2g1
-	t = mulFpByNonResidue(e.fp, &x.B0.A2)
-	t = e.fp.Add(t, &x.B1.A0)
-	h1 := e.fp.Add(&x.B0.A2, &x.B1.A0)
-	h1 = e.fp.Mul(h1, t)
-	t = e.fp.MulConst(g3g2, big.NewInt(3))
-	h1 = e.fp.Add(h1, t)
-	h1 = e.fp.MulConst(h1, big.NewInt(3))
-	t = e.fp.MulConst(&x.B0.A1, big.NewInt(2))
-	h1 = e.fp.Sub(h1, t)
-
-	// h5 = 2(g5+3g3g2)
-	h5 := e.fp.MulConst(g3g2, big.NewInt(3))
-	h5 = e.fp.Add(h5, &x.B1.A2)
-	h5 = e.fp.MulConst(h5, big.NewInt(2))
-
-	return &E6{
-		B0: E3{
-			A0: x.B0.A0,
-			A1: *h1,
-			A2: *h2,
-		},
-		B1: E3{
-			A0: *h3,
-			A1: *h4,
-			A2: *h5,
-		},
-	}
-}
-
-// DecompressKarabina12345 decompresses Karabina's cyclotomic square result SQR12345
-func (e Ext6) DecompressKarabina12345(x *E6) *E6 {
-	x = e.Reduce(x)
-
-	// h0 = (2g4^2 + g3g5 - 3g2g1)*c + 1
-	t0 := e.fp.Mul(&x.B0.A1, &x.B0.A2)
-	t0 = e.fp.MulConst(t0, big.NewInt(3))
-	t1 := e.fp.Mul(&x.B1.A0, &x.B1.A2)
-	h0 := e.fp.Mul(&x.B1.A1, &x.B1.A1)
-	h0 = e.fp.MulConst(h0, big.NewInt(2))
-	h0 = e.fp.Add(h0, t1)
-	h0 = e.fp.Sub(t0, h0)
-	h0 = e.fp.MulConst(h0, big.NewInt(4))
-	h0 = e.fp.Add(h0, e.fp.One())
-
-	return &E6{
-		B0: E3{
-			A0: *h0,
-			A1: x.B0.A1,
-			A2: x.B0.A2,
-		},
-		B1: x.B1,
-	}
-}
-
-// Karabina's compressed cyclotomic square SQR2345
-// https://eprint.iacr.org/2010/542.pdf
-// Th. 3.2 with minor modifications to fit our tower
-func (e Ext6) CyclotomicSquareKarabina2345(x *E6) *E6 {
-	x = e.Reduce(x)
-	z := e.Copy(x)
-
-	var t [7]*baseEl
-
-	// t0 = g1²
-	t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
-	// t1 = g5²
-	t[1] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
-	// t5 = g1 + g5
-	t[5] = e.fp.Add(&x.B0.A1, &x.B1.A2)
-	// t2 = (g1 + g5)²
-	t[2] = e.fp.Mul(t[5], t[5])
-
-	// t3 = g1² + g5²
-	t[3] = e.fp.Add(t[0], t[1])
-	// t5 = 2 * g1 * g5
-	t[5] = e.fp.Sub(t[3], t[2])
-
-	// t6 = g3 + g2
-	t[6] = e.fp.Add(&x.B1.A0, &x.B0.A2)
-	// t3 = (g3 + g2)²
-	t[3] = e.fp.Mul(t[6], t[6])
-	// t2 = g3²
-	t[2] = e.fp.Mul(&x.B1.A0, &x.B1.A0)
-
-	// t6 = 2 * nr * g1 * g5
-	t[6] = e.fp.MulConst(t[5], big.NewInt(4))
-	// t5 = 4 * nr * g1 * g5 + 2 * g3
-	t[5] = e.fp.Add(t[6], &x.B1.A0)
-	t[5] = e.fp.MulConst(t[5], big.NewInt(2))
-	// z3 = 6 * nr * g1 * g5 + 2 * g3
-	z.B1.A0 = *e.fp.Add(t[5], t[6])
-
-	// t4 = nr * g5²
-	t[4] = mulFpByNonResidue(e.fp, t[1])
-	// t5 = nr * g5² + g1²
-	t[5] = e.fp.Add(t[0], t[4])
-	// t6 = nr * g5² + g1² - g2
-	t[6] = e.fp.Sub(t[5], &x.B0.A2)
-
-	// t1 = g2²
-	t[1] = e.fp.Mul(&x.B0.A2, &x.B0.A2)
-
-	// t6 = 2 * nr * g5² + 2 * g1² - 2*g2
-	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
-	// z2 = 3 * nr * g5² + 3 * g1² - 2*g2
-	z.B0.A2 = *e.fp.Add(t[6], t[5])
-
-	// t4 = nr * g2²
-	t[4] = mulFpByNonResidue(e.fp, t[1])
-	// t5 = g3² + nr * g2²
-	t[5] = e.fp.Add(t[2], t[4])
-	// t6 = g3² + nr * g2² - g1
-	t[6] = e.fp.Sub(t[5], &x.B0.A1)
-	// t6 = 2 * g3² + 2 * nr * g2² - 2 * g1
-	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
-	// z1 = 3 * g3² + 3 * nr * g2² - 2 * g1
-	z.B0.A1 = *e.fp.Add(t[6], t[5])
-
-	// t0 = g2² + g3²
-	t[0] = e.fp.Add(t[2], t[1])
-	// t5 = 2 * g3 * g2
-	t[5] = e.fp.Sub(t[3], t[0])
-	// t6 = 2 * g3 * g2 + g5
-	t[6] = e.fp.Add(t[5], &x.B1.A2)
-	// t6 = 4 * g3 * g2 + 2 * g5
-	t[6] = e.fp.MulConst(t[6], big.NewInt(2))
-	// z5 = 6 * g3 * g2 + 2 * g5
-	z.B1.A2 = *e.fp.Add(t[5], t[6])
-
-	return z
-}
-
-// DecompressKarabina2345 decompresses Karabina's cyclotomic square result SQR2345
-// if g3 != 0
-//
-//	g4 = (E * g5^2 + 3 * g1^2 - 2 * g2)/4g3
-//
-// if g3 == 0
-//
-//	g4 = 2g1g5/g2
-//
-// if g3=g2=0 then g4=g5=g1=0 and g0=1 (x=1)
-// Theorem 3.1 is well-defined for all x in Gϕₙ\{1}
-func (e Ext6) DecompressKarabina2345(x *E6) *E6 {
-
-	x = e.Reduce(x)
-
-	var z E6
-
-	var t [3]*baseEl
-	var _t [2]*baseEl
-	one := e.fp.One()
-
-	// if g3 == 0
-	// t0 = 2 * g1 * g5
-	// t1 = g2
-	selector1 := e.fp.IsZero(&x.B1.A0)
-	_t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
-	_t[0] = e.fp.MulConst(_t[0], big.NewInt(2))
-	_t[1] = &x.B0.A2
-
-	// if g2 == g3 == 0
-	selector2 := e.fp.IsZero(_t[1])
-
-	// if g3 != 0
-	// t0 = E * g5^2 + 3 * g1^2 - 2 * g2
-	// t1 = 4 * g3
-	t[0] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
-	t[1] = e.fp.Sub(t[0], &x.B0.A2)
-	t[1] = e.fp.MulConst(t[1], big.NewInt(2))
-	t[1] = e.fp.Add(t[1], t[0])
-	t[2] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
-	t[0] = mulFpByNonResidue(e.fp, t[2])
-	t[0] = e.fp.Add(t[0], t[1])
-	t[1] = e.fp.Add(&x.B1.A0, &x.B1.A0)
-	t[1] = e.fp.MulConst(t[1], big.NewInt(2))
-
-	// g4 = (E * g5^2 + 3 * g1^2 - 2 * g2)/4g3 or (2 * g1 * g5)/g2
-	t[0] = e.fp.Select(selector1, _t[0], t[0])
-	t[1] = e.fp.Select(selector1, _t[1], t[1])
-	// g4 = dummy value, continue
-	t[1] = e.fp.Select(selector2, one, t[1])
-
-	z.B1.A1 = *e.fp.Div(t[0], t[1])
-
-	// Rest of the computation for all cases
-	// t1 = g2 * g1
-	t[1] = e.fp.Mul(&x.B0.A2, &x.B0.A1)
-	// t2 = 2 * g4² - 3 * g2 * g1
-	t[2] = e.fp.Mul(&z.B1.A1, &z.B1.A1)
-	t[2] = e.fp.Sub(t[2], t[1])
-	t[2] = e.fp.MulConst(t[2], big.NewInt(2))
-	t[2] = e.fp.Sub(t[2], t[1])
-	// t1 = g3 * g5 (g3 can be 0)
-	t[1] = e.fp.Mul(&x.B1.A0, &x.B1.A2)
-	// g0 = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
-	t[2] = e.fp.Add(t[2], t[1])
-
-	z.B0.A0 = *mulFpByNonResidue(e.fp, t[2])
-	z.B0.A0 = *e.fp.Add(&z.B0.A0, one)
-
-	z.B0.A1 = x.B0.A1
-	z.B0.A2 = x.B0.A2
-	z.B1.A0 = x.B1.A0
-	z.B1.A2 = x.B1.A2
-
-	return e.Select(e.api.And(selector1, selector2), e.One(), &z)
-}
-
-// Granger-Scott's cyclotomic square
-// https://eprint.iacr.org/2009/565.pdf, 3.2
-func (e Ext6) CyclotomicSquare(x *E6) *E6 {
-	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶
-	// cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0,
-	//					3*x2²*u + 3*x3² - 2*x1,
-	//					3*x5²*u + 3*x1² - 2*x2,
-	//					6*x1*x5*u + 2*x3,
-	//					6*x0*x4 + 2*x4,
-	//					6*x2*x3 + 2*x5)
-
-	x = e.Reduce(x)
-
-	var t [9]*baseEl
-
-	t[0] = e.fp.Mul(&x.B1.A1, &x.B1.A1)
-	t[1] = e.fp.Mul(&x.B0.A0, &x.B0.A0)
-	t[6] = e.fp.Add(&x.B1.A1, &x.B0.A0)
-	t[6] = e.fp.Mul(t[6], t[6])
-	t[6] = e.fp.Sub(t[6], t[0])
-	t[6] = e.fp.Sub(t[6], t[1]) // 2*x4*x0
-	t[2] = e.fp.Mul(&x.B0.A2, &x.B0.A2)
-	t[3] = e.fp.Mul(&x.B1.A0, &x.B1.A0)
-	t[7] = e.fp.Add(&x.B0.A2, &x.B1.A0)
-	t[7] = e.fp.Mul(t[7], t[7])
-	t[7] = e.fp.Sub(t[7], t[2])
-	t[7] = e.fp.Sub(t[7], t[3]) // 2*x2*x3
-	t[4] = e.fp.Mul(&x.B1.A2, &x.B1.A2)
-	t[5] = e.fp.Mul(&x.B0.A1, &x.B0.A1)
-	t[8] = e.fp.Add(&x.B1.A2, &x.B0.A1)
-	t[8] = e.fp.Mul(t[8], t[8])
-	t[8] = e.fp.Sub(t[8], t[4])
-	t[8] = e.fp.Sub(t[5], t[8])
-	t[8] = e.fp.MulConst(t[8], big.NewInt(4)) // 2*x5*x1*u
-
-	t[0] = mulFpByNonResidue(e.fp, t[0])
-	t[0] = e.fp.Add(t[0], t[1]) // x4²*u + x0²
-	t[2] = mulFpByNonResidue(e.fp, t[2])
-	t[2] = e.fp.Add(t[2], t[3]) // x2²*u + x3²
-	t[4] = mulFpByNonResidue(e.fp, t[4])
-	t[4] = e.fp.Add(t[4], t[5]) // x5²*u + x1²
-
-	var z E6
-	z.B0.A0 = *e.fp.Sub(t[0], &x.B0.A0)
-	z.B0.A0 = *e.fp.MulConst(&z.B0.A0, big.NewInt(2))
-	z.B0.A0 = *e.fp.Add(&z.B0.A0, t[0])
-	z.B0.A1 = *e.fp.Sub(t[2], &x.B0.A1)
-	z.B0.A1 = *e.fp.MulConst(&z.B0.A1, big.NewInt(2))
-	z.B0.A1 = *e.fp.Add(&z.B0.A1, t[2])
-	z.B0.A2 = *e.fp.Sub(t[4], &x.B0.A2)
-	z.B0.A2 = *e.fp.MulConst(&z.B0.A2, big.NewInt(2))
-	z.B0.A2 = *e.fp.Add(&z.B0.A2, t[4])
-
-	z.B1.A0 = *e.fp.Add(t[8], &x.B1.A0)
-	z.B1.A0 = *e.fp.MulConst(&z.B1.A0, big.NewInt(2))
-	z.B1.A0 = *e.fp.Add(&z.B1.A0, t[8])
-	z.B1.A1 = *e.fp.Add(t[6], &x.B1.A1)
-	z.B1.A1 = *e.fp.MulConst(&z.B1.A1, big.NewInt(2))
-	z.B1.A1 = *e.fp.Add(&z.B1.A1, t[6])
-	z.B1.A2 = *e.fp.Add(t[7], &x.B1.A2)
-	z.B1.A2 = *e.fp.Add(&z.B1.A2, &z.B1.A2)
-	z.B1.A2 = *e.fp.Add(&z.B1.A2, t[7])
-
-	return &z
-}
-
-func (e Ext6) Inverse(x *E6) *E6 {
-	res, err := e.fp.NewHint(inverseE6Hint, 6, &x.B0.A0, &x.B0.A1, &x.B0.A2, &x.B1.A0, &x.B1.A1, &x.B1.A2)
-	if err != nil {
-		// err is non-nil only for invalid number of inputs
-		panic(err)
-	}
-
-	inv := E6{
-		B0: E3{A0: *res[0], A1: *res[1], A2: *res[2]},
-		B1: E3{A0: *res[3], A1: *res[4], A2: *res[5]},
-	}
-	one := e.One()
-
-	// 1 == inv * x
-	_one := e.Mul(&inv, x)
-	e.AssertIsEqual(one, _one)
-
-	return &inv
-
-}
-
-func (e Ext6) DivUnchecked(x, y *E6) *E6 {
-	res, err := e.fp.NewHint(divE6Hint, 12, &x.B0.A0, &x.B0.A1, &x.B0.A2, &x.B1.A0, &x.B1.A1, &x.B1.A2, &y.B0.A0, &y.B0.A1, &y.B0.A2, &y.B1.A0, &y.B1.A1, &y.B1.A2)
-	if err != nil {
-		// err is non-nil only for invalid number of inputs
-		panic(err)
-	}
-
-	div := E6{
-		B0: E3{A0: *res[0], A1: *res[1], A2: *res[2]},
-		B1: E3{A0: *res[3], A1: *res[4], A2: *res[5]},
-	}
-
-	// x = div * y
-	_x := e.Mul(&div, y)
-	e.AssertIsEqual(x, _x)
-
-	return &div
-
-}
-
-func (e Ext6) Conjugate(x *E6) *E6 {
-	return &E6{
-		B0: x.B0,
-		B1: *e.Ext3.Neg(&x.B1),
-	}
-}
-
-func (e Ext6) AssertIsEqual(a, b *E6) {
-	e.Ext3.AssertIsEqual(&a.B0, &b.B0)
-	e.Ext3.AssertIsEqual(&a.B1, &b.B1)
-}
-
-func (e Ext6) Copy(x *E6) *E6 {
-	b0 := e.Ext3.Copy(&x.B0)
-	b1 := e.Ext3.Copy(&x.B1)
-	return &E6{
-		B0: *b0,
-		B1: *b1,
-	}
-}
-
-func FromE6(a *bw6761.E6) E6 {
-	return E6{
-		B0: FromE3(&a.B0),
-		B1: FromE3(&a.B1),
-	}
-}
-
-// Frobenius set z in E6 to Frobenius(x), return z
-func (e Ext6) Frobenius(x *E6) *E6 {
-	_frobA := emulated.ValueOf[emulated.BW6761Fp]("4922464560225523242118178942575080391082002530232324381063048548642823052024664478336818169867474395270858391911405337707247735739826664939444490469542109391530482826728203582549674992333383150446779312029624171857054392282775648")
-	_frobB := emulated.ValueOf[emulated.BW6761Fp]("1968985824090209297278610739700577151397666382303825728450741611566800370218827257750865013421937292370006175842381275743914023380727582819905021229583192207421122272650305267822868639090213645505120388400344940985710520836292650")
-	_frobC := emulated.ValueOf[emulated.BW6761Fp]("4922464560225523242118178942575080391082002530232324381063048548642823052024664478336818169867474395270858391911405337707247735739826664939444490469542109391530482826728203582549674992333383150446779312029624171857054392282775649")
-	_frobAC := emulated.ValueOf[emulated.BW6761Fp]("-1")
-	_frobBC := emulated.ValueOf[emulated.BW6761Fp]("1968985824090209297278610739700577151397666382303825728450741611566800370218827257750865013421937292370006175842381275743914023380727582819905021229583192207421122272650305267822868639090213645505120388400344940985710520836292651")
-	var z E6
-	z.B0.A0 = x.B0.A0
-	z.B0.A1 = *e.fp.Mul(&x.B0.A1, &_frobA)
-	z.B0.A2 = *e.fp.Mul(&x.B0.A2, &_frobB)
-
-	z.B1.A0 = *e.fp.Mul(&x.B1.A0, &_frobC)
-	z.B1.A1 = *e.fp.Mul(&x.B1.A1, &_frobAC)
-	z.B1.A2 = *e.fp.Mul(&x.B1.A2, &_frobBC)
-
-	return &z
-}
-
-func (e Ext6) Select(selector frontend.Variable, z1, z0 *E6) *E6 {
-	b0 := e.Ext3.Select(selector, &z1.B0, &z0.B0)
-	b1 := e.Ext3.Select(selector, &z1.B1, &z0.B1)
-	return &E6{B0: *b0, B1: *b1}
-}
diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 5d73f5ba6b..2b9fcb057f 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -685,9 +685,6 @@ func (e Ext6) MulToomCook6x(x, y *E6) *E6 {
 // Karabina's compressed cyclotomic square SQR12345
 // https://eprint.iacr.org/2010/542.pdf
 // Sec. 5.6 with minor modifications to fit our tower
-//
-//	a00 a01 a02 a10 a11 a12
-//	A0  A2  A4  A1  A3  A5
 func (e Ext6) CyclotomicSquareKarabina12345(x *E6) *E6 {
 	x = e.Reduce(x)
 
diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index 12aa698ae3..e989bb338a 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -25,7 +25,7 @@ func (e Ext6) ExpX0Minus1(z *E6) *E6 {
 	result = e.Mul(result, z33)
 	result = e.nSquareKarabina12345(result, 4)
 	result = e.Mul(result, z)
-	result = e.CyclotomicSquare(result)
+	result = e.nSquareKarabina12345(result, 1)
 	result = e.Mul(result, z)
 	result = e.nSquareKarabina12345(result, 46)
 
@@ -38,11 +38,11 @@ func (e Ext6) ExpX0Minus1Square(z *E6) *E6 {
 	z = e.Reduce(z)
 	result := e.Copy(z)
 	result = e.nSquareKarabina12345(result, 3)
-	t0 := e.CyclotomicSquare(result)
+	t0 := e.nSquareKarabina12345(result, 1)
 	t2 := e.Mul(z, t0)
 	result = e.Mul(result, t2)
 	t0 = e.Mul(z, result)
-	t1 := e.CyclotomicSquare(t0)
+	t1 := e.nSquareKarabina12345(t0, 1)
 	t1 = e.Mul(t2, t1)
 	t3 := e.nSquareKarabina12345(t1, 7)
 	t2 = e.Mul(t2, t3)
@@ -64,7 +64,7 @@ func (e Ext6) ExpX0Minus1Square(z *E6) *E6 {
 func (e Ext6) ExpX0Plus1(z *E6) *E6 {
 	z = e.Reduce(z)
 	result := e.Copy(z)
-	t := e.CyclotomicSquare(result)
+	t := e.nSquareKarabina12345(result, 1)
 	result = e.nSquareKarabina12345(t, 4)
 	result = e.Mul(result, z)
 	z33 := e.Copy(result)
@@ -72,7 +72,7 @@ func (e Ext6) ExpX0Plus1(z *E6) *E6 {
 	result = e.Mul(result, z33)
 	result = e.nSquareKarabina12345(result, 4)
 	result = e.Mul(result, z)
-	result = e.CyclotomicSquare(result)
+	result = e.nSquareKarabina12345(result, 1)
 	result = e.Mul(result, z)
 	result = e.nSquareKarabina12345(result, 46)
 	result = e.Mul(result, t)
@@ -85,10 +85,9 @@ func (e Ext6) ExpX0Plus1(z *E6) *E6 {
 func (e Ext6) ExptMinus1Div3(z *E6) *E6 {
 	z = e.Reduce(z)
 	result := e.Copy(z)
-	result = e.CyclotomicSquare(result)
-	result = e.CyclotomicSquare(result)
+	result = e.nSquareKarabina12345(result, 2)
 	result = e.Mul(result, z)
-	result = e.CyclotomicSquare(result)
+	result = e.nSquareKarabina12345(result, 1)
 	result = e.Mul(result, z)
 	t0 := e.nSquareKarabina12345(result, 7)
 	result = e.Mul(result, t0)
@@ -105,10 +104,9 @@ func (e Ext6) ExptMinus1Div3(z *E6) *E6 {
 func (e Ext6) ExpC1(z *E6) *E6 {
 	z = e.Reduce(z)
 	result := e.Copy(z)
-	result = e.CyclotomicSquare(result)
-	result = e.CyclotomicSquare(result)
+	result = e.nSquareKarabina12345(result, 2)
 	result = e.Mul(result, z)
-	result = e.CyclotomicSquare(result)
+	result = e.nSquareKarabina12345(result, 1)
 	result = e.Mul(result, z)
 
 	return result
@@ -119,11 +117,11 @@ func (e Ext6) ExpC1(z *E6) *E6 {
 // C2 = (ht**2+3*hy**2)/4 = 103
 func (e Ext6) ExpC2(z *E6) *E6 {
 	z = e.Reduce(z)
-	result := e.CyclotomicSquare(z)
+	result := e.nSquareKarabina12345(z, 1)
 	result = e.Mul(result, z)
 	t0 := e.nSquareKarabina12345(result, 4)
 	result = e.Mul(result, t0)
-	result = e.CyclotomicSquare(result)
+	result = e.nSquareKarabina12345(result, 1)
 	result = e.Mul(result, z)
 
 	return result

From e2b48164dfed60d47e81d0f9afe015e4975f0e98 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Thu, 25 Apr 2024 13:47:51 -0400
Subject: [PATCH 04/24] perf(bw6): optimize pairing with new tower

---
 std/algebra/emulated/fields_bw6761/e6.go      | 66 -------------
 .../emulated/fields_bw6761/e6_pairing.go      |  4 +-
 std/algebra/emulated/sw_bw6761/pairing.go     | 98 ++++++++++++++++---
 3 files changed, 89 insertions(+), 79 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 2b9fcb057f..feedf80c5d 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -771,72 +771,6 @@ func (e Ext6) DecompressKarabina12345(x *E6) *E6 {
 	}
 }
 
-// Granger-Scott's cyclotomic square
-// https://eprint.iacr.org/2009/565.pdf, 3.2
-func (e Ext6) CyclotomicSquare(x *E6) *E6 {
-	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶
-	// cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0,
-	//					3*x2²*u + 3*x3² - 2*x1,
-	//					3*x5²*u + 3*x1² - 2*x2,
-	//					6*x1*x5*u + 2*x3,
-	//					6*x0*x4 + 2*x4,
-	//					6*x2*x3 + 2*x5)
-
-	x = e.Reduce(x)
-
-	var t [9]*baseEl
-
-	t[0] = e.fp.Mul(&x.A3, &x.A3)
-	t[1] = e.fp.Mul(&x.A0, &x.A0)
-	t[6] = e.fp.Add(&x.A3, &x.A0)
-	t[6] = e.fp.Mul(t[6], t[6])
-	t[6] = e.fp.Sub(t[6], t[0])
-	t[6] = e.fp.Sub(t[6], t[1]) // 2*x4*x0
-	t[2] = e.fp.Mul(&x.A4, &x.A4)
-	t[3] = e.fp.Mul(&x.A1, &x.A1)
-	t[7] = e.fp.Add(&x.A4, &x.A1)
-	t[7] = e.fp.Mul(t[7], t[7])
-	t[7] = e.fp.Sub(t[7], t[2])
-	t[7] = e.fp.Sub(t[7], t[3]) // 2*x2*x3
-	t[4] = e.fp.Mul(&x.A5, &x.A5)
-	t[5] = e.fp.Mul(&x.A2, &x.A2)
-	t[8] = e.fp.Add(&x.A5, &x.A2)
-	t[8] = e.fp.Mul(t[8], t[8])
-	t[8] = e.fp.Sub(t[8], t[4])
-	t[8] = e.fp.Sub(t[5], t[8])
-	t[8] = e.fp.MulConst(t[8], big.NewInt(4)) // 2*x5*x1*u
-
-	t[0] = mulFpByNonResidue(e.fp, t[0])
-	t[0] = e.fp.Add(t[0], t[1]) // x4²*u + x0²
-	t[2] = mulFpByNonResidue(e.fp, t[2])
-	t[2] = e.fp.Add(t[2], t[3]) // x2²*u + x3²
-	t[4] = mulFpByNonResidue(e.fp, t[4])
-	t[4] = e.fp.Add(t[4], t[5]) // x5²*u + x1²
-
-	var z E6
-	z.A0 = *e.fp.Sub(t[0], &x.A0)
-	z.A0 = *e.fp.MulConst(&z.A0, big.NewInt(2))
-	z.A0 = *e.fp.Add(&z.A0, t[0])
-	z.A2 = *e.fp.Sub(t[2], &x.A2)
-	z.A2 = *e.fp.MulConst(&z.A2, big.NewInt(2))
-	z.A2 = *e.fp.Add(&z.A2, t[2])
-	z.A4 = *e.fp.Sub(t[4], &x.A4)
-	z.A4 = *e.fp.MulConst(&z.A4, big.NewInt(2))
-	z.A4 = *e.fp.Add(&z.A4, t[4])
-
-	z.A1 = *e.fp.Add(t[8], &x.A1)
-	z.A1 = *e.fp.MulConst(&z.A1, big.NewInt(2))
-	z.A1 = *e.fp.Add(&z.A1, t[8])
-	z.A3 = *e.fp.Add(t[6], &x.A3)
-	z.A3 = *e.fp.MulConst(&z.A3, big.NewInt(2))
-	z.A3 = *e.fp.Add(&z.A3, t[6])
-	z.A5 = *e.fp.Add(t[7], &x.A5)
-	z.A5 = *e.fp.Add(&z.A5, &z.A5)
-	z.A5 = *e.fp.Add(&z.A5, t[7])
-
-	return &z
-}
-
 func (e Ext6) Inverse(x *E6) *E6 {
 	res, err := e.fp.NewHint(inverseE6Hint, 6, &x.A0, &x.A1, &x.A2, &x.A3, &x.A4, &x.A5)
 	if err != nil {
diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index e989bb338a..93716a2845 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -1,8 +1,8 @@
 package fields_bw6761
 
 import (
+	"github.com/consensys/gnark/std/math/emulated"
 	"math/big"
-	// "github.com/consensys/gnark/std/math/emulated"
 )
 
 func (e Ext6) nSquareKarabina12345(z *E6, n int) *E6 {
@@ -266,7 +266,6 @@ func (e *Ext6) MulBy023(x *E6, c0, c1 *baseEl) *E6 {
 	}
 }
 
-/*
 //	Mul023By023 multiplies two E6 sparse element of the form:
 //
 //	E6{A0: c0, A1: 0, A2: c1, A3: 1,  A4: 0,  A5: 0}
@@ -291,6 +290,7 @@ func (e Ext6) Mul023By023(d0, d1, c0, c1 *baseEl) [5]*baseEl {
 	return [5]*baseEl{zC0B0, x01, x1, x04, x14}
 }
 
+/*
 // MulBy01245 multiplies z by an E6 sparse element of the form
 //
 //	E6{
diff --git a/std/algebra/emulated/sw_bw6761/pairing.go b/std/algebra/emulated/sw_bw6761/pairing.go
index 42a6487df0..b07aa0a7bb 100644
--- a/std/algebra/emulated/sw_bw6761/pairing.go
+++ b/std/algebra/emulated/sw_bw6761/pairing.go
@@ -96,7 +96,8 @@ func (pr Pairing) FinalExponentiation(z *GTEl) *GTEl {
 	a = pr.Mul(a, pr.Frobenius(result))
 	b := pr.ExpX0Plus1(a)
 	b = pr.Mul(b, pr.Conjugate(result))
-	t := pr.CyclotomicSquare(a)
+	t := pr.CyclotomicSquareKarabina12345(a)
+	t = pr.DecompressKarabina12345(t)
 	a = pr.Mul(a, t)
 	c := pr.ExptMinus1Div3(b)
 	d := pr.ExpX0Minus1(c)
@@ -112,7 +113,8 @@ func (pr Pairing) FinalExponentiation(z *GTEl) *GTEl {
 	i = pr.Mul(i, pr.Conjugate(f))
 	j := pr.ExpC1(h)
 	j = pr.Mul(j, e)
-	k := pr.CyclotomicSquare(j)
+	k := pr.CyclotomicSquareKarabina12345(j)
+	k = pr.DecompressKarabina12345(k)
 	k = pr.Mul(k, j)
 	k = pr.Mul(k, b)
 	t = pr.ExpC2(i)
@@ -290,26 +292,100 @@ func (pr Pairing) millerLoopLines(P []*G1Affine, lines []lineEvaluations) (*GTEl
 	}
 
 	// f_{x₀+1+λ(x₀³-x₀²-x₀),Q}(P), Q is known in advance
+	var prodLines [5]*emulated.Element[BaseField]
 	result := pr.Ext6.One()
 
-	for i := 188; i >= 0; i-- {
+	// i = 188
+	// k = 0
+	result = &fields_bw6761.E6{
+		A0: *pr.curveF.Mul(&lines[0][0][188].R1, yInv[0]),
+		A1: result.A1,
+		A2: *pr.curveF.Mul(&lines[0][0][188].R0, xNegOverY[0]),
+		A3: *pr.curveF.One(),
+		A4: result.A4,
+		A5: result.A5,
+	}
+
+	if n >= 2 {
+		// k = 1, separately to avoid MulBy023 (res × ℓ)
+		// (res is also a line at this point, so we use Mul023By023 ℓ × ℓ)
+		prodLines = pr.Mul023By023(
+			pr.curveF.Mul(&lines[1][0][188].R1, yInv[1]),
+			pr.curveF.Mul(&lines[1][0][188].R0, xNegOverY[1]),
+			&result.A0,
+			&result.A2,
+		)
+		result = &fields_bw6761.E6{
+			A0: *prodLines[0],
+			A1: result.A1,
+			A2: *prodLines[1],
+			A3: *prodLines[3],
+			A4: *prodLines[2],
+			A5: *prodLines[4],
+		}
+	}
+
+	for k := 2; k < n; k++ {
+		result = pr.MulBy023(result,
+			pr.curveF.Mul(&lines[k][0][188].R1, yInv[k]),
+			pr.curveF.Mul(&lines[k][0][188].R0, xNegOverY[k]),
+		)
+	}
+
+	for i := 187; i >= 0; i-- {
 		// mutualize the square among n Miller loops
 		// (∏ᵢfᵢ)²
 		result = pr.Square(result)
 
-		for k := 0; k < n; k++ {
-			result = pr.MulBy023(result,
-				pr.curveF.Mul(&lines[k][0][i].R1, yInv[k]),
-				pr.curveF.Mul(&lines[k][0][i].R0, xNegOverY[k]),
-			)
-		}
-
 		if i > 0 && loopCounter2[i]*3+loopCounter1[i] != 0 {
 			for k := 0; k < n; k++ {
-				result = pr.MulBy023(result,
+				prodLines = pr.Mul023By023(
+					pr.curveF.Mul(&lines[k][0][i].R1, yInv[k]),
+					pr.curveF.Mul(&lines[k][0][i].R0, xNegOverY[k]),
 					pr.curveF.Mul(&lines[k][1][i].R1, yInv[k]),
 					pr.curveF.Mul(&lines[k][1][i].R0, xNegOverY[k]),
 				)
+				result = pr.Mul(
+					result,
+					&fields_bw6761.E6{
+						A0: *prodLines[0],
+						A1: *pr.curveF.Zero(),
+						A2: *prodLines[1],
+						A3: *prodLines[3],
+						A4: *prodLines[2],
+						A5: *prodLines[4],
+					},
+				)
+			}
+		} else {
+			// if number of lines is odd, mul last line by res
+			// works for n=1 as well
+			if n%2 != 0 {
+				// ℓ × res
+				result = pr.MulBy023(result,
+					pr.curveF.Mul(&lines[n-1][0][i].R1, yInv[n-1]),
+					pr.curveF.Mul(&lines[n-1][0][i].R0, xNegOverY[n-1]),
+				)
+			}
+			// mul lines 2-by-2
+			for k := 1; k < n; k += 2 {
+				prodLines = pr.Mul023By023(
+					pr.curveF.Mul(&lines[k][0][i].R1, yInv[k]),
+					pr.curveF.Mul(&lines[k][0][i].R0, xNegOverY[k]),
+					pr.curveF.Mul(&lines[k-1][0][i].R1, yInv[k-1]),
+					pr.curveF.Mul(&lines[k-1][0][i].R0, xNegOverY[k-1]),
+				)
+				result = pr.Mul(
+					result,
+					&fields_bw6761.E6{
+						A0: *prodLines[0],
+						A1: *pr.curveF.Zero(),
+						A2: *prodLines[1],
+						A3: *prodLines[3],
+						A4: *prodLines[2],
+						A5: *prodLines[4],
+					},
+				)
 			}
 		}
 	}

From 650f8791f4fc55f4f4dec2c06de2f3732d59448f Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Thu, 25 Apr 2024 18:26:36 -0400
Subject: [PATCH 05/24] perf(bw6): optimize Montgomery-6 mul

---
 std/algebra/emulated/fields_bw6761/e6.go      | 170 ++++++++++--------
 std/algebra/emulated/fields_bw6761/e6_test.go |  28 ---
 2 files changed, 96 insertions(+), 102 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index feedf80c5d..4022a07109 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -362,110 +362,132 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	//  c5 = −(v3 + v4 + v5 + v9 + v15 + v16) + 2(v8 + v10 + v11 + v12 + v13 −
 	//  (v6 + v7)) + 3(v14 + v17)
 
-	c0 := e.fp.Sub(v[0], v[2])
-	c0 = e.fp.Add(c0, v[4])
-	s1 := e.fp.Add(v[3], v[5])
-	s1 = e.fp.Add(s1, v[6])
-	s1 = e.fp.Sub(s1, v[12])
-	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	c0 := e.fp.MulConst(v[2], big.NewInt(4))
+	s1 := e.fp.Add(v[8], v[10])
+	s1 = e.fp.Add(s1, v[11])
+	s1 = e.fp.MulConst(s1, big.NewInt(12))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.Add(v[7], v[15])
-	s2 := e.fp.Add(v[8], v[10])
-	s2 = e.fp.Add(s2, v[11])
-	s1 = e.fp.Sub(s1, s2)
-	s1 = e.fp.MulConst(s1, big.NewInt(3))
+	s1 = e.fp.MulConst(v[12], big.NewInt(8))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.Sub(v[16], v[13])
-	s1 = e.fp.MulConst(s1, big.NewInt(4))
+	s1 = e.fp.MulConst(v[13], big.NewInt(16))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.Add(v[14], v[17])
-	s1 = e.fp.MulConst(s1, big.NewInt(5))
-	c0 = e.fp.Sub(c0, s1)
-	c0 = mulFpByNonResidue(e.fp, c0)
-	c0 = e.fp.Add(c0, v[14])
-
-	c1 := e.fp.Add(v[15], v[14])
-	c1 = e.fp.Sub(v[12], c1)
-	s2 = e.fp.Add(v[3], v[5])
-	s2 = e.fp.Add(s2, v[6])
-	s2 = e.fp.Add(s2, v[15])
-	s1 = e.fp.Add(v[10], v[8])
-	s1 = e.fp.Add(s1, v[12])
-	s1 = e.fp.Sub(s1, s2)
-	s2 = e.fp.Add(v[14], v[17])
-	s2 = e.fp.Add(s2, v[13])
-	s2 = e.fp.Sub(s2, v[7])
-	s2 = e.fp.MulConst(s2, big.NewInt(2))
+	s1 = e.fp.MulConst(v[14], big.NewInt(21))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v[17], big.NewInt(20))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v[15], big.NewInt(12))
+	s2 := e.fp.MulConst(v[16], big.NewInt(16))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.Sub(v[11], v[16])
-	s2 = e.fp.MulConst(s2, big.NewInt(3))
+	s2 = e.fp.MulConst(v[0], big.NewInt(4))
 	s1 = e.fp.Add(s1, s2)
-	s1 = mulFpByNonResidue(e.fp, s1)
+	s2 = e.fp.MulConst(v[3], big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[4], big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[5], big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[6], big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[7], big.NewInt(12))
+	s1 = e.fp.Add(s1, s2)
+	c0 = e.fp.Sub(c0, s1)
+
+	c1 := e.fp.Add(v[3], v[5])
+	c1 = e.fp.Add(c1, v[6])
+	c1 = e.fp.MulConst(c1, big.NewInt(4))
+	s1 = e.fp.MulConst(v[7], big.NewInt(8))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v[16], big.NewInt(12))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v[15], big.NewInt(3))
 	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v[12], big.NewInt(3))
+	s2 = e.fp.MulConst(v[14], big.NewInt(9))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[8], big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[10], big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[11], big.NewInt(12))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[13], big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[17], big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	c1 = e.fp.Sub(c1, s1)
 
 	c2 := e.fp.MulConst(v[15], big.NewInt(2))
 	c2 = e.fp.Add(c2, v[6])
+	s1 = e.fp.MulConst(v[11], big.NewInt(4))
+	c2 = e.fp.Add(c2, s1)
+	s1 = e.fp.MulConst(v[13], big.NewInt(4))
+	c2 = e.fp.Add(c2, s1)
 	s1 = e.fp.Add(v[10], v[12])
+	s2 = e.fp.MulConst(v[7], big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[16], big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
 	c2 = e.fp.Sub(c2, s1)
-	s2 = e.fp.Add(v[11], v[13])
-	s1 = e.fp.MulConst(v[16], big.NewInt(2))
-	s1 = e.fp.Add(s1, v[7])
-	s1 = e.fp.Sub(s1, s2)
-	s1 = mulFpByNonResidue(e.fp, s1)
-	c2 = e.fp.Add(c2, s1)
 
 	c3 := e.fp.Add(v[8], v[11])
-	c3 = e.fp.Add(c3, v[13])
-	s1 = e.fp.Add(v[3], v[4])
-	s1 = e.fp.Add(s1, v[7])
-	s1 = e.fp.Add(s1, v[16])
-	c3 = e.fp.Sub(c3, s1)
-	s1 = e.fp.Sub(v[10], v[15])
-	s1 = e.fp.MulConst(s1, big.NewInt(3))
+	s1 = e.fp.MulConst(v[10], big.NewInt(3))
 	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.Add(v[12], v[14])
-	s1 = e.fp.Add(s1, v[17])
-	s1 = e.fp.Sub(s1, v[6])
-	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	s1 = e.fp.MulConst(v[12], big.NewInt(2))
 	c3 = e.fp.Add(c3, s1)
-	s2 = e.fp.Add(v[16], v[17])
-	s1 = e.fp.Sub(v[13], s2)
-	s1 = mulFpByNonResidue(e.fp, s1)
+	s1 = e.fp.MulConst(v[14], big.NewInt(2))
 	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v[16], big.NewInt(3))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v[17], big.NewInt(6))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.Add(v[3], v[4])
+	s1 = e.fp.Add(s1, v[7])
+	s2 = e.fp.MulConst(v[6], big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[13], big.NewInt(3))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[15], big.NewInt(3))
+	s1 = e.fp.Add(s1, s2)
+	c3 = e.fp.Sub(c3, s1)
 
 	c4 := e.fp.Add(v[2], v[3])
-	c4 = e.fp.Add(c4, v[4])
-	c4 = e.fp.Add(c4, v[7])
 	c4 = e.fp.Add(c4, v[15])
 	c4 = e.fp.Add(c4, v[9])
-	s1 = e.fp.Add(v[8], v[13])
-	c4 = e.fp.Sub(c4, s1)
-	s1 = e.fp.MulConst(v[12], big.NewInt(3))
-	c4 = e.fp.Sub(c4, s1)
-	s1 = e.fp.Add(v[10], v[17])
-	s1 = e.fp.Add(s1, v[11])
-	s1 = e.fp.Add(s1, v[14])
-	s1 = e.fp.Sub(v[6], s1)
-	s1 = e.fp.MulConst(s1, big.NewInt(2))
-	c4 = e.fp.Add(c4, s1)
-	s1 = mulFpByNonResidue(e.fp, v[17])
+	c4 = e.fp.Add(c4, v[7])
+	c4 = e.fp.Add(c4, v[4])
+	s1 = e.fp.MulConst(v[6], big.NewInt(2))
 	c4 = e.fp.Add(c4, s1)
+	s1 = e.fp.Add(v[13], v[8])
+	s2 = e.fp.MulConst(v[10], big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[11], big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[12], big.NewInt(3))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[14], big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[17], big.NewInt(6))
+	s1 = e.fp.Add(s1, s2)
+	c4 = e.fp.Sub(c4, s1)
 
 	c5 := e.fp.Add(v[8], v[10])
 	c5 = e.fp.Add(c5, v[11])
 	c5 = e.fp.Add(c5, v[12])
 	c5 = e.fp.Add(c5, v[13])
-	s1 = e.fp.Add(v[6], v[7])
-	c5 = e.fp.Sub(c5, s1)
 	c5 = e.fp.MulConst(c5, big.NewInt(2))
-	s1 = e.fp.Add(v[14], v[17])
-	s1 = e.fp.MulConst(s1, big.NewInt(3))
+	s1 = e.fp.MulConst(v[14], big.NewInt(3))
 	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.Add(v[3], v[4])
+	s1 = e.fp.MulConst(v[17], big.NewInt(3))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.Add(v[15], v[16])
+	s1 = e.fp.Add(s1, v[3])
+	s1 = e.fp.Add(s1, v[4])
 	s1 = e.fp.Add(s1, v[5])
 	s1 = e.fp.Add(s1, v[9])
-	s1 = e.fp.Add(s1, v[15])
-	s1 = e.fp.Add(s1, v[16])
+	s2 = e.fp.MulConst(v[6], big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v[7], big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
 	c5 = e.fp.Sub(c5, s1)
 
 	return &E6{
diff --git a/std/algebra/emulated/fields_bw6761/e6_test.go b/std/algebra/emulated/fields_bw6761/e6_test.go
index 789629d963..5f38dcd4b1 100644
--- a/std/algebra/emulated/fields_bw6761/e6_test.go
+++ b/std/algebra/emulated/fields_bw6761/e6_test.go
@@ -248,34 +248,6 @@ func TestConjugateFp6(t *testing.T) {
 	assert.NoError(err)
 }
 
-type e6CyclotomicSquare struct {
-	A, B E6
-}
-
-func (circuit *e6CyclotomicSquare) Define(api frontend.API) error {
-	e := NewExt6(api)
-	expected := e.CyclotomicSquare(&circuit.A)
-	e.AssertIsEqual(expected, &circuit.B)
-	return nil
-}
-
-func TestCyclotomicSquareFp6(t *testing.T) {
-	assert := test.NewAssert(t)
-	// witness values
-	var a, b bw6761.E6
-	_, _ = a.SetRandom()
-	b.Set(&a)
-	b.CyclotomicSquare(&a)
-
-	witness := e6CyclotomicSquare{
-		A: FromE6(&a),
-		B: FromE6(&b),
-	}
-
-	err := test.IsSolved(&e6CyclotomicSquare{}, &witness, ecc.BN254.ScalarField())
-	assert.NoError(err)
-}
-
 type e6Expt struct {
 	A, B E6
 }

From 37e3874b37b588fa754118c9fd5c4eb8d560bf1a Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Thu, 25 Apr 2024 19:00:29 -0400
Subject: [PATCH 06/24] perf(bw6): optimize specialized Montgomery-6 mul

---
 .../emulated/fields_bw6761/e6_pairing.go      | 298 ++++++++++++++----
 1 file changed, 235 insertions(+), 63 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index 93716a2845..2932bf167c 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -183,77 +183,91 @@ func (e *Ext6) MulBy023(x *E6, c0, c1 *baseEl) *E6 {
 	v5 := e.fp.Mul(t1, s1)
 	v14 := e.fp.Mul(&x.A0, c0)
 
-	z0 := e.fp.Sub(v0, v2)
-	z0 = e.fp.Add(z0, v4)
-	s1 = e.fp.Add(v3, v5)
-	s1 = e.fp.Add(s1, v6)
-	s1 = e.fp.Sub(s1, v12)
-	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	z0 := e.fp.MulConst(v2, big.NewInt(4))
+	s1 = e.fp.Add(v8, v10)
+	s1 = e.fp.Add(s1, v11)
+	s1 = e.fp.MulConst(s1, big.NewInt(12))
 	z0 = e.fp.Add(z0, s1)
-	s2 = e.fp.Add(v8, v10)
-	s2 = e.fp.Add(s2, v11)
-	s1 = e.fp.Sub(v7, s2)
-	s1 = e.fp.MulConst(s1, big.NewInt(3))
+	s1 = e.fp.MulConst(v12, big.NewInt(8))
 	z0 = e.fp.Add(z0, s1)
-	s1 = e.fp.MulConst(v14, big.NewInt(5))
+	s1 = e.fp.MulConst(v14, big.NewInt(21))
+	z0 = e.fp.Add(z0, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(4))
+	s2 = e.fp.MulConst(v3, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v4, big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v5, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v6, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v7, big.NewInt(12))
+	s1 = e.fp.Add(s1, s2)
 	z0 = e.fp.Sub(z0, s1)
-	z0 = mulFpByNonResidue(e.fp, z0)
-	z0 = e.fp.Add(z0, v14)
 
-	z1 := e.fp.Sub(v12, v14)
-	s2 = e.fp.Add(v3, v5)
-	s2 = e.fp.Add(s2, v6)
-	s1 = e.fp.Add(v10, v8)
-	s1 = e.fp.Add(s1, v12)
-	s1 = e.fp.Sub(s1, s2)
-	s2 = e.fp.Sub(v14, v7)
-	s2 = e.fp.MulConst(s2, big.NewInt(2))
+	z1 := e.fp.Add(v3, v5)
+	z1 = e.fp.Add(z1, v6)
+	z1 = e.fp.MulConst(z1, big.NewInt(4))
+	s1 = e.fp.MulConst(v7, big.NewInt(8))
+	z1 = e.fp.Add(z1, s1)
+	s1 = e.fp.MulConst(v12, big.NewInt(3))
+	s2 = e.fp.MulConst(v14, big.NewInt(9))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v11, big.NewInt(3))
+	s2 = e.fp.MulConst(v8, big.NewInt(4))
 	s1 = e.fp.Add(s1, s2)
-	s1 = mulFpByNonResidue(e.fp, s1)
-	z1 = e.fp.Add(z1, s1)
+	s2 = e.fp.MulConst(v10, big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v11, big.NewInt(12))
+	s1 = e.fp.Add(s1, s2)
+	z1 = e.fp.Sub(z1, s1)
 
-	z2 := v6
+	s1 = e.fp.MulConst(v11, big.NewInt(4))
+	z2 := e.fp.Add(v6, s1)
 	s1 = e.fp.Add(v10, v12)
+	s2 = e.fp.MulConst(v7, big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
 	z2 = e.fp.Sub(z2, s1)
-	s1 = e.fp.Sub(v7, v11)
-	s1 = mulFpByNonResidue(e.fp, s1)
-	z2 = e.fp.Add(z2, s1)
 
 	z3 := e.fp.Add(v8, v11)
-	s1 = e.fp.Add(v3, v4)
-	s1 = e.fp.Add(s1, v7)
-	z3 = e.fp.Sub(z3, s1)
 	s1 = e.fp.MulConst(v10, big.NewInt(3))
 	z3 = e.fp.Add(z3, s1)
-	s1 = e.fp.Add(v12, v14)
-	s1 = e.fp.Sub(s1, v6)
-	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	s1 = e.fp.MulConst(v12, big.NewInt(2))
+	z3 = e.fp.Add(z3, s1)
+	s1 = e.fp.MulConst(v14, big.NewInt(2))
 	z3 = e.fp.Add(z3, s1)
+	s1 = e.fp.Add(v3, v4)
+	s1 = e.fp.Add(s1, v7)
+	s2 = e.fp.MulConst(v6, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	z3 = e.fp.Sub(z3, s1)
 
 	z4 := e.fp.Add(v2, v3)
-	z4 = e.fp.Add(z4, v4)
 	z4 = e.fp.Add(z4, v7)
-	z4 = e.fp.Sub(z4, v8)
-	s1 = e.fp.MulConst(v12, big.NewInt(3))
-	z4 = e.fp.Sub(z4, s1)
-	s1 = e.fp.Add(v10, v11)
-	s1 = e.fp.Add(s1, v14)
-	s1 = e.fp.Sub(v6, s1)
-	s1 = e.fp.MulConst(s1, big.NewInt(2))
+	z4 = e.fp.Add(z4, v4)
+	s1 = e.fp.MulConst(v6, big.NewInt(2))
 	z4 = e.fp.Add(z4, s1)
+	s2 = e.fp.MulConst(v10, big.NewInt(2))
+	s1 = e.fp.Add(v8, s2)
+	s2 = e.fp.MulConst(v11, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v12, big.NewInt(3))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v14, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	z4 = e.fp.Sub(z4, s1)
 
 	z5 := e.fp.Add(v8, v10)
 	z5 = e.fp.Add(z5, v11)
 	z5 = e.fp.Add(z5, v12)
-	s1 = e.fp.Add(v6, v7)
-	z5 = e.fp.Sub(z5, s1)
 	z5 = e.fp.MulConst(z5, big.NewInt(2))
 	s1 = e.fp.MulConst(v14, big.NewInt(3))
 	z5 = e.fp.Add(z5, s1)
 	s1 = e.fp.Add(v3, v4)
 	s1 = e.fp.Add(s1, v5)
+	s2 = e.fp.MulConst(v6, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v7, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
 	z5 = e.fp.Sub(z5, s1)
 
 	return &E6{
@@ -290,34 +304,192 @@ func (e Ext6) Mul023By023(d0, d1, c0, c1 *baseEl) [5]*baseEl {
 	return [5]*baseEl{zC0B0, x01, x1, x04, x14}
 }
 
-/*
-// MulBy01245 multiplies z by an E6 sparse element of the form
+// MulBy02345 multiplies z by an E6 sparse element of the form
 //
 //	E6{
 //		B0: E3{A0: c0, A1: c1, A2: c2},
 //		B1: E3{A0: 0, A1: c4, A2: c5},
 //	}
-func (e *Ext6) MulBy01245(z *E6, x [5]*baseEl) *E6 {
-	c0 := &E3{A0: *x[0], A1: *x[1], A2: *x[2]}
-	a := e.Ext3.Add(&z.B0, &z.B1)
-	b := &E3{
-		A0: c0.A0,
-		A1: *e.fp.Add(&c0.A1, x[3]),
-		A2: *e.fp.Add(&c0.A2, x[4]),
-	}
-	a = e.Ext3.Mul(a, b)
-	b = e.Ext3.Mul(&z.B0, c0)
-	c := e.Ext3.MulBy12(&z.B1, x[3], x[4])
-	z1 := e.Ext3.Sub(a, b)
-	z1 = e.Ext3.Sub(z1, c)
-	z0 := e.Ext3.MulByNonResidue(c)
-	z0 = e.Ext3.Add(z0, b)
+func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
+	_t0 := e.fp.Add(&x.A0, &x.A1)
+	t0 := e.fp.Add(_t0, &x.A2)
+	t1 := e.fp.Add(&x.A3, &x.A4)
+	t2 := e.fp.Add(_t0, t1)
+	t3 := e.fp.Add(t2, &x.A5)
+	t3 = e.fp.Add(t3, &x.A2)
+
+	_s0 := y[0]
+	s0 := e.fp.Add(_s0, y[1])
+	s1 := e.fp.Add(y[2], y[3])
+	s2 := e.fp.Add(_s0, s1)
+	s3 := e.fp.Add(s2, y[4])
+	s3 = e.fp.Add(s3, y[1])
+
+	v0 := e.fp.Mul(t3, s3)
+	v2 := e.fp.Mul(t2, s2)
+	v6 := e.fp.Mul(t0, s0)
+	t4 := e.fp.Add(t1, &x.A5)
+	s4 := e.fp.Add(s1, y[4])
+	v7 := e.fp.Mul(t4, s4)
+	v12 := e.fp.Mul(_t0, _s0)
+	v11 := e.fp.Mul(t1, s1)
+	t0 = e.fp.Add(&x.A2, &x.A3)
+	s0 = e.fp.Add(y[1], y[2])
+	v8 := e.fp.Mul(t0, s0)
+	_t0 = e.fp.Sub(&x.A4, &x.A1)
+	v9 := e.fp.Mul(_t0, y[3])
+	t1 = e.fp.Add(&x.A1, &x.A2)
+	v10 := e.fp.Mul(t1, y[2])
+	t1 = e.fp.Add(&x.A4, &x.A5)
+	s1 = e.fp.Add(y[3], y[4])
+	v13 := e.fp.Mul(t1, s1)
+	v3 := e.fp.Add(&x.A0, &x.A5)
+	v3 = e.fp.Sub(v3, t0)
+	s1 = e.fp.Add(y[0], y[4])
+	s1 = e.fp.Sub(s1, s0)
+	v3 = e.fp.Mul(v3, s1)
+	t1 = e.fp.Add(&x.A2, &x.A5)
+	t2 = e.fp.Sub(&x.A0, t1)
+	s1 = e.fp.Add(y[1], y[4])
+	s2 = e.fp.Sub(y[0], s1)
+	v4 := e.fp.Mul(t2, s2)
+	t1 = e.fp.Add(&x.A0, &x.A3)
+	t1 = e.fp.Sub(t1, &x.A5)
+	s1 = e.fp.Add(y[0], y[2])
+	s1 = e.fp.Sub(s1, y[4])
+	v5 := e.fp.Mul(t1, s1)
+	v14 := e.fp.Mul(&x.A0, y[0])
+	v16 := e.fp.Mul(&x.A4, y[3])
+	v17 := e.fp.Mul(&x.A5, y[4])
+
+	c0 := e.fp.MulConst(v2, big.NewInt(4))
+	s1 = e.fp.Add(v8, v10)
+	s1 = e.fp.Add(s1, v11)
+	s1 = e.fp.MulConst(s1, big.NewInt(12))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v12, big.NewInt(8))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v13, big.NewInt(16))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v14, big.NewInt(21))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v17, big.NewInt(20))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(4))
+	s2 = e.fp.MulConst(v3, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v4, big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v5, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v6, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v7, big.NewInt(12))
+	s1 = e.fp.Add(s1, s2)
+	c0 = e.fp.Sub(c0, s1)
+
+	c1 := e.fp.Add(v3, v5)
+	c1 = e.fp.Add(c1, v6)
+	c1 = e.fp.MulConst(c1, big.NewInt(4))
+	s1 = e.fp.MulConst(v7, big.NewInt(8))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v16, big.NewInt(12))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v12, big.NewInt(3))
+	s2 = e.fp.MulConst(v14, big.NewInt(9))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v8, big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v10, big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v11, big.NewInt(12))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v13, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v17, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	c1 = e.fp.Sub(c1, s1)
+
+	c2 := v6
+	s1 = e.fp.MulConst(v11, big.NewInt(4))
+	c2 = e.fp.Add(c2, s1)
+	s1 = e.fp.MulConst(v13, big.NewInt(4))
+	c2 = e.fp.Add(c2, s1)
+	s1 = e.fp.Add(v10, v12)
+	s2 = e.fp.MulConst(v7, big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v16, big.NewInt(8))
+	s1 = e.fp.Add(s1, s2)
+	c2 = e.fp.Sub(c2, s1)
+
+	c3 := e.fp.Add(v8, v11)
+	s1 = e.fp.MulConst(v10, big.NewInt(3))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v12, big.NewInt(2))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v14, big.NewInt(2))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v16, big.NewInt(3))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v17, big.NewInt(6))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.Add(v3, v4)
+	s1 = e.fp.Add(s1, v7)
+	s2 = e.fp.MulConst(v6, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v13, big.NewInt(3))
+	s1 = e.fp.Add(s1, s2)
+	c3 = e.fp.Sub(c3, s1)
+
+	c4 := e.fp.Add(v2, v3)
+	c4 = e.fp.Add(c4, v9)
+	c4 = e.fp.Add(c4, v7)
+	c4 = e.fp.Add(c4, v4)
+	s1 = e.fp.MulConst(v6, big.NewInt(2))
+	c4 = e.fp.Add(c4, s1)
+	s1 = e.fp.Add(v13, v8)
+	s2 = e.fp.MulConst(v10, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v11, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v12, big.NewInt(3))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v14, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v17, big.NewInt(6))
+	s1 = e.fp.Add(s1, s2)
+	c4 = e.fp.Sub(c4, s1)
+
+	c5 := e.fp.Add(v8, v10)
+	c5 = e.fp.Add(c5, v11)
+	c5 = e.fp.Add(c5, v12)
+	c5 = e.fp.Add(c5, v13)
+	c5 = e.fp.MulConst(c5, big.NewInt(2))
+	s1 = e.fp.MulConst(v14, big.NewInt(3))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v17, big.NewInt(3))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.Add(v4, v3)
+	s1 = e.fp.Add(s1, v5)
+	s1 = e.fp.Add(s1, v9)
+	s2 = e.fp.MulConst(v6, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v7, big.NewInt(2))
+	s1 = e.fp.Add(s1, s2)
+	c5 = e.fp.Sub(c5, s1)
+
 	return &E6{
-		B0: *z0,
-		B1: *z1,
+		A0: *c0,
+		A1: *c1,
+		A2: *c2,
+		A3: *c3,
+		A4: *c4,
+		A5: *c5,
 	}
+
 }
 
+/*
 // Mul01245By014 multiplies two E6 sparse element of the form
 //
 //	E6{

From 4df060ae87e5609453c5500315f5c354c268d69c Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Fri, 26 Apr 2024 13:56:14 -0400
Subject: [PATCH 07/24] perf(bw6): save some adds in specialized mul e6

---
 std/algebra/emulated/fields_bw6761/e6.go      | 33 ++++++++-----------
 .../emulated/fields_bw6761/e6_pairing.go      | 31 ++++++++---------
 std/algebra/emulated/sw_bw6761/pairing.go     | 12 +++----
 3 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 4022a07109..197eabaf19 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -363,9 +363,9 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	//  (v6 + v7)) + 3(v14 + v17)
 
 	c0 := e.fp.MulConst(v[2], big.NewInt(4))
-	s1 := e.fp.Add(v[8], v[10])
-	s1 = e.fp.Add(s1, v[11])
-	s1 = e.fp.MulConst(s1, big.NewInt(12))
+	s811 := e.fp.Add(v[8], v[11])
+	s81110 := e.fp.Add(s811, v[10])
+	s1 := e.fp.MulConst(s81110, big.NewInt(12))
 	c0 = e.fp.Add(c0, s1)
 	s1 = e.fp.MulConst(v[12], big.NewInt(8))
 	c0 = e.fp.Add(c0, s1)
@@ -392,8 +392,8 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c0 = e.fp.Sub(c0, s1)
 
-	c1 := e.fp.Add(v[3], v[5])
-	c1 = e.fp.Add(c1, v[6])
+	s35 := e.fp.Add(v[3], v[5])
+	c1 := e.fp.Add(s35, v[6])
 	c1 = e.fp.MulConst(c1, big.NewInt(4))
 	s1 = e.fp.MulConst(v[7], big.NewInt(8))
 	c1 = e.fp.Add(c1, s1)
@@ -422,16 +422,15 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	c2 = e.fp.Add(c2, s1)
 	s1 = e.fp.MulConst(v[13], big.NewInt(4))
 	c2 = e.fp.Add(c2, s1)
-	s1 = e.fp.Add(v[10], v[12])
+	s1012 := e.fp.Add(v[10], v[12])
 	s2 = e.fp.MulConst(v[7], big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
+	s1 = e.fp.Add(s1012, s2)
 	s2 = e.fp.MulConst(v[16], big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
 	c2 = e.fp.Sub(c2, s1)
 
-	c3 := e.fp.Add(v[8], v[11])
 	s1 = e.fp.MulConst(v[10], big.NewInt(3))
-	c3 = e.fp.Add(c3, s1)
+	c3 := e.fp.Add(s811, s1)
 	s1 = e.fp.MulConst(v[12], big.NewInt(2))
 	c3 = e.fp.Add(c3, s1)
 	s1 = e.fp.MulConst(v[14], big.NewInt(2))
@@ -440,8 +439,8 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	c3 = e.fp.Add(c3, s1)
 	s1 = e.fp.MulConst(v[17], big.NewInt(6))
 	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.Add(v[3], v[4])
-	s1 = e.fp.Add(s1, v[7])
+	s34 := e.fp.Add(v[3], v[4])
+	s1 = e.fp.Add(s34, v[7])
 	s2 = e.fp.MulConst(v[6], big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
 	s2 = e.fp.MulConst(v[13], big.NewInt(3))
@@ -450,11 +449,10 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c3 = e.fp.Sub(c3, s1)
 
-	c4 := e.fp.Add(v[2], v[3])
-	c4 = e.fp.Add(c4, v[15])
+	c4 := e.fp.Add(v[2], v[15])
 	c4 = e.fp.Add(c4, v[9])
 	c4 = e.fp.Add(c4, v[7])
-	c4 = e.fp.Add(c4, v[4])
+	c4 = e.fp.Add(c4, s34)
 	s1 = e.fp.MulConst(v[6], big.NewInt(2))
 	c4 = e.fp.Add(c4, s1)
 	s1 = e.fp.Add(v[13], v[8])
@@ -470,9 +468,7 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c4 = e.fp.Sub(c4, s1)
 
-	c5 := e.fp.Add(v[8], v[10])
-	c5 = e.fp.Add(c5, v[11])
-	c5 = e.fp.Add(c5, v[12])
+	c5 := e.fp.Add(s81110, v[12])
 	c5 = e.fp.Add(c5, v[13])
 	c5 = e.fp.MulConst(c5, big.NewInt(2))
 	s1 = e.fp.MulConst(v[14], big.NewInt(3))
@@ -480,8 +476,7 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	s1 = e.fp.MulConst(v[17], big.NewInt(3))
 	c5 = e.fp.Add(c5, s1)
 	s1 = e.fp.Add(v[15], v[16])
-	s1 = e.fp.Add(s1, v[3])
-	s1 = e.fp.Add(s1, v[4])
+	s1 = e.fp.Add(s1, s34)
 	s1 = e.fp.Add(s1, v[5])
 	s1 = e.fp.Add(s1, v[9])
 	s2 = e.fp.MulConst(v[6], big.NewInt(2))
diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index 2932bf167c..fcc81465e5 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -184,9 +184,9 @@ func (e *Ext6) MulBy023(x *E6, c0, c1 *baseEl) *E6 {
 	v14 := e.fp.Mul(&x.A0, c0)
 
 	z0 := e.fp.MulConst(v2, big.NewInt(4))
-	s1 = e.fp.Add(v8, v10)
-	s1 = e.fp.Add(s1, v11)
-	s1 = e.fp.MulConst(s1, big.NewInt(12))
+	s811 := e.fp.Add(v8, v11)
+	s81110 := e.fp.Add(s811, v10)
+	s1 = e.fp.MulConst(s81110, big.NewInt(12))
 	z0 = e.fp.Add(z0, s1)
 	s1 = e.fp.MulConst(v12, big.NewInt(8))
 	z0 = e.fp.Add(z0, s1)
@@ -205,8 +205,8 @@ func (e *Ext6) MulBy023(x *E6, c0, c1 *baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	z0 = e.fp.Sub(z0, s1)
 
-	z1 := e.fp.Add(v3, v5)
-	z1 = e.fp.Add(z1, v6)
+	s35 := e.fp.Add(v3, v5)
+	z1 := e.fp.Add(s35, v6)
 	z1 = e.fp.MulConst(z1, big.NewInt(4))
 	s1 = e.fp.MulConst(v7, big.NewInt(8))
 	z1 = e.fp.Add(z1, s1)
@@ -228,22 +228,20 @@ func (e *Ext6) MulBy023(x *E6, c0, c1 *baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	z2 = e.fp.Sub(z2, s1)
 
-	z3 := e.fp.Add(v8, v11)
 	s1 = e.fp.MulConst(v10, big.NewInt(3))
-	z3 = e.fp.Add(z3, s1)
+	z3 := e.fp.Add(s811, s1)
 	s1 = e.fp.MulConst(v12, big.NewInt(2))
 	z3 = e.fp.Add(z3, s1)
 	s1 = e.fp.MulConst(v14, big.NewInt(2))
 	z3 = e.fp.Add(z3, s1)
-	s1 = e.fp.Add(v3, v4)
-	s1 = e.fp.Add(s1, v7)
+	s34 := e.fp.Add(v3, v4)
+	s1 = e.fp.Add(s34, v7)
 	s2 = e.fp.MulConst(v6, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
 	z3 = e.fp.Sub(z3, s1)
 
-	z4 := e.fp.Add(v2, v3)
-	z4 = e.fp.Add(z4, v7)
-	z4 = e.fp.Add(z4, v4)
+	z4 := e.fp.Add(v2, v7)
+	z4 = e.fp.Add(z4, s34)
 	s1 = e.fp.MulConst(v6, big.NewInt(2))
 	z4 = e.fp.Add(z4, s1)
 	s2 = e.fp.MulConst(v10, big.NewInt(2))
@@ -256,14 +254,11 @@ func (e *Ext6) MulBy023(x *E6, c0, c1 *baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	z4 = e.fp.Sub(z4, s1)
 
-	z5 := e.fp.Add(v8, v10)
-	z5 = e.fp.Add(z5, v11)
-	z5 = e.fp.Add(z5, v12)
+	z5 := e.fp.Add(s81110, v12)
 	z5 = e.fp.MulConst(z5, big.NewInt(2))
 	s1 = e.fp.MulConst(v14, big.NewInt(3))
 	z5 = e.fp.Add(z5, s1)
-	s1 = e.fp.Add(v3, v4)
-	s1 = e.fp.Add(s1, v5)
+	s1 = e.fp.Add(s34, v5)
 	s2 = e.fp.MulConst(v6, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
 	s2 = e.fp.MulConst(v7, big.NewInt(2))
@@ -301,7 +296,7 @@ func (e Ext6) Mul023By023(d0, d1, c0, c1 *baseEl) [5]*baseEl {
 	four := emulated.ValueOf[emulated.BW6761Fp](big.NewInt(4))
 	zC0B0 := e.fp.Sub(x0, &four)
 
-	return [5]*baseEl{zC0B0, x01, x1, x04, x14}
+	return [5]*baseEl{zC0B0, x01, x04, x1, x14}
 }
 
 // MulBy02345 multiplies z by an E6 sparse element of the form
diff --git a/std/algebra/emulated/sw_bw6761/pairing.go b/std/algebra/emulated/sw_bw6761/pairing.go
index b07aa0a7bb..bda8c5f218 100644
--- a/std/algebra/emulated/sw_bw6761/pairing.go
+++ b/std/algebra/emulated/sw_bw6761/pairing.go
@@ -319,8 +319,8 @@ func (pr Pairing) millerLoopLines(P []*G1Affine, lines []lineEvaluations) (*GTEl
 			A0: *prodLines[0],
 			A1: result.A1,
 			A2: *prodLines[1],
-			A3: *prodLines[3],
-			A4: *prodLines[2],
+			A3: *prodLines[2],
+			A4: *prodLines[3],
 			A5: *prodLines[4],
 		}
 	}
@@ -351,8 +351,8 @@ func (pr Pairing) millerLoopLines(P []*G1Affine, lines []lineEvaluations) (*GTEl
 						A0: *prodLines[0],
 						A1: *pr.curveF.Zero(),
 						A2: *prodLines[1],
-						A3: *prodLines[3],
-						A4: *prodLines[2],
+						A3: *prodLines[2],
+						A4: *prodLines[3],
 						A5: *prodLines[4],
 					},
 				)
@@ -381,8 +381,8 @@ func (pr Pairing) millerLoopLines(P []*G1Affine, lines []lineEvaluations) (*GTEl
 						A0: *prodLines[0],
 						A1: *pr.curveF.Zero(),
 						A2: *prodLines[1],
-						A3: *prodLines[3],
-						A4: *prodLines[2],
+						A3: *prodLines[2],
+						A4: *prodLines[3],
 						A5: *prodLines[4],
 					},
 				)

From 72e558ab3bf77b727f45258485b22e5eef7e09f2 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Fri, 26 Apr 2024 15:50:19 -0400
Subject: [PATCH 08/24] perf(bw6): sparse mul by lines

---
 .../emulated/fields_bw6761/e6_pairing.go      | 52 ++++++++-----------
 std/algebra/emulated/sw_bw6761/pairing.go     | 24 +--------
 2 files changed, 25 insertions(+), 51 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index fcc81465e5..2c58b79b6e 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -301,9 +301,7 @@ func (e Ext6) Mul023By023(d0, d1, c0, c1 *baseEl) [5]*baseEl {
 
 // MulBy02345 multiplies z by an E6 sparse element of the form
 //
-//	E6{
-//		B0: E3{A0: c0, A1: c1, A2: c2},
-//		B1: E3{A0: 0, A1: c4, A2: c5},
+//	E6{A0: y0, A1: 0, A2: y1, A3: y2, A4: y3, A5: y4},
 //	}
 func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	_t0 := e.fp.Add(&x.A0, &x.A1)
@@ -313,10 +311,9 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	t3 := e.fp.Add(t2, &x.A5)
 	t3 = e.fp.Add(t3, &x.A2)
 
-	_s0 := y[0]
-	s0 := e.fp.Add(_s0, y[1])
+	s0 := e.fp.Add(y[0], y[1])
 	s1 := e.fp.Add(y[2], y[3])
-	s2 := e.fp.Add(_s0, s1)
+	s2 := e.fp.Add(y[0], s1)
 	s3 := e.fp.Add(s2, y[4])
 	s3 = e.fp.Add(s3, y[1])
 
@@ -326,7 +323,7 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	t4 := e.fp.Add(t1, &x.A5)
 	s4 := e.fp.Add(s1, y[4])
 	v7 := e.fp.Mul(t4, s4)
-	v12 := e.fp.Mul(_t0, _s0)
+	v12 := e.fp.Mul(_t0, y[0])
 	v11 := e.fp.Mul(t1, s1)
 	t0 = e.fp.Add(&x.A2, &x.A3)
 	s0 = e.fp.Add(y[1], y[2])
@@ -334,7 +331,7 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	_t0 = e.fp.Sub(&x.A4, &x.A1)
 	v9 := e.fp.Mul(_t0, y[3])
 	t1 = e.fp.Add(&x.A1, &x.A2)
-	v10 := e.fp.Mul(t1, y[2])
+	v10 := e.fp.Mul(t1, y[1])
 	t1 = e.fp.Add(&x.A4, &x.A5)
 	s1 = e.fp.Add(y[3], y[4])
 	v13 := e.fp.Mul(t1, s1)
@@ -358,9 +355,9 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	v17 := e.fp.Mul(&x.A5, y[4])
 
 	c0 := e.fp.MulConst(v2, big.NewInt(4))
-	s1 = e.fp.Add(v8, v10)
-	s1 = e.fp.Add(s1, v11)
-	s1 = e.fp.MulConst(s1, big.NewInt(12))
+	s811 := e.fp.Add(v8, v11)
+	s81110 := e.fp.Add(s811, v10)
+	s1 = e.fp.MulConst(s81110, big.NewInt(12))
 	c0 = e.fp.Add(c0, s1)
 	s1 = e.fp.MulConst(v12, big.NewInt(8))
 	c0 = e.fp.Add(c0, s1)
@@ -370,7 +367,9 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	c0 = e.fp.Add(c0, s1)
 	s1 = e.fp.MulConst(v17, big.NewInt(20))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v0, big.NewInt(4))
+	s1 = e.fp.MulConst(v16, big.NewInt(16))
+	s2 = e.fp.MulConst(v0, big.NewInt(4))
+	s1 = e.fp.Add(s1, s2)
 	s2 = e.fp.MulConst(v3, big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
 	s2 = e.fp.MulConst(v4, big.NewInt(4))
@@ -383,8 +382,8 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c0 = e.fp.Sub(c0, s1)
 
-	c1 := e.fp.Add(v3, v5)
-	c1 = e.fp.Add(c1, v6)
+	s35 := e.fp.Add(v3, v5)
+	c1 := e.fp.Add(s35, v6)
 	c1 = e.fp.MulConst(c1, big.NewInt(4))
 	s1 = e.fp.MulConst(v7, big.NewInt(8))
 	c1 = e.fp.Add(c1, s1)
@@ -405,21 +404,19 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c1 = e.fp.Sub(c1, s1)
 
-	c2 := v6
 	s1 = e.fp.MulConst(v11, big.NewInt(4))
-	c2 = e.fp.Add(c2, s1)
+	c2 := e.fp.Add(v6, s1)
 	s1 = e.fp.MulConst(v13, big.NewInt(4))
 	c2 = e.fp.Add(c2, s1)
-	s1 = e.fp.Add(v10, v12)
+	s1012 := e.fp.Add(v10, v12)
 	s2 = e.fp.MulConst(v7, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
+	s1 = e.fp.Add(s1012, s2)
 	s2 = e.fp.MulConst(v16, big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
 	c2 = e.fp.Sub(c2, s1)
 
-	c3 := e.fp.Add(v8, v11)
 	s1 = e.fp.MulConst(v10, big.NewInt(3))
-	c3 = e.fp.Add(c3, s1)
+	c3 := e.fp.Add(s811, s1)
 	s1 = e.fp.MulConst(v12, big.NewInt(2))
 	c3 = e.fp.Add(c3, s1)
 	s1 = e.fp.MulConst(v14, big.NewInt(2))
@@ -428,18 +425,17 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	c3 = e.fp.Add(c3, s1)
 	s1 = e.fp.MulConst(v17, big.NewInt(6))
 	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.Add(v3, v4)
-	s1 = e.fp.Add(s1, v7)
+	s34 := e.fp.Add(v3, v4)
+	s1 = e.fp.Add(s34, v7)
 	s2 = e.fp.MulConst(v6, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
 	s2 = e.fp.MulConst(v13, big.NewInt(3))
 	s1 = e.fp.Add(s1, s2)
 	c3 = e.fp.Sub(c3, s1)
 
-	c4 := e.fp.Add(v2, v3)
-	c4 = e.fp.Add(c4, v9)
+	c4 := e.fp.Add(v2, v9)
 	c4 = e.fp.Add(c4, v7)
-	c4 = e.fp.Add(c4, v4)
+	c4 = e.fp.Add(c4, s34)
 	s1 = e.fp.MulConst(v6, big.NewInt(2))
 	c4 = e.fp.Add(c4, s1)
 	s1 = e.fp.Add(v13, v8)
@@ -455,16 +451,14 @@ func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c4 = e.fp.Sub(c4, s1)
 
-	c5 := e.fp.Add(v8, v10)
-	c5 = e.fp.Add(c5, v11)
-	c5 = e.fp.Add(c5, v12)
+	c5 := e.fp.Add(s81110, v12)
 	c5 = e.fp.Add(c5, v13)
 	c5 = e.fp.MulConst(c5, big.NewInt(2))
 	s1 = e.fp.MulConst(v14, big.NewInt(3))
 	c5 = e.fp.Add(c5, s1)
 	s1 = e.fp.MulConst(v17, big.NewInt(3))
 	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.Add(v4, v3)
+	s1 = e.fp.Add(v16, s34)
 	s1 = e.fp.Add(s1, v5)
 	s1 = e.fp.Add(s1, v9)
 	s2 = e.fp.MulConst(v6, big.NewInt(2))
diff --git a/std/algebra/emulated/sw_bw6761/pairing.go b/std/algebra/emulated/sw_bw6761/pairing.go
index bda8c5f218..b6fb462139 100644
--- a/std/algebra/emulated/sw_bw6761/pairing.go
+++ b/std/algebra/emulated/sw_bw6761/pairing.go
@@ -345,17 +345,7 @@ func (pr Pairing) millerLoopLines(P []*G1Affine, lines []lineEvaluations) (*GTEl
 					pr.curveF.Mul(&lines[k][1][i].R1, yInv[k]),
 					pr.curveF.Mul(&lines[k][1][i].R0, xNegOverY[k]),
 				)
-				result = pr.Mul(
-					result,
-					&fields_bw6761.E6{
-						A0: *prodLines[0],
-						A1: *pr.curveF.Zero(),
-						A2: *prodLines[1],
-						A3: *prodLines[2],
-						A4: *prodLines[3],
-						A5: *prodLines[4],
-					},
-				)
+				result = pr.MulBy02345(result, prodLines)
 			}
 		} else {
 			// if number of lines is odd, mul last line by res
@@ -375,17 +365,7 @@ func (pr Pairing) millerLoopLines(P []*G1Affine, lines []lineEvaluations) (*GTEl
 					pr.curveF.Mul(&lines[k-1][0][i].R1, yInv[k-1]),
 					pr.curveF.Mul(&lines[k-1][0][i].R0, xNegOverY[k-1]),
 				)
-				result = pr.Mul(
-					result,
-					&fields_bw6761.E6{
-						A0: *prodLines[0],
-						A1: *pr.curveF.Zero(),
-						A2: *prodLines[1],
-						A3: *prodLines[2],
-						A4: *prodLines[3],
-						A5: *prodLines[4],
-					},
-				)
+				result = pr.MulBy02345(result, prodLines)
 			}
 		}
 	}

From a0949567c06505303ac048e8d282e8c29d355e14 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Sat, 27 Apr 2024 13:57:17 -0400
Subject: [PATCH 09/24] perf(bw6): Square uses Karatsuba over Chung-Hasan
 instead of TC6

---
 std/algebra/emulated/fields_bw6761/e6.go | 102 ++++++++++++++++++++---
 1 file changed, 92 insertions(+), 10 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 197eabaf19..78a8ea58e8 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -189,14 +189,6 @@ func mulFpByNonResidue(fp *curveF, x *baseEl) *baseEl {
 	return z
 }
 
-func (e Ext6) Mul(x, y *E6) *E6 {
-	x = e.Reduce(x)
-	y = e.Reduce(y)
-	v := e.interpolationX6Mul(x, y)
-	return e.mulMontgomery6(v)
-	// return e.mulToomCook6(v)
-}
-
 func (e Ext6) interpolationX6Mul(x, y *E6) [18]*baseEl {
 	// Fixing the polynomial to X^6 we first compute the interpolation points
 	// vi = x(pi)*y(pi) at {0, ±1, ±2, ±3, ±4, 5,∞}:
@@ -495,13 +487,103 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	}
 }
 
-func (e Ext6) Square(x *E6) *E6 {
+func (e Ext6) Mul(x, y *E6) *E6 {
 	x = e.Reduce(x)
-	v := e.interpolationX6Sq(x)
+	y = e.Reduce(y)
+	v := e.interpolationX6Mul(x, y)
 	return e.mulMontgomery6(v)
 	// return e.mulToomCook6(v)
 }
 
+func (e Ext6) Square(x *E6) *E6 {
+	// We don't use Montgomery-6 or Toom-Cook-6 for the squaring but instead we
+	// simulate a quadratic over cubic extension tower because Karatsuba over
+	// Chung-Hasan SQR2 is better constraint wise.
+	//
+	// Algorithm 22 from https://eprint.iacr.org/2010/354.pdf
+	x = e.Reduce(x)
+
+	// c0
+	c00 := e.fp.Sub(&x.A0, &x.A1)
+	c01 := e.fp.Sub(&x.A2, &x.A3)
+	c02 := e.fp.Sub(&x.A4, &x.A5)
+
+	// c3
+	c30 := e.fp.Add(&x.A0, e.fp.MulConst(&x.A5, big.NewInt(4)))
+	c31 := e.fp.Sub(&x.A2, &x.A1)
+	c32 := e.fp.Sub(&x.A4, &x.A3)
+
+	t0 := e.fp.Mul(&x.A0, &x.A1)
+	t1 := e.fp.Mul(&x.A2, &x.A3)
+	t2 := e.fp.Mul(&x.A4, &x.A5)
+	c0 := e.fp.Add(&x.A2, &x.A4)
+	tmp := e.fp.Add(&x.A3, &x.A5)
+	c0 = e.fp.Mul(c0, tmp)
+	c0 = e.fp.Sub(c0, t1)
+	c0 = e.fp.Sub(t2, c0)
+	c0 = e.fp.MulConst(c0, big.NewInt(4))
+	tmp = e.fp.Add(&x.A0, &x.A4)
+	c2 := e.fp.Add(&x.A1, &x.A5)
+	c2 = e.fp.Mul(c2, tmp)
+	c2 = e.fp.Sub(c2, t0)
+	c2 = e.fp.Sub(c2, t2)
+	c1 := e.fp.Add(&x.A0, &x.A2)
+	tmp = e.fp.Add(&x.A1, &x.A3)
+	c1 = e.fp.Mul(c1, tmp)
+	c1 = e.fp.Sub(c1, t0)
+	c1 = e.fp.Sub(c1, t1)
+	t2 = mulFpByNonResidue(e.fp, t2)
+	// c2
+	c20 := e.fp.Add(c0, t0)
+	c21 := e.fp.Add(c1, t2)
+	c22 := e.fp.Add(c2, t1)
+
+	t0 = e.fp.Mul(c00, c30)
+	t1 = e.fp.Mul(c01, c31)
+	t2 = e.fp.Mul(c02, c32)
+	c0 = e.fp.Add(c01, c02)
+	tmp = e.fp.Add(c31, c32)
+	c0 = e.fp.Mul(c0, tmp)
+	c0 = e.fp.Sub(c0, t1)
+	c0 = e.fp.Sub(t2, c0)
+	c0 = e.fp.MulConst(c0, big.NewInt(4))
+	tmp = e.fp.Add(c00, c02)
+	c2 = e.fp.Add(c30, c32)
+	c2 = e.fp.Mul(c2, tmp)
+	c2 = e.fp.Sub(c2, t0)
+	c2 = e.fp.Sub(c2, t2)
+	c1 = e.fp.Add(c00, c01)
+	tmp = e.fp.Add(c30, c31)
+	c1 = e.fp.Mul(c1, tmp)
+	c1 = e.fp.Sub(c1, t0)
+	c1 = e.fp.Sub(c1, t1)
+	t2 = mulFpByNonResidue(e.fp, t2)
+	c00 = e.fp.Add(c0, t0)
+	c01 = e.fp.Add(c1, t2)
+	c02 = e.fp.Add(c2, t1)
+
+	c00 = e.fp.Add(c00, c20)
+	c01 = e.fp.Add(c01, c21)
+	c02 = e.fp.Add(c02, c22)
+
+	b10 := e.fp.MulConst(c20, big.NewInt(2))
+	b11 := e.fp.MulConst(c21, big.NewInt(2))
+	b12 := e.fp.MulConst(c22, big.NewInt(2))
+
+	b00 := e.fp.Sub(c00, e.fp.MulConst(c22, big.NewInt(4)))
+	b01 := e.fp.Add(c01, c20)
+	b02 := e.fp.Add(c02, c21)
+
+	return &E6{
+		A0: *b00,
+		A1: *b10,
+		A2: *b01,
+		A3: *b11,
+		A4: *b02,
+		A5: *b12,
+	}
+}
+
 /*
 func (e Ext6) MulToomCook6x(x, y *E6) *E6 {
 	//	Then we compute the product  362880*x*y to avoid divisions:

From 00c01e534e0d73d214a9bb0e220aeda498fab275 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Sat, 27 Apr 2024 14:33:31 -0400
Subject: [PATCH 10/24] perf(bw6): save some subs in Fp6 square

---
 std/algebra/emulated/fields_bw6761/e6.go | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 78a8ea58e8..5564d6773a 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -519,19 +519,16 @@ func (e Ext6) Square(x *E6) *E6 {
 	c0 := e.fp.Add(&x.A2, &x.A4)
 	tmp := e.fp.Add(&x.A3, &x.A5)
 	c0 = e.fp.Mul(c0, tmp)
-	c0 = e.fp.Sub(c0, t1)
-	c0 = e.fp.Sub(t2, c0)
+	c0 = e.fp.Sub(e.fp.Add(t1, t2), c0)
 	c0 = e.fp.MulConst(c0, big.NewInt(4))
 	tmp = e.fp.Add(&x.A0, &x.A4)
 	c2 := e.fp.Add(&x.A1, &x.A5)
 	c2 = e.fp.Mul(c2, tmp)
-	c2 = e.fp.Sub(c2, t0)
-	c2 = e.fp.Sub(c2, t2)
+	c2 = e.fp.Sub(c2, e.fp.Add(t0, t2))
 	c1 := e.fp.Add(&x.A0, &x.A2)
 	tmp = e.fp.Add(&x.A1, &x.A3)
 	c1 = e.fp.Mul(c1, tmp)
-	c1 = e.fp.Sub(c1, t0)
-	c1 = e.fp.Sub(c1, t1)
+	c1 = e.fp.Sub(c1, e.fp.Add(t0, t1))
 	t2 = mulFpByNonResidue(e.fp, t2)
 	// c2
 	c20 := e.fp.Add(c0, t0)
@@ -544,19 +541,16 @@ func (e Ext6) Square(x *E6) *E6 {
 	c0 = e.fp.Add(c01, c02)
 	tmp = e.fp.Add(c31, c32)
 	c0 = e.fp.Mul(c0, tmp)
-	c0 = e.fp.Sub(c0, t1)
-	c0 = e.fp.Sub(t2, c0)
+	c0 = e.fp.Sub(e.fp.Add(t1, t2), c0)
 	c0 = e.fp.MulConst(c0, big.NewInt(4))
 	tmp = e.fp.Add(c00, c02)
 	c2 = e.fp.Add(c30, c32)
 	c2 = e.fp.Mul(c2, tmp)
-	c2 = e.fp.Sub(c2, t0)
-	c2 = e.fp.Sub(c2, t2)
+	c2 = e.fp.Sub(c2, e.fp.Add(t0, t2))
 	c1 = e.fp.Add(c00, c01)
 	tmp = e.fp.Add(c30, c31)
 	c1 = e.fp.Mul(c1, tmp)
-	c1 = e.fp.Sub(c1, t0)
-	c1 = e.fp.Sub(c1, t1)
+	c1 = e.fp.Sub(c1, e.fp.Add(t0, t1))
 	t2 = mulFpByNonResidue(e.fp, t2)
 	c00 = e.fp.Add(c0, t0)
 	c01 = e.fp.Add(c1, t2)

From a203ff89d192db9ad4de85c2e4e22371d2269ea3 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 29 Apr 2024 14:18:59 -0400
Subject: [PATCH 11/24] perf(bw6): mulby02345

---
 std/algebra/emulated/fields_bw6761/e6.go      |  62 ----
 .../emulated/fields_bw6761/e6_pairing.go      | 265 ++++++------------
 2 files changed, 93 insertions(+), 234 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 5564d6773a..23d25cf7a1 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -268,68 +268,6 @@ func (e Ext6) interpolationX6Mul(x, y *E6) [18]*baseEl {
 	return [18]*baseEl{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17}
 }
 
-func (e Ext6) interpolationX6Sq(x *E6) [18]*baseEl {
-	// Fixing the polynomial to X^6 we first compute the interpolation points
-	// vi = x(pi)*y(pi) at {0, ±1, ±2, ±3, ±4, 5,∞}:
-	//
-	//		v0 = (a0 + a1 + a2 + a3 + a4 + a5)^2
-	//		v2 = (a0 + a1 + a3 + a4)^2
-	//		v3 = (a0 − a2 − a3 + a5)^2
-	//		v4 = (a0 − a2 − a5)^2
-	//		v5 = (a0 + a3 − a5)^2
-	//		v6 = (a0 + a1 + a2)^2
-	//		v7 = (a3 + a4 + a5)^2
-	//		v8 = (a2 + a3)^2
-	//		v9 = (a1 − a4)^2
-	//		v10 = (a1 + a2)^2
-	//		v11 = (a3 + a4)^2
-	//		v12 = (a0 + a1)^2
-	//		v13 = (a4 + a5)^2
-	//		v14 = a0^2
-	//		v15 = a1^2
-	//		v16 = a4^2
-	//		v17 = a5^2
-
-	_t0 := e.fp.Add(&x.A0, &x.A1)
-	t0 := e.fp.Add(_t0, &x.A2)
-	t1 := e.fp.Add(&x.A3, &x.A4)
-	t2 := e.fp.Add(_t0, t1)
-	t3 := e.fp.Add(t2, &x.A5)
-	t3 = e.fp.Add(t3, &x.A2)
-
-	v0 := e.fp.Mul(t3, t3)
-	v2 := e.fp.Mul(t2, t2)
-	v6 := e.fp.Mul(t0, t0)
-	t4 := e.fp.Add(t1, &x.A5)
-	v7 := e.fp.Mul(t4, t4)
-	v12 := e.fp.Mul(_t0, _t0)
-	v11 := e.fp.Mul(t1, t1)
-	t0 = e.fp.Add(&x.A2, &x.A3)
-	v8 := e.fp.Mul(t0, t0)
-	_t0 = e.fp.Sub(&x.A1, &x.A4)
-	v9 := e.fp.Mul(_t0, _t0)
-	t1 = e.fp.Add(&x.A1, &x.A2)
-	v10 := e.fp.Mul(t1, t1)
-	t1 = e.fp.Add(&x.A4, &x.A5)
-	v13 := e.fp.Mul(t1, t1)
-	v3 := e.fp.Add(&x.A0, &x.A5)
-	v3 = e.fp.Sub(v3, t0)
-	v3 = e.fp.Mul(v3, v3)
-	t1 = e.fp.Add(&x.A2, &x.A5)
-	t2 = e.fp.Sub(&x.A0, t1)
-	v4 := e.fp.Mul(t2, t2)
-	t1 = e.fp.Add(&x.A0, &x.A3)
-	t1 = e.fp.Sub(t1, &x.A5)
-	v5 := e.fp.Mul(t1, t1)
-	v14 := e.fp.Mul(&x.A0, &x.A0)
-	v15 := e.fp.Mul(&x.A1, &x.A1)
-	v16 := e.fp.Mul(&x.A4, &x.A4)
-	v17 := e.fp.Mul(&x.A5, &x.A5)
-	v1 := e.fp.Zero()
-
-	return [18]*baseEl{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17}
-}
-
 func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	// Then we compute the coefficients c0,c1,c3,c4 and c5 in the direct sextic
 	// extension of the product x*y as follows:
diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index 2c58b79b6e..bc10352845 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -293,8 +293,8 @@ func (e Ext6) Mul023By023(d0, d1, c0, c1 *baseEl) [5]*baseEl {
 	x01 = e.fp.Sub(x01, tmp)
 	x14 := e.fp.Add(c1, d1)
 
-	four := emulated.ValueOf[emulated.BW6761Fp](big.NewInt(4))
-	zC0B0 := e.fp.Sub(x0, &four)
+	minusFour := emulated.ValueOf[emulated.BW6761Fp]("6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068295") // -4 % p
+	zC0B0 := e.fp.Add(x0, &minusFour)
 
 	return [5]*baseEl{zC0B0, x01, x04, x1, x14}
 }
@@ -303,179 +303,100 @@ func (e Ext6) Mul023By023(d0, d1, c0, c1 *baseEl) [5]*baseEl {
 //
 //	E6{A0: y0, A1: 0, A2: y1, A3: y2, A4: y3, A5: y4},
 //	}
-func (e *Ext6) MulBy02345(x *E6, y [5]*baseEl) *E6 {
-	_t0 := e.fp.Add(&x.A0, &x.A1)
-	t0 := e.fp.Add(_t0, &x.A2)
-	t1 := e.fp.Add(&x.A3, &x.A4)
-	t2 := e.fp.Add(_t0, t1)
-	t3 := e.fp.Add(t2, &x.A5)
-	t3 = e.fp.Add(t3, &x.A2)
-
-	s0 := e.fp.Add(y[0], y[1])
-	s1 := e.fp.Add(y[2], y[3])
-	s2 := e.fp.Add(y[0], s1)
-	s3 := e.fp.Add(s2, y[4])
-	s3 = e.fp.Add(s3, y[1])
-
-	v0 := e.fp.Mul(t3, s3)
-	v2 := e.fp.Mul(t2, s2)
-	v6 := e.fp.Mul(t0, s0)
-	t4 := e.fp.Add(t1, &x.A5)
-	s4 := e.fp.Add(s1, y[4])
-	v7 := e.fp.Mul(t4, s4)
-	v12 := e.fp.Mul(_t0, y[0])
-	v11 := e.fp.Mul(t1, s1)
-	t0 = e.fp.Add(&x.A2, &x.A3)
-	s0 = e.fp.Add(y[1], y[2])
-	v8 := e.fp.Mul(t0, s0)
-	_t0 = e.fp.Sub(&x.A4, &x.A1)
-	v9 := e.fp.Mul(_t0, y[3])
-	t1 = e.fp.Add(&x.A1, &x.A2)
-	v10 := e.fp.Mul(t1, y[1])
-	t1 = e.fp.Add(&x.A4, &x.A5)
-	s1 = e.fp.Add(y[3], y[4])
-	v13 := e.fp.Mul(t1, s1)
-	v3 := e.fp.Add(&x.A0, &x.A5)
-	v3 = e.fp.Sub(v3, t0)
-	s1 = e.fp.Add(y[0], y[4])
-	s1 = e.fp.Sub(s1, s0)
-	v3 = e.fp.Mul(v3, s1)
-	t1 = e.fp.Add(&x.A2, &x.A5)
-	t2 = e.fp.Sub(&x.A0, t1)
-	s1 = e.fp.Add(y[1], y[4])
-	s2 = e.fp.Sub(y[0], s1)
-	v4 := e.fp.Mul(t2, s2)
-	t1 = e.fp.Add(&x.A0, &x.A3)
-	t1 = e.fp.Sub(t1, &x.A5)
-	s1 = e.fp.Add(y[0], y[2])
-	s1 = e.fp.Sub(s1, y[4])
-	v5 := e.fp.Mul(t1, s1)
-	v14 := e.fp.Mul(&x.A0, y[0])
-	v16 := e.fp.Mul(&x.A4, y[3])
-	v17 := e.fp.Mul(&x.A5, y[4])
-
-	c0 := e.fp.MulConst(v2, big.NewInt(4))
-	s811 := e.fp.Add(v8, v11)
-	s81110 := e.fp.Add(s811, v10)
-	s1 = e.fp.MulConst(s81110, big.NewInt(12))
-	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v12, big.NewInt(8))
-	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v13, big.NewInt(16))
-	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v14, big.NewInt(21))
-	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v17, big.NewInt(20))
-	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v16, big.NewInt(16))
-	s2 = e.fp.MulConst(v0, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v3, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v4, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v5, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v6, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v7, big.NewInt(12))
-	s1 = e.fp.Add(s1, s2)
-	c0 = e.fp.Sub(c0, s1)
-
-	s35 := e.fp.Add(v3, v5)
-	c1 := e.fp.Add(s35, v6)
-	c1 = e.fp.MulConst(c1, big.NewInt(4))
-	s1 = e.fp.MulConst(v7, big.NewInt(8))
-	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v16, big.NewInt(12))
-	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v12, big.NewInt(3))
-	s2 = e.fp.MulConst(v14, big.NewInt(9))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v8, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v10, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v11, big.NewInt(12))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v13, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v17, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	c1 = e.fp.Sub(c1, s1)
-
-	s1 = e.fp.MulConst(v11, big.NewInt(4))
-	c2 := e.fp.Add(v6, s1)
-	s1 = e.fp.MulConst(v13, big.NewInt(4))
-	c2 = e.fp.Add(c2, s1)
-	s1012 := e.fp.Add(v10, v12)
-	s2 = e.fp.MulConst(v7, big.NewInt(4))
-	s1 = e.fp.Add(s1012, s2)
-	s2 = e.fp.MulConst(v16, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	c2 = e.fp.Sub(c2, s1)
-
-	s1 = e.fp.MulConst(v10, big.NewInt(3))
-	c3 := e.fp.Add(s811, s1)
-	s1 = e.fp.MulConst(v12, big.NewInt(2))
-	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v14, big.NewInt(2))
-	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v16, big.NewInt(3))
-	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v17, big.NewInt(6))
-	c3 = e.fp.Add(c3, s1)
-	s34 := e.fp.Add(v3, v4)
-	s1 = e.fp.Add(s34, v7)
-	s2 = e.fp.MulConst(v6, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v13, big.NewInt(3))
-	s1 = e.fp.Add(s1, s2)
-	c3 = e.fp.Sub(c3, s1)
-
-	c4 := e.fp.Add(v2, v9)
-	c4 = e.fp.Add(c4, v7)
-	c4 = e.fp.Add(c4, s34)
-	s1 = e.fp.MulConst(v6, big.NewInt(2))
-	c4 = e.fp.Add(c4, s1)
-	s1 = e.fp.Add(v13, v8)
-	s2 = e.fp.MulConst(v10, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v11, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v12, big.NewInt(3))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v14, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v17, big.NewInt(6))
-	s1 = e.fp.Add(s1, s2)
-	c4 = e.fp.Sub(c4, s1)
-
-	c5 := e.fp.Add(s81110, v12)
-	c5 = e.fp.Add(c5, v13)
-	c5 = e.fp.MulConst(c5, big.NewInt(2))
-	s1 = e.fp.MulConst(v14, big.NewInt(3))
-	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.MulConst(v17, big.NewInt(3))
-	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.Add(v16, s34)
-	s1 = e.fp.Add(s1, v5)
-	s1 = e.fp.Add(s1, v9)
-	s2 = e.fp.MulConst(v6, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v7, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	c5 = e.fp.Sub(c5, s1)
+func (e *Ext6) MulBy02345(z *E6, x [5]*baseEl) *E6 {
+	a0 := e.fp.Add(&z.A0, &z.A1)
+	a1 := e.fp.Add(&z.A2, &z.A3)
+	a2 := e.fp.Add(&z.A4, &z.A5)
+
+	b1 := e.fp.Add(x[1], x[2])
+	b2 := e.fp.Add(x[3], x[4])
+
+	t0 := e.fp.Mul(a0, x[0])
+	t1 := e.fp.Mul(a1, b1)
+	t2 := e.fp.Mul(a2, b2)
+	c0 := e.fp.Add(a1, a2)
+	tmp := e.fp.Add(b1, b2)
+	c0 = e.fp.Mul(c0, tmp)
+	c0 = e.fp.Sub(c0, t1)
+	c0 = e.fp.Sub(t2, c0)
+	c0 = e.fp.MulConst(c0, big.NewInt(4))
+	tmp = e.fp.Add(a0, a2)
+	c2 := e.fp.Add(x[0], b2)
+	c2 = e.fp.Mul(c2, tmp)
+	c2 = e.fp.Sub(c2, t0)
+	c2 = e.fp.Sub(c2, t2)
+	c1 := e.fp.Add(a0, a1)
+	tmp = e.fp.Add(x[0], b1)
+	c1 = e.fp.Mul(c1, tmp)
+	c1 = e.fp.Sub(c1, t0)
+	c1 = e.fp.Sub(c1, t1)
+	t2 = mulFpByNonResidue(e.fp, t2)
+	a0 = e.fp.Add(c0, t0)
+	a1 = e.fp.Add(c1, t2)
+	a2 = e.fp.Add(c2, t1)
+
+	t0 = e.fp.Mul(&z.A0, x[0])
+	t1 = e.fp.Mul(&z.A2, x[1])
+	t2 = e.fp.Mul(&z.A4, x[3])
+	c0 = e.fp.Add(&z.A2, &z.A4)
+	tmp = e.fp.Add(x[1], x[3])
+	c0 = e.fp.Mul(c0, tmp)
+	c0 = e.fp.Sub(c0, t1)
+	c0 = e.fp.Sub(t2, c0)
+	c0 = e.fp.MulConst(c0, big.NewInt(4))
+	tmp = e.fp.Add(&z.A0, &z.A4)
+	c2 = e.fp.Add(x[0], x[3])
+	c2 = e.fp.Mul(c2, tmp)
+	c2 = e.fp.Sub(c2, t0)
+	c2 = e.fp.Sub(c2, t2)
+	c1 = e.fp.Add(&z.A0, &z.A2)
+	tmp = e.fp.Add(x[0], x[1])
+	c1 = e.fp.Mul(c1, tmp)
+	c1 = e.fp.Sub(c1, t0)
+	c1 = e.fp.Sub(c1, t1)
+	t2 = mulFpByNonResidue(e.fp, t2)
+	b0 := e.fp.Add(c0, t0)
+	b1 = e.fp.Add(c1, t2)
+	b2 = e.fp.Add(c2, t1)
+
+	t1 = e.fp.Mul(&z.A3, x[2])
+	t2 = e.fp.Mul(&z.A5, x[4])
+	c0 = e.fp.Add(&z.A3, &z.A5)
+	tmp = e.fp.Add(x[2], x[4])
+	c0 = e.fp.Mul(c0, tmp)
+	c0 = e.fp.Sub(c0, t1)
+	c0 = e.fp.Sub(t2, c0)
+	c0 = e.fp.MulConst(c0, big.NewInt(4))
+	c1 = e.fp.Add(&z.A1, &z.A3)
+	c1 = e.fp.Mul(c1, x[2])
+	c1 = e.fp.Sub(c1, t1)
+	tmp = mulFpByNonResidue(e.fp, t2)
+	c1 = e.fp.Add(c1, tmp)
+	tmp = e.fp.Add(&z.A1, &z.A5)
+	c2 = e.fp.Mul(x[4], tmp)
+	c2 = e.fp.Sub(c2, t2)
+	c2 = e.fp.Add(c2, t1)
+
+	tmp = e.fp.Add(b0, c0)
+	z10 := e.fp.Sub(a0, tmp)
+	tmp = e.fp.Add(b1, c1)
+	z11 := e.fp.Sub(a1, tmp)
+	tmp = e.fp.Add(b2, c2)
+	z12 := e.fp.Sub(a2, tmp)
+
+	z00 := mulFpByNonResidue(e.fp, c2)
+	z00 = e.fp.Add(z00, b0)
+	z01 := e.fp.Add(c0, b1)
+	z02 := e.fp.Add(c1, b2)
 
 	return &E6{
-		A0: *c0,
-		A1: *c1,
-		A2: *c2,
-		A3: *c3,
-		A4: *c4,
-		A5: *c5,
+		A0: *z00,
+		A1: *z10,
+		A2: *z01,
+		A3: *z11,
+		A4: *z02,
+		A5: *z12,
 	}
-
 }
 
 /*

From 0351ff1b2c41b02118c40c70dda90be982815138 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 29 Apr 2024 16:08:09 -0400
Subject: [PATCH 12/24] perf(bw6): optimize mulby023

---
 .../emulated/fields_bw6761/e6_pairing.go      | 208 ++++++------------
 1 file changed, 69 insertions(+), 139 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index bc10352845..24ed322a75 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -130,149 +130,79 @@ func (e Ext6) ExpC2(z *E6) *E6 {
 // MulBy023 multiplies z by an E6 sparse element of the form
 //
 //	E6{A0: c0, A1: 0, A2: c1, A3: 1,  A4: 0,  A5: 0}
-func (e *Ext6) MulBy023(x *E6, c0, c1 *baseEl) *E6 {
-	x = e.Reduce(x)
-	//		v0 = (a0 + a1 + a2 + a3 + a4 + a5)(c0 + c1 + 1)
-	//		v2 = (a0 + a1 + a3 + a4)(c0 + 1)
-	//		v3 = (a0 − a2 − a3 + a5)(c0 − c1 − 1)
-	//		v4 = (a0 − a2 − a5)(c0 − c1)
-	//		v5 = (a0 + a3 − a5)(c0 + 1)
-	//		v6 = (a0 + a1 + a2)(c0 + c1)
-	//		v7 = (a3 + a4 + a5)
-	//		v8 = (a2 + a3)(c1 + 1)
-	//		v10 = (a1 + a2)c1
-	//		v11 = (a3 + a4)
-	//		v12 = (a0 + a1)c0
-	//		v14 = a0c0
-
-	_t0 := e.fp.Add(&x.A0, &x.A1)
-	t0 := e.fp.Add(_t0, &x.A2)
-	t1 := e.fp.Add(&x.A3, &x.A4)
-	t2 := e.fp.Add(_t0, t1)
-	t3 := e.fp.Add(t2, &x.A5)
-	t3 = e.fp.Add(t3, &x.A2)
-
-	s0 := e.fp.Add(c0, c1)
+func (e *Ext6) MulBy023(z *E6, c0, c1 *baseEl) *E6 {
+	z = e.Reduce(z)
+
+	// a := e.MulBy01(&z.B0, c0, c1)
+	a := e.fp.Mul(&z.A0, c0)
+	b := e.fp.Mul(&z.A2, c1)
+	tmp := e.fp.Add(&z.A2, &z.A4)
+	a0 := e.fp.Mul(c1, tmp)
+	a0 = e.fp.Sub(b, a0)
+	a0 = e.fp.MulConst(a0, big.NewInt(4))
+	a0 = e.fp.Add(a0, a)
+	a2 := e.fp.Mul(&z.A4, c0)
+	a2 = e.fp.Add(a2, b)
+	a1 := e.fp.Add(c0, c1)
+	tmp = e.fp.Add(&z.A0, &z.A2)
+	a1 = e.fp.Mul(a1, tmp)
+	a1 = e.fp.Sub(a1, a)
+	a1 = e.fp.Sub(a1, b)
+
+	b0 := e.fp.MulConst(&z.A5, big.NewInt(4))
+	b2 := e.fp.Neg(&z.A3)
+	b1 := e.fp.Neg(&z.A1)
+
 	one := e.fp.One()
-	s2 := e.fp.Add(c0, one)
-	s3 := e.fp.Add(s2, c1)
-
-	v0 := e.fp.Mul(t3, s3)
-	v2 := e.fp.Mul(t2, s2)
-	v6 := e.fp.Mul(t0, s0)
-	t4 := e.fp.Add(t1, &x.A5)
-	v7 := t4
-	v12 := e.fp.Mul(_t0, c0)
-	v11 := t1
-	t0 = e.fp.Add(&x.A2, &x.A3)
-	s0 = e.fp.Add(c1, one)
-	v8 := e.fp.Mul(t0, s0)
-	t1 = e.fp.Add(&x.A1, &x.A2)
-	v10 := e.fp.Mul(t1, c1)
-	v3 := e.fp.Add(&x.A0, &x.A5)
-	v3 = e.fp.Sub(v3, t0)
-	s1 := e.fp.Sub(c0, s0)
-	v3 = e.fp.Mul(v3, s1)
-	t1 = e.fp.Add(&x.A2, &x.A5)
-	t2 = e.fp.Sub(&x.A0, t1)
-	s2 = e.fp.Sub(c0, c1)
-	v4 := e.fp.Mul(t2, s2)
-	t1 = e.fp.Add(&x.A0, &x.A3)
-	t1 = e.fp.Sub(t1, &x.A5)
-	s1 = e.fp.Add(c0, one)
-	v5 := e.fp.Mul(t1, s1)
-	v14 := e.fp.Mul(&x.A0, c0)
-
-	z0 := e.fp.MulConst(v2, big.NewInt(4))
-	s811 := e.fp.Add(v8, v11)
-	s81110 := e.fp.Add(s811, v10)
-	s1 = e.fp.MulConst(s81110, big.NewInt(12))
-	z0 = e.fp.Add(z0, s1)
-	s1 = e.fp.MulConst(v12, big.NewInt(8))
-	z0 = e.fp.Add(z0, s1)
-	s1 = e.fp.MulConst(v14, big.NewInt(21))
-	z0 = e.fp.Add(z0, s1)
-	s1 = e.fp.MulConst(v0, big.NewInt(4))
-	s2 = e.fp.MulConst(v3, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v4, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v5, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v6, big.NewInt(8))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v7, big.NewInt(12))
-	s1 = e.fp.Add(s1, s2)
-	z0 = e.fp.Sub(z0, s1)
-
-	s35 := e.fp.Add(v3, v5)
-	z1 := e.fp.Add(s35, v6)
-	z1 = e.fp.MulConst(z1, big.NewInt(4))
-	s1 = e.fp.MulConst(v7, big.NewInt(8))
-	z1 = e.fp.Add(z1, s1)
-	s1 = e.fp.MulConst(v12, big.NewInt(3))
-	s2 = e.fp.MulConst(v14, big.NewInt(9))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v8, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v10, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v11, big.NewInt(12))
-	s1 = e.fp.Add(s1, s2)
-	z1 = e.fp.Sub(z1, s1)
-
-	s1 = e.fp.MulConst(v11, big.NewInt(4))
-	z2 := e.fp.Add(v6, s1)
-	s1 = e.fp.Add(v10, v12)
-	s2 = e.fp.MulConst(v7, big.NewInt(4))
-	s1 = e.fp.Add(s1, s2)
-	z2 = e.fp.Sub(z2, s1)
-
-	s1 = e.fp.MulConst(v10, big.NewInt(3))
-	z3 := e.fp.Add(s811, s1)
-	s1 = e.fp.MulConst(v12, big.NewInt(2))
-	z3 = e.fp.Add(z3, s1)
-	s1 = e.fp.MulConst(v14, big.NewInt(2))
-	z3 = e.fp.Add(z3, s1)
-	s34 := e.fp.Add(v3, v4)
-	s1 = e.fp.Add(s34, v7)
-	s2 = e.fp.MulConst(v6, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	z3 = e.fp.Sub(z3, s1)
-
-	z4 := e.fp.Add(v2, v7)
-	z4 = e.fp.Add(z4, s34)
-	s1 = e.fp.MulConst(v6, big.NewInt(2))
-	z4 = e.fp.Add(z4, s1)
-	s2 = e.fp.MulConst(v10, big.NewInt(2))
-	s1 = e.fp.Add(v8, s2)
-	s2 = e.fp.MulConst(v11, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v12, big.NewInt(3))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v14, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	z4 = e.fp.Sub(z4, s1)
-
-	z5 := e.fp.Add(s81110, v12)
-	z5 = e.fp.MulConst(z5, big.NewInt(2))
-	s1 = e.fp.MulConst(v14, big.NewInt(3))
-	z5 = e.fp.Add(z5, s1)
-	s1 = e.fp.Add(s34, v5)
-	s2 = e.fp.MulConst(v6, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v7, big.NewInt(2))
-	s1 = e.fp.Add(s1, s2)
-	z5 = e.fp.Sub(z5, s1)
+	d := e.fp.Add(c1, one)
+
+	// zC1 := e.Ext3.Add(&z.B1, &z.B0)
+	// 		a00 a01 a02 a10 a11 a12
+	// 		A0  A2  A4  A1  A3  A5
+	zC10 := e.fp.Add(&z.A1, &z.A0)
+	zC11 := e.fp.Add(&z.A3, &z.A2)
+	zC12 := e.fp.Add(&z.A5, &z.A4)
+
+	// zC1 = e.Ext3.MulBy01(zC1, c0, d)
+	a = e.fp.Mul(zC10, c0)
+	b = e.fp.Mul(zC11, d)
+	tmp = e.fp.Add(zC11, zC12)
+	t0 := e.fp.Mul(d, tmp)
+	t0 = e.fp.Sub(b, t0)
+	t0 = e.fp.MulConst(t0, big.NewInt(4))
+	t0 = e.fp.Add(t0, a)
+	t2 := e.fp.Mul(zC12, c0)
+	t2 = e.fp.Add(t2, b)
+	t1 := e.fp.Add(c0, d)
+	tmp = e.fp.Add(zC10, zC11)
+	t1 = e.fp.Mul(t1, tmp)
+	t1 = e.fp.Sub(t1, a)
+	t1 = e.fp.Sub(t1, b)
+
+	// zC1 = e.Ext3.Sub(zC1, a)
+	zC10 = e.fp.Sub(t0, a0)
+	zC11 = e.fp.Sub(t1, a1)
+	zC12 = e.fp.Sub(t2, a2)
+
+	// zC1 = e.Ext3.Add(zC1, &b)
+	zC10 = e.fp.Add(zC10, b0)
+	zC11 = e.fp.Add(zC11, b1)
+	zC12 = e.fp.Add(zC12, b2)
+
+	// zC0 = e.Ext3.Add(zC0, a)
+	zC00 := e.fp.Add(a0, e.fp.MulConst(b2, big.NewInt(4)))
+	zC01 := e.fp.Sub(a1, b0)
+	zC02 := e.fp.Sub(a2, b1)
 
 	return &E6{
-		A0: *z0,
-		A1: *z1,
-		A2: *z2,
-		A3: *z3,
-		A4: *z4,
-		A5: *z5,
+		A0: *zC00,
+		A1: *zC10,
+		A2: *zC01,
+		A3: *zC11,
+		A4: *zC02,
+		A5: *zC12,
 	}
+
 }
 
 //	Mul023By023 multiplies two E6 sparse element of the form:

From ece2c044d7504db7a61ef5ad99a169ec101a657b Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 29 Apr 2024 16:43:28 -0400
Subject: [PATCH 13/24] refactor(bw6): remove dead code

---
 .../emulated/fields_bw6761/e6_pairing.go      | 53 -------------------
 1 file changed, 53 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index 24ed322a75..3365e900f2 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -133,7 +133,6 @@ func (e Ext6) ExpC2(z *E6) *E6 {
 func (e *Ext6) MulBy023(z *E6, c0, c1 *baseEl) *E6 {
 	z = e.Reduce(z)
 
-	// a := e.MulBy01(&z.B0, c0, c1)
 	a := e.fp.Mul(&z.A0, c0)
 	b := e.fp.Mul(&z.A2, c1)
 	tmp := e.fp.Add(&z.A2, &z.A4)
@@ -156,14 +155,10 @@ func (e *Ext6) MulBy023(z *E6, c0, c1 *baseEl) *E6 {
 	one := e.fp.One()
 	d := e.fp.Add(c1, one)
 
-	// zC1 := e.Ext3.Add(&z.B1, &z.B0)
-	// 		a00 a01 a02 a10 a11 a12
-	// 		A0  A2  A4  A1  A3  A5
 	zC10 := e.fp.Add(&z.A1, &z.A0)
 	zC11 := e.fp.Add(&z.A3, &z.A2)
 	zC12 := e.fp.Add(&z.A5, &z.A4)
 
-	// zC1 = e.Ext3.MulBy01(zC1, c0, d)
 	a = e.fp.Mul(zC10, c0)
 	b = e.fp.Mul(zC11, d)
 	tmp = e.fp.Add(zC11, zC12)
@@ -179,17 +174,14 @@ func (e *Ext6) MulBy023(z *E6, c0, c1 *baseEl) *E6 {
 	t1 = e.fp.Sub(t1, a)
 	t1 = e.fp.Sub(t1, b)
 
-	// zC1 = e.Ext3.Sub(zC1, a)
 	zC10 = e.fp.Sub(t0, a0)
 	zC11 = e.fp.Sub(t1, a1)
 	zC12 = e.fp.Sub(t2, a2)
 
-	// zC1 = e.Ext3.Add(zC1, &b)
 	zC10 = e.fp.Add(zC10, b0)
 	zC11 = e.fp.Add(zC11, b1)
 	zC12 = e.fp.Add(zC12, b2)
 
-	// zC0 = e.Ext3.Add(zC0, a)
 	zC00 := e.fp.Add(a0, e.fp.MulConst(b2, big.NewInt(4)))
 	zC01 := e.fp.Sub(a1, b0)
 	zC02 := e.fp.Sub(a2, b1)
@@ -328,48 +320,3 @@ func (e *Ext6) MulBy02345(z *E6, x [5]*baseEl) *E6 {
 		A5: *z12,
 	}
 }
-
-/*
-// Mul01245By014 multiplies two E6 sparse element of the form
-//
-//	E6{
-//		C0: E3{B0: x0, B1: x1, B2: x2},
-//		C1: E3{B0: 0,  B1: x4, B2: x5},
-//	}
-//
-//	and
-//
-//	E6{
-//		C0: E3{B0: d0, B1: d1, B2: 0},
-//		C1: E3{B0: 0,  B1: 1,  B2: 0},
-//	}
-func (e *Ext6) Mul01245By014(x [5]*baseEl, d0, d1 *baseEl) *E6 {
-	zero := e.fp.Zero()
-	c0 := &E3{A0: *x[0], A1: *x[1], A2: *x[2]}
-	b := &E3{
-		A0: *x[0],
-		A1: *e.fp.Add(x[1], x[3]),
-		A2: *e.fp.Add(x[2], x[4]),
-	}
-	a := e.Ext3.MulBy01(b, d0, e.fp.Add(d1, e.fp.One()))
-	b = e.Ext3.MulBy01(c0, d0, d1)
-	c := &E3{
-		A0: *e.fp.MulConst(x[4], big.NewInt(4)),
-		A1: *e.fp.Neg(zero),
-		A2: *e.fp.Neg(x[3]),
-	}
-	z1 := e.Ext3.Sub(a, b)
-	z1 = e.Ext3.Add(z1, c)
-	z0 := &E3{
-		A0: *e.fp.MulConst(&c.A2, big.NewInt(4)),
-		A1: *e.fp.Neg(&c.A0),
-		A2: *e.fp.Neg(&c.A1),
-	}
-
-	z0 = e.Ext3.Add(z0, b)
-	return &E6{
-		B0: *z0,
-		B1: *z1,
-	}
-}
-*/

From f3af4b3f8648881021e43706a89c159e2d2b36cf Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Wed, 1 May 2024 10:48:26 -0400
Subject: [PATCH 14/24] perf(bw6): save 2 subs in fp6 sq

---
 std/algebra/emulated/fields_bw6761/e6.go | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 23d25cf7a1..84a5448304 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -442,14 +442,16 @@ func (e Ext6) Square(x *E6) *E6 {
 	x = e.Reduce(x)
 
 	// c0
-	c00 := e.fp.Sub(&x.A0, &x.A1)
-	c01 := e.fp.Sub(&x.A2, &x.A3)
+	x1n := e.fp.Neg(&x.A1)
+	x3n := e.fp.Neg(&x.A3)
+	c00 := e.fp.Add(&x.A0, x1n)
+	c01 := e.fp.Add(&x.A2, x3n)
 	c02 := e.fp.Sub(&x.A4, &x.A5)
 
 	// c3
 	c30 := e.fp.Add(&x.A0, e.fp.MulConst(&x.A5, big.NewInt(4)))
-	c31 := e.fp.Sub(&x.A2, &x.A1)
-	c32 := e.fp.Sub(&x.A4, &x.A3)
+	c31 := e.fp.Add(&x.A2, x1n)
+	c32 := e.fp.Add(&x.A4, x3n)
 
 	t0 := e.fp.Mul(&x.A0, &x.A1)
 	t1 := e.fp.Mul(&x.A2, &x.A3)

From 909e11e91e5dee8216de63a4663de43a8a263578 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Wed, 1 May 2024 13:21:11 -0400
Subject: [PATCH 15/24] refactor: clean code

---
 std/algebra/emulated/fields_bw6761/e6.go  | 408 +++++++++++-----------
 std/algebra/native/fields_bls12377/e12.go |  14 +-
 std/algebra/native/fields_bls12377/e2.go  |   6 +-
 std/algebra/native/fields_bls12377/e6.go  |  24 +-
 4 files changed, 240 insertions(+), 212 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 84a5448304..9a65d62ae1 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -430,9 +430,218 @@ func (e Ext6) Mul(x, y *E6) *E6 {
 	y = e.Reduce(y)
 	v := e.interpolationX6Mul(x, y)
 	return e.mulMontgomery6(v)
-	// return e.mulToomCook6(v)
 }
 
+/*
+func (e Ext6) Mul(x, y *E6) *E6 {
+	x = e.Reduce(x)
+	y = e.Reduce(y)
+	// Toom-Cook 6-way multiplication:
+	//
+	// Fixing the polynomial to X^6 we first compute the interpolation points
+	// vi = x(pi)*y(pi) at {0, ±1, ±2, ±3, ±4, 5,∞}:
+	//
+	//     v0 = (a0 + a1 + a2 + a3 + a4 + a5)(b0 + b1 + b2 + b3 + b4 + b5)
+	//     v2 = (a0 + a1 + a3 + a4)(b0 + b1 + b3 + b4)
+	//     v3 = (a0 − a2 − a3 + a5)(b0 − b2 − b3 + b5)
+	//     v4 = (a0 − a2 − a5)(b0 − b2 − b5)
+	//     v5 = (a0 + a3 − a5)(b0 + b3 − b5)
+	//     v6 = (a0 + a1 + a2)(b0 + b1 + b2)
+	//     v7 = (a3 + a4 + a5)(b3 + b4 + b5)
+	//     v8 = (a2 + a3)(b2 + b3)
+	//     v9 = (a1 − a4)(b1 − b4)
+	//     v10 = (a1 + a2)(b1 + b2)
+	//     v11 = (a3 + a4)(b3 + b4)
+	//     v12 = (a0 + a1)(b0 + b1)
+	//     v13 = (a4 + a5)(b4 + b5)
+	//     v14 = a0b0
+	//     v15 = a1b1
+	//     v16 = a4b4
+	//     v17 = a5b5
+	_t0 := e.fp.Add(&x.A0, &x.A1)
+	t0 := e.fp.Add(_t0, &x.A2)
+	t1 := e.fp.Add(&x.A3, &x.A4)
+	t2 := e.fp.Add(_t0, t1)
+	t3 := e.fp.Add(t2, &x.A5)
+	t3 = e.fp.Add(t3, &x.A2)
+
+	_s0 := e.fp.Add(&y.A0, &y.A1)
+	s0 := e.fp.Add(_s0, &y.A2)
+	s1 := e.fp.Add(&y.A3, &y.A4)
+	s2 := e.fp.Add(_s0, s1)
+	s3 := e.fp.Add(s2, &y.A5)
+	s3 = e.fp.Add(s3, &y.A2)
+
+	v0 := e.fp.Mul(t3, s3)
+	v2 := e.fp.Mul(t2, s2)
+	v6 := e.fp.Mul(t0, s0)
+	t4 := e.fp.Add(t1, &x.A5)
+	s4 := e.fp.Add(s1, &y.A5)
+	v7 := e.fp.Mul(t4, s4)
+	t0 = e.fp.Add(&x.A2, &x.A3)
+	s0 = e.fp.Add(&y.A2, &y.A3)
+	v8 := e.fp.Mul(t0, s0)
+	_t0 = e.fp.Sub(&x.A1, &x.A4)
+	_s0 = e.fp.Sub(&y.A1, &y.A4)
+	v9 := e.fp.Mul(_t0, _s0)
+	t1 = e.fp.Add(&x.A1, &x.A2)
+	s1 = e.fp.Add(&y.A1, &y.A2)
+	v10 := e.fp.Mul(t1, s1)
+	v3 := e.fp.Add(&x.A0, &x.A5)
+	v3 = e.fp.Sub(v3, t0)
+	s1 = e.fp.Add(&y.A0, &y.A5)
+	s1 = e.fp.Sub(s1, s0)
+	v3 = e.fp.Mul(v3, s1)
+	t1 = e.fp.Add(&x.A2, &x.A5)
+	t2 = e.fp.Sub(&x.A0, t1)
+	s1 = e.fp.Add(&y.A2, &y.A5)
+	s2 = e.fp.Sub(&y.A0, s1)
+	v4 := e.fp.Mul(t2, s2)
+	t1 = e.fp.Add(&x.A0, &x.A3)
+	t1 = e.fp.Sub(t1, &x.A5)
+	s1 = e.fp.Add(&y.A0, &y.A3)
+	s1 = e.fp.Sub(s1, &y.A5)
+	v5 := e.fp.Mul(t1, s1)
+	v1 := e.fp.One()
+
+	//	Then we compute the product  362880*x*y to avoid divisions:
+	//
+	// 		c0 = 438480 v0 + 26208(v3 + v4) + 504(v7 + v8)
+	// 		- (58464(v1 + v2) + 6048(v5 + v6) + 396264960 v10)
+	//
+	c0 := e.fp.MulConst(v0, big.NewInt(438480))
+	s1 = e.fp.Add(v3, v4)
+	s1 = e.fp.MulConst(s1, big.NewInt(26208))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v7, big.NewInt(504))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.MulConst(v8, big.NewInt(504))
+	c0 = e.fp.Add(c0, s1)
+	s1 = e.fp.Add(v2, v1)
+	s1 = e.fp.MulConst(s1, big.NewInt(58464))
+	s2 = e.fp.Add(v5, v6)
+	s2 = e.fp.MulConst(s2, big.NewInt(6048))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v10, big.NewInt(396264960))
+	s1 = e.fp.Add(s1, s2)
+	c0 = e.fp.Sub(c0, s1)
+	// 		c1 = 744 v8 + 696 v9 + 49536 v4 + 39744 v5  + 379016 v1
+	// 		− (87696 v0 + 233856 v2 + 133056 v3 + 8424 v6 + 7704 v7 + 1260814400 v10)
+	c1 := e.fp.MulConst(v8, big.NewInt(744))
+	s1 = e.fp.MulConst(v9, big.NewInt(696))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v4, big.NewInt(49536))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v5, big.NewInt(39744))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v1, big.NewInt(379016))
+	c1 = e.fp.Add(c1, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(87696))
+	s2 = e.fp.MulConst(v2, big.NewInt(233856))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v3, big.NewInt(133056))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v6, big.NewInt(8424))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v7, big.NewInt(7704))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v10, big.NewInt(1260814400))
+	s1 = e.fp.Add(s1, s2)
+	c1 = e.fp.Sub(c1, s1)
+	// 		c2 = 4896(v5 + v6) + 292320(v1 + v2) + 252564480 v10
+	// 		− (519120 v0 + 360(v7 + v8) + 37296(v3 + v4))
+	c2 := e.fp.Add(v5, v6)
+	c2 = e.fp.MulConst(c2, big.NewInt(4896))
+	s1 = e.fp.Add(v1, v2)
+	s1 = e.fp.MulConst(s1, big.NewInt(292320))
+	c2 = e.fp.Add(c2, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(252564480))
+	c2 = e.fp.Add(c2, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(519120))
+	s2 = e.fp.Add(v7, v8)
+	s2 = e.fp.MulConst(s2, big.NewInt(360))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.Add(v3, v4)
+	s2 = e.fp.MulConst(s2, big.NewInt(37296))
+	s1 = e.fp.Add(s1, s2)
+	c2 = e.fp.Sub(c2, s1)
+	// 		c3 = 103824 v0 + 1495065600 v10 + 10728 v6 + 9180 v7 + 53760 v2 + 154392 v3
+	//  	− (55512 v4 + 47520 v5 + 940 v8 + 816 v9 + 225792 v1)
+	c3 := e.fp.MulConst(v0, big.NewInt(103824))
+	s1 = e.fp.MulConst(v10, big.NewInt(1495065600))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v6, big.NewInt(10728))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v7, big.NewInt(9180))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v2, big.NewInt(53760))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v3, big.NewInt(154392))
+	c3 = e.fp.Add(c3, s1)
+	s1 = e.fp.MulConst(v4, big.NewInt(55512))
+	s2 = e.fp.MulConst(v5, big.NewInt(47520))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v8, big.NewInt(940))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v9, big.NewInt(816))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v1, big.NewInt(225792))
+	s1 = e.fp.Add(s1, s2)
+	c3 = e.fp.Sub(c3, s1)
+	// 		c4 = 171990 v0 + 42588(v3 + v4) + 63(v7 + v8)
+	// 		− (299013120 v10 + 122976(v1 + v2) + 6048(v5 + v6))
+	c4 := e.fp.MulConst(v0, big.NewInt(171990))
+	s1 = e.fp.Add(v3, v4)
+	s1 = e.fp.MulConst(s1, big.NewInt(42588))
+	c4 = e.fp.Add(c4, s1)
+	s1 = e.fp.Add(v7, v8)
+	s1 = e.fp.MulConst(s1, big.NewInt(63))
+	c4 = e.fp.Add(c4, s1)
+	s1 = e.fp.MulConst(v10, big.NewInt(299013120))
+	s2 = e.fp.Add(v1, v2)
+	s2 = e.fp.MulConst(s2, big.NewInt(122976))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.Add(v5, v6)
+	s2 = e.fp.MulConst(s2, big.NewInt(6048))
+	s1 = e.fp.Add(s1, s2)
+	c4 = e.fp.Sub(c4, s1)
+	// 		c5 = 231 v8 + 273 v9 + 3276 v4 + 8316 v2 + 14364 v5 + 49014 v1
+	// 		- (34398 v0 + 36036 v3 + 2079 v6 + 2961 v7 + 495331200 v10)
+	c5 := e.fp.MulConst(v8, big.NewInt(231))
+	s1 = e.fp.MulConst(v9, big.NewInt(273))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v4, big.NewInt(3276))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v2, big.NewInt(8316))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v5, big.NewInt(14364))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v1, big.NewInt(49014))
+	c5 = e.fp.Add(c5, s1)
+	s1 = e.fp.MulConst(v0, big.NewInt(34398))
+	s2 = e.fp.MulConst(v3, big.NewInt(36036))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v6, big.NewInt(2079))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v7, big.NewInt(2961))
+	s1 = e.fp.Add(s1, s2)
+	s2 = e.fp.MulConst(v10, big.NewInt(495331200))
+	s1 = e.fp.Add(s1, s2)
+	c5 = e.fp.Sub(c5, s1)
+
+	// inv362880 := emulated.ValueOf[emulated.BW6761Fp]("4671422665851984694040348663017660157508519176517181272289218522372474038323623073011971993796055931265397672069676435635279488178552288409646583546248183456271259848848724056226545014884280653287710097584502403952205015690976464")
+
+	return &E6{
+		A0: *c0, //e.fp.Mul(c0, &inv362880),
+		A1: *c1, //e.fp.Mul(c1, &inv362880),
+		A2: *c2, //e.fp.Mul(c2, &inv362880),
+		A3: *c3, //e.fp.Mul(c3, &inv362880),
+		A4: *c4, //e.fp.Mul(c4, &inv362880),
+		A5: *c5, //e.fp.Mul(c5, &inv362880),
+	}
+}
+*/
+
 func (e Ext6) Square(x *E6) *E6 {
 	// We don't use Montgomery-6 or Toom-Cook-6 for the squaring but instead we
 	// simulate a quadratic over cubic extension tower because Karatsuba over
@@ -518,203 +727,6 @@ func (e Ext6) Square(x *E6) *E6 {
 	}
 }
 
-/*
-func (e Ext6) MulToomCook6x(x, y *E6) *E6 {
-	//	Then we compute the product  362880*x*y to avoid divisions:
-	//
-	//		c0 = 362880v0 + β(−18900v0 + 14616v2 − 6552(v3 + v4) + 1512(v5 +
-	//		v6) − 126(v7 + v8) + 99066240v10)
-	//
-	//		c1 = −(72576v0 + 241920v2 + 120960v3 - 51840v4 - 34560v5 + 8640v6 +
-	//		6480v7 - 720v8 - 576v9 + 1045094400v10 + β(-3780v0 + 2016v2 -
-	//		3024v3 - 576v4 + 1296v5 + 54v6 - 306v7 + 6v8 + 30v9 - 54432000v10))
-	//
-	//		c2 = −516600v0 + 290304v2 − 36288(v3 + v4) + 4608(v5 + v6) − 324(v7
-	//		+ v8) + 209018880v10 + β(630v0 − 504v2 + 252(v3 + v4) − 72(v5 + v6)
-	//		+ 9(v7 + v8) − 10886400v10)
-	//
-	//		c3 = 103320v0 + 54096v2 + 154056v3 − 55656v4 − 47664v5 + 10764v6 +
-	//		9144v7 − 944v8 − 820v9 + 1487808000v10 + β(−126v0 + 84(v2 − v3) −
-	//		36(v4 + v5) + 9(v6 − v7) − (v8 + v9) − 1814400v10)
-	//
-	//		c4 = 171990v0 − 122976v2 + 42588(v3 + v4) − 6048(v5 + v6) + 63(v7 +
-	//		v8) − 297561600v10 + β(362880v10)
-	//
-	//		c5 = −34398v0 + 8316v2 + 14364v5 − 36036v3 + 3276v4 − 2079v6 −
-	//		2961v7 + 231v8 + 273v9 − 495331200v10.
-
-	t1 = e.fp.Add(v3, v4) // v3 + v4
-	t2 = e.fp.Add(v5, v6) // v5 + v6
-	t3 = e.fp.Add(v7, v8) // v7 + v8
-	t4 = e.fp.Add(v4, v5) // v4 + v5
-	// _t0 = e.fp.Add(v8, v9) // v8 + v9
-
-	c0 := e.fp.MulConst(t2, big.NewInt(1512))
-	s1 = e.fp.MulConst(t1, big.NewInt(6552))
-	c0 = e.fp.Sub(c0, s1)
-	s1 = e.fp.MulConst(v2, big.NewInt(14616))
-	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v0, big.NewInt(18900))
-	c0 = e.fp.Sub(c0, s1)
-	s1 = e.fp.MulConst(v10, big.NewInt(99066240))
-	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(t3, big.NewInt(126))
-	c0 = e.fp.Sub(c0, s1)
-	c0 = mulFpByNonResidue(e.fp, c0)
-	s1 = e.fp.MulConst(v0, big.NewInt(362880))
-	c0 = e.fp.Add(c0, s1)
-
-	c1 := e.fp.MulConst(v0, big.NewInt(72576))
-	s1 = e.fp.MulConst(v2, big.NewInt(241920))
-	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v3, big.NewInt(120960))
-	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v4, big.NewInt(51840))
-	c1 = e.fp.Sub(c1, s1)
-	s1 = e.fp.MulConst(v5, big.NewInt(34560))
-	c1 = e.fp.Sub(c1, s1)
-	s1 = e.fp.MulConst(v6, big.NewInt(8640))
-	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v7, big.NewInt(6480))
-	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v8, big.NewInt(720))
-	c1 = e.fp.Sub(c1, s1)
-	s1 = e.fp.MulConst(v9, big.NewInt(576))
-	c1 = e.fp.Sub(c1, s1)
-	s1 = e.fp.MulConst(v10, big.NewInt(1045094400))
-	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v0, big.NewInt(3780))
-	s2 = e.fp.MulConst(v2, big.NewInt(2016))
-	s1 = e.fp.Sub(s2, s1)
-	s2 = e.fp.MulConst(v3, big.NewInt(3024))
-	s1 = e.fp.Sub(s1, s2)
-	s2 = e.fp.MulConst(v4, big.NewInt(576))
-	s1 = e.fp.Sub(s1, s2)
-	s2 = e.fp.MulConst(v5, big.NewInt(1296))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v6, big.NewInt(54))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v7, big.NewInt(306))
-	s1 = e.fp.Sub(s1, s2)
-	s2 = e.fp.MulConst(v8, big.NewInt(6))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v9, big.NewInt(30))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v10, big.NewInt(54432000))
-	s1 = e.fp.Sub(s1, s2)
-	s1 = mulFpByNonResidue(e.fp, s1)
-	c1 = e.fp.Add(c1, s1)
-	c1 = e.fp.Neg(c1)
-
-	c2 := e.fp.MulConst(v2, big.NewInt(290304))
-	s1 = e.fp.MulConst(t1, big.NewInt(36288))
-	c2 = e.fp.Sub(c2, s1)
-	s1 = e.fp.MulConst(v0, big.NewInt(516600))
-	c2 = e.fp.Sub(c2, s1)
-	s1 = e.fp.MulConst(t2, big.NewInt(4608))
-	c2 = e.fp.Add(c2, s1)
-	s1 = e.fp.MulConst(t3, big.NewInt(324))
-	c2 = e.fp.Sub(c2, s1)
-	s1 = e.fp.MulConst(v10, big.NewInt(209018880))
-	c2 = e.fp.Add(c2, s1)
-	s2 = e.fp.MulConst(v0, big.NewInt(630))
-	s1 = e.fp.MulConst(v2, big.NewInt(504))
-	s1 = e.fp.Sub(s2, s1)
-	s2 = e.fp.MulConst(t1, big.NewInt(252))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(t2, big.NewInt(72))
-	s1 = e.fp.Sub(s1, s2)
-	s2 = e.fp.MulConst(t3, big.NewInt(9))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v10, big.NewInt(10886400))
-	s1 = e.fp.Sub(s1, s2)
-	s1 = mulFpByNonResidue(e.fp, s1)
-	c2 = e.fp.Add(c2, s1)
-
-	c3 := e.fp.MulConst(v0, big.NewInt(103320))
-	s1 = e.fp.MulConst(v2, big.NewInt(54096))
-	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v3, big.NewInt(154056))
-	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v4, big.NewInt(55656))
-	c3 = e.fp.Sub(c3, s1)
-	s1 = e.fp.MulConst(v5, big.NewInt(47664))
-	c3 = e.fp.Sub(c3, s1)
-	s1 = e.fp.MulConst(v6, big.NewInt(10764))
-	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v7, big.NewInt(9144))
-	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v8, big.NewInt(944))
-	c3 = e.fp.Sub(c3, s1)
-	s1 = e.fp.MulConst(v9, big.NewInt(820))
-	c3 = e.fp.Sub(c3, s1)
-	s1 = e.fp.MulConst(v10, big.NewInt(1487808000))
-	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v0, big.NewInt(126))
-	s2 = e.fp.Sub(v2, v3)
-	s2 = e.fp.MulConst(s2, big.NewInt(84))
-	s1 = e.fp.Sub(s2, s1)
-	s2 = e.fp.Add(v4, v5)
-	s2 = e.fp.MulConst(s2, big.NewInt(36))
-	s1 = e.fp.Sub(s1, s2)
-	s2 = e.fp.Sub(v6, v7)
-	s2 = e.fp.MulConst(s2, big.NewInt(9))
-	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v10, big.NewInt(1814400))
-	s2 = e.fp.Add(s2, v8)
-	s2 = e.fp.Add(s2, v9)
-	s1 = e.fp.Sub(s1, s2)
-	s1 = mulFpByNonResidue(e.fp, s1)
-	c3 = e.fp.Add(c3, s1)
-
-	c4 := e.fp.MulConst(v0, big.NewInt(171990))
-	s1 = e.fp.MulConst(v2, big.NewInt(122976))
-	c4 = e.fp.Sub(c4, s1)
-	s1 = e.fp.MulConst(t1, big.NewInt(42588))
-	c4 = e.fp.Add(c4, s1)
-	s1 = e.fp.MulConst(t2, big.NewInt(6048))
-	c4 = e.fp.Sub(c4, s1)
-	s1 = e.fp.MulConst(t3, big.NewInt(63))
-	c4 = e.fp.Add(c4, s1)
-	s1 = e.fp.MulConst(v10, big.NewInt(297561600))
-	c4 = e.fp.Sub(c4, s1)
-	s1 = e.fp.MulConst(v10, big.NewInt(362880))
-	s1 = mulFpByNonResidue(e.fp, s1)
-	c4 = e.fp.Add(c4, s1)
-
-	c5 := e.fp.MulConst(v2, big.NewInt(8316))
-	s1 = e.fp.MulConst(v0, big.NewInt(34398))
-	c5 = e.fp.Sub(c5, s1)
-	s1 = e.fp.MulConst(v5, big.NewInt(14364))
-	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.MulConst(v3, big.NewInt(36036))
-	c5 = e.fp.Sub(c5, s1)
-	s1 = e.fp.MulConst(v4, big.NewInt(3276))
-	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.MulConst(v6, big.NewInt(2079))
-	c5 = e.fp.Sub(c5, s1)
-	s1 = e.fp.MulConst(v7, big.NewInt(2961))
-	c5 = e.fp.Sub(c5, s1)
-	s1 = e.fp.MulConst(v8, big.NewInt(231))
-	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.MulConst(v9, big.NewInt(273))
-	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.MulConst(v10, big.NewInt(495331200))
-	c5 = e.fp.Sub(c5, s1)
-
-	inv362880 := emulated.ValueOf[emulated.BW6761Fp]("4671422665851984694040348663017660157508519176517181272289218522372474038323623073011971993796055931265397672069676435635279488178552288409646583546248183456271259848848724056226545014884280653287710097584502403952205015690976464")
-
-	return &E6{
-		A0: *e.fp.Mul(&inv362880, c0),
-		A1: *e.fp.Mul(&inv362880, c1),
-		A2: *e.fp.Mul(&inv362880, c2),
-		A3: *e.fp.Mul(&inv362880, c3),
-		A4: *e.fp.Mul(&inv362880, c4),
-		A5: *e.fp.Mul(&inv362880, c5),
-	}
-}
-*/
-
 // Karabina's compressed cyclotomic square SQR12345
 // https://eprint.iacr.org/2010/542.pdf
 // Sec. 5.6 with minor modifications to fit our tower
diff --git a/std/algebra/native/fields_bls12377/e12.go b/std/algebra/native/fields_bls12377/e12.go
index 73c2050ee9..e23b8ea5aa 100644
--- a/std/algebra/native/fields_bls12377/e12.go
+++ b/std/algebra/native/fields_bls12377/e12.go
@@ -176,7 +176,7 @@ func (e *E12) Square(api frontend.API, x E12) *E12 {
 	c3.Sub(api, x.C0, c3)
 	c2.Mul(api, x.C0, x.C1)
 	c0.Mul(api, c0, c3).Add(api, c0, c2)
-	e.C1.Add(api, c2, c2)
+	e.C1.Double(api, c2)
 	c2.MulByNonResidue(api, c2)
 	e.C0.Add(api, c0, c2)
 
@@ -423,13 +423,13 @@ func (e *E12) CyclotomicSquare(api frontend.API, x E12) *E12 {
 	t[2].MulByNonResidue(api, t[2]).Add(api, t[2], t[3]) // x2²*u + x3²
 	t[4].MulByNonResidue(api, t[4]).Add(api, t[4], t[5]) // x5²*u + x1²
 
-	e.C0.B0.Sub(api, t[0], x.C0.B0).Add(api, e.C0.B0, e.C0.B0).Add(api, e.C0.B0, t[0])
-	e.C0.B1.Sub(api, t[2], x.C0.B1).Add(api, e.C0.B1, e.C0.B1).Add(api, e.C0.B1, t[2])
-	e.C0.B2.Sub(api, t[4], x.C0.B2).Add(api, e.C0.B2, e.C0.B2).Add(api, e.C0.B2, t[4])
+	e.C0.B0.Sub(api, t[0], x.C0.B0).Double(api, e.C0.B0).Add(api, e.C0.B0, t[0])
+	e.C0.B1.Sub(api, t[2], x.C0.B1).Double(api, e.C0.B1).Add(api, e.C0.B1, t[2])
+	e.C0.B2.Sub(api, t[4], x.C0.B2).Double(api, e.C0.B2).Add(api, e.C0.B2, t[4])
 
-	e.C1.B0.Add(api, t[8], x.C1.B0).Add(api, e.C1.B0, e.C1.B0).Add(api, e.C1.B0, t[8])
-	e.C1.B1.Add(api, t[6], x.C1.B1).Add(api, e.C1.B1, e.C1.B1).Add(api, e.C1.B1, t[6])
-	e.C1.B2.Add(api, t[7], x.C1.B2).Add(api, e.C1.B2, e.C1.B2).Add(api, e.C1.B2, t[7])
+	e.C1.B0.Add(api, t[8], x.C1.B0).Double(api, e.C1.B0).Add(api, e.C1.B0, t[8])
+	e.C1.B1.Add(api, t[6], x.C1.B1).Double(api, e.C1.B1).Add(api, e.C1.B1, t[6])
+	e.C1.B2.Add(api, t[7], x.C1.B2).Double(api, e.C1.B2).Add(api, e.C1.B2, t[7])
 
 	return e
 }
diff --git a/std/algebra/native/fields_bls12377/e2.go b/std/algebra/native/fields_bls12377/e2.go
index fd6f99ecde..1a8390367c 100644
--- a/std/algebra/native/fields_bls12377/e2.go
+++ b/std/algebra/native/fields_bls12377/e2.go
@@ -83,13 +83,11 @@ func (e *E2) Sub(api frontend.API, e1, e2 E2) *E2 {
 // Mul e2 elmts
 func (e *E2) Mul(api frontend.API, e1, e2 E2) *E2 {
 
-	// 1C
 	l1 := api.Add(e1.A0, e1.A1)
 	l2 := api.Add(e2.A0, e2.A1)
 
 	u := api.Mul(l1, l2)
 
-	// 2C
 	ac := api.Mul(e1.A0, e2.A0)
 	bd := api.Mul(e1.A1, e2.A1)
 
@@ -111,9 +109,9 @@ func (e *E2) Square(api frontend.API, x E2) *E2 {
 
 	c0 = api.Mul(c0, c2) // (x1+x2)*(x1+(u**2)x2)
 	c2 = api.Mul(x.A0, x.A1)
-	c2 = api.Add(c2, c2)
+	c2 = api.Mul(c2, 2)
 	e.A1 = c2
-	c2 = api.Add(c2, c2)
+	c2 = api.Mul(c2, 2)
 	e.A0 = api.Add(c0, c2)
 
 	return e
diff --git a/std/algebra/native/fields_bls12377/e6.go b/std/algebra/native/fields_bls12377/e6.go
index 4ea19f9e25..05c6b194d4 100644
--- a/std/algebra/native/fields_bls12377/e6.go
+++ b/std/algebra/native/fields_bls12377/e6.go
@@ -52,6 +52,14 @@ func (e *E6) assign(e1 []frontend.Variable) {
 	e.B2.A1 = e1[5]
 }
 
+// Double e6 elmt
+func (e *E6) Double(api frontend.API, e1 E6) *E6 {
+	e.B0.Double(api, e1.B0)
+	e.B1.Double(api, e1.B1)
+	e.B2.Double(api, e1.B2)
+	return e
+}
+
 // Add creates a fp6elmt from fp elmts
 func (e *E6) Add(api frontend.API, e1, e2 E6) *E6 {
 
@@ -101,16 +109,26 @@ func (e *E6) Mul(api frontend.API, e1, e2 E6) *E6 {
 
 	c0.Add(api, e1.B1, e1.B2)
 	tmp.Add(api, e2.B1, e2.B2)
-	c0.Mul(api, c0, tmp).Sub(api, c0, t1).Sub(api, c0, t2).MulByNonResidue(api, c0).Add(api, c0, t0)
+	c0.Mul(api, c0, tmp).
+		Sub(api, c0, t1).
+		Sub(api, c0, t2).
+		MulByNonResidue(api, c0).
+		Add(api, c0, t0)
 
 	c1.Add(api, e1.B0, e1.B1)
 	tmp.Add(api, e2.B0, e2.B1)
-	c1.Mul(api, c1, tmp).Sub(api, c1, t0).Sub(api, c1, t1)
+	c1.Mul(api, c1, tmp).
+		Sub(api, c1, t0).
+		Sub(api, c1, t1)
 	tmp.MulByNonResidue(api, t2)
 	c1.Add(api, c1, tmp)
 
 	tmp.Add(api, e1.B0, e1.B2)
-	c2.Add(api, e2.B0, e2.B2).Mul(api, c2, tmp).Sub(api, c2, t0).Sub(api, c2, t2).Add(api, c2, t1)
+	c2.Add(api, e2.B0, e2.B2).
+		Mul(api, c2, tmp).
+		Sub(api, c2, t0).
+		Sub(api, c2, t2).
+		Add(api, c2, t1)
 
 	e.B0 = c0
 	e.B1 = c1

From 31d61bc540f617e576252b44ac2a72b4346fd90d Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Fri, 3 May 2024 18:10:32 -0400
Subject: [PATCH 16/24] fix(bw6): Toom-Cook 6-way mul

---
 std/algebra/emulated/fields_bw6761/e6.go      | 252 ++++++++++++------
 std/algebra/emulated/fields_bw6761/e6_test.go |  31 +++
 2 files changed, 195 insertions(+), 88 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 9a65d62ae1..ba9abfa7fa 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -428,94 +428,171 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 func (e Ext6) Mul(x, y *E6) *E6 {
 	x = e.Reduce(x)
 	y = e.Reduce(y)
+	return e.mulToomCook6(x, y)
+}
+
+func (e Ext6) mulMontgomery(x, y *E6) *E6 {
 	v := e.interpolationX6Mul(x, y)
 	return e.mulMontgomery6(v)
 }
 
-/*
-func (e Ext6) Mul(x, y *E6) *E6 {
+func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 	x = e.Reduce(x)
 	y = e.Reduce(y)
 	// Toom-Cook 6-way multiplication:
 	//
-	// Fixing the polynomial to X^6 we first compute the interpolation points
-	// vi = x(pi)*y(pi) at {0, ±1, ±2, ±3, ±4, 5,∞}:
+	// We first represent a, b as the polynomials:
+	// 	x(X) = a0 + a1*X + a2*X^2 + a3*X^3 + a4*X^4 + a5*X^5
+	// 	y(X) = b0 + b1*X + b2*X^2 + b3*X^3 + b4*X^4 + b5*X^5
 	//
-	//     v0 = (a0 + a1 + a2 + a3 + a4 + a5)(b0 + b1 + b2 + b3 + b4 + b5)
-	//     v2 = (a0 + a1 + a3 + a4)(b0 + b1 + b3 + b4)
-	//     v3 = (a0 − a2 − a3 + a5)(b0 − b2 − b3 + b5)
-	//     v4 = (a0 − a2 − a5)(b0 − b2 − b5)
-	//     v5 = (a0 + a3 − a5)(b0 + b3 − b5)
-	//     v6 = (a0 + a1 + a2)(b0 + b1 + b2)
-	//     v7 = (a3 + a4 + a5)(b3 + b4 + b5)
-	//     v8 = (a2 + a3)(b2 + b3)
-	//     v9 = (a1 − a4)(b1 − b4)
-	//     v10 = (a1 + a2)(b1 + b2)
-	//     v11 = (a3 + a4)(b3 + b4)
-	//     v12 = (a0 + a1)(b0 + b1)
-	//     v13 = (a4 + a5)(b4 + b5)
-	//     v14 = a0b0
-	//     v15 = a1b1
-	//     v16 = a4b4
-	//     v17 = a5b5
-	_t0 := e.fp.Add(&x.A0, &x.A1)
-	t0 := e.fp.Add(_t0, &x.A2)
-	t1 := e.fp.Add(&x.A3, &x.A4)
-	t2 := e.fp.Add(_t0, t1)
-	t3 := e.fp.Add(t2, &x.A5)
-	t3 = e.fp.Add(t3, &x.A2)
-
-	_s0 := e.fp.Add(&y.A0, &y.A1)
-	s0 := e.fp.Add(_s0, &y.A2)
-	s1 := e.fp.Add(&y.A3, &y.A4)
-	s2 := e.fp.Add(_s0, s1)
-	s3 := e.fp.Add(s2, &y.A5)
-	s3 = e.fp.Add(s3, &y.A2)
-
-	v0 := e.fp.Mul(t3, s3)
-	v2 := e.fp.Mul(t2, s2)
-	v6 := e.fp.Mul(t0, s0)
-	t4 := e.fp.Add(t1, &x.A5)
-	s4 := e.fp.Add(s1, &y.A5)
-	v7 := e.fp.Mul(t4, s4)
-	t0 = e.fp.Add(&x.A2, &x.A3)
-	s0 = e.fp.Add(&y.A2, &y.A3)
-	v8 := e.fp.Mul(t0, s0)
-	_t0 = e.fp.Sub(&x.A1, &x.A4)
-	_s0 = e.fp.Sub(&y.A1, &y.A4)
-	v9 := e.fp.Mul(_t0, _s0)
-	t1 = e.fp.Add(&x.A1, &x.A2)
-	s1 = e.fp.Add(&y.A1, &y.A2)
-	v10 := e.fp.Mul(t1, s1)
-	v3 := e.fp.Add(&x.A0, &x.A5)
-	v3 = e.fp.Sub(v3, t0)
-	s1 = e.fp.Add(&y.A0, &y.A5)
-	s1 = e.fp.Sub(s1, s0)
-	v3 = e.fp.Mul(v3, s1)
-	t1 = e.fp.Add(&x.A2, &x.A5)
-	t2 = e.fp.Sub(&x.A0, t1)
-	s1 = e.fp.Add(&y.A2, &y.A5)
-	s2 = e.fp.Sub(&y.A0, s1)
-	v4 := e.fp.Mul(t2, s2)
-	t1 = e.fp.Add(&x.A0, &x.A3)
-	t1 = e.fp.Sub(t1, &x.A5)
-	s1 = e.fp.Add(&y.A0, &y.A3)
-	s1 = e.fp.Sub(s1, &y.A5)
-	v5 := e.fp.Mul(t1, s1)
-	v1 := e.fp.One()
+	// and we compute the interpolation points
+	// vi = a(Xi)*b(Xi) at Xi={0, ±1, ±2, ±3, ±4, 5, ∞}:
+	//
+	//     v0 = x(0)y(0)   = a0b0
+	//     v1 = x(1)y(1)   = (a0 + a1 + a2 + a3 + a4 + a5)(b0 + b1 + b2 + b3 + b4 + b5)
+	//     v2 = x(-1)y(-1) = (a0 - a1 + a2 - a3 + a4 - a5)(b0 - b1 + b2 - b3 + b4 - b5)
+	//     v3 = x(2)y(2)   = (a0 + 2a1 + 4a2 + 8a3 + 16a4 + 32a5)(b0 + 2b1 + 4b2 + 8b3 + 16b4 + 32b5)
+	//     v4 = x(-2)y(-2) = (a0 - 2a1 + 4a2 - 8a3 + 16a4 - 32a5)(b0 - 2b1 + 4b2 - 8b3 + 16b4 - 32b5)
+	//     v5 = x(3)y(3)   = (a0 + 3a1 + 9a2 + 27a3 + 81a4 + 243a5)(b0 + 3b1 + 9b2 + 27b3 + 81b4 + 243b5)
+	//     v6 = x(-3)y(-3) = (a0 - 3a1 + 9a2 - 27a3 + 81a4 - 243a5)(b0 - 3b1 + 9b2 - 27b3 + 81b4 - 243b5)
+	//     v7 = x(4)y(4)   = (a0 + 4a1 + 16a2 + 64a3 + 256a4 + 1024a5)(b0 + 4b1 + 16b2 + 64b3 + 256b4 + 1024b5)
+	//     v8 = x(-4)y(-4) = (a0 - 4a1 + 16a2 - 64a3 + 256a4 - 1024a5)(b0 - 4b1 + 16b2 - 64b3 + 256b4 - 1024b5)
+	//     v9 = x(5)y(5)   = (a0 + 5a1 + 25a2 + 125a3 + 625a4 + 3125a5)(b0 + 5b1 + 25b2 + 125b3 + 625b4 + 3125b5)
+	// 	   v10 = x(∞)y(∞)  = a5b5
+	v0 := e.fp.Mul(&x.A0, &y.A0)
+
+	t1 := e.fp.Add(&x.A0, &x.A2)
+	t1 = e.fp.Add(t1, &x.A4)
+	s1 := e.fp.Add(&y.A0, &y.A2)
+	s1 = e.fp.Add(s1, &y.A4)
+	t2 := e.fp.Add(&x.A1, &x.A3)
+	t2 = e.fp.Add(t2, &x.A5)
+	s2 := e.fp.Add(&y.A1, &y.A3)
+	s2 = e.fp.Add(s2, &y.A5)
+
+	v1 := e.fp.Add(t1, t2)
+	s3 := e.fp.Add(s1, s2)
+	v1 = e.fp.Mul(v1, s3)
+
+	v2 := e.fp.Sub(t1, t2)
+	s3 = e.fp.Sub(s1, s2)
+	v2 = e.fp.Mul(v2, s3)
+
+	t1 = e.fp.MulConst(&x.A2, big.NewInt(4))
+	t1 = e.fp.Add(&x.A0, t1)
+	t := e.fp.MulConst(&x.A4, big.NewInt(16))
+	t1 = e.fp.Add(t1, t)
+	t2 = e.fp.MulConst(&x.A1, big.NewInt(2))
+	t = e.fp.MulConst(&x.A3, big.NewInt(8))
+	t2 = e.fp.Add(t2, t)
+	t = e.fp.MulConst(&x.A5, big.NewInt(32))
+	t2 = e.fp.Add(t2, t)
+	s1 = e.fp.MulConst(&y.A2, big.NewInt(4))
+	s1 = e.fp.Add(&y.A0, s1)
+	s := e.fp.MulConst(&y.A4, big.NewInt(16))
+	s1 = e.fp.Add(s1, s)
+	s2 = e.fp.MulConst(&y.A1, big.NewInt(2))
+	s = e.fp.MulConst(&y.A3, big.NewInt(8))
+	s2 = e.fp.Add(s2, s)
+	s = e.fp.MulConst(&y.A5, big.NewInt(32))
+	s2 = e.fp.Add(s2, s)
+
+	v3 := e.fp.Add(t1, t2)
+	s3 = e.fp.Add(s1, s2)
+	v3 = e.fp.Mul(v3, s3)
+
+	v4 := e.fp.Sub(t1, t2)
+	s3 = e.fp.Sub(s1, s2)
+	v4 = e.fp.Mul(v4, s3)
+
+	t1 = e.fp.MulConst(&x.A2, big.NewInt(9))
+	t1 = e.fp.Add(&x.A0, t1)
+	t = e.fp.MulConst(&x.A4, big.NewInt(81))
+	t1 = e.fp.Add(t1, t)
+	t2 = e.fp.MulConst(&x.A1, big.NewInt(3))
+	t = e.fp.MulConst(&x.A3, big.NewInt(27))
+	t2 = e.fp.Add(t2, t)
+	t = e.fp.MulConst(&x.A5, big.NewInt(243))
+	t2 = e.fp.Add(t2, t)
+	s1 = e.fp.MulConst(&y.A2, big.NewInt(9))
+	s1 = e.fp.Add(&y.A0, s1)
+	s = e.fp.MulConst(&y.A4, big.NewInt(81))
+	s1 = e.fp.Add(s1, s)
+	s2 = e.fp.MulConst(&y.A1, big.NewInt(3))
+	s = e.fp.MulConst(&y.A3, big.NewInt(27))
+	s2 = e.fp.Add(s2, s)
+	s = e.fp.MulConst(&y.A5, big.NewInt(243))
+	s2 = e.fp.Add(s2, s)
+
+	v5 := e.fp.Add(t1, t2)
+	s3 = e.fp.Add(s1, s2)
+	v5 = e.fp.Mul(v5, s3)
+
+	v6 := e.fp.Sub(t1, t2)
+	s3 = e.fp.Sub(s1, s2)
+	v6 = e.fp.Mul(v6, s3)
+
+	t1 = e.fp.MulConst(&x.A2, big.NewInt(16))
+	t1 = e.fp.Add(&x.A0, t1)
+	t = e.fp.MulConst(&x.A4, big.NewInt(256))
+	t1 = e.fp.Add(t1, t)
+	t2 = e.fp.MulConst(&x.A1, big.NewInt(4))
+	t = e.fp.MulConst(&x.A3, big.NewInt(64))
+	t2 = e.fp.Add(t2, t)
+	t = e.fp.MulConst(&x.A5, big.NewInt(1024))
+	t2 = e.fp.Add(t2, t)
+	s1 = e.fp.MulConst(&y.A2, big.NewInt(16))
+	s1 = e.fp.Add(&y.A0, s1)
+	s = e.fp.MulConst(&y.A4, big.NewInt(256))
+	s1 = e.fp.Add(s1, s)
+	s2 = e.fp.MulConst(&y.A1, big.NewInt(4))
+	s = e.fp.MulConst(&y.A3, big.NewInt(64))
+	s2 = e.fp.Add(s2, s)
+	s = e.fp.MulConst(&y.A5, big.NewInt(1024))
+	s2 = e.fp.Add(s2, s)
+
+	v7 := e.fp.Add(t1, t2)
+	s3 = e.fp.Add(s1, s2)
+	v7 = e.fp.Mul(v7, s3)
+
+	v8 := e.fp.Sub(t1, t2)
+	s3 = e.fp.Sub(s1, s2)
+	v8 = e.fp.Mul(v8, s3)
+
+	t1 = e.fp.MulConst(&x.A2, big.NewInt(25))
+	t1 = e.fp.Add(&x.A0, t1)
+	t = e.fp.MulConst(&x.A4, big.NewInt(625))
+	t1 = e.fp.Add(t1, t)
+	t2 = e.fp.MulConst(&x.A1, big.NewInt(5))
+	t = e.fp.MulConst(&x.A3, big.NewInt(125))
+	t2 = e.fp.Add(t2, t)
+	t = e.fp.MulConst(&x.A5, big.NewInt(3125))
+	t2 = e.fp.Add(t2, t)
+	s1 = e.fp.MulConst(&y.A2, big.NewInt(25))
+	s1 = e.fp.Add(&y.A0, s1)
+	s = e.fp.MulConst(&y.A4, big.NewInt(625))
+	s1 = e.fp.Add(s1, s)
+	s2 = e.fp.MulConst(&y.A1, big.NewInt(5))
+	s = e.fp.MulConst(&y.A3, big.NewInt(125))
+	s2 = e.fp.Add(s2, s)
+	s = e.fp.MulConst(&y.A5, big.NewInt(3125))
+	s2 = e.fp.Add(s2, s)
+	v9 := e.fp.Add(t1, t2)
+	s3 = e.fp.Add(s1, s2)
+	v9 = e.fp.Mul(v9, s3)
+
+	v10 := e.fp.Mul(&x.A5, &y.A5)
 
 	//	Then we compute the product  362880*x*y to avoid divisions:
 	//
 	// 		c0 = 438480 v0 + 26208(v3 + v4) + 504(v7 + v8)
 	// 		- (58464(v1 + v2) + 6048(v5 + v6) + 396264960 v10)
-	//
 	c0 := e.fp.MulConst(v0, big.NewInt(438480))
 	s1 = e.fp.Add(v3, v4)
 	s1 = e.fp.MulConst(s1, big.NewInt(26208))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v7, big.NewInt(504))
-	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v8, big.NewInt(504))
+	s1 = e.fp.Add(v7, v8)
+	s1 = e.fp.MulConst(s1, big.NewInt(504))
 	c0 = e.fp.Add(c0, s1)
 	s1 = e.fp.Add(v2, v1)
 	s1 = e.fp.MulConst(s1, big.NewInt(58464))
@@ -525,8 +602,8 @@ func (e Ext6) Mul(x, y *E6) *E6 {
 	s2 = e.fp.MulConst(v10, big.NewInt(396264960))
 	s1 = e.fp.Add(s1, s2)
 	c0 = e.fp.Sub(c0, s1)
-	// 		c1 = 744 v8 + 696 v9 + 49536 v4 + 39744 v5  + 379016 v1
-	// 		− (87696 v0 + 233856 v2 + 133056 v3 + 8424 v6 + 7704 v7 + 1260814400 v10)
+	//  	c1 = 744 v8 + 696 v9 + 49536 v4 + 39744 v5 + 380016 v1
+	//  	− (87696 v0 + 226800 v2 + 136080 v3 + 8424* v6 + 7704 v7 + 1262822400 v10)
 	c1 := e.fp.MulConst(v8, big.NewInt(744))
 	s1 = e.fp.MulConst(v9, big.NewInt(696))
 	c1 = e.fp.Add(c1, s1)
@@ -534,7 +611,7 @@ func (e Ext6) Mul(x, y *E6) *E6 {
 	c1 = e.fp.Add(c1, s1)
 	s1 = e.fp.MulConst(v5, big.NewInt(39744))
 	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v1, big.NewInt(379016))
+	s1 = e.fp.MulConst(v1, big.NewInt(380016))
 	c1 = e.fp.Add(c1, s1)
 	s1 = e.fp.MulConst(v0, big.NewInt(87696))
 	s2 = e.fp.MulConst(v2, big.NewInt(233856))
@@ -545,7 +622,7 @@ func (e Ext6) Mul(x, y *E6) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	s2 = e.fp.MulConst(v7, big.NewInt(7704))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v10, big.NewInt(1260814400))
+	s2 = e.fp.MulConst(v10, big.NewInt(1262822400))
 	s1 = e.fp.Add(s1, s2)
 	c1 = e.fp.Sub(c1, s1)
 	// 		c2 = 4896(v5 + v6) + 292320(v1 + v2) + 252564480 v10
@@ -566,7 +643,7 @@ func (e Ext6) Mul(x, y *E6) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c2 = e.fp.Sub(c2, s1)
 	// 		c3 = 103824 v0 + 1495065600 v10 + 10728 v6 + 9180 v7 + 53760 v2 + 154392 v3
-	//  	− (55512 v4 + 47520 v5 + 940 v8 + 816 v9 + 225792 v1)
+	// 		- (55512 v4 + 47808* v5 + 940 v8 + 824* v9 + 226800* v1)
 	c3 := e.fp.MulConst(v0, big.NewInt(103824))
 	s1 = e.fp.MulConst(v10, big.NewInt(1495065600))
 	c3 = e.fp.Add(c3, s1)
@@ -579,23 +656,23 @@ func (e Ext6) Mul(x, y *E6) *E6 {
 	s1 = e.fp.MulConst(v3, big.NewInt(154392))
 	c3 = e.fp.Add(c3, s1)
 	s1 = e.fp.MulConst(v4, big.NewInt(55512))
-	s2 = e.fp.MulConst(v5, big.NewInt(47520))
+	s2 = e.fp.MulConst(v5, big.NewInt(47808))
 	s1 = e.fp.Add(s1, s2)
 	s2 = e.fp.MulConst(v8, big.NewInt(940))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v9, big.NewInt(816))
+	s2 = e.fp.MulConst(v9, big.NewInt(824))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v1, big.NewInt(225792))
+	s2 = e.fp.MulConst(v1, big.NewInt(226800))
 	s1 = e.fp.Add(s1, s2)
 	c3 = e.fp.Sub(c3, s1)
-	// 		c4 = 171990 v0 + 42588(v3 + v4) + 63(v7 + v8)
+	// 		c4 = 171990 v0 + 42588(v3 + v4) + 441* (v7 + v8)
 	// 		− (299013120 v10 + 122976(v1 + v2) + 6048(v5 + v6))
 	c4 := e.fp.MulConst(v0, big.NewInt(171990))
 	s1 = e.fp.Add(v3, v4)
 	s1 = e.fp.MulConst(s1, big.NewInt(42588))
 	c4 = e.fp.Add(c4, s1)
 	s1 = e.fp.Add(v7, v8)
-	s1 = e.fp.MulConst(s1, big.NewInt(63))
+	s1 = e.fp.MulConst(s1, big.NewInt(441))
 	c4 = e.fp.Add(c4, s1)
 	s1 = e.fp.MulConst(v10, big.NewInt(299013120))
 	s2 = e.fp.Add(v1, v2)
@@ -629,18 +706,17 @@ func (e Ext6) Mul(x, y *E6) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c5 = e.fp.Sub(c5, s1)
 
-	// inv362880 := emulated.ValueOf[emulated.BW6761Fp]("4671422665851984694040348663017660157508519176517181272289218522372474038323623073011971993796055931265397672069676435635279488178552288409646583546248183456271259848848724056226545014884280653287710097584502403952205015690976464")
+	inv362880 := emulated.ValueOf[emulated.BW6761Fp]("4671422665851984694040348663017660157508519176517181272289218522372474038323623073011971993796055931265397672069676435635279488178552288409646583546248183456271259848848724056226545014884280653287710097584502403952205015690976464")
 
 	return &E6{
-		A0: *c0, //e.fp.Mul(c0, &inv362880),
-		A1: *c1, //e.fp.Mul(c1, &inv362880),
-		A2: *c2, //e.fp.Mul(c2, &inv362880),
-		A3: *c3, //e.fp.Mul(c3, &inv362880),
-		A4: *c4, //e.fp.Mul(c4, &inv362880),
-		A5: *c5, //e.fp.Mul(c5, &inv362880),
+		A0: *e.fp.Mul(c0, &inv362880),
+		A1: *e.fp.Mul(c1, &inv362880),
+		A2: *e.fp.Mul(c2, &inv362880),
+		A3: *e.fp.Mul(c3, &inv362880),
+		A4: *e.fp.Mul(c4, &inv362880),
+		A5: *e.fp.Mul(c5, &inv362880),
 	}
 }
-*/
 
 func (e Ext6) Square(x *E6) *E6 {
 	// We don't use Montgomery-6 or Toom-Cook-6 for the squaring but instead we
diff --git a/std/algebra/emulated/fields_bw6761/e6_test.go b/std/algebra/emulated/fields_bw6761/e6_test.go
index 5f38dcd4b1..269e8802a4 100644
--- a/std/algebra/emulated/fields_bw6761/e6_test.go
+++ b/std/algebra/emulated/fields_bw6761/e6_test.go
@@ -102,6 +102,37 @@ func TestDoubleFp6(t *testing.T) {
 	assert.NoError(err)
 }
 
+type e6MulVariants struct {
+	A, B, C E6
+}
+
+func (circuit *e6MulVariants) Define(api frontend.API) error {
+	e := NewExt6(api)
+	expected1 := *e.mulMontgomery(&circuit.A, &circuit.B)
+	expected2 := *e.mulToomCook6(&circuit.A, &circuit.B)
+	e.AssertIsEqual(&expected1, &circuit.C)
+	e.AssertIsEqual(&expected2, &circuit.C)
+	return nil
+}
+
+func TestMulVariantsFp6(t *testing.T) {
+	assert := test.NewAssert(t)
+	// witness values
+	var a, b, c bw6761.E6
+	_, _ = a.SetRandom()
+	_, _ = b.SetRandom()
+	c.Mul(&a, &b)
+
+	witness := e6MulVariants{
+		A: FromE6(&a),
+		B: FromE6(&b),
+		C: FromE6(&c),
+	}
+
+	err := test.IsSolved(&e6MulVariants{}, &witness, ecc.BN254.ScalarField())
+	assert.NoError(err)
+}
+
 type e6Mul struct {
 	A, B, C E6
 }

From b27d4a575f55eef24f3728cb9e41239e93c204aa Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Fri, 3 May 2024 18:49:58 -0400
Subject: [PATCH 17/24] perf(bw6): use hint to divide by 362880 in Toom-6

---
 std/algebra/emulated/fields_bw6761/e6.go      | 244 ++++++++++--------
 std/algebra/emulated/fields_bw6761/e6_test.go |   2 +-
 std/algebra/emulated/fields_bw6761/hints.go   |  31 +++
 3 files changed, 173 insertions(+), 104 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index ba9abfa7fa..273a9cbcc6 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -189,7 +189,17 @@ func mulFpByNonResidue(fp *curveF, x *baseEl) *baseEl {
 	return z
 }
 
-func (e Ext6) interpolationX6Mul(x, y *E6) [18]*baseEl {
+func (e Ext6) Mul(x, y *E6) *E6 {
+	x = e.Reduce(x)
+	y = e.Reduce(y)
+	return e.mulToomCook6(x, y)
+}
+
+func (e Ext6) mulMontgomery6(x, y *E6) *E6 {
+	// Ref.: Peter L. Montgomery. Five, six, and seven-term Karatsuba-like formulae. IEEE
+	// Transactions on Computers, 54(3):362–369, 2005.
+	x = e.Reduce(x)
+	y = e.Reduce(y)
 	// Fixing the polynomial to X^6 we first compute the interpolation points
 	// vi = x(pi)*y(pi) at {0, ±1, ±2, ±3, ±4, 5,∞}:
 	//
@@ -263,18 +273,10 @@ func (e Ext6) interpolationX6Mul(x, y *E6) [18]*baseEl {
 	v15 := e.fp.Mul(&x.A1, &y.A1)
 	v16 := e.fp.Mul(&x.A4, &y.A4)
 	v17 := e.fp.Mul(&x.A5, &y.A5)
-	v1 := e.fp.Zero()
-
-	return [18]*baseEl{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17}
-}
 
-func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	// Then we compute the coefficients c0,c1,c3,c4 and c5 in the direct sextic
 	// extension of the product x*y as follows:
 	//
-	// Ref.: Peter L. Montgomery. Five, six, and seven-term Karatsuba-like formulae. IEEE
-	// Transactions on Computers, 54(3):362–369, 2005.
-	//
 	// 	c0 = v14 + β(v0 − v2 + v4 + 2(v3+v5+v6-v12) + 3(v7+v15-v8-v10-v11) +
 	// 	4(v16-v13) − 5(v14+v17))
 	//
@@ -292,126 +294,126 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	//  c5 = −(v3 + v4 + v5 + v9 + v15 + v16) + 2(v8 + v10 + v11 + v12 + v13 −
 	//  (v6 + v7)) + 3(v14 + v17)
 
-	c0 := e.fp.MulConst(v[2], big.NewInt(4))
-	s811 := e.fp.Add(v[8], v[11])
-	s81110 := e.fp.Add(s811, v[10])
-	s1 := e.fp.MulConst(s81110, big.NewInt(12))
+	c0 := e.fp.MulConst(v2, big.NewInt(4))
+	s811 := e.fp.Add(v8, v11)
+	s81110 := e.fp.Add(s811, v10)
+	s1 = e.fp.MulConst(s81110, big.NewInt(12))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v[12], big.NewInt(8))
+	s1 = e.fp.MulConst(v12, big.NewInt(8))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v[13], big.NewInt(16))
+	s1 = e.fp.MulConst(v13, big.NewInt(16))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v[14], big.NewInt(21))
+	s1 = e.fp.MulConst(v14, big.NewInt(21))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v[17], big.NewInt(20))
+	s1 = e.fp.MulConst(v17, big.NewInt(20))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.MulConst(v[15], big.NewInt(12))
-	s2 := e.fp.MulConst(v[16], big.NewInt(16))
+	s1 = e.fp.MulConst(v15, big.NewInt(12))
+	s2 = e.fp.MulConst(v16, big.NewInt(16))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[0], big.NewInt(4))
+	s2 = e.fp.MulConst(v0, big.NewInt(4))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[3], big.NewInt(8))
+	s2 = e.fp.MulConst(v3, big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[4], big.NewInt(4))
+	s2 = e.fp.MulConst(v4, big.NewInt(4))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[5], big.NewInt(8))
+	s2 = e.fp.MulConst(v5, big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[6], big.NewInt(8))
+	s2 = e.fp.MulConst(v6, big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[7], big.NewInt(12))
+	s2 = e.fp.MulConst(v7, big.NewInt(12))
 	s1 = e.fp.Add(s1, s2)
 	c0 = e.fp.Sub(c0, s1)
 
-	s35 := e.fp.Add(v[3], v[5])
-	c1 := e.fp.Add(s35, v[6])
+	s35 := e.fp.Add(v3, v5)
+	c1 := e.fp.Add(s35, v6)
 	c1 = e.fp.MulConst(c1, big.NewInt(4))
-	s1 = e.fp.MulConst(v[7], big.NewInt(8))
+	s1 = e.fp.MulConst(v7, big.NewInt(8))
 	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v[16], big.NewInt(12))
+	s1 = e.fp.MulConst(v16, big.NewInt(12))
 	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v[15], big.NewInt(3))
+	s1 = e.fp.MulConst(v15, big.NewInt(3))
 	c1 = e.fp.Add(c1, s1)
-	s1 = e.fp.MulConst(v[12], big.NewInt(3))
-	s2 = e.fp.MulConst(v[14], big.NewInt(9))
+	s1 = e.fp.MulConst(v12, big.NewInt(3))
+	s2 = e.fp.MulConst(v14, big.NewInt(9))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[8], big.NewInt(4))
+	s2 = e.fp.MulConst(v8, big.NewInt(4))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[10], big.NewInt(4))
+	s2 = e.fp.MulConst(v10, big.NewInt(4))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[11], big.NewInt(12))
+	s2 = e.fp.MulConst(v11, big.NewInt(12))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[13], big.NewInt(8))
+	s2 = e.fp.MulConst(v13, big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[17], big.NewInt(8))
+	s2 = e.fp.MulConst(v17, big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
 	c1 = e.fp.Sub(c1, s1)
 
-	c2 := e.fp.MulConst(v[15], big.NewInt(2))
-	c2 = e.fp.Add(c2, v[6])
-	s1 = e.fp.MulConst(v[11], big.NewInt(4))
+	c2 := e.fp.MulConst(v15, big.NewInt(2))
+	c2 = e.fp.Add(c2, v6)
+	s1 = e.fp.MulConst(v11, big.NewInt(4))
 	c2 = e.fp.Add(c2, s1)
-	s1 = e.fp.MulConst(v[13], big.NewInt(4))
+	s1 = e.fp.MulConst(v13, big.NewInt(4))
 	c2 = e.fp.Add(c2, s1)
-	s1012 := e.fp.Add(v[10], v[12])
-	s2 = e.fp.MulConst(v[7], big.NewInt(4))
+	s1012 := e.fp.Add(v10, v12)
+	s2 = e.fp.MulConst(v7, big.NewInt(4))
 	s1 = e.fp.Add(s1012, s2)
-	s2 = e.fp.MulConst(v[16], big.NewInt(8))
+	s2 = e.fp.MulConst(v16, big.NewInt(8))
 	s1 = e.fp.Add(s1, s2)
 	c2 = e.fp.Sub(c2, s1)
 
-	s1 = e.fp.MulConst(v[10], big.NewInt(3))
+	s1 = e.fp.MulConst(v10, big.NewInt(3))
 	c3 := e.fp.Add(s811, s1)
-	s1 = e.fp.MulConst(v[12], big.NewInt(2))
+	s1 = e.fp.MulConst(v12, big.NewInt(2))
 	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v[14], big.NewInt(2))
+	s1 = e.fp.MulConst(v14, big.NewInt(2))
 	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v[16], big.NewInt(3))
+	s1 = e.fp.MulConst(v16, big.NewInt(3))
 	c3 = e.fp.Add(c3, s1)
-	s1 = e.fp.MulConst(v[17], big.NewInt(6))
+	s1 = e.fp.MulConst(v17, big.NewInt(6))
 	c3 = e.fp.Add(c3, s1)
-	s34 := e.fp.Add(v[3], v[4])
-	s1 = e.fp.Add(s34, v[7])
-	s2 = e.fp.MulConst(v[6], big.NewInt(2))
+	s34 := e.fp.Add(v3, v4)
+	s1 = e.fp.Add(s34, v7)
+	s2 = e.fp.MulConst(v6, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[13], big.NewInt(3))
+	s2 = e.fp.MulConst(v13, big.NewInt(3))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[15], big.NewInt(3))
+	s2 = e.fp.MulConst(v15, big.NewInt(3))
 	s1 = e.fp.Add(s1, s2)
 	c3 = e.fp.Sub(c3, s1)
 
-	c4 := e.fp.Add(v[2], v[15])
-	c4 = e.fp.Add(c4, v[9])
-	c4 = e.fp.Add(c4, v[7])
+	c4 := e.fp.Add(v2, v15)
+	c4 = e.fp.Add(c4, v9)
+	c4 = e.fp.Add(c4, v7)
 	c4 = e.fp.Add(c4, s34)
-	s1 = e.fp.MulConst(v[6], big.NewInt(2))
+	s1 = e.fp.MulConst(v6, big.NewInt(2))
 	c4 = e.fp.Add(c4, s1)
-	s1 = e.fp.Add(v[13], v[8])
-	s2 = e.fp.MulConst(v[10], big.NewInt(2))
+	s1 = e.fp.Add(v13, v8)
+	s2 = e.fp.MulConst(v10, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[11], big.NewInt(2))
+	s2 = e.fp.MulConst(v11, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[12], big.NewInt(3))
+	s2 = e.fp.MulConst(v12, big.NewInt(3))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[14], big.NewInt(2))
+	s2 = e.fp.MulConst(v14, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[17], big.NewInt(6))
+	s2 = e.fp.MulConst(v17, big.NewInt(6))
 	s1 = e.fp.Add(s1, s2)
 	c4 = e.fp.Sub(c4, s1)
 
-	c5 := e.fp.Add(s81110, v[12])
-	c5 = e.fp.Add(c5, v[13])
+	c5 := e.fp.Add(s81110, v12)
+	c5 = e.fp.Add(c5, v13)
 	c5 = e.fp.MulConst(c5, big.NewInt(2))
-	s1 = e.fp.MulConst(v[14], big.NewInt(3))
+	s1 = e.fp.MulConst(v14, big.NewInt(3))
 	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.MulConst(v[17], big.NewInt(3))
+	s1 = e.fp.MulConst(v17, big.NewInt(3))
 	c5 = e.fp.Add(c5, s1)
-	s1 = e.fp.Add(v[15], v[16])
+	s1 = e.fp.Add(v15, v16)
 	s1 = e.fp.Add(s1, s34)
-	s1 = e.fp.Add(s1, v[5])
-	s1 = e.fp.Add(s1, v[9])
-	s2 = e.fp.MulConst(v[6], big.NewInt(2))
+	s1 = e.fp.Add(s1, v5)
+	s1 = e.fp.Add(s1, v9)
+	s2 = e.fp.MulConst(v6, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.MulConst(v[7], big.NewInt(2))
+	s2 = e.fp.MulConst(v7, big.NewInt(2))
 	s1 = e.fp.Add(s1, s2)
 	c5 = e.fp.Sub(c5, s1)
 
@@ -425,22 +427,14 @@ func (e Ext6) mulMontgomery6(v [18]*baseEl) *E6 {
 	}
 }
 
-func (e Ext6) Mul(x, y *E6) *E6 {
-	x = e.Reduce(x)
-	y = e.Reduce(y)
-	return e.mulToomCook6(x, y)
-}
-
-func (e Ext6) mulMontgomery(x, y *E6) *E6 {
-	v := e.interpolationX6Mul(x, y)
-	return e.mulMontgomery6(v)
-}
-
 func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 	x = e.Reduce(x)
 	y = e.Reduce(y)
 	// Toom-Cook 6-way multiplication:
 	//
+	// Ref.: https://eprint.iacr.org/2006/471.pdf
+	// ⚠️  but has sign errors in c1 and coefficient errors in c3 and c4.
+	//
 	// We first represent a, b as the polynomials:
 	// 	x(X) = a0 + a1*X + a2*X^2 + a3*X^3 + a4*X^4 + a5*X^5
 	// 	y(X) = b0 + b1*X + b2*X^2 + b3*X^3 + b4*X^4 + b5*X^5
@@ -448,17 +442,18 @@ func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 	// and we compute the interpolation points
 	// vi = a(Xi)*b(Xi) at Xi={0, ±1, ±2, ±3, ±4, 5, ∞}:
 	//
-	//     v0 = x(0)y(0)   = a0b0
-	//     v1 = x(1)y(1)   = (a0 + a1 + a2 + a3 + a4 + a5)(b0 + b1 + b2 + b3 + b4 + b5)
-	//     v2 = x(-1)y(-1) = (a0 - a1 + a2 - a3 + a4 - a5)(b0 - b1 + b2 - b3 + b4 - b5)
-	//     v3 = x(2)y(2)   = (a0 + 2a1 + 4a2 + 8a3 + 16a4 + 32a5)(b0 + 2b1 + 4b2 + 8b3 + 16b4 + 32b5)
-	//     v4 = x(-2)y(-2) = (a0 - 2a1 + 4a2 - 8a3 + 16a4 - 32a5)(b0 - 2b1 + 4b2 - 8b3 + 16b4 - 32b5)
-	//     v5 = x(3)y(3)   = (a0 + 3a1 + 9a2 + 27a3 + 81a4 + 243a5)(b0 + 3b1 + 9b2 + 27b3 + 81b4 + 243b5)
-	//     v6 = x(-3)y(-3) = (a0 - 3a1 + 9a2 - 27a3 + 81a4 - 243a5)(b0 - 3b1 + 9b2 - 27b3 + 81b4 - 243b5)
-	//     v7 = x(4)y(4)   = (a0 + 4a1 + 16a2 + 64a3 + 256a4 + 1024a5)(b0 + 4b1 + 16b2 + 64b3 + 256b4 + 1024b5)
-	//     v8 = x(-4)y(-4) = (a0 - 4a1 + 16a2 - 64a3 + 256a4 - 1024a5)(b0 - 4b1 + 16b2 - 64b3 + 256b4 - 1024b5)
-	//     v9 = x(5)y(5)   = (a0 + 5a1 + 25a2 + 125a3 + 625a4 + 3125a5)(b0 + 5b1 + 25b2 + 125b3 + 625b4 + 3125b5)
-	// 	   v10 = x(∞)y(∞)  = a5b5
+	//     v0 = x(0)y(0)   = x0y0
+	//     v1 = x(1)y(1)   = (x0 + x1 + x2 + x3 + x4 + x5)(y0 + y1 + y2 + y3 + y4 + y5)
+	//     v2 = x(-1)y(-1) = (x0 - x1 + x2 - x3 + x4 - x5)(y0 - y1 + y2 - y3 + y4 - y5)
+	//     v3 = x(2)y(2)   = (x0 + 2x1 + 4x2 + 8x3 + 16x4 + 32x5)(y0 + 2y1 + 4y2 + 8y3 + 16y4 + 32y5)
+	//     v4 = x(-2)y(-2) = (x0 - 2x1 + 4x2 - 8x3 + 16x4 - 32x5)(y0 - 2y1 + 4y2 - 8y3 + 16y4 - 32y5)
+	//     v5 = x(3)y(3)   = (x0 + 3x1 + 9x2 + 27x3 + 81x4 + 243x5)(y0 + 3y1 + 9y2 + 27y3 + 81y4 + 243y5)
+	//     v6 = x(-3)y(-3) = (x0 - 3x1 + 9x2 - 27x3 + 81x4 - 243x5)(y0 - 3y1 + 9y2 - 27y3 + 81y4 - 243y5)
+	//     v7 = x(4)y(4)   = (x0 + 4x1 + 16x2 + 64x3 + 256x4 + 1024x5)(y0 + 4y1 + 16y2 + 64y3 + 256y4 + 1024y5)
+	//     v8 = x(-4)y(-4) = (x0 - 4x1 + 16x2 - 64x3 + 256x4 - 1024x5)(y0 - 4y1 + 16y2 - 64y3 + 256y4 - 1024y5)
+	//     v9 = x(5)y(5)   = (x0 + 5x1 + 25x2 + 125x3 + 625x4 + 3125x5)(y0 + 5y1 + 25y2 + 125y3 + 625y4 + 3125y5)
+	// 	   v10 = x(∞)y(∞)  = x5y5
+
 	v0 := e.fp.Mul(&x.A0, &y.A0)
 
 	t1 := e.fp.Add(&x.A0, &x.A2)
@@ -583,7 +578,7 @@ func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 
 	v10 := e.fp.Mul(&x.A5, &y.A5)
 
-	//	Then we compute the product  362880*x*y to avoid divisions:
+	//	Then we compute the product  362880 * x * y to avoid divisions (mul by large coeffs):
 	//
 	// 		c0 = 438480 v0 + 26208(v3 + v4) + 504(v7 + v8)
 	// 		- (58464(v1 + v2) + 6048(v5 + v6) + 396264960 v10)
@@ -706,15 +701,27 @@ func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c5 = e.fp.Sub(c5, s1)
 
-	inv362880 := emulated.ValueOf[emulated.BW6761Fp]("4671422665851984694040348663017660157508519176517181272289218522372474038323623073011971993796055931265397672069676435635279488178552288409646583546248183456271259848848724056226545014884280653287710097584502403952205015690976464")
+	/*
+		inv362880 := emulated.ValueOf[emulated.BW6761Fp]("4671422665851984694040348663017660157508519176517181272289218522372474038323623073011971993796055931265397672069676435635279488178552288409646583546248183456271259848848724056226545014884280653287710097584502403952205015690976464")
 
+		return &E6{
+			A0: *e.fp.Mul(c0, &inv362880),
+			A1: *e.fp.Mul(c1, &inv362880),
+			A2: *e.fp.Mul(c2, &inv362880),
+			A3: *e.fp.Mul(c3, &inv362880),
+			A4: *e.fp.Mul(c4, &inv362880),
+			A5: *e.fp.Mul(c5, &inv362880),
+		}
+	*/
+
+	res := e.divE6By362880([6]*baseEl{c0, c1, c2, c3, c4, c5})
 	return &E6{
-		A0: *e.fp.Mul(c0, &inv362880),
-		A1: *e.fp.Mul(c1, &inv362880),
-		A2: *e.fp.Mul(c2, &inv362880),
-		A3: *e.fp.Mul(c3, &inv362880),
-		A4: *e.fp.Mul(c4, &inv362880),
-		A5: *e.fp.Mul(c5, &inv362880),
+		A0: *res[0],
+		A1: *res[1],
+		A2: *res[2],
+		A3: *res[3],
+		A4: *res[4],
+		A5: *res[5],
 	}
 }
 
@@ -927,6 +934,37 @@ func (e Ext6) DivUnchecked(x, y *E6) *E6 {
 
 }
 
+func (e Ext6) divE6By362880(x [6]*baseEl) [6]*baseEl {
+	res, err := e.fp.NewHint(divE6By362880Hint, 6, x[0], x[1], x[2], x[3], x[4], x[5])
+	if err != nil {
+		// err is non-nil only for invalid number of inputs
+		panic(err)
+	}
+
+	y0 := *res[0]
+	y1 := *res[1]
+	y2 := *res[2]
+	y3 := *res[3]
+	y4 := *res[4]
+	y5 := *res[5]
+
+	// xi == 362880 * yi
+	x0 := e.fp.MulConst(&y0, big.NewInt(362880))
+	x1 := e.fp.MulConst(&y1, big.NewInt(362880))
+	x2 := e.fp.MulConst(&y2, big.NewInt(362880))
+	x3 := e.fp.MulConst(&y3, big.NewInt(362880))
+	x4 := e.fp.MulConst(&y4, big.NewInt(362880))
+	x5 := e.fp.MulConst(&y5, big.NewInt(362880))
+	e.fp.AssertIsEqual(x[0], x0)
+	e.fp.AssertIsEqual(x[1], x1)
+	e.fp.AssertIsEqual(x[2], x2)
+	e.fp.AssertIsEqual(x[3], x3)
+	e.fp.AssertIsEqual(x[4], x4)
+	e.fp.AssertIsEqual(x[5], x5)
+
+	return [6]*baseEl{&y0, &y1, &y2, &y3, &y4, &y5}
+}
+
 func (e Ext6) AssertIsEqual(a, b *E6) {
 	e.fp.AssertIsEqual(&a.A0, &b.A0)
 	e.fp.AssertIsEqual(&a.A1, &b.A1)
diff --git a/std/algebra/emulated/fields_bw6761/e6_test.go b/std/algebra/emulated/fields_bw6761/e6_test.go
index 269e8802a4..d5bf8147c2 100644
--- a/std/algebra/emulated/fields_bw6761/e6_test.go
+++ b/std/algebra/emulated/fields_bw6761/e6_test.go
@@ -108,7 +108,7 @@ type e6MulVariants struct {
 
 func (circuit *e6MulVariants) Define(api frontend.API) error {
 	e := NewExt6(api)
-	expected1 := *e.mulMontgomery(&circuit.A, &circuit.B)
+	expected1 := *e.mulMontgomery6(&circuit.A, &circuit.B)
 	expected2 := *e.mulToomCook6(&circuit.A, &circuit.B)
 	e.AssertIsEqual(&expected1, &circuit.C)
 	e.AssertIsEqual(&expected2, &circuit.C)
diff --git a/std/algebra/emulated/fields_bw6761/hints.go b/std/algebra/emulated/fields_bw6761/hints.go
index cf983c51c7..994daa0d18 100644
--- a/std/algebra/emulated/fields_bw6761/hints.go
+++ b/std/algebra/emulated/fields_bw6761/hints.go
@@ -4,6 +4,7 @@ import (
 	"math/big"
 
 	bw6761 "github.com/consensys/gnark-crypto/ecc/bw6-761"
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 	"github.com/consensys/gnark/constraint/solver"
 	"github.com/consensys/gnark/std/math/emulated"
 )
@@ -17,6 +18,7 @@ func GetHints() []solver.Hint {
 	return []solver.Hint{
 		divE6Hint,
 		inverseE6Hint,
+		divE6By362880Hint,
 	}
 }
 
@@ -75,3 +77,32 @@ func divE6Hint(nativeMod *big.Int, nativeInputs, nativeOutputs []*big.Int) error
 			return nil
 		})
 }
+
+func divE6By362880Hint(nativeMod *big.Int, nativeInputs, nativeOutputs []*big.Int) error {
+	return emulated.UnwrapHint(nativeInputs, nativeOutputs,
+		func(mod *big.Int, inputs, outputs []*big.Int) error {
+			var a, c bw6761.E6
+
+			a.B0.A0.SetBigInt(inputs[0])
+			a.B0.A1.SetBigInt(inputs[2])
+			a.B0.A2.SetBigInt(inputs[4])
+			a.B1.A0.SetBigInt(inputs[1])
+			a.B1.A1.SetBigInt(inputs[3])
+			a.B1.A2.SetBigInt(inputs[5])
+
+			var sixInv fp.Element
+			sixInv.SetString("362880")
+			sixInv.Inverse(&sixInv)
+			c.B0.MulByElement(&a.B0, &sixInv)
+			c.B1.MulByElement(&a.B1, &sixInv)
+
+			c.B0.A0.BigInt(outputs[0])
+			c.B0.A1.BigInt(outputs[2])
+			c.B0.A2.BigInt(outputs[4])
+			c.B1.A0.BigInt(outputs[1])
+			c.B1.A1.BigInt(outputs[3])
+			c.B1.A2.BigInt(outputs[5])
+
+			return nil
+		})
+}

From 09004632186ec145026f451ccff19e1ce5547fe6 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Fri, 3 May 2024 19:19:41 -0400
Subject: [PATCH 18/24] refactor: clean code

---
 std/algebra/emulated/fields_bw6761/e6.go | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 273a9cbcc6..554ebb5cce 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -198,10 +198,9 @@ func (e Ext6) Mul(x, y *E6) *E6 {
 func (e Ext6) mulMontgomery6(x, y *E6) *E6 {
 	// Ref.: Peter L. Montgomery. Five, six, and seven-term Karatsuba-like formulae. IEEE
 	// Transactions on Computers, 54(3):362–369, 2005.
-	x = e.Reduce(x)
-	y = e.Reduce(y)
-	// Fixing the polynomial to X^6 we first compute the interpolation points
-	// vi = x(pi)*y(pi) at {0, ±1, ±2, ±3, ±4, 5,∞}:
+	//
+	// Fixing the polynomial C to X^6 we first compute the interpolation points
+	// vi = x(Xi)*y(Xi) at Xi={0, ±1, ±2, ±3, ±4, 5,∞}:
 	//
 	//		v0 = (a0 + a1 + a2 + a3 + a4 + a5)(b0 + b1 + b2 + b3 + b4 + b5)
 	//		v2 = (a0 + a1 + a3 + a4)(b0 + b1 + b3 + b4)
@@ -428,8 +427,6 @@ func (e Ext6) mulMontgomery6(x, y *E6) *E6 {
 }
 
 func (e Ext6) mulToomCook6(x, y *E6) *E6 {
-	x = e.Reduce(x)
-	y = e.Reduce(y)
 	// Toom-Cook 6-way multiplication:
 	//
 	// Ref.: https://eprint.iacr.org/2006/471.pdf
@@ -701,19 +698,6 @@ func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 	s1 = e.fp.Add(s1, s2)
 	c5 = e.fp.Sub(c5, s1)
 
-	/*
-		inv362880 := emulated.ValueOf[emulated.BW6761Fp]("4671422665851984694040348663017660157508519176517181272289218522372474038323623073011971993796055931265397672069676435635279488178552288409646583546248183456271259848848724056226545014884280653287710097584502403952205015690976464")
-
-		return &E6{
-			A0: *e.fp.Mul(c0, &inv362880),
-			A1: *e.fp.Mul(c1, &inv362880),
-			A2: *e.fp.Mul(c2, &inv362880),
-			A3: *e.fp.Mul(c3, &inv362880),
-			A4: *e.fp.Mul(c4, &inv362880),
-			A5: *e.fp.Mul(c5, &inv362880),
-		}
-	*/
-
 	res := e.divE6By362880([6]*baseEl{c0, c1, c2, c3, c4, c5})
 	return &E6{
 		A0: *res[0],

From 471563de8959954f15c343729bc2deeb2bb19446 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Fri, 3 May 2024 19:49:30 -0400
Subject: [PATCH 19/24] test: update stats

---
 internal/stats/latest.stats | Bin 2246 -> 2246 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/internal/stats/latest.stats b/internal/stats/latest.stats
index f005c61d24cb12d8ef503023cb7d597d2e9fc8fd..793af61c3a0e2188f4e06ba7c40d882fd343533c 100644
GIT binary patch
delta 132
zcmX>mcua6Y=H!!13nq84&Y8@{Xg|4`ecHs#ct*X+57@*f+psO3c*1V+OJ>H&ZH#j!
zOLDB8ti-f@@=KQalkc-FnViYo;K)~ym|2vWmmZ%~Zf0&~$jre21ssfjV;J@@{+<5w
U6GG?@V_d%$;~z$kXD%>70G$RZPXGV_

delta 125
zcmX>mcua6Y=Hx1-_{me4?Itf}w72IgNX#tC%uA0?DmODXGi2sqfC3K2zX=Q782`@C
zi9iVbVa#or!}y0Wtal4UaPkF~d6T8tmQT!FFqxI3Veutq#>t1+=1=BioHO|*)9%TM
P>`Nv;VVpKOk992oOx!4G


From de595f7d19599e63e7d3e84bb0f09ffbc11e3c47 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 6 May 2024 14:25:22 -0400
Subject: [PATCH 20/24] refactor(bw6/Fp6-mul): record some common additions

---
 std/algebra/emulated/fields_bw6761/e6.go | 47 ++++++++++--------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 554ebb5cce..976d4f9df4 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -575,27 +575,29 @@ func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 
 	v10 := e.fp.Mul(&x.A5, &y.A5)
 
+	// recording common sub-expressions
+	v12 := e.fp.Add(v1, v2)
+	v34 := e.fp.Add(v3, v4)
+	v56 := e.fp.Add(v5, v6)
+	v78 := e.fp.Add(v7, v8)
+
 	//	Then we compute the product  362880 * x * y to avoid divisions (mul by large coeffs):
 	//
 	// 		c0 = 438480 v0 + 26208(v3 + v4) + 504(v7 + v8)
 	// 		- (58464(v1 + v2) + 6048(v5 + v6) + 396264960 v10)
 	c0 := e.fp.MulConst(v0, big.NewInt(438480))
-	s1 = e.fp.Add(v3, v4)
-	s1 = e.fp.MulConst(s1, big.NewInt(26208))
+	s1 = e.fp.MulConst(v34, big.NewInt(26208))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.Add(v7, v8)
-	s1 = e.fp.MulConst(s1, big.NewInt(504))
+	s1 = e.fp.MulConst(v78, big.NewInt(504))
 	c0 = e.fp.Add(c0, s1)
-	s1 = e.fp.Add(v2, v1)
-	s1 = e.fp.MulConst(s1, big.NewInt(58464))
-	s2 = e.fp.Add(v5, v6)
-	s2 = e.fp.MulConst(s2, big.NewInt(6048))
+	s1 = e.fp.MulConst(v12, big.NewInt(58464))
+	s2 = e.fp.MulConst(v56, big.NewInt(6048))
 	s1 = e.fp.Add(s1, s2)
 	s2 = e.fp.MulConst(v10, big.NewInt(396264960))
 	s1 = e.fp.Add(s1, s2)
 	c0 = e.fp.Sub(c0, s1)
 	//  	c1 = 744 v8 + 696 v9 + 49536 v4 + 39744 v5 + 380016 v1
-	//  	− (87696 v0 + 226800 v2 + 136080 v3 + 8424* v6 + 7704 v7 + 1262822400 v10)
+	//  	− (87696 v0 + 226800 v2 + 133056 v3 + 8424* v6 + 7704 v7 + 1262822400 v10)
 	c1 := e.fp.MulConst(v8, big.NewInt(744))
 	s1 = e.fp.MulConst(v9, big.NewInt(696))
 	c1 = e.fp.Add(c1, s1)
@@ -619,19 +621,15 @@ func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 	c1 = e.fp.Sub(c1, s1)
 	// 		c2 = 4896(v5 + v6) + 292320(v1 + v2) + 252564480 v10
 	// 		− (519120 v0 + 360(v7 + v8) + 37296(v3 + v4))
-	c2 := e.fp.Add(v5, v6)
-	c2 = e.fp.MulConst(c2, big.NewInt(4896))
-	s1 = e.fp.Add(v1, v2)
-	s1 = e.fp.MulConst(s1, big.NewInt(292320))
+	c2 := e.fp.MulConst(v56, big.NewInt(4896))
+	s1 = e.fp.MulConst(v12, big.NewInt(292320))
 	c2 = e.fp.Add(c2, s1)
 	s1 = e.fp.MulConst(v10, big.NewInt(252564480))
 	c2 = e.fp.Add(c2, s1)
 	s1 = e.fp.MulConst(v0, big.NewInt(519120))
-	s2 = e.fp.Add(v7, v8)
-	s2 = e.fp.MulConst(s2, big.NewInt(360))
+	s2 = e.fp.MulConst(v78, big.NewInt(360))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.Add(v3, v4)
-	s2 = e.fp.MulConst(s2, big.NewInt(37296))
+	s2 = e.fp.MulConst(v34, big.NewInt(37296))
 	s1 = e.fp.Add(s1, s2)
 	c2 = e.fp.Sub(c2, s1)
 	// 		c3 = 103824 v0 + 1495065600 v10 + 10728 v6 + 9180 v7 + 53760 v2 + 154392 v3
@@ -660,18 +658,14 @@ func (e Ext6) mulToomCook6(x, y *E6) *E6 {
 	// 		c4 = 171990 v0 + 42588(v3 + v4) + 441* (v7 + v8)
 	// 		− (299013120 v10 + 122976(v1 + v2) + 6048(v5 + v6))
 	c4 := e.fp.MulConst(v0, big.NewInt(171990))
-	s1 = e.fp.Add(v3, v4)
-	s1 = e.fp.MulConst(s1, big.NewInt(42588))
+	s1 = e.fp.MulConst(v34, big.NewInt(42588))
 	c4 = e.fp.Add(c4, s1)
-	s1 = e.fp.Add(v7, v8)
-	s1 = e.fp.MulConst(s1, big.NewInt(441))
+	s1 = e.fp.MulConst(v78, big.NewInt(441))
 	c4 = e.fp.Add(c4, s1)
 	s1 = e.fp.MulConst(v10, big.NewInt(299013120))
-	s2 = e.fp.Add(v1, v2)
-	s2 = e.fp.MulConst(s2, big.NewInt(122976))
+	s2 = e.fp.MulConst(v12, big.NewInt(122976))
 	s1 = e.fp.Add(s1, s2)
-	s2 = e.fp.Add(v5, v6)
-	s2 = e.fp.MulConst(s2, big.NewInt(6048))
+	s2 = e.fp.MulConst(v56, big.NewInt(6048))
 	s1 = e.fp.Add(s1, s2)
 	c4 = e.fp.Sub(c4, s1)
 	// 		c5 = 231 v8 + 273 v9 + 3276 v4 + 8316 v2 + 14364 v5 + 49014 v1
@@ -1001,14 +995,13 @@ func (e Ext6) Frobenius(x *E6) *E6 {
 	_frobA := emulated.ValueOf[emulated.BW6761Fp]("4922464560225523242118178942575080391082002530232324381063048548642823052024664478336818169867474395270858391911405337707247735739826664939444490469542109391530482826728203582549674992333383150446779312029624171857054392282775648")
 	_frobB := emulated.ValueOf[emulated.BW6761Fp]("1968985824090209297278610739700577151397666382303825728450741611566800370218827257750865013421937292370006175842381275743914023380727582819905021229583192207421122272650305267822868639090213645505120388400344940985710520836292650")
 	_frobC := emulated.ValueOf[emulated.BW6761Fp]("4922464560225523242118178942575080391082002530232324381063048548642823052024664478336818169867474395270858391911405337707247735739826664939444490469542109391530482826728203582549674992333383150446779312029624171857054392282775649")
-	_frobAC := emulated.ValueOf[emulated.BW6761Fp]("-1")
 	_frobBC := emulated.ValueOf[emulated.BW6761Fp]("1968985824090209297278610739700577151397666382303825728450741611566800370218827257750865013421937292370006175842381275743914023380727582819905021229583192207421122272650305267822868639090213645505120388400344940985710520836292651")
 	var z E6
 	z.A0 = x.A0
 	z.A2 = *e.fp.Mul(&x.A2, &_frobA)
 	z.A4 = *e.fp.Mul(&x.A4, &_frobB)
 	z.A1 = *e.fp.Mul(&x.A1, &_frobC)
-	z.A3 = *e.fp.Mul(&x.A3, &_frobAC)
+	z.A3 = *e.fp.Neg(&x.A3)
 	z.A5 = *e.fp.Mul(&x.A5, &_frobBC)
 
 	return &z

From a4e6b23b54b8a0cf24cd382ceb78418aac63d462 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 6 May 2024 14:54:51 -0400
Subject: [PATCH 21/24] test: update stats

---
 internal/stats/latest.stats | Bin 2246 -> 2246 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/internal/stats/latest.stats b/internal/stats/latest.stats
index 793af61c3a0e2188f4e06ba7c40d882fd343533c..b5f1781ab7d7717a4e5f036849e6a313177812df 100644
GIT binary patch
delta 84
zcmV-a0IUDT5ylaaX_L|eV3QjLr;`N;Ka+(B9tOAp{g?Uflc@+DlQIOLlfnj{lL-Nw
qlWqmAli32hlX?PQlVJy>k!hfllmkAK<N=nG(gd87r2#*aL<m8^x+1jz

delta 114
zcmX>mcua6YCZq7=k3j6j)-ze3dCp`v4*SU>?9(P2v&K)ZVA(zS8k^n3%!J7w7#B=l
z#Xe{9Y38*PPcEN4focBaNvumI7ckmSZsl;Cyq-}`c@N{?=|4X)KmiBiAI7+TEg<Rf
I%mpS00M_y-82|tP


From 9c26d64b60441c4ac4263badf27f4abc8c49d947 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 6 May 2024 17:32:18 -0400
Subject: [PATCH 22/24] refactor(bw6): remove benchmark

---
 std/algebra/emulated/fields_bw6761/e6_test.go | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6_test.go b/std/algebra/emulated/fields_bw6761/e6_test.go
index d5bf8147c2..cd0193ae9c 100644
--- a/std/algebra/emulated/fields_bw6761/e6_test.go
+++ b/std/algebra/emulated/fields_bw6761/e6_test.go
@@ -351,19 +351,3 @@ func TestFp6MulBy023(t *testing.T) {
 	err := test.IsSolved(&e6MulBy023{}, &witness, ecc.BN254.ScalarField())
 	assert.NoError(err)
 }
-
-func BenchmarkMulMontgomery6(b *testing.B) {
-	var c e6Mul
-	p := profile.Start()
-	_, _ = frontend.Compile(ecc.BN254.ScalarField(), scs.NewBuilder, &c)
-	p.Stop()
-	fmt.Println("Fp6 Mul (Montgomery-6): ", p.NbConstraints())
-}
-
-func BenchmarkSqMontgomery6(b *testing.B) {
-	var c e6Square
-	p := profile.Start()
-	_, _ = frontend.Compile(ecc.BN254.ScalarField(), scs.NewBuilder, &c)
-	p.Stop()
-	fmt.Println("Fp6 Square (Montgomery-6): ", p.NbConstraints())
-}

From e62195168e590061d3b5109949a043396cafdaa1 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 6 May 2024 17:45:03 -0400
Subject: [PATCH 23/24] refactor(bw6): remove benchmark

---
 std/algebra/emulated/fields_bw6761/e6_test.go | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6_test.go b/std/algebra/emulated/fields_bw6761/e6_test.go
index cd0193ae9c..b5745e77c6 100644
--- a/std/algebra/emulated/fields_bw6761/e6_test.go
+++ b/std/algebra/emulated/fields_bw6761/e6_test.go
@@ -1,15 +1,12 @@
 package fields_bw6761
 
 import (
-	"fmt"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc"
 	bw6761 "github.com/consensys/gnark-crypto/ecc/bw6-761"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 	"github.com/consensys/gnark/frontend"
-	"github.com/consensys/gnark/frontend/cs/scs"
-	"github.com/consensys/gnark/profile"
 	"github.com/consensys/gnark/std/math/emulated"
 	"github.com/consensys/gnark/test"
 )

From 5d27c866a09b1716a37188173561a7a7053828f4 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Tue, 7 May 2024 18:30:44 -0400
Subject: [PATCH 24/24] refactor(bw6): apply review suggestion

---
 std/algebra/emulated/fields_bw6761/e6.go         | 14 +++++++-------
 std/algebra/emulated/fields_bw6761/e6_pairing.go | 11 ++++++-----
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/std/algebra/emulated/fields_bw6761/e6.go b/std/algebra/emulated/fields_bw6761/e6.go
index 976d4f9df4..433b92cc5a 100644
--- a/std/algebra/emulated/fields_bw6761/e6.go
+++ b/std/algebra/emulated/fields_bw6761/e6.go
@@ -182,7 +182,7 @@ func (e Ext6) Conjugate(x *E6) *E6 {
 	}
 }
 
-func mulFpByNonResidue(fp *curveF, x *baseEl) *baseEl {
+func (e Ext6) mulFpByNonResidue(fp *curveF, x *baseEl) *baseEl {
 
 	z := fp.Neg(x)
 	z = fp.MulConst(z, big.NewInt(4))
@@ -739,7 +739,7 @@ func (e Ext6) Square(x *E6) *E6 {
 	tmp = e.fp.Add(&x.A1, &x.A3)
 	c1 = e.fp.Mul(c1, tmp)
 	c1 = e.fp.Sub(c1, e.fp.Add(t0, t1))
-	t2 = mulFpByNonResidue(e.fp, t2)
+	t2 = e.mulFpByNonResidue(e.fp, t2)
 	// c2
 	c20 := e.fp.Add(c0, t0)
 	c21 := e.fp.Add(c1, t2)
@@ -761,7 +761,7 @@ func (e Ext6) Square(x *E6) *E6 {
 	tmp = e.fp.Add(c30, c31)
 	c1 = e.fp.Mul(c1, tmp)
 	c1 = e.fp.Sub(c1, e.fp.Add(t0, t1))
-	t2 = mulFpByNonResidue(e.fp, t2)
+	t2 = e.mulFpByNonResidue(e.fp, t2)
 	c00 = e.fp.Add(c0, t0)
 	c01 = e.fp.Add(c1, t2)
 	c02 = e.fp.Add(c2, t1)
@@ -797,7 +797,7 @@ func (e Ext6) CyclotomicSquareKarabina12345(x *E6) *E6 {
 	// h4 = -g4 + 3((g3+g5)(g1+c*g2)-g1g5-c*g3g2)
 	g1g5 := e.fp.Mul(&x.A2, &x.A5)
 	g3g2 := e.fp.Mul(&x.A1, &x.A4)
-	h4 := mulFpByNonResidue(e.fp, &x.A4)
+	h4 := e.mulFpByNonResidue(e.fp, &x.A4)
 	h4 = e.fp.Add(h4, &x.A2)
 	t := e.fp.Add(&x.A1, &x.A5)
 	h4 = e.fp.Mul(h4, t)
@@ -808,13 +808,13 @@ func (e Ext6) CyclotomicSquareKarabina12345(x *E6) *E6 {
 	h4 = e.fp.Sub(h4, &x.A3)
 
 	// h3 = 2(g3+3c*g1g5)
-	h3 := mulFpByNonResidue(e.fp, g1g5)
+	h3 := e.mulFpByNonResidue(e.fp, g1g5)
 	h3 = e.fp.MulConst(h3, big.NewInt(3))
 	h3 = e.fp.Add(h3, &x.A1)
 	h3 = e.fp.MulConst(h3, big.NewInt(2))
 
 	// h2 = 3((g1+g5)(g1+c*g5)-(c+1)*g1g5)-2g2
-	t = mulFpByNonResidue(e.fp, &x.A5)
+	t = e.mulFpByNonResidue(e.fp, &x.A5)
 	t = e.fp.Add(t, &x.A2)
 	h2 := e.fp.Add(&x.A5, &x.A2)
 	h2 = e.fp.Mul(h2, t)
@@ -825,7 +825,7 @@ func (e Ext6) CyclotomicSquareKarabina12345(x *E6) *E6 {
 	h2 = e.fp.Sub(h2, t)
 
 	// h1 = 3((g3+g2)(g3+c*g2)-(c+1)*g3g2)-2g1
-	t = mulFpByNonResidue(e.fp, &x.A4)
+	t = e.mulFpByNonResidue(e.fp, &x.A4)
 	t = e.fp.Add(t, &x.A1)
 	h1 := e.fp.Add(&x.A4, &x.A1)
 	h1 = e.fp.Mul(h1, t)
diff --git a/std/algebra/emulated/fields_bw6761/e6_pairing.go b/std/algebra/emulated/fields_bw6761/e6_pairing.go
index 3365e900f2..12cbf4ab41 100644
--- a/std/algebra/emulated/fields_bw6761/e6_pairing.go
+++ b/std/algebra/emulated/fields_bw6761/e6_pairing.go
@@ -1,8 +1,9 @@
 package fields_bw6761
 
 import (
-	"github.com/consensys/gnark/std/math/emulated"
 	"math/big"
+
+	"github.com/consensys/gnark/std/math/emulated"
 )
 
 func (e Ext6) nSquareKarabina12345(z *E6, n int) *E6 {
@@ -252,7 +253,7 @@ func (e *Ext6) MulBy02345(z *E6, x [5]*baseEl) *E6 {
 	c1 = e.fp.Mul(c1, tmp)
 	c1 = e.fp.Sub(c1, t0)
 	c1 = e.fp.Sub(c1, t1)
-	t2 = mulFpByNonResidue(e.fp, t2)
+	t2 = e.mulFpByNonResidue(e.fp, t2)
 	a0 = e.fp.Add(c0, t0)
 	a1 = e.fp.Add(c1, t2)
 	a2 = e.fp.Add(c2, t1)
@@ -276,7 +277,7 @@ func (e *Ext6) MulBy02345(z *E6, x [5]*baseEl) *E6 {
 	c1 = e.fp.Mul(c1, tmp)
 	c1 = e.fp.Sub(c1, t0)
 	c1 = e.fp.Sub(c1, t1)
-	t2 = mulFpByNonResidue(e.fp, t2)
+	t2 = e.mulFpByNonResidue(e.fp, t2)
 	b0 := e.fp.Add(c0, t0)
 	b1 = e.fp.Add(c1, t2)
 	b2 = e.fp.Add(c2, t1)
@@ -292,7 +293,7 @@ func (e *Ext6) MulBy02345(z *E6, x [5]*baseEl) *E6 {
 	c1 = e.fp.Add(&z.A1, &z.A3)
 	c1 = e.fp.Mul(c1, x[2])
 	c1 = e.fp.Sub(c1, t1)
-	tmp = mulFpByNonResidue(e.fp, t2)
+	tmp = e.mulFpByNonResidue(e.fp, t2)
 	c1 = e.fp.Add(c1, tmp)
 	tmp = e.fp.Add(&z.A1, &z.A5)
 	c2 = e.fp.Mul(x[4], tmp)
@@ -306,7 +307,7 @@ func (e *Ext6) MulBy02345(z *E6, x [5]*baseEl) *E6 {
 	tmp = e.fp.Add(b2, c2)
 	z12 := e.fp.Sub(a2, tmp)
 
-	z00 := mulFpByNonResidue(e.fp, c2)
+	z00 := e.mulFpByNonResidue(e.fp, c2)
 	z00 = e.fp.Add(z00, b0)
 	z01 := e.fp.Add(c0, b1)
 	z02 := e.fp.Add(c1, b2)