Perf: emulated pairing BN254 #714

yelhousni · 2023-06-02T18:03:02Z

One more optim while writing the blog post: In the case of multi-pairings, when bit=0 we can first store the lines in the first loop and then iterate on them in the second loop while multiplying them together 2-by-2 before multiplying by the accumulator.

l1 := make([]*lineEvaluation, n)

// ...

switch loopCounter[i] {

case 0:
        // precompute lines
        for k := 0; k < n; k++ {
                // Qacc[k] ← 2Qacc[k] and l1 the tangent ℓ passing 2Qacc[k]
                Qacc[k], l1[k] = pr.doubleStep(Qacc[k])

                // line evaluation at P[k]
                l1[k].R0 = *pr.MulByElement(&l1[k].R0, xOverY[k])
                l1[k].R1 = *pr.MulByElement(&l1[k].R1, yInv[k])

        }

        // if number of lines is odd, mul last line by res
        // works for n=1 as well
        if n%2 != 0 {
                // ℓ × res
                res = pr.MulBy034(res, &l1[n-1].R0, &l1[n-1].R1)

        }

        // mul lines 2-by-2
        for k := 1; k < n; k += 2 {
                // ℓ × ℓ
                prodLines = *pr.Mul034By034(&l1[k].R0, &l1[k].R1, &l1[k-1].R0, &l1[k-1].R1)
                // (ℓ × ℓ) × res
                res = pr.MulBy01234(res, &prodLines)

        }
        // ...
}

This saves quite some constraints as the batch size grows. For example:

Batch of size 2: -3 588 r1cs
Batch of size 9: -89 441 r1cs

P.S.: this is not worth it for BLS12-381 as Mul014By014 (2-by-2 lines mul for M-type twist and quadratic final sub-extension) is not efficient circuit wise compared to 2 plain muls by line (MulBy014).

P.P.S.: this has a direct perf incidence on the ECPAIR precompile if we keep the 2-by-2 MillerLoop logic or increase it.

ivokub · 2023-06-23T15:48:50Z

Suggested edit:

diff --git a/std/algebra/emulated/sw_bn254/pairing_test.go b/std/algebra/emulated/sw_bn254/pairing_test.go
index 30fdd148..4623245e 100644
--- a/std/algebra/emulated/sw_bn254/pairing_test.go
+++ b/std/algebra/emulated/sw_bn254/pairing_test.go
@@ -112,66 +112,16 @@ func (c *MultiPairCircuit) Define(api frontend.API) error {
 	}
 	pairing.AssertIsOnG1(&c.InG1)
 	pairing.AssertIsOnG2(&c.InG2)
-	switch c.n {
-	case 2:
-		res, err := pairing.Pair([]*G1Affine{&c.InG1, &c.InG1}, []*G2Affine{&c.InG2, &c.InG2})
-		if err != nil {
-			return fmt.Errorf("pair: %w", err)
-		}
-		pairing.AssertIsEqual(res, &c.Res)
-
-	case 3:
-		res, err := pairing.Pair([]*G1Affine{&c.InG1, &c.InG1, &c.InG1}, []*G2Affine{&c.InG2, &c.InG2, &c.InG2})
-		if err != nil {
-			return fmt.Errorf("pair: %w", err)
-		}
-		pairing.AssertIsEqual(res, &c.Res)
-
-	case 4:
-		res, err := pairing.Pair([]*G1Affine{&c.InG1, &c.InG1, &c.InG1, &c.InG1}, []*G2Affine{&c.InG2, &c.InG2, &c.InG2, &c.InG2})
-		if err != nil {
-			return fmt.Errorf("pair: %w", err)
-		}
-		pairing.AssertIsEqual(res, &c.Res)
-
-	case 5:
-		res, err := pairing.Pair([]*G1Affine{&c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1}, []*G2Affine{&c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2})
-		if err != nil {
-			return fmt.Errorf("pair: %w", err)
-		}
-		pairing.AssertIsEqual(res, &c.Res)
-
-	case 6:
-		res, err := pairing.Pair([]*G1Affine{&c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1}, []*G2Affine{&c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2})
-		if err != nil {
-			return fmt.Errorf("pair: %w", err)
-		}
-		pairing.AssertIsEqual(res, &c.Res)
-
-	case 7:
-		res, err := pairing.Pair([]*G1Affine{&c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1}, []*G2Affine{&c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2})
-		if err != nil {
-			return fmt.Errorf("pair: %w", err)
-		}
-		pairing.AssertIsEqual(res, &c.Res)
-
-	case 8:
-		res, err := pairing.Pair([]*G1Affine{&c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1}, []*G2Affine{&c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2})
-		if err != nil {
-			return fmt.Errorf("pair: %w", err)
-		}
-		pairing.AssertIsEqual(res, &c.Res)
-
-	case 9:
-		res, err := pairing.Pair([]*G1Affine{&c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1, &c.InG1}, []*G2Affine{&c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2, &c.InG2})
-		if err != nil {
-			return fmt.Errorf("pair: %w", err)
-		}
-		pairing.AssertIsEqual(res, &c.Res)
-	default:
-		return fmt.Errorf("not handled %d", c.n)
-
+	P, Q := []*G1Affine{}, []*G2Affine{}
+	for i := 0; i < c.n; i++ {
+		P = append(P, &c.InG1)
+		Q = append(Q, &c.InG2)
 	}
+	res, err := pairing.Pair(P, Q)
+	if err != nil {
+		return fmt.Errorf("pair: %w", err)
+	}
+	pairing.AssertIsEqual(res, &c.Res)
 	return nil
 }
 
@@ -195,7 +145,6 @@ func TestMultiPairTestSolve(t *testing.T) {
 		}
 		err = test.IsSolved(&MultiPairCircuit{n: i}, &witness, ecc.BN254.ScalarField())
 		assert.NoError(err)
-		fmt.Println("Batch of size", i, "✅")
 	}
 }

ivokub · 2023-06-23T15:50:08Z

Suggested edit:

diff --git a/std/algebra/emulated/sw_bn254/pairing.go b/std/algebra/emulated/sw_bn254/pairing.go
index e1f523d8..6dbf11af 100644
--- a/std/algebra/emulated/sw_bn254/pairing.go
+++ b/std/algebra/emulated/sw_bn254/pairing.go
@@ -329,8 +329,7 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 	res := pr.Ext12.One()
 	var prodLines [5]fields_bn254.E2
 
-	l1 := make([]*lineEvaluation, n)
-	l2 := make([]*lineEvaluation, n)
+	var l1, l2 *lineEvaluation
 	Qacc := make([]*G2Affine, n)
 	QNeg := make([]*G2Affine, n)
 	yInv := make([]*emulated.Element[emulated.BN254Fp], n)
@@ -354,22 +353,22 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 
 	// k = 0, separately to avoid MulBy034 (res × ℓ)
 	// (assign line to res)
-	Qacc[0], l1[0] = pr.doubleStep(Qacc[0])
+	Qacc[0], l1 = pr.doubleStep(Qacc[0])
 	// line evaluation at P[0]
-	res.C1.B0 = *pr.MulByElement(&l1[0].R0, xOverY[0])
-	res.C1.B1 = *pr.MulByElement(&l1[0].R1, yInv[0])
+	res.C1.B0 = *pr.MulByElement(&l1.R0, xOverY[0])
+	res.C1.B1 = *pr.MulByElement(&l1.R1, yInv[0])
 
 	if n >= 2 {
 		// k = 1, separately to avoid MulBy034 (res × ℓ)
 		// (res is also a line at this point, so we use Mul034By034 ℓ × ℓ)
-		Qacc[1], l1[1] = pr.doubleStep(Qacc[1])
+		Qacc[1], l1 = pr.doubleStep(Qacc[1])
 
 		// line evaluation at P[1]
-		l1[1].R0 = *pr.MulByElement(&l1[1].R0, xOverY[1])
-		l1[1].R1 = *pr.MulByElement(&l1[1].R1, yInv[1])
+		l1.R0 = *pr.MulByElement(&l1.R0, xOverY[1])
+		l1.R1 = *pr.MulByElement(&l1.R1, yInv[1])
 
 		// ℓ × res
-		prodLines = *pr.Mul034By034(&l1[1].R0, &l1[1].R1, &res.C1.B0, &res.C1.B1)
+		prodLines = *pr.Mul034By034(&l1.R0, &l1.R1, &res.C1.B0, &res.C1.B1)
 		res.C0.B0 = prodLines[0]
 		res.C0.B1 = prodLines[1]
 		res.C0.B2 = prodLines[2]
@@ -380,26 +379,26 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 	if n >= 3 {
 		// k = 2, separately to avoid MulBy034 (res × ℓ)
 		// (res has a zero E2 element, so we use Mul01234By034)
-		Qacc[2], l1[2] = pr.doubleStep(Qacc[2])
+		Qacc[2], l1 = pr.doubleStep(Qacc[2])
 
 		// line evaluation at P[1]
-		l1[2].R0 = *pr.MulByElement(&l1[2].R0, xOverY[2])
-		l1[2].R1 = *pr.MulByElement(&l1[2].R1, yInv[2])
+		l1.R0 = *pr.MulByElement(&l1.R0, xOverY[2])
+		l1.R1 = *pr.MulByElement(&l1.R1, yInv[2])
 
 		// ℓ × res
-		res = pr.Mul01234By034(&prodLines, &l1[2].R0, &l1[2].R1)
+		res = pr.Mul01234By034(&prodLines, &l1.R0, &l1.R1)
 
 		// k >= 3
 		for k := 3; k < n; k++ {
-			// Qacc[k] ← 2Qacc[k] and l1[k] the tangent ℓ passing 2Qacc[k]
-			Qacc[k], l1[k] = pr.doubleStep(Qacc[k])
+			// Qacc[k] ← 2Qacc[k] and l1 the tangent ℓ passing 2Qacc[k]
+			Qacc[k], l1 = pr.doubleStep(Qacc[k])
 
 			// line evaluation at P[k]
-			l1[k].R0 = *pr.MulByElement(&l1[k].R0, xOverY[k])
-			l1[k].R1 = *pr.MulByElement(&l1[k].R1, yInv[k])
+			l1.R0 = *pr.MulByElement(&l1.R0, xOverY[k])
+			l1.R1 = *pr.MulByElement(&l1.R1, yInv[k])
 
 			// ℓ × res
-			res = pr.MulBy034(res, &l1[k].R0, &l1[k].R1)
+			res = pr.MulBy034(res, &l1.R0, &l1.R1)
 		}
 	}
 
@@ -409,22 +408,22 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 	res = pr.Square(res)
 	for k := 0; k < n; k++ {
 		// l2 the line passing Qacc[k] and -Q
-		l2[k] = pr.lineCompute(Qacc[k], QNeg[k])
+		l2 = pr.lineCompute(Qacc[k], QNeg[k])
 
 		// line evaluation at P[k]
-		l2[k].R0 = *pr.MulByElement(&l2[k].R0, xOverY[k])
-		l2[k].R1 = *pr.MulByElement(&l2[k].R1, yInv[k])
+		l2.R0 = *pr.MulByElement(&l2.R0, xOverY[k])
+		l2.R1 = *pr.MulByElement(&l2.R1, yInv[k])
 
 		// Qacc[k] ← Qacc[k]+Q[k] and
-		// l1[k] the line ℓ passing Qacc[k] and Q[k]
-		Qacc[k], l1[k] = pr.addStep(Qacc[k], Q[k])
+		// l1 the line ℓ passing Qacc[k] and Q[k]
+		Qacc[k], l1 = pr.addStep(Qacc[k], Q[k])
 
 		// line evaluation at P[k]
-		l1[k].R0 = *pr.MulByElement(&l1[k].R0, xOverY[k])
-		l1[k].R1 = *pr.MulByElement(&l1[k].R1, yInv[k])
+		l1.R0 = *pr.MulByElement(&l1.R0, xOverY[k])
+		l1.R1 = *pr.MulByElement(&l1.R1, yInv[k])
 
 		// ℓ × ℓ
-		prodLines = *pr.Mul034By034(&l1[k].R0, &l1[k].R1, &l2[k].R0, &l2[k].R1)
+		prodLines = *pr.Mul034By034(&l1.R0, &l1.R1, &l2.R0, &l2.R1)
 		// (ℓ × ℓ) × res
 		res = pr.MulBy01234(res, &prodLines)
 	}
@@ -468,20 +467,20 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 		case 1:
 			for k := 0; k < n; k++ {
 				// Qacc[k] ← 2Qacc[k]+Q[k],
-				// l1[k] the line ℓ passing Qacc[k] and Q[k]
-				// l2[k] the line ℓ passing (Qacc[k]+Q[k]) and Qacc[k]
-				Qacc[k], l1[k], l2[k] = pr.doubleAndAddStep(Qacc[k], Q[k])
+				// l1 the line ℓ passing Qacc[k] and Q[k]
+				// l2 the line ℓ passing (Qacc[k]+Q[k]) and Qacc[k]
+				Qacc[k], l1, l2 = pr.doubleAndAddStep(Qacc[k], Q[k])
 
 				// line evaluation at P[k]
-				l1[k].R0 = *pr.MulByElement(&l1[k].R0, xOverY[k])
-				l1[k].R1 = *pr.MulByElement(&l1[k].R1, yInv[k])
+				l1.R0 = *pr.MulByElement(&l1.R0, xOverY[k])
+				l1.R1 = *pr.MulByElement(&l1.R1, yInv[k])
 
 				// line evaluation at P[k]
-				l2[k].R0 = *pr.MulByElement(&l2[k].R0, xOverY[k])
-				l2[k].R1 = *pr.MulByElement(&l2[k].R1, yInv[k])
+				l2.R0 = *pr.MulByElement(&l2.R0, xOverY[k])
+				l2.R1 = *pr.MulByElement(&l2.R1, yInv[k])
 
 				// ℓ × ℓ
-				prodLines = *pr.Mul034By034(&l1[k].R0, &l1[k].R1, &l2[k].R0, &l2[k].R1)
+				prodLines = *pr.Mul034By034(&l1.R0, &l1.R1, &l2.R0, &l2.R1)
 				// (ℓ × ℓ) × res
 				res = pr.MulBy01234(res, &prodLines)
 
@@ -490,20 +489,20 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 		case -1:
 			for k := 0; k < n; k++ {
 				// Qacc[k] ← 2Qacc[k]-Q[k],
-				// l1[k] the line ℓ passing Qacc[k] and -Q[k]
-				// l2[k] the line ℓ passing (Qacc[k]-Q[k]) and Qacc[k]
-				Qacc[k], l1[k], l2[k] = pr.doubleAndAddStep(Qacc[k], QNeg[k])
+				// l1 the line ℓ passing Qacc[k] and -Q[k]
+				// l2 the line ℓ passing (Qacc[k]-Q[k]) and Qacc[k]
+				Qacc[k], l1, l2 = pr.doubleAndAddStep(Qacc[k], QNeg[k])
 
 				// line evaluation at P[k]
-				l1[k].R0 = *pr.MulByElement(&l1[k].R0, xOverY[k])
-				l1[k].R1 = *pr.MulByElement(&l1[k].R1, yInv[k])
+				l1.R0 = *pr.MulByElement(&l1.R0, xOverY[k])
+				l1.R1 = *pr.MulByElement(&l1.R1, yInv[k])
 
 				// line evaluation at P[k]
-				l2[k].R0 = *pr.MulByElement(&l2[k].R0, xOverY[k])
-				l2[k].R1 = *pr.MulByElement(&l2[k].R1, yInv[k])
+				l2.R0 = *pr.MulByElement(&l2.R0, xOverY[k])
+				l2.R1 = *pr.MulByElement(&l2.R1, yInv[k])
 
 				// ℓ × ℓ
-				prodLines = *pr.Mul034By034(&l1[k].R0, &l1[k].R1, &l2[k].R0, &l2[k].R1)
+				prodLines = *pr.Mul034By034(&l1.R0, &l1.R1, &l2.R0, &l2.R1)
 				// (ℓ × ℓ) × res
 				res = pr.MulBy01234(res, &prodLines)
 
@@ -529,21 +528,21 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 		Q2.Y = *pr.Ext2.Neg(&Q2.Y)
 
 		// Qacc[k] ← Qacc[k]+π(Q) and
-		// l1[k] the line passing Qacc[k] and π(Q)
-		Qacc[k], l1[k] = pr.addStep(Qacc[k], Q1)
+		// l1 the line passing Qacc[k] and π(Q)
+		Qacc[k], l1 = pr.addStep(Qacc[k], Q1)
 
 		// line evaluation at P[k]
-		l1[k].R0 = *pr.Ext2.MulByElement(&l1[k].R0, xOverY[k])
-		l1[k].R1 = *pr.Ext2.MulByElement(&l1[k].R1, yInv[k])
+		l1.R0 = *pr.Ext2.MulByElement(&l1.R0, xOverY[k])
+		l1.R1 = *pr.Ext2.MulByElement(&l1.R1, yInv[k])
 
-		// l2[k] the line passing Qacc[k] and -π²(Q)
-		l2[k] = pr.lineCompute(Qacc[k], Q2)
+		// l2 the line passing Qacc[k] and -π²(Q)
+		l2 = pr.lineCompute(Qacc[k], Q2)
 		// line evaluation at P[k]
-		l2[k].R0 = *pr.MulByElement(&l2[k].R0, xOverY[k])
-		l2[k].R1 = *pr.MulByElement(&l2[k].R1, yInv[k])
+		l2.R0 = *pr.MulByElement(&l2.R0, xOverY[k])
+		l2.R1 = *pr.MulByElement(&l2.R1, yInv[k])
 
 		// ℓ × ℓ
-		prodLines = *pr.Mul034By034(&l1[k].R0, &l1[k].R1, &l2[k].R0, &l2[k].R1)
+		prodLines = *pr.Mul034By034(&l1.R0, &l1.R1, &l2.R0, &l2.R1)
 		// (ℓ × ℓ) × res
 		res = pr.MulBy01234(res, &prodLines)

ivokub · 2023-06-23T15:50:29Z

Suggested edit:

diff --git a/std/algebra/emulated/sw_bn254/pairing.go b/std/algebra/emulated/sw_bn254/pairing.go
index e1f523d8..8198bbc1 100644
--- a/std/algebra/emulated/sw_bn254/pairing.go
+++ b/std/algebra/emulated/sw_bn254/pairing.go
@@ -429,6 +429,7 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 		res = pr.MulBy01234(res, &prodLines)
 	}
 
+	l1s := make([]*lineEvaluation, n)
 	for i := 62; i >= 0; i-- {
 		// mutualize the square among n Miller loops
 		// (∏ᵢfᵢ)²
@@ -440,11 +441,11 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 			// precompute lines
 			for k := 0; k < n; k++ {
 				// Qacc[k] ← 2Qacc[k] and l1 the tangent ℓ passing 2Qacc[k]
-				Qacc[k], l1[k] = pr.doubleStep(Qacc[k])
+				Qacc[k], l1s[k] = pr.doubleStep(Qacc[k])
 
 				// line evaluation at P[k]
-				l1[k].R0 = *pr.MulByElement(&l1[k].R0, xOverY[k])
-				l1[k].R1 = *pr.MulByElement(&l1[k].R1, yInv[k])
+				l1s[k].R0 = *pr.MulByElement(&l1s[k].R0, xOverY[k])
+				l1s[k].R1 = *pr.MulByElement(&l1s[k].R1, yInv[k])
 
 			}
 
@@ -452,14 +453,14 @@ func (pr Pairing) MillerLoop(P []*G1Affine, Q []*G2Affine) (*GTEl, error) {
 			// works for n=1 as well
 			if n%2 != 0 {
 				// ℓ × res
-				res = pr.MulBy034(res, &l1[n-1].R0, &l1[n-1].R1)
+				res = pr.MulBy034(res, &l1s[n-1].R0, &l1s[n-1].R1)
 
 			}
 
 			// mul lines 2-by-2
 			for k := 1; k < n; k += 2 {
 				// ℓ × ℓ
-				prodLines = *pr.Mul034By034(&l1[k].R0, &l1[k].R1, &l1[k-1].R0, &l1[k-1].R1)
+				prodLines = *pr.Mul034By034(&l1s[k].R0, &l1s[k].R1, &l1s[k-1].R0, &l1s[k-1].R1)
 				// (ℓ × ℓ) × res
 				res = pr.MulBy01234(res, &prodLines)

ivokub

In general looks good and makes sense.

I suggested some edits:

in the main MillerLoop method we actually only use the whole slice in one loop. I kept the current implementation everywhere else as is easier to review.
in the test I used loop instead of select for different cases.
removed fmt.Println in test.

From these changes I think only removing fmt.Println would be essential. For the rest, see if makes sense.

perf(bn254/multi-pairing): mul lines 2-by-2 when bit=0

6304532

yelhousni added the perf label Jun 2, 2023

yelhousni added this to the v0.9.0 milestone Jun 2, 2023

yelhousni requested a review from ivokub June 2, 2023 18:03

yelhousni mentioned this pull request Jun 5, 2023

feat: emulated pairing 2-by-2 fixed circuit for EVM #698

Merged

ivokub approved these changes Jun 23, 2023

View reviewed changes

refactor: apply suggested edits

8bdadb1

yelhousni merged commit bb5a773 into develop Jul 4, 2023
4 checks passed

yelhousni deleted the perf/emulated-pairing branch July 4, 2023 12:30

mratsim mentioned this pull request Aug 2, 2023

Faster pairings axiom-crypto/halo2-lib#101

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Perf: emulated pairing BN254 #714

Perf: emulated pairing BN254 #714

yelhousni commented Jun 2, 2023 •

edited

Loading

ivokub commented Jun 23, 2023

ivokub commented Jun 23, 2023

ivokub commented Jun 23, 2023

ivokub left a comment

Perf: emulated pairing BN254 #714

Perf: emulated pairing BN254 #714

Conversation

yelhousni commented Jun 2, 2023 • edited Loading

ivokub commented Jun 23, 2023

ivokub commented Jun 23, 2023

ivokub commented Jun 23, 2023

ivokub left a comment

Choose a reason for hiding this comment

yelhousni commented Jun 2, 2023 •

edited

Loading