Permalink
Cannot retrieve contributors at this time
4722 lines (3977 sloc)
200 KB
| /* | |
| This file is part of mfaktc (mfakto). | |
| Copyright (C) 2009 - 2014 Oliver Weihe (o.weihe@t-online.de) | |
| Bertram Franz (bertramf@gmx.net) | |
| mfaktc (mfakto) is free software: you can redistribute it and/or modify | |
| it under the terms of the GNU General Public License as published by | |
| the Free Software Foundation, either version 3 of the License, or | |
| (at your option) any later version. | |
| mfaktc (mfakto) is distributed in the hope that it will be useful, | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| GNU General Public License for more details. | |
| You should have received a copy of the GNU General Public License | |
| along with mfaktc (mfakto). If not, see <http://www.gnu.org/licenses/>. | |
| Version 0.15 | |
| */ | |
| /**************************************** | |
| **************************************** | |
| * 15-bit based 75-bit barrett-kernels | |
| * | |
| **************************************** | |
| ****************************************/ | |
| int75_v sub_if_gte_75(const int75_v a, const int75_v b) | |
| /* return (a>b)?a-b:a */ | |
| { | |
| int75_v tmp; | |
| /* do the subtraction and use tmp.d4 to decide if the result is valid (if a was > b) */ | |
| tmp.d0 = (a.d0 - b.d0); | |
| tmp.d1 = (a.d1 - b.d1 + AS_UINT_V((tmp.d0 > 0x7FFF) )); | |
| tmp.d2 = (a.d2 - b.d2 + AS_UINT_V((tmp.d1 > 0x7FFF) )); | |
| tmp.d3 = (a.d3 - b.d3 + AS_UINT_V((tmp.d2 > 0x7FFF) )); | |
| tmp.d4 = (a.d4 - b.d4 + AS_UINT_V((tmp.d3 > 0x7FFF) )); | |
| tmp.d0&= 0x7FFF; | |
| tmp.d1&= 0x7FFF; | |
| tmp.d2&= 0x7FFF; | |
| tmp.d3&= 0x7FFF; | |
| tmp.d0 = (tmp.d4 > a.d4) ? a.d0 : tmp.d0; | |
| tmp.d1 = (tmp.d4 > a.d4) ? a.d1 : tmp.d1; | |
| tmp.d2 = (tmp.d4 > a.d4) ? a.d2 : tmp.d2; | |
| tmp.d3 = (tmp.d4 > a.d4) ? a.d3 : tmp.d3; | |
| tmp.d4 = (tmp.d4 > a.d4) ? a.d4 : tmp.d4; // & 0x7FFF not necessary as tmp.d4 is <= a.d4 | |
| return tmp; | |
| } | |
| void mul_75(int75_v * const res, const int75_v a, const int75_v b) | |
| /* res = a * b (low 75 bits) | |
| 15x mul24/mad24, 4x >>, 4x & = 23 ops */ | |
| { | |
| res->d0 = mul24(a.d0, b.d0); | |
| res->d1 = mad24(a.d1, b.d0, res->d0 >> 15); | |
| res->d1 = mad24(a.d0, b.d1, res->d1); | |
| res->d0 &= 0x7FFF; | |
| res->d2 = mad24(a.d2, b.d0, res->d1 >> 15); | |
| res->d2 = mad24(a.d1, b.d1, res->d2); | |
| res->d2 = mad24(a.d0, b.d2, res->d2); | |
| res->d1 &= 0x7FFF; | |
| res->d3 = mad24(a.d3, b.d0, res->d2 >> 15); | |
| res->d3 = mad24(a.d2, b.d1, res->d3); | |
| res->d3 = mad24(a.d1, b.d2, res->d3); | |
| res->d3 = mad24(a.d0, b.d3, res->d3); | |
| res->d2 &= 0x7FFF; | |
| res->d4 = mad24(a.d4, b.d0, res->d3 >> 15); // if a.d4 is > 15 bits, then overflow can happen faster. | |
| res->d4 = mad24(a.d3, b.d1, res->d4); | |
| res->d4 = mad24(a.d2, b.d2, res->d4); | |
| res->d4 = mad24(a.d1, b.d3, res->d4); | |
| res->d4 = mad24(a.d0, b.d4, res->d4); // the 5th mad can overflow d4, but that's ok for this function. | |
| res->d3 &= 0x7FFF; | |
| // res->d4 &= 0x7FFF; | |
| } | |
| void mul_75_big(int75_v * const res, const int75_v a, const int75_v b) | |
| /* res = a * b (low 75 bits) | |
| 19x mul24/mad24, 4x >>, 4x & = 27 ops */ | |
| { | |
| res->d0 = mul24(a.d0, b.d0); | |
| res->d1 = mad24(a.d1, b.d0, res->d0 >> 15); | |
| res->d1 = mad24(a.d0, b.d1, res->d1); | |
| res->d0 &= 0x7FFF; | |
| res->d2 = mad24(a.d2, b.d0, res->d1 >> 15); | |
| res->d2 = mad24(a.d1, b.d1, res->d2); | |
| res->d2 = mad24(a.d0, b.d2, res->d2); | |
| res->d1 &= 0x7FFF; | |
| res->d3 = mad24(a.d3, b.d0, res->d2 >> 15); | |
| res->d3 = mad24(a.d2, b.d1, res->d3); | |
| res->d3 = mad24(a.d1, b.d2, res->d3); | |
| res->d3 = mad24(a.d0, b.d3, res->d3); | |
| res->d2 &= 0x7FFF; | |
| // in order to get one more bit in the result, we need to add the next bigger component into each multiplicant | |
| res->d4 = mad24(a.d4, b.d0, res->d3 >> 15); | |
| res->d4 = mad24(mad24(a.d4, 32768u, a.d3), b.d1, res->d4); | |
| res->d4 = mad24(mad24(a.d3, 32768u, a.d2), b.d2, res->d4); | |
| res->d4 = mad24(mad24(a.d2, 32768u, a.d1), b.d3, res->d4); | |
| res->d4 = mad24(mad24(a.d1, 32768u, a.d0), b.d4, res->d4); | |
| res->d3 &= 0x7FFF; | |
| // res->d4 &= 0xFFFF; | |
| } | |
| void mul_75_150_no_low3(int150_v * const res, const int75_v a, const int75_v b) | |
| /* | |
| res ~= a * b | |
| res.d0 to res.d2 are NOT computed. Carries to res.d3 are ignored, | |
| too. So the digits res.d{3-9} might differ from mul_75_150(). | |
| */ | |
| { | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // this optimized mul 5x5 requires: 19 mul/mad24, 7 shift, 6 and, 1 add | |
| res->d3 = mul24(a.d3, b.d0); | |
| res->d3 = mad24(a.d2, b.d1, res->d3); | |
| res->d3 = mad24(a.d1, b.d2, res->d3); | |
| res->d3 = mad24(a.d0, b.d3, res->d3); | |
| res->d4 = mad24(a.d4, b.d0, res->d3 >> 15); | |
| // res->d3 &= 0x7FFF; // d3 itself is not used, only its carry to d4 is required | |
| res->d4 = mad24(a.d3, b.d1, res->d4); | |
| res->d4 = mad24(a.d2, b.d2, res->d4); | |
| res->d4 = mad24(a.d1, b.d3, res->d4); | |
| // 5th mad24 can overflow d4, need to handle carry before: pull in the first d5 line | |
| res->d5 = mad24(a.d4, b.d1, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d4 = mad24(a.d0, b.d4, res->d4); // 31-bit at most | |
| res->d5 = mad24(a.d3, b.d2, res->d4 >> 15) + res->d5; | |
| res->d5 = mad24(a.d2, b.d3, res->d5); | |
| res->d5 = mad24(a.d1, b.d4, res->d5); | |
| res->d4 &= 0x7FFF; | |
| // now we have in d5: 4x mad24() + 1x 17-bit carry + 1x 16-bit carry: still fits into 32 bits | |
| res->d6 = mad24(a.d2, b.d4, res->d5 >> 15); | |
| res->d6 = mad24(a.d3, b.d3, res->d6); | |
| res->d6 = mad24(a.d4, b.d2, res->d6); | |
| res->d5 &= 0x7FFF; | |
| res->d7 = mad24(a.d3, b.d4, res->d6 >> 15); | |
| res->d7 = mad24(a.d4, b.d3, res->d7); | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d4, b.d4, res->d7 >> 15); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = res->d8 >> 15; | |
| res->d8 &= 0x7FFF; | |
| } | |
| void mul_75_150_no_low5(int150_v * const res, const int75_v a, const int75_v b) | |
| /* | |
| res ~= a * b | |
| res.d0 to res.d3 are NOT computed. res.d4 is computed only to get its upper half carried to res.d5. | |
| Due to the missing carries from d3 into d4, at most a 17-bit value is missing in d4. This means, | |
| d4 >> 15 can be too low by up to 3, thus d5 can be low by 3. | |
| */ | |
| { | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // this optimized mul 5x5 requires: 19 mul/mad24, 7 shift, 6 and, 1 add | |
| res->d4 = mul24(a.d4, b.d0); | |
| res->d4 = mad24(a.d3, b.d1, res->d4); | |
| res->d4 = mad24(a.d2, b.d2, res->d4); | |
| res->d4 = mad24(a.d1, b.d3, res->d4); | |
| // 5th mad24 can overflow d4, need to handle carry before: pull in the first d5 line | |
| res->d5 = mad24(a.d4, b.d1, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d4 = mad24(a.d0, b.d4, res->d4); // 31-bit at most | |
| res->d5 = mad24(a.d3, b.d2, res->d4 >> 15) + res->d5; | |
| res->d5 = mad24(a.d2, b.d3, res->d5); | |
| res->d5 = mad24(a.d1, b.d4, res->d5); | |
| // res->d4 &= 0x7FFF; // not needed, we won't use d4 anyway | |
| // now we have in d5: 4x mad24() + 1x 17-bit carry + 1x 16-bit carry: still fits into 32 bits | |
| res->d6 = mad24(a.d2, b.d4, res->d5 >> 15); | |
| res->d6 = mad24(a.d3, b.d3, res->d6); | |
| res->d6 = mad24(a.d4, b.d2, res->d6); | |
| res->d5 &= 0x7FFF; | |
| res->d7 = mad24(a.d3, b.d4, res->d6 >> 15); | |
| res->d7 = mad24(a.d4, b.d3, res->d7); | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d4, b.d4, res->d7 >> 15); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = res->d8 >> 15; | |
| res->d8 &= 0x7FFF; | |
| } | |
| void mul_75_150_no_low5_big(int150_v * const res, const int75_v a, const int75_v b) | |
| /* | |
| res ~= a * b | |
| res.d0 to res.d3 are NOT computed. res.d4 is computed only to get its upper half carried to res.d5. | |
| Due to the missing carries from d3 into d4, at most a 17-bit value is missing in d4. This means, | |
| d4 >> 15 can be too low by up to 3, thus d5 can be low by 3. | |
| This version allows for a "big" a, meaning, a.d4 can be up to 17 bits. | |
| */ | |
| { | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // this optimized mul 5x5 requires: 19 mul/mad24, 7 shift, 6 and, 1 add | |
| // mad24(a.d4, ...) will already return 32 bit. Handle a.d4 first all the way through res->d8 | |
| res->d4 = mul24(a.d4, b.d0); | |
| res->d5 = mad24(a.d4, b.d1, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d6 = mad24(a.d4, b.d2, res->d5 >> 15); | |
| res->d5 &= 0x7FFF; | |
| res->d7 = mad24(a.d4, b.d3, res->d6 >> 15); | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d4, b.d4, res->d7 >> 15); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = res->d8 >> 15; | |
| res->d8 &= 0x7FFF; | |
| res->d4 = mad24(a.d3, b.d1, res->d4); | |
| res->d4 = mad24(a.d2, b.d2, res->d4); | |
| res->d4 = mad24(a.d1, b.d3, res->d4); | |
| res->d4 = mad24(a.d0, b.d4, res->d4); | |
| res->d5 = mad24(a.d3, b.d2, res->d5) + (res->d4 >> 15); | |
| res->d5 = mad24(a.d2, b.d3, res->d5); | |
| res->d5 = mad24(a.d1, b.d4, res->d5); | |
| // res->d4 &= 0x7FFF; // not needed, we won't use d4 anyway | |
| res->d6 = mad24(a.d3, b.d3, res->d6) + (res->d5 >> 15); | |
| res->d6 = mad24(a.d2, b.d4, res->d6); | |
| res->d5 &= 0x7FFF; | |
| res->d7 = mad24(a.d3, b.d4, res->d7) + (res->d6 >> 15); | |
| res->d6 &= 0x7FFF; | |
| res->d8 += res->d7 >> 15; | |
| res->d7 &= 0x7FFF; | |
| res->d9 += res->d8 >> 15; | |
| res->d8 &= 0x7FFF; | |
| } | |
| void mul_75_150(int150_v * const res, const int75_v a, const int75_v b) | |
| /* | |
| res = a * b | |
| */ | |
| { | |
| /* this is the complete implementation, no longer used, but was the basis for | |
| the _no_low3 and square functions */ | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // mul 5x5 requires: 25 mul/mad24, 10 shift, 10 and, 1 add | |
| res->d0 = mul24(a.d0, b.d0); | |
| res->d1 = mad24(a.d1, b.d0, res->d0 >> 15); | |
| res->d1 = mad24(a.d0, b.d1, res->d1); | |
| res->d0 &= 0x7FFF; | |
| res->d2 = mad24(a.d2, b.d0, res->d1 >> 15); | |
| res->d2 = mad24(a.d1, b.d1, res->d2); | |
| res->d2 = mad24(a.d0, b.d2, res->d2); | |
| res->d1 &= 0x7FFF; | |
| res->d3 = mad24(a.d3, b.d0, res->d2 >> 15); | |
| res->d3 = mad24(a.d2, b.d1, res->d3); | |
| res->d3 = mad24(a.d1, b.d2, res->d3); | |
| res->d3 = mad24(a.d0, b.d3, res->d3); | |
| res->d2 &= 0x7FFF; | |
| res->d4 = mad24(a.d4, b.d0, res->d3 >> 15); | |
| res->d3 &= 0x7FFF; | |
| res->d4 = mad24(a.d3, b.d1, res->d4); | |
| res->d4 = mad24(a.d2, b.d2, res->d4); | |
| res->d4 = mad24(a.d1, b.d3, res->d4); | |
| // 5th mad24 can overflow d4, need to handle carry before: pull in the first d5 line | |
| res->d5 = mad24(a.d4, b.d1, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d4 = mad24(a.d0, b.d4, res->d4); // 31-bit at most | |
| res->d5 = mad24(a.d3, b.d2, res->d4 >> 15) + res->d5; | |
| res->d5 = mad24(a.d2, b.d3, res->d5); | |
| res->d5 = mad24(a.d1, b.d4, res->d5); | |
| res->d4 &= 0x7FFF; | |
| // now we have in d5: 4x mad24() + 1x 17-bit carry + 1x 16-bit carry: still fits into 32 bits | |
| res->d6 = mad24(a.d2, b.d4, res->d5 >> 15); | |
| res->d6 = mad24(a.d3, b.d3, res->d6); | |
| res->d6 = mad24(a.d4, b.d2, res->d6); | |
| res->d5 &= 0x7FFF; | |
| res->d7 = mad24(a.d3, b.d4, res->d6 >> 15); | |
| res->d7 = mad24(a.d4, b.d3, res->d7); | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d4, b.d4, res->d7 >> 15); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = res->d8 >> 15; | |
| res->d8 &= 0x7FFF; | |
| } | |
| void square_75_150(int150_v * const res, const int75_v a) | |
| /* res = a^2 = d0^2 + 2d0d1 + d1^2 + 2d0d2 + 2(d1d2 + d0d3) + d2^2 + | |
| 2(d0d4 + d1d3) + 2(d1d4 + d2d3) + d3^2 + 2d2d4 + 2d3d4 + d4^2 | |
| */ | |
| { | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // square 5x5 requires: 15 mul/mad24, 20 shift, 10 and, 1 add | |
| res->d0 = mul24(a.d0, a.d0); | |
| res->d1 = mad24(a.d1, a.d0 << 1, res->d0 >> 15); | |
| res->d0 &= 0x7FFF; | |
| res->d2 = mad24(a.d1, a.d1, res->d1 >> 15); | |
| res->d2 = mad24(a.d2, a.d0 << 1, res->d2); | |
| res->d1 &= 0x7FFF; | |
| res->d3 = mad24(a.d3, a.d0 << 1, res->d2 >> 15); | |
| res->d3 = mad24(a.d2, a.d1 << 1, res->d3); | |
| res->d2 &= 0x7FFF; | |
| res->d4 = mad24(a.d4, a.d0 << 1, res->d3 >> 15); | |
| res->d3 &= 0x7FFF; | |
| res->d4 = mad24(a.d3, a.d1 << 1, res->d4); | |
| // 5th mad24 can overflow d4, need to handle carry before: pull in the first d5 line | |
| res->d5 = mad24(a.d4, a.d1 << 1, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d4 = mad24(a.d2, a.d2, res->d4); // 31-bit at most | |
| res->d5 = mad24(a.d3, a.d2 << 1, res->d4 >> 15) + res->d5; | |
| res->d4 &= 0x7FFF; | |
| // now we have in d5: 4x mad24() + 1x 17-bit carry + 1x 16-bit carry: still fits into 32 bits | |
| res->d6 = mad24(a.d4, a.d2 << 1, res->d5 >> 15); | |
| res->d6 = mad24(a.d3, a.d3, res->d6); | |
| res->d5 &= 0x7FFF; | |
| res->d7 = mad24(a.d4, a.d3 << 1, res->d6 >> 15); | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d4, a.d4, res->d7 >> 15); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = res->d8 >> 15; | |
| res->d8 &= 0x7FFF; | |
| } | |
| void square_75_150_big(int150_v * const res, const int75_v a) | |
| /* res = a^2 = d0^2 + 2d0d1 + d1^2 + 2d0d2 + 2(d1d2 + d0d3) + d2^2 + | |
| 2(d0d4 + d1d3) + 2(d1d4 + d2d3) + d3^2 + 2d2d4 + 2d3d4 + d4^2 | |
| "big" because a.d4 can have 16 bits. | |
| */ | |
| { | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // square 5x5 requires: 15 mul/mad24, 20 shift, 10 and, 1 add | |
| // for 16-bit: 0xFFFF * 0xFFFF = 0xFFFE0001, can add 2 15-bit carries | |
| // square 5x5 big requires: 15 mul/mad24, 22 shift, 12 and, 3 add (+6 ops for "big") | |
| res->d0 = mul24(a.d0, a.d0); // max: 0x3FFF0001 | |
| res->d1 = mad24(a.d1, a.d0 << 1, res->d0 >> 15); // max: 0x7FFF * 0xFFFE + 0x7FFE = 0x7FFE8000 | |
| res->d0 &= 0x7FFF; | |
| res->d2 = mad24(a.d1, a.d1, res->d1 >> 15); // max: 0x3FFFFFFE | |
| res->d2 = mad24(a.d2, a.d0 << 1, res->d2); // max: 0xBFFE0000 | |
| res->d1 &= 0x7FFF; | |
| res->d3 = mad24(a.d3, a.d0 << 1, res->d2 >> 15); // max: 0x7FFF7FFE | |
| res->d3 = mad24(a.d2, a.d1 << 1, res->d3); // max: 0xFFFD8000 | |
| res->d2 &= 0x7FFF; | |
| res->d4 = mad24(a.d4, a.d0 << 1, res->d3 >> 15); // max: 0xFFFF * 0xFFFE + 0x1FFFB = 0xFFFEFFFD | |
| res->d3 &= 0x7FFF; // need to propagate the carry to the top | |
| res->d5 = mad24(a.d4, a.d1 << 1, res->d4 >> 15); // max: 0xFFFF * 0xFFFE + 0x1FFFD = 0xFFFEFFFF | |
| res->d4 &= 0x7FFF; | |
| res->d6 = mad24(a.d4, a.d2 << 1, res->d5 >> 15); // max: 0xFFFF * 0xFFFE + 0x1FFFD = 0xFFFEFFFF | |
| res->d5 &= 0x7FFF; | |
| res->d7 = mad24(a.d4, a.d3 << 1, res->d6 >> 15); // max: 0xFFFF * 0xFFFE + 0x1FFFD = 0xFFFEFFFF | |
| res->d6 &= 0x7FFF; | |
| res->d4 = mad24(a.d2, a.d2, res->d4); | |
| res->d4 = mad24(a.d3, a.d1 << 1, res->d4); // max: 0x7FFF * 0x7FFF + 0x7FFF * 0xFFFE + 0x7FFF = 0xBFFD8002 | |
| res->d5 = mad24(a.d3, a.d2 << 1, res->d5) + (res->d4 >> 15); // max: 0x7FFFFFFC | |
| res->d4 &= 0x7FFF; | |
| res->d6 = mad24(a.d3, a.d3, res->d6) + (res->d5 >> 15); // max: 0x40007FFF | |
| res->d5 &= 0x7FFF; | |
| res->d7 += (res->d6 >> 15); // max: 0xFFFEFFFF + 0x8000 = 0xFFFF7FFF | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d4, a.d4, res->d7 >> 15); // max: 0xFFFF * 0xFFFF + 0x1FFFE = 0xFFFFFFFF | |
| res->d7 &= 0x7FFF; | |
| res->d9 = res->d8 >> 15; // max: 0x1FFFF | |
| res->d8 &= 0x7FFF; | |
| } | |
| void shl_75(int75_v * const a) | |
| /* shiftleft a one bit */ | |
| { | |
| a->d4 = mad24(a->d4, 2u, a->d3 >> 14); // keep the extra top bit | |
| a->d3 = mad24(a->d3, 2u, a->d2 >> 14) & 0x7FFF; | |
| a->d2 = mad24(a->d2, 2u, a->d1 >> 14) & 0x7FFF; | |
| a->d1 = mad24(a->d1, 2u, a->d0 >> 14) & 0x7FFF; | |
| a->d0 = (a->d0 << 1u) & 0x7FFF; | |
| } | |
| void shl_150(int150_v * const a) | |
| /* shiftleft a one bit */ | |
| { | |
| a->d9 = mad24(a->d9, 2u, a->d8 >> 14); // keep the extra top bit | |
| a->d8 = mad24(a->d8, 2u, a->d7 >> 14) & 0x7FFF; | |
| a->d7 = mad24(a->d7, 2u, a->d6 >> 14) & 0x7FFF; | |
| a->d6 = mad24(a->d6, 2u, a->d5 >> 14) & 0x7FFF; | |
| a->d5 = mad24(a->d5, 2u, a->d4 >> 14) & 0x7FFF; | |
| a->d4 = mad24(a->d4, 2u, a->d3 >> 14) & 0x7FFF; | |
| a->d3 = mad24(a->d3, 2u, a->d2 >> 14) & 0x7FFF; | |
| a->d2 = mad24(a->d2, 2u, a->d1 >> 14) & 0x7FFF; | |
| a->d1 = mad24(a->d1, 2u, a->d0 >> 14) & 0x7FFF; | |
| a->d0 = (a->d0 << 1u) & 0x7FFF; | |
| } | |
| #if defined USE_DP | |
| void div_150_75_d(int75_v * const res, const uint qhi, const int75_v n, const double_v nf | |
| #if (TRACE_KERNEL > 1) | |
| , const uint tid | |
| #endif | |
| MODBASECASE_PAR_DEF | |
| )/* res = q / n (integer division) | |
| during function entry, qhi contains the upper 30 bits of an 180-bit-value. The remaining bits are zero implicitely. | |
| this is not a vector, as the first value is the same for all FCs*/ | |
| // do 2*45 bit reductions using double: should be sufficient for 90 bits (and 86 anyways) | |
| { | |
| __private double_v qf; | |
| __private double qf_1; // for the first conversion which does not need vectors yet | |
| __private ulong_v qi; | |
| __private uint_v qil, qim, qih; | |
| __private int150_v nn, q; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#0: q=%x:<150x0>, n=%x:%x:%x:%x:%x, nf=%#G\n", | |
| qhi, V(n.d4), V(n.d3), V(n.d2), V(n.d1), V(n.d0), V(nf)); | |
| #endif | |
| /********** Step 1, Offset 2^67 (4*15 + 7) **********/ | |
| qf_1 = convert_double(qhi) * 40564819207303340847894502572032.0; | |
| qi=CONVERT_ULONG_V(qf_1*nf); // vectorize just here | |
| MODBASECASE_QI_ERROR(1L<<46, 1, qi, 0); // qi here is about 45 bits | |
| qih = res->d4 = CONVERT_UINT_V(qi >> 30); // PERF: amd_bitalign ? | |
| qim = res->d3 = (CONVERT_UINT_V(qi) >> 15) & 0x7FFF; | |
| qil = res->d2 = CONVERT_UINT_V(qi ) & 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1: qf=%#G, nf=%#G, *=%#G, qi=%lld=0x%llx, res=%x:%x:%x:..:..\n", | |
| qf_1, V(nf), qf_1*V(nf), V(qi), V(qi), V(res->d4), V(res->d3), V(res->d2)); | |
| #endif | |
| /*******************************************************/ | |
| // nn = n * qi | |
| nn.d2 = mul24(n.d0, qil); | |
| nn.d3 = mad24(n.d0, qim, nn.d2 >> 15); | |
| nn.d3 = mad24(n.d1, qil, nn.d3); | |
| nn.d2 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1.1: nn=..:..:..:..:..:..:%x:%x:..:..\n", | |
| V(nn.d3), V(nn.d2)); | |
| #endif | |
| nn.d4 = mad24(n.d0, qih, nn.d3 >> 15); | |
| nn.d4 = mad24(n.d1, qim, nn.d4); | |
| nn.d4 = mad24(n.d2, qil, nn.d4); | |
| nn.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1.2: nn=..:..:..:..:..:%x:%x:%x:...\n", | |
| V(nn.d4), V(nn.d3), V(nn.d2)); | |
| #endif | |
| nn.d5 = mad24(n.d1, qih, nn.d4 >> 15); | |
| nn.d5 = mad24(n.d2, qim, nn.d5); | |
| nn.d5 = mad24(n.d3, qil, nn.d5); | |
| nn.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1.3: nn=..:..:..:..:%x:%x:%x:%x:...\n", | |
| V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2)); | |
| #endif | |
| nn.d6 = mad24(n.d2, qih, nn.d5 >> 15); | |
| nn.d6 = mad24(n.d3, qim, nn.d6); | |
| nn.d6 = mad24(n.d4, qil, nn.d6); | |
| nn.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1.4: nn=..:..:..:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2)); | |
| #endif | |
| nn.d7 = mad24(n.d3, qih, nn.d6 >> 15); | |
| nn.d7 = mad24(n.d4, qim, nn.d7); | |
| nn.d6 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1.5: nn=..:..:%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2)); | |
| #endif | |
| nn.d8 = mad24(n.d4, qih, nn.d7 >> 15); | |
| nn.d7 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1.6: nn=..:%x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2)); | |
| #endif | |
| nn.d9 = nn.d8 >> 15; | |
| nn.d8 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1.7: nn=..:%x:%x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d9), V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2)); | |
| #endif | |
| // q = q - nn, but upon function entry, qhi contains all the bits for d9. All bits below are zero. | |
| q.d2 = (-nn.d2) & 0x7FFF; | |
| q.d3 = (-nn.d3 + AS_UINT_V((nn.d2 > 0))); | |
| q.d4 = (-nn.d4 + AS_UINT_V((q.d3 > 0))); | |
| q.d5 = (-nn.d5 + AS_UINT_V((q.d4 > 0))); | |
| q.d6 = (-nn.d6 + AS_UINT_V((q.d5 > 0))); | |
| q.d7 = (-nn.d7 + AS_UINT_V((q.d6 > 0))); | |
| q.d8 = (-nn.d8 + AS_UINT_V((q.d7 > 0))); | |
| q.d9 = (qhi - nn.d9 + AS_UINT_V((q.d8 > 0))); | |
| q.d3 &= 0x7FFF; | |
| q.d4 &= 0x7FFF; | |
| q.d5 &= 0x7FFF; | |
| q.d6 &= 0x7FFF; | |
| q.d7 &= 0x7FFF; | |
| q.d8 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#1.8: q=..:%x:%x!%x:%x:%x:%x:%x:%x:..:..\n", | |
| V(q.d9), V(q.d8), V(q.d7), V(q.d6), V(q.d5), V(q.d4), V(q.d3), V(q.d2)); | |
| #endif | |
| MODBASECASE_NONZERO_ERROR(q.d9, 2, 10, 1); | |
| MODBASECASE_NONZERO_ERROR(q.d8, 2, 9, 2); | |
| /********** Step 2, Offset 2^30 (2*15 + 0) **********/ | |
| qf= CONVERT_DOUBLE_V(mad24(q.d7, 32768u, q.d6)); | |
| qf= qf * 1073741824.0f + CONVERT_DOUBLE_V(mad24(q.d5, 32768u, q.d4)); // now we need only 30 bits | |
| // qf= qf * 1073741824.0f + CONVERT_DOUBLE_V(mad24(q.d4, 32768u, q.d3)); | |
| // qf*= 35184372088832.0; | |
| qf*= 1152921504606846976.0; | |
| qih=CONVERT_UINT_V(qf*nf); | |
| MODBASECASE_QI_ERROR(1L<<30, 2, qih, 3); | |
| // res->d2 = CONVERT_UINT_V(qi >> 30); | |
| res->d1 = qih >> 15; | |
| res->d0 = qih & 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75_d#2: qf=%#G, nf=%#G, *=%#G, qi=%lld=0x%llx, res=%x:%x:%x:%x:%x\n", | |
| V(qf), V(nf), V(qf)*V(nf), V(qih), V(qih), V(res->d4), V(res->d3), V(res->d2), V(res->d1), V(res->d0)); | |
| #endif | |
| /*******************************************************/ | |
| // skip the last part - it will change the result by one at most - we can live with a result that is off by one | |
| } | |
| #endif | |
| void div_150_75(int75_v * const res, const uint qhi, const int75_v n, const float_v nf | |
| #if (TRACE_KERNEL > 1) | |
| , const uint tid | |
| #endif | |
| MODBASECASE_PAR_DEF | |
| )/* res = q / n (integer division) */ | |
| { | |
| __private float_v qf; | |
| __private float qf_1; // for the first conversion which does not need vectors yet | |
| __private uint_v qi, qil, qih; | |
| __private int150_v nn, q; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#0: q=%x:<135x0>, n=%x:%x:%x:%x:%x, nf=%#G\n", | |
| qhi, V(n.d4), V(n.d3), V(n.d2), V(n.d1), V(n.d0), V(nf)); | |
| #endif | |
| /********** Step 1, Offset 2^60 (4*15 + 0) **********/ | |
| qf_1= convert_float(qhi) * 35184372088832.0f; // =32768.0f * 32768.0f * 32768.0f; // no vector yet | |
| qi=CONVERT_UINT_V(qf_1*nf); // vectorize just here | |
| MODBASECASE_QI_ERROR(1<<16, 1, qi, 0); // first step is smaller, but 74 kernel needs 16 bits here | |
| res->d4 = qi; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#1: qf=%#G, nf=%#G, *=%#G, qi=%d=0x%x, res=%x:..:..:..:..\n", | |
| qf_1, V(nf), qf_1*V(nf), V(qi), V(qi), V(res->d4)); | |
| q.d9=0; // for correct printing later | |
| #endif | |
| /*******************************************************/ | |
| // nn = n * qi | |
| nn.d0 = mul24(n.d0, qi); | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#1.1: nn=..:..:..:..:..:%x:..:..:..:..\n", | |
| V(nn.d0)); | |
| #endif | |
| nn.d1 = mad24(n.d1, qi, nn.d0 >> 15); | |
| nn.d0 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#1.2: nn=..:..:..:..:%x:%x:...\n", | |
| V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d2 = mad24(n.d2, qi, nn.d1 >> 15); | |
| nn.d1 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#1.3: nn=..:..:..:%x:%x:%x:...\n", | |
| V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d3 = mad24(n.d3, qi, nn.d2 >> 15); | |
| nn.d2 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#1.4: nn=..:..:%x:%x:%x:%x:...\n", | |
| V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d4 = mad24(n.d4, qi, nn.d3 >> 15); | |
| nn.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#1.5: nn=..:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| // no shift-left | |
| #ifdef CHECKS_MODBASECASE | |
| nn.d5 = nn.d4 >> 15; // PERF: not needed as it will be gone anyway after sub | |
| nn.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#1.6: nn=%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| #endif | |
| // q.d0-q.d8 are all zero | |
| q.d4 = -nn.d0; | |
| q.d5 = SUB_COND(-nn.d1, q.d4 > 0x7FFF); | |
| q.d6 = SUB_COND(-nn.d2, q.d5 > 0x7FFF); | |
| q.d7 = SUB_COND(-nn.d3, q.d6 > 0x7FFF); | |
| q.d8 = SUB_COND(-nn.d4, q.d7 > 0x7FFF); | |
| #ifdef CHECKS_MODBASECASE | |
| q.d9 = SUB_COND(qhi - nn.d5, q.d8 > 0x7FFF); // PERF: not needed: should be zero anyway | |
| // compiler errors: qhi=8, nn.d5=7, q.d8 > 0x7fff ==> q.d9 = 2 : skip this check | |
| #endif | |
| q.d4 &= 0x7FFF; | |
| q.d5 &= 0x7FFF; | |
| q.d6 &= 0x7FFF; | |
| q.d7 &= 0x7FFF; | |
| q.d8 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#1.7: q=%x!%x:%x:%x:%x:%x:..:..:..:..\n", | |
| V(q.d9), V(q.d8), V(q.d7), V(q.d6), V(q.d5), V(q.d4)); | |
| #endif | |
| // MODBASECASE_NONZERO_ERROR(q.d9, 1, 9, 1); // gives false positives | |
| /********** Step 2, Offset 2^40 (2*15 + 10) **********/ | |
| qf= CONVERT_FLOAT_V(mad24(q.d8, 32768u, q.d7)); | |
| qf= qf * 1073741824.0f + CONVERT_FLOAT_V(mad24(q.d6, 32768u, q.d5)); | |
| qf*= 32.0f; | |
| qi=CONVERT_UINT_V(qf*nf); | |
| MODBASECASE_QI_ERROR(1<<21, 2, qi, 2); | |
| res->d3 = (qi >> 5); | |
| res->d2 = (qi << 10) & 0x7FFF; | |
| qil = qi & 0x7FFF; | |
| qih = (qi >> 15); | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#2: qf=%#G, nf=%#G, *=%#G, qi=%d=0x%x, res=%x:%x:%x:..:..\n", | |
| V(qf), V(nf), V(qf)*V(nf), V(qi), V(qi), V(res->d4), V(res->d3), V(res->d2)); | |
| #endif | |
| /*******************************************************/ | |
| // nn = n * qi | |
| nn.d0 = mul24(n.d0, qil); | |
| nn.d1 = mad24(n.d0, qih, nn.d0 >> 15); | |
| nn.d0 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#2.1: nn=..:..:..:..:%x:%x:..:..\n", | |
| V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d1 = mad24(n.d1, qil, nn.d1); | |
| nn.d2 = mad24(n.d1, qih, nn.d1 >> 15); | |
| nn.d1 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#2.2: nn=..:..:..:%x:%x:%x:..:..\n", | |
| V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d2 = mad24(n.d2, qil, nn.d2); | |
| nn.d3 = mad24(n.d2, qih, nn.d2 >> 15); | |
| nn.d2 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#2.3: nn=..:..:%x:%x:%x:%x:..:..\n", | |
| V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d3 = mad24(n.d3, qil, nn.d3); | |
| nn.d4 = mad24(n.d3, qih, nn.d3 >> 15); | |
| nn.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#2.4: nn=..:%x:%x:%x:%x:%x:..:..\n", | |
| V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d4 = mad24(n.d4, qil, nn.d4); | |
| nn.d5 = mad24(n.d4, qih, nn.d4 >> 15); | |
| nn.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#2.5: nn=..:%x:%x:%x:%x:%x:%x:..:..\n", | |
| V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| // now shift-left 10 bits | |
| #ifdef CHECKS_MODBASECASE | |
| nn.d6 = nn.d5 >> 5; // PERF: not needed as it will be gone anyway after sub | |
| #endif | |
| nn.d5 = mad24(nn.d5 & 0x1F, 1024u, nn.d4 >> 5); | |
| nn.d4 = mad24(nn.d4 & 0x1F, 1024u, nn.d3 >> 5); | |
| nn.d3 = mad24(nn.d3 & 0x1F, 1024u, nn.d2 >> 5); | |
| nn.d2 = mad24(nn.d2 & 0x1F, 1024u, nn.d1 >> 5); | |
| nn.d1 = mad24(nn.d1 & 0x1F, 1024u, nn.d0 >> 5); | |
| nn.d0 = (nn.d0 & 0x1F) << 10; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#2.6: nn=..:%x:%x:%x:%x:%x:%x:%x:..:..\n", | |
| V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| // q = q - nn | |
| q.d2 = -nn.d0; | |
| q.d3 = SUB_COND(-nn.d1, q.d2 > 0x7FFF); | |
| q.d4 = SUB_COND(q.d4 - nn.d2, q.d3 > 0x7FFF); | |
| q.d5 = SUB_COND(q.d5 - nn.d3, q.d4 > 0x7FFF); | |
| q.d6 = SUB_COND(q.d6 - nn.d4, q.d5 > 0x7FFF); | |
| q.d7 = SUB_COND(q.d7 - nn.d5, q.d6 > 0x7FFF); | |
| #ifdef CHECKS_MODBASECASE | |
| q.d8 = SUB_COND(q.d8 - nn.d6, q.d7 > 0x7FFF); // PERF: not needed: should be zero anyway | |
| #endif | |
| q.d2 &= 0x7FFF; | |
| q.d3 &= 0x7FFF; | |
| q.d4 &= 0x7FFF; | |
| q.d5 &= 0x7FFF; | |
| q.d6 &= 0x7FFF; | |
| q.d7 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#2.7: q=..:%x!%x:%x:%x:%x:%x:%x:..:..\n", | |
| V(q.d8), V(q.d7), V(q.d6), V(q.d5), V(q.d4), V(q.d3), V(q.d2)); | |
| #endif | |
| MODBASECASE_NONZERO_ERROR(q.d8, 2, 8, 3); | |
| /********** Step 3, Offset 2^20 (1*15 + 5) **********/ | |
| qf= CONVERT_FLOAT_V(mad24(q.d7, 32768u, q.d6)); | |
| qf= qf * 1073741824.0f + CONVERT_FLOAT_V(mad24(q.d5, 32768u, q.d4)); | |
| qf*= 32768.0f; | |
| qi=CONVERT_UINT_V(qf*nf); | |
| MODBASECASE_QI_ERROR(1<<26, 3, qi, 5); // very big qi, but then we can skip the bit-shifting later | |
| qih = (qi >> 15); | |
| qil = qi & 0x7FFF; | |
| res->d1 = qi; // carry to d2 is handled at the end anyway | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#3: qf=%#G, nf=%#G, *=%#G, qi=%d=0x%x, res=%x:%x:%x:%x:..\n", | |
| V(qf), V(nf), V(qf)*V(nf), V(qi), V(qi), V(res->d4), V(res->d3), V(res->d2), V(res->d1)); | |
| #endif | |
| /*******************************************************/ | |
| // nn = n * qi | |
| nn.d0 = mul24(n.d0, qil); | |
| nn.d1 = mad24(n.d0, qih, nn.d0 >> 15); | |
| nn.d0 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#3.1: nn=..:..:..:..:%x:%x:..\n", | |
| V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d1 = mad24(n.d1, qil, nn.d1); | |
| nn.d2 = mad24(n.d1, qih, nn.d1 >> 15); | |
| nn.d1 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#3.2: nn=..:..:..:%x:%x:%x:..\n", | |
| V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d2 = mad24(n.d2, qil, nn.d2); | |
| nn.d3 = mad24(n.d2, qih, nn.d2 >> 15); | |
| nn.d2 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#3.3: nn=..:..:%x:%x:%x:%x:..\n", | |
| V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d3 = mad24(n.d3, qil, nn.d3); | |
| nn.d4 = mad24(n.d3, qih, nn.d3 >> 15); | |
| nn.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#3.4: nn=..:%x:%x:%x:%x:%x:..\n", | |
| V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| nn.d4 = mad24(n.d4, qil, nn.d4); | |
| nn.d5 = mad24(n.d4, qih, nn.d4 >> 15); | |
| nn.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#3.5: nn=..:%x:%x:%x:%x:%x:%x:..\n", | |
| V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| #ifdef CHECKS_MODBASECASE | |
| nn.d6 = nn.d5 >> 15; // PERF: not needed as it will be gone anyway after sub | |
| nn.d5 &= 0x7FFF; | |
| #endif | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#3.6: nn=..:..:%x:%x:%x:%x:%x:%x:..\n", | |
| V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1), V(nn.d0)); | |
| #endif | |
| // q = q - nn | |
| q.d1 = -nn.d0; | |
| q.d2 = SUB_COND(q.d2 - nn.d1, q.d1 > 0x7FFF); | |
| q.d3 = SUB_COND(q.d3 - nn.d2, q.d2 > 0x7FFF); | |
| q.d4 = SUB_COND(q.d4 - nn.d3, q.d3 > 0x7FFF); | |
| q.d5 = SUB_COND(q.d5 - nn.d4, q.d4 > 0x7FFF); | |
| q.d6 = SUB_COND(q.d6 - nn.d5, q.d5 > 0x7FFF); | |
| #ifdef CHECKS_MODBASECASE | |
| q.d7 = SUB_COND(q.d7 - nn.d6, q.d6 > 0x7FFF); // PERF: not needed: should be zero anyway | |
| q.d7 &= 0x7FFF; | |
| #endif | |
| q.d1 &= 0x7FFF; | |
| q.d2 &= 0x7FFF; | |
| q.d3 &= 0x7FFF; | |
| q.d4 &= 0x7FFF; | |
| q.d5 &= 0x7FFF; | |
| q.d6 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#3.7: q=..:%x:%x!%x:%x:%x:%x:%x:%x:..\n", | |
| V(q.d8), V(q.d7), V(q.d6), V(q.d5), V(q.d4), V(q.d3), V(q.d2), V(q.d1)); | |
| #endif | |
| MODBASECASE_NONZERO_ERROR(q.d7, 3, 7, 6); | |
| /********** Step 4, Offset 2^0 (0*15 + 0) **********/ | |
| qf= CONVERT_FLOAT_V(mad24(q.d6, 32768u, q.d5)); | |
| qf= qf * 1073741824.0f + CONVERT_FLOAT_V(mad24(q.d4, 32768u, q.d3)); | |
| qf*= 32768.0f; | |
| qi=CONVERT_UINT_V(qf*nf); | |
| MODBASECASE_QI_ERROR(1<<22, 4, qi, 7); | |
| qil = qi & 0x7FFF; | |
| qih = (qi >> 15); | |
| res->d1 += qih; | |
| res->d0 = qil; | |
| // skip the last part - it will change the result by one at most - we can live with a result that is off by one | |
| // but need to handle outstanding carries instead | |
| res->d2 += res->d1 >> 15; | |
| res->d1 &= 0x7FFF; | |
| res->d3 += res->d2 >> 15; | |
| res->d2 &= 0x7FFF; | |
| res->d4 += res->d3 >> 15; | |
| res->d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_150_75#4: qf=%#G, nf=%#G, *=%#G, qi=%d=0x%x, res=%x:%x:%x:%x:%x\n", | |
| V(qf), V(nf), V(qf)*V(nf), V(qi), V(qi), V(res->d4), V(res->d3), V(res->d2), V(res->d1), V(res->d0)); | |
| #endif | |
| } | |
| /**** | |
| * the trial factoring implementations for 5x15 bit | |
| * bit_max65 is bit_max - 65 | |
| ****/ | |
| void check_barrett15_69(uint shifter, const int75_v f, const uint tid, const uint8 b_in, const int bit_max65, __global uint * restrict RES | |
| MODBASECASE_PAR_DEF) | |
| { | |
| __private int75_v a, u; | |
| __private int150_v b, tmp150; | |
| __private int75_v tmp75; | |
| __private float_v ff; | |
| __private uint bit_max_75=11-bit_max65, bit_max_60=bit_max65+4; //bit_max is 61 .. 70 | |
| __private uint tmp, bit_max75_mult = 1 << bit_max_75; /* used for bit shifting... */ | |
| __private int150_t bb={0, 0, 0, 0, b_in.s0, b_in.s1, b_in.s2, b_in.s3, b_in.s4, b_in.s5}; | |
| #if defined USE_DP | |
| __private double_v ffd; | |
| #endif | |
| /* | |
| ff = f as float, needed in mod_192_96() and div_192_96(). | |
| Precalculated here since it is the same for all steps in the following loop */ | |
| ff= CONVERT_FLOAT_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ff= ff * 32768.0f + CONVERT_FLOAT_RTP_V(f.d2); // f.d1 needed? | |
| ff= as_float(0x3f7ffffc) / ff; | |
| tmp = 1 << bit_max_60; // tmp150 = 2^(74 + bits in f) | |
| #if defined USE_DP | |
| // ffd = f as double, needed in div_180_90_d). | |
| ffd = CONVERT_DOUBLE_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ffd = ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d2, 32768u, f.d1)); | |
| // this is at least 45 bits - f.d0 is not needed | |
| ffd = ffd * 32768.0; | |
| ffd = as_double(0x3feffffffffffffdL) / ffd; // should be a bit less than 1.0 | |
| div_150_75_d(&u, tmp, f, ffd | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69: u(d)=%x:%x:%x:%x:%x:%x, ffd=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ffd)); | |
| #endif | |
| #else | |
| // PERF: as div is only used here, use all those zeros directly in there | |
| // here, no vectorized data is necessary yet: the precalculated "b" value is the same for all | |
| // tmp contains the upper part (15 bits) of a 150-bit value. The lower 135 bits are all zero implicitely | |
| div_150_75(&u, tmp, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR | |
| ); // u = floor(tmp150 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69: u=%x:%x:%x:%x:%x, ff=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ff)); | |
| #endif | |
| #endif | |
| #if (TRACE_KERNEL > 11) | |
| mul_75_150(&tmp150, f, u); // verify division: tmp150 should be 1 | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69: f=%x:%x:%x:%x:%x * u=%x:%x:%x:%x:%x = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x\n", | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4), V(tmp150.d3), V(tmp150.d2), V(tmp150.d1), V(tmp150.d0)); | |
| #endif | |
| a.d0 = mad24(bb.d5, bit_max75_mult, (bb.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(bb.d6, bit_max75_mult, (bb.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(bb.d7, bit_max75_mult, (bb.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(bb.d8, bit_max75_mult, (bb.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(bb.d9, bit_max75_mult, (bb.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| // bb.d0-bb.d3 are 0 due to preprocessing on the host, thus always require a borrow | |
| a.d0 = (-tmp75.d0) & 0x7FFF; | |
| a.d1 = SUB_COND(-tmp75.d1, a.d0 > 0); | |
| a.d2 = SUB_COND(-tmp75.d2, a.d1 > 0x7FFF); | |
| a.d3 = SUB_COND(-tmp75.d3, a.d2 > 0x7FFF); | |
| a.d4 = SUB_COND(bb.d4-tmp75.d4, a.d3 > 0x7FFF) & 0x7FFF; | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (a)\n", | |
| bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| while(shifter) | |
| { | |
| square_75_150(&b, a); // b = a^2 | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| a.d0 = mad24(b.d5, bit_max75_mult, (b.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d6, bit_max75_mult, (b.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d7, bit_max75_mult, (b.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d8, bit_max75_mult, (b.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.d9, bit_max75_mult, (b.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| a.d0 = (b.d0 - tmp75.d0) & 0x7FFF; | |
| a.d1 = (b.d1 - tmp75.d1 + AS_UINT_V((a.d0 > b.d0) )); | |
| a.d2 = (b.d2 - tmp75.d2 + AS_UINT_V((a.d1 > b.d1) )); | |
| a.d3 = (b.d3 - tmp75.d3 + AS_UINT_V((a.d2 > b.d2) )); | |
| a.d4 = (b.d4 - tmp75.d4 + AS_UINT_V((a.d3 > b.d3) )); | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| a.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (a)\n", | |
| V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| if(shifter&0x80000000)shl_75(&a); // "optional multiply by 2" in Prime 95 documentation | |
| #ifdef CHECKS_MODBASECASE | |
| // a.d4 must not exceed 0x7fff, otherwise the following squaring may overflow | |
| #endif | |
| shifter+=shifter; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"loopend: exp=%x, a= %x:%x:%x:%x:%x\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0) ); | |
| #endif | |
| } | |
| mod_simple_even_75_and_check_big_factor75(a, f, ff, RES | |
| #ifdef CHECKS_MODBASECASE | |
| , bit_max_75, 10, modbasecase_debug | |
| #endif | |
| ); | |
| } | |
| void check_barrett15_70(uint shifter, const int75_v f, const uint tid, const uint8 b_in, const int bit_max65, __global uint * restrict RES | |
| MODBASECASE_PAR_DEF) | |
| { | |
| __private int75_v a, u; | |
| __private int150_v b, tmp150; | |
| __private int75_v tmp75; | |
| __private float_v ff; | |
| __private uint bit_max_75=11-bit_max65, bit_max_60=bit_max65+4; //bit_max is 61 .. 70 | |
| __private uint tmp, bit_max75_mult = 1 << bit_max_75; /* used for bit shifting... */ | |
| __private int150_t bb={0, 0, 0, 0, b_in.s0, b_in.s1, b_in.s2, b_in.s3, b_in.s4, b_in.s5}; | |
| #if defined USE_DP | |
| __private double_v ffd; | |
| #endif | |
| /* | |
| ff = f as float, needed in mod_192_96() and div_192_96(). | |
| Precalculated here since it is the same for all steps in the following loop */ | |
| ff= CONVERT_FLOAT_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ff= ff * 32768.0f + CONVERT_FLOAT_RTP_V(f.d2); // f.d1 needed? | |
| ff= as_float(0x3f7ffffc) / ff; | |
| tmp = 1 << bit_max_60; // tmp150 = 2^(74 + bits in f) | |
| #if defined USE_DP | |
| // ffd = f as double, needed in div_180_90_d). | |
| ffd = CONVERT_DOUBLE_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ffd = ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d2, 32768u, f.d1)); | |
| // this is at least 45 bits - f.d0 is not needed | |
| ffd = ffd * 32768.0; | |
| ffd = as_double(0x3feffffffffffffdL) / ffd; // should be a bit less than 1.0 | |
| div_150_75_d(&u, tmp, f, ffd | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_70: u(d)=%x:%x:%x:%x:%x:%x, ffd=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ffd)); | |
| #endif | |
| #else | |
| // PERF: as div is only used here, use all those zeros directly in there | |
| // here, no vectorized data is necessary yet: the precalculated "b" value is the same for all | |
| // tmp contains the upper part (15 bits) of a 150-bit value. The lower 135 bits are all zero implicitely | |
| div_150_75(&u, tmp, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR | |
| ); // u = floor(tmp150 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_70: u=%x:%x:%x:%x:%x, ff=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ff)); | |
| #endif | |
| #endif | |
| a.d0 = mad24(bb.d5, bit_max75_mult, (bb.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(bb.d6, bit_max75_mult, (bb.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(bb.d7, bit_max75_mult, (bb.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(bb.d8, bit_max75_mult, (bb.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(bb.d9, bit_max75_mult, (bb.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_70: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_70: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| // bb.d0-bb.d3 are 0 due to preprocessing on the host, thus always require a borrow | |
| a.d0 = (-tmp75.d0) & 0x7FFF; | |
| a.d1 = (-tmp75.d1 + AS_UINT_V((a.d0 > 0) )); | |
| a.d2 = (-tmp75.d2 + AS_UINT_V((a.d1 > 0x7FFF) )); | |
| a.d3 = (-tmp75.d3 + AS_UINT_V((a.d2 > 0x7FFF) )); | |
| a.d4 = (bb.d4-tmp75.d4 + AS_UINT_V((a.d3 > 0x7FFF) )) & 0x7FFF; | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_70: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (a)\n", | |
| bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| while(shifter) | |
| { | |
| square_75_150(&b, a); // b = a^2 | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| a.d0 = mad24(b.d5, bit_max75_mult, (b.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d6, bit_max75_mult, (b.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d7, bit_max75_mult, (b.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d8, bit_max75_mult, (b.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.d9, bit_max75_mult, (b.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| a.d0 = (b.d0 - tmp75.d0) & 0x7FFF; | |
| a.d1 = (b.d1 - tmp75.d1 + AS_UINT_V((a.d0 > b.d0) )); | |
| a.d2 = (b.d2 - tmp75.d2 + AS_UINT_V((a.d1 > b.d1) )); | |
| a.d3 = (b.d3 - tmp75.d3 + AS_UINT_V((a.d2 > b.d2) )); | |
| a.d4 = (b.d4 - tmp75.d4 + AS_UINT_V((a.d3 > b.d3) )); | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| a.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (a)\n", | |
| V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| if(shifter&0x80000000)shl_75(&a); // "optional multiply by 2" in Prime 95 documentation | |
| #ifdef CHECKS_MODBASECASE | |
| // a.d4 must not exceed 0x7fff, otherwise the following squaring may overflow | |
| #endif | |
| shifter+=shifter; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"loopend: exp=%x, a= %x:%x:%x:%x:%x\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0) ); | |
| #endif | |
| } | |
| mod_simple_75_and_check_big_factor75(a, f, ff, RES | |
| #ifdef CHECKS_MODBASECASE | |
| , bit_max_75, 10, modbasecase_debug | |
| #endif | |
| ); | |
| } | |
| void check_barrett15_71(uint shifter, const int75_v f, const uint tid, const uint8 b_in, const int bit_max65, __global uint * restrict RES | |
| MODBASECASE_PAR_DEF) | |
| { | |
| __private int75_v a, u; | |
| __private int150_v b, tmp150; | |
| __private int75_v tmp75; | |
| __private float_v ff; | |
| __private uint bit_max_75=11-bit_max65, bit_max_60=bit_max65+4; //bit_max is 61 .. 70 | |
| __private uint tmp, bit_max75_mult = 1 << bit_max_75; /* used for bit shifting... */ | |
| __private int150_t bb={0, 0, 0, 0, b_in.s0, b_in.s1, b_in.s2, b_in.s3, b_in.s4, b_in.s5}; | |
| #if defined USE_DP | |
| __private double_v ffd; | |
| #endif | |
| /* | |
| ff = 1/f as float, needed in div_192_96(). | |
| */ | |
| ff= CONVERT_FLOAT_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ff= ff * 32768.0f + CONVERT_FLOAT_RTP_V(f.d2); // f.d1 needed? | |
| ff= as_float(0x3f7ffffc) / ff; | |
| tmp = 1 << bit_max_60; // tmp150 = 2^(74 + bits in f) | |
| #if defined USE_DP | |
| // ffd = f as double, needed in div_180_90_d). | |
| ffd = CONVERT_DOUBLE_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ffd = ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d2, 32768u, f.d1)); | |
| // this is at least 45 bits - f.d0 is not needed | |
| ffd = ffd * 32768.0; | |
| ffd = as_double(0x3feffffffffffffdL) / ffd; // should be a bit less than 1.0 | |
| div_150_75_d(&u, tmp, f, ffd | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_71: u(d)=%x:%x:%x:%x:%x:%x, ffd=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ffd)); | |
| #endif | |
| #else | |
| // PERF: as div is only used here, use all those zeros directly in there | |
| // here, no vectorized data is necessary yet: the precalculated "b" value is the same for all | |
| // tmp contains the upper part (15 bits) of a 150-bit value. The lower 135 bits are all zero implicitely | |
| div_150_75(&u, tmp, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR | |
| ); // u = floor(tmp150 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_71: u=%x:%x:%x:%x:%x, ff=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ff)); | |
| #endif | |
| #endif | |
| a.d0 = mad24(bb.d5, bit_max75_mult, (bb.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(bb.d6, bit_max75_mult, (bb.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(bb.d7, bit_max75_mult, (bb.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(bb.d8, bit_max75_mult, (bb.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(bb.d9, bit_max75_mult, (bb.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_71: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_71: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| // bb.d0-bb.d3 are 0 due to preprocessing on the host, thus always require a borrow | |
| a.d0 = (-tmp75.d0) & 0x7FFF; | |
| a.d1 = (-tmp75.d1 + AS_UINT_V((a.d0 > 0) )); | |
| a.d2 = (-tmp75.d2 + AS_UINT_V((a.d1 > 0x7FFF) )); | |
| a.d3 = (-tmp75.d3 + AS_UINT_V((a.d2 > 0x7FFF) )); | |
| a.d4 = (bb.d4-tmp75.d4 + AS_UINT_V((a.d3 > 0x7FFF) )) & 0x7FFF; | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_71: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (a)\n", | |
| bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| while(shifter) | |
| { | |
| square_75_150(&b, a); // b = a^2 | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| if(shifter&0x80000000) | |
| { | |
| shl_150(&b); // "optional multiply by 2" in Prime 95 documentation | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| } | |
| a.d0 = mad24(b.d5, bit_max75_mult, (b.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d6, bit_max75_mult, (b.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d7, bit_max75_mult, (b.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d8, bit_max75_mult, (b.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.d9, bit_max75_mult, (b.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| a.d0 = (b.d0 - tmp75.d0) & 0x7FFF; | |
| a.d1 = (b.d1 - tmp75.d1 + AS_UINT_V((a.d0 > b.d0) )); | |
| a.d2 = (b.d2 - tmp75.d2 + AS_UINT_V((a.d1 > b.d1) )); | |
| a.d3 = (b.d3 - tmp75.d3 + AS_UINT_V((a.d2 > b.d2) )); | |
| a.d4 = (b.d4 - tmp75.d4 + AS_UINT_V((a.d3 > b.d3) )); | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| a.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (a)\n", | |
| V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| shifter+=shifter; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"loopend: exp=%x, a= %x:%x:%x:%x:%x\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0) ); | |
| #endif | |
| } | |
| mod_simple_75_and_check_big_factor75(a, f, ff, RES | |
| #ifdef CHECKS_MODBASECASE | |
| , bit_max_75, 10, modbasecase_debug | |
| #endif | |
| ); | |
| } | |
| void check_barrett15_73(uint shifter, const int75_v f, const uint tid, const uint8 b_in, const int bit_max65, __global uint * restrict RES | |
| MODBASECASE_PAR_DEF) | |
| { | |
| __private int75_v a, u; | |
| __private int150_v b, tmp150; | |
| __private int75_v tmp75; | |
| __private float_v ff; | |
| __private uint bit_max_75=11-bit_max65, bit_max_60=bit_max65+4; //bit_max is 61 .. 70 | |
| __private uint tmp, bit_max75_mult = 1 << bit_max_75; /* used for bit shifting... */ | |
| __private int150_t bb={0, 0, 0, 0, b_in.s0, b_in.s1, b_in.s2, b_in.s3, b_in.s4, b_in.s5}; | |
| #if defined USE_DP | |
| __private double_v ffd; | |
| #endif | |
| /* | |
| ff = 1/f as float, needed in div_192_96(). | |
| */ | |
| ff= CONVERT_FLOAT_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ff= ff * 32768.0f + CONVERT_FLOAT_RTP_V(f.d2); // these are at least 30 significant bits for 60-bit FC's | |
| ff= as_float(0x3f7ffffc) / ff; | |
| tmp = 1 << bit_max_60; // tmp150 = 2^(74 + bits in f) | |
| #if defined USE_DP | |
| // ffd = f as double, needed in div_180_90_d). | |
| ffd = CONVERT_DOUBLE_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ffd = ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d2, 32768u, f.d1)); | |
| // this is at least 45 bits - f.d0 is not needed | |
| ffd = ffd * 32768.0; | |
| ffd = as_double(0x3feffffffffffffdL) / ffd; // should be a bit less than 1.0 | |
| div_150_75_d(&u, tmp, f, ffd | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73: u(d)=%x:%x:%x:%x:%x:%x, ffd=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ffd)); | |
| #endif | |
| #else | |
| // PERF: as div is only used here, use all those zeros directly in there | |
| // here, no vectorized data is necessary yet: the precalculated "b" value is the same for all | |
| // tmp contains the upper part (15 bits) of a 150-bit value. The lower 135 bits are all zero implicitely | |
| div_150_75(&u, tmp, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR | |
| ); // u = floor(tmp150 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73: u=%x:%x:%x:%x:%x, ff=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ff)); | |
| #endif | |
| #endif | |
| a.d0 = mad24(bb.d5, bit_max75_mult, (bb.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(bb.d6, bit_max75_mult, (bb.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(bb.d7, bit_max75_mult, (bb.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(bb.d8, bit_max75_mult, (bb.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(bb.d9, bit_max75_mult, (bb.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| // all those bb's are 0 due to preprocessing on the host, thus always require a borrow | |
| a.d0 = (-tmp75.d0) & 0x7FFF; | |
| a.d1 = SUB_COND(-tmp75.d1, a.d0 > 0); | |
| a.d2 = SUB_COND(-tmp75.d2, a.d1 > 0x7FFF); | |
| a.d3 = SUB_COND(-tmp75.d3, a.d2 > 0x7FFF); | |
| a.d4 = SUB_COND(bb.d4 - tmp75.d4, a.d3 > 0x7FFF) & 0x7FFF; | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (a)\n", | |
| bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| for(;;) | |
| { | |
| square_75_150(&b, a); // b = a^2 | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| #if (TRACE_KERNEL > 14) | |
| // verify squaring by dividing again. | |
| __private float_v f1 = CONVERT_FLOAT_RTP_V(mad24(a.d4, 32768u, a.d3)); | |
| f1= f1 * 32768.0f + CONVERT_FLOAT_RTP_V(a.d2); // f.d1 needed? | |
| f1= as_float(0x3f7ffffc) / f1; | |
| div_150_75(&tmp75, V(b.d9), a, f1, tid | |
| MODBASECASE_PAR | |
| ); | |
| if (tid==TRACE_TID) printf((__constant char *)"vrfy: b = %x:0:0:0:0:0:0:0:0:0 / a=%x:%x:%x:%x:%x = %x:%x:%x:%x:%x\n", | |
| V(b.d9), | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| a.d0 = mad24(b.d5, bit_max75_mult, (b.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d6, bit_max75_mult, (b.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d7, bit_max75_mult, (b.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d8, bit_max75_mult, (b.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.d9, bit_max75_mult, (b.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| tmp75.d0 = (b.d0 - tmp75.d0) & 0x7FFF; | |
| tmp75.d1 = SUB_COND(b.d1 - tmp75.d1, tmp75.d0 > b.d0); | |
| tmp75.d2 = SUB_COND(b.d2 - tmp75.d2, tmp75.d1 > 0x7FFF); | |
| tmp75.d3 = SUB_COND(b.d3 - tmp75.d3, tmp75.d2 > 0x7FFF); | |
| tmp75.d4 = SUB_COND(b.d4 - tmp75.d4, tmp75.d3 > 0x7FFF); | |
| tmp75.d1 &= 0x7FFF; | |
| tmp75.d2 &= 0x7FFF; | |
| tmp75.d3 &= 0x7FFF; | |
| tmp75.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (tmp)\n", | |
| V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| if (shifter & 0x80000000) shl_75(&tmp75); | |
| if (shifter == 0x80000000) break; | |
| shifter+=shifter; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"loopend: exp=%x, tmp=%x:%x:%x:%x:%x mod f=%x:%x:%x:%x:%x = %x:%x:%x:%x:%x (a)\n", | |
| shifter, V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0), | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0) ); | |
| #endif | |
| #ifndef CHECKS_MODBASECASE | |
| mod_simple_75(&a, tmp75, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| ); // adjustment, plain barrett returns N = AB mod M where N < 3M! | |
| #else | |
| int limit = 10; | |
| if(bit_max_75 == 2) limit = 12; | |
| if(bit_max_75 == 3) limit = 11; | |
| mod_simple_75(&a, tmp75, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| , bit_max_75, limit, modbasecase_debug); | |
| #endif | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73: tmp=%x:%x:%x:%x:%x mod f=%x:%x:%x:%x:%x = %x:%x:%x:%x:%x (a)\n", | |
| V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0), | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0) ); | |
| #endif | |
| } | |
| mod_simple_even_75_and_check_big_factor75(tmp75, f, ff, RES | |
| #ifdef CHECKS_MODBASECASE | |
| , bit_max_75, 10, modbasecase_debug | |
| #endif | |
| ); | |
| } | |
| void check_barrett15_74(uint shifter, const int75_v f, const uint tid, const uint8 b_in, const int bit_max65, __global uint * restrict RES | |
| MODBASECASE_PAR_DEF) | |
| { | |
| __private int75_v a, u; | |
| __private int150_v b, tmp150; | |
| __private int75_v tmp75; | |
| __private float_v ff; | |
| __private uint bit_max_75=11-bit_max65, bit_max_60=bit_max65+4; //bit_max is 61 .. 70 | |
| __private uint tmp, bit_max75_mult = 1 << bit_max_75; /* used for bit shifting... */ | |
| __private int150_t bb={0, 0, 0, 0, b_in.s0, b_in.s1, b_in.s2, b_in.s3, b_in.s4, b_in.s5}; | |
| #if defined USE_DP | |
| __private double_v ffd; | |
| #endif | |
| // this kernel is based on the 73-bit kernel but stores one more bit in the top word, allowing to factor up to 74 bits. | |
| /* | |
| ff = 1/f as float, needed in div_192_96(). | |
| */ | |
| ff= CONVERT_FLOAT_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ff= ff * 32768.0f + CONVERT_FLOAT_RTP_V(f.d2); // these are at least 30 significant bits for 60-bit FC's | |
| ff= as_float(0x3f7ffffc) / ff; | |
| tmp = 1 << bit_max_60; // tmp150 = 2^(74 + bits in f) | |
| #if defined USE_DP | |
| // ffd = f as double, needed in div_180_90_d). | |
| ffd = CONVERT_DOUBLE_RTP_V(mad24(f.d4, 32768u, f.d3)); | |
| ffd = ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d2, 32768u, f.d1)); | |
| // this is at least 45 bits - f.d0 is not needed | |
| ffd = ffd * 32768.0; | |
| ffd = as_double(0x3feffffffffffffdL) / ffd; // should be a bit less than 1.0 | |
| div_150_75_d(&u, tmp, f, ffd | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74: u(d)=%x:%x:%x:%x:%x:%x, ffd=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ffd)); | |
| #endif | |
| #else | |
| // PERF: as div is only used here, use all those zeros directly in there | |
| // here, no vectorized data is necessary yet: the precalculated "b" value is the same for all | |
| // tmp contains the upper part (15 bits) of a 150-bit value. The lower 135 bits are all zero implicitely | |
| div_150_75(&u, tmp, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR | |
| ); // u = floor(tmp150 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74: u=%x:%x:%x:%x:%x, ff=%G\n", | |
| V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ff)); | |
| #endif | |
| #endif | |
| #if (TRACE_KERNEL > 10) | |
| // verify u | |
| mul_75_150(&tmp150, u, f); | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74: vrfy: f*u=%x:%x:%x:%x:%x:%x:%x:%x:%x:%x\n", | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), | |
| V(tmp150.d4), V(tmp150.d3), V(tmp150.d2), V(tmp150.d1), V(tmp150.d0)); | |
| #endif | |
| a.d0 = mad24(bb.d5, bit_max75_mult, (bb.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(bb.d6, bit_max75_mult, (bb.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(bb.d7, bit_max75_mult, (bb.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(bb.d8, bit_max75_mult, (bb.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(bb.d9, bit_max75_mult, (bb.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75_big(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| // all those bb's are 0 due to preprocessing on the host, thus always require a borrow | |
| a.d0 = (-tmp75.d0) & 0x7FFF; | |
| a.d1 = SUB_COND(-tmp75.d1, a.d0 > 0); | |
| a.d2 = SUB_COND(-tmp75.d2, a.d1 > 0x7FFF); | |
| a.d3 = SUB_COND(-tmp75.d3, a.d2 > 0x7FFF); | |
| a.d4 = SUB_COND(mad24(bb.d5, 32768u, bb.d4) - tmp75.d4, a.d3 > 0x7FFF) & 0xFFFF; // keep one extra bit | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (a)\n", | |
| mad24(bb.d5, 32768u, bb.d4), bb.d3, bb.d2, bb.d1, bb.d0, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| for(;;) | |
| { | |
| square_75_150(&b, a); // b = a^2 | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| #if (TRACE_KERNEL > 14) | |
| // verify squaring by dividing again. | |
| __private float_v f1 = CONVERT_FLOAT_RTP_V(mad24(a.d4, 32768u, a.d3)); | |
| f1= f1 * 32768.0f + CONVERT_FLOAT_RTP_V(a.d2); // f.d1 needed? | |
| f1= as_float(0x3f7ffffc) / f1; | |
| div_150_75(&tmp75, V(b.d9), a, f1, tid | |
| MODBASECASE_PAR | |
| ); | |
| if (tid==TRACE_TID) printf((__constant char *)"vrfy: b = %x:0:0:0:0:0:0:0:0:0 / a=%x:%x:%x:%x:%x = %x:%x:%x:%x:%x\n", | |
| V(b.d9), | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| a.d0 = mad24(b.d5, bit_max75_mult, (b.d4 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d6, bit_max75_mult, (b.d5 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d7, bit_max75_mult, (b.d6 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d8, bit_max75_mult, (b.d7 >> bit_max_60))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.d9, bit_max75_mult, (b.d8 >> bit_max_60)); // a = b / (2^bit_max) | |
| mul_75_150_no_low5(&tmp150, a, u); // tmp150 = (b / (2^bit_max)) * u # at least close to ;) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp150.d9), V(tmp150.d8), V(tmp150.d7), V(tmp150.d6), V(tmp150.d5), V(tmp150.d4)); | |
| #endif | |
| a.d0 = tmp150.d5; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d1 = tmp150.d6; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d2 = tmp150.d7; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d3 = tmp150.d8; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| a.d4 = tmp150.d9; // a = ((b / (2^bit_max)) * u) / (2^bit_max) | |
| mul_75_big(&tmp75, a, f); // tmp75 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| tmp75.d0 = (b.d0 - tmp75.d0) & 0x7FFF; | |
| tmp75.d1 = SUB_COND(b.d1 - tmp75.d1, tmp75.d0 > b.d0); | |
| tmp75.d2 = SUB_COND(b.d2 - tmp75.d2, tmp75.d1 > 0x7FFF); | |
| tmp75.d3 = SUB_COND(b.d3 - tmp75.d3, tmp75.d2 > 0x7FFF); | |
| tmp75.d4 = SUB_COND(mad24(b.d5, 32768u, b.d4) - tmp75.d4, tmp75.d3 > 0x7FFF); | |
| tmp75.d1 &= 0x7FFF; | |
| tmp75.d2 &= 0x7FFF; | |
| tmp75.d3 &= 0x7FFF; | |
| tmp75.d4 &= 0x1FFFF; // keep 2 extra bits | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: b=%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x (tmp)\n", | |
| mad24(V(b.d5), 32768u, V(b.d4)), V(b.d3), V(b.d2), V(b.d1), V(b.d0), V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0)); | |
| #endif | |
| if (shifter & 0x80000000) shl_75(&tmp75); | |
| if (shifter == 0x80000000) break; | |
| shifter+=shifter; | |
| #ifndef CHECKS_MODBASECASE | |
| mod_simple_75_big(&a, tmp75, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| ); // adjustment, plain barrett returns N = AB mod M where N < 3M! | |
| #else | |
| int limit = 14; | |
| if(bit_max_75 == 2) limit = 16; // bit_max == 65, due to decreased accuracy of mul_96_192_no_low2() above we need a higher threshold | |
| if(bit_max_75 == 3) limit = 15; // bit_max == 66, ... | |
| mod_simple_75_big(&a, tmp75, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| , bit_max_75, limit, modbasecase_debug); | |
| #endif | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"loopend: exp=%x, tmp=%x:%x:%x:%x:%x mod f=%x:%x:%x:%x:%x = %x:%x:%x:%x:%x (a)\n", | |
| shifter, V(tmp75.d4), V(tmp75.d3), V(tmp75.d2), V(tmp75.d1), V(tmp75.d0), | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0) ); | |
| #endif | |
| } | |
| mod_simple_even_75_and_check_big_factor75_big(tmp75, f, ff, RES | |
| #ifdef CHECKS_MODBASECASE | |
| , bit_max_75, 10, modbasecase_debug | |
| #endif | |
| ); | |
| } | |
| /****** | |
| * now the actual kernels for 5x15 bit calculations | |
| * | |
| * shiftcount is used for precomputing without mod | |
| * b_in is precomputed on host ONCE. | |
| ******/ | |
| __kernel void cl_barrett15_69(__private uint exponent, const int75_t k_base, const __global uint * restrict k_tab, const int shiftcount, | |
| const uint8 b_in, __global uint * restrict RES, const int bit_max65 | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private int75_v f; | |
| __private uint tid; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), (uint)get_local_id(0)) * VECTOR_SIZE; | |
| calculate_FC75(exponent, tid, k_tab, k_base, &f); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69: f=%x:%x:%x:%x:%x, shift=%d\n", | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), shiftcount); | |
| #endif | |
| check_barrett15_69(exponent << (32 - shiftcount), f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| __kernel void cl_barrett15_70(__private uint exponent, const int75_t k_base, const __global uint * restrict k_tab, const int shiftcount, | |
| const uint8 b_in, __global uint * restrict RES, const int bit_max65 | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private int75_v f; | |
| __private uint tid; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), (uint)get_local_id(0)) * VECTOR_SIZE; | |
| calculate_FC75(exponent, tid, k_tab, k_base, &f); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_70: f=%x:%x:%x:%x:%x, shift=%d\n", | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), shiftcount); | |
| #endif | |
| check_barrett15_70(exponent << (32 - shiftcount), f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| __kernel void cl_barrett15_71(__private uint exponent, const int75_t k_base, const __global uint * restrict k_tab, const int shiftcount, | |
| const uint8 b_in, __global uint * restrict RES, const int bit_max65 | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private int75_v f; | |
| __private uint tid; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), (uint)get_local_id(0)) * VECTOR_SIZE; | |
| calculate_FC75(exponent, tid, k_tab, k_base, &f); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_71: f=%x:%x:%x:%x:%x, shift=%d\n", | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), shiftcount); | |
| #endif | |
| check_barrett15_71(exponent << (32 - shiftcount), f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| __kernel void cl_barrett15_73(__private uint exponent, const int75_t k_base, const __global uint * restrict k_tab, const int shiftcount, | |
| const uint8 b_in, __global uint * restrict RES, const int bit_max65 | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private int75_v f; | |
| __private uint tid; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), (uint)get_local_id(0)) * VECTOR_SIZE; | |
| calculate_FC75(exponent, tid, k_tab, k_base, &f); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73: f=%x:%x:%x:%x:%x, shift=%d\n", | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), shiftcount); | |
| #endif | |
| check_barrett15_73(exponent << (32 - shiftcount), f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| __kernel void cl_barrett15_74(__private uint exponent, const int75_t k_base, const __global uint * restrict k_tab, const int shiftcount, | |
| const uint8 b_in, __global uint * restrict RES, const int bit_max65 | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private int75_v f; | |
| __private uint tid; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), (uint)get_local_id(0)) * VECTOR_SIZE; | |
| calculate_FC75(exponent, tid, k_tab, k_base, &f); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74: f=%x:%x:%x:%x:%x, shift=%d\n", | |
| V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), shiftcount); | |
| #endif | |
| check_barrett15_74(exponent << (32 - shiftcount), f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| /**************************************** | |
| **************************************** | |
| * 15-bit based 90-bit barrett-kernels | |
| * | |
| **************************************** | |
| ****************************************/ | |
| int90_v sub_if_gte_90(const int90_v a, const int90_v b) | |
| /* return (a>b)?a-b:a */ | |
| { | |
| int90_v tmp; | |
| /* do the subtraction and use tmp.d5 to decide if the result is valid (if a was > b) */ | |
| tmp.d0 = (a.d0 - b.d0) & 0x7FFF; | |
| tmp.d1 = (a.d1 - b.d1 + AS_UINT_V((b.d0 > a.d0) )); | |
| tmp.d2 = (a.d2 - b.d2 + AS_UINT_V((tmp.d1 > a.d1) )); | |
| tmp.d3 = (a.d3 - b.d3 + AS_UINT_V((tmp.d2 > a.d2) )); | |
| tmp.d4 = (a.d4 - b.d4 + AS_UINT_V((tmp.d3 > a.d3) )); | |
| tmp.d5 = (a.d5 - b.d5 + AS_UINT_V((tmp.d4 > a.d4) )); | |
| tmp.d1&= 0x7FFF; | |
| tmp.d2&= 0x7FFF; | |
| tmp.d3&= 0x7FFF; | |
| tmp.d4&= 0x7FFF; | |
| tmp.d0 = (tmp.d5 > a.d5) ? a.d0 : tmp.d0; | |
| tmp.d1 = (tmp.d5 > a.d5) ? a.d1 : tmp.d1; | |
| tmp.d2 = (tmp.d5 > a.d5) ? a.d2 : tmp.d2; | |
| tmp.d3 = (tmp.d5 > a.d5) ? a.d3 : tmp.d3; | |
| tmp.d4 = (tmp.d5 > a.d5) ? a.d4 : tmp.d4; | |
| tmp.d5 = (tmp.d5 > a.d5) ? a.d5 : tmp.d5; // & 0x7FFF not necessary as tmp.d5 is <= a.d5 | |
| return tmp; | |
| } | |
| void mul_90(int90_v * const res, const int90_v a, const int90_v b) | |
| /* res = a * b | |
| 21 mul/mad24, 6 >>, 7 &=, 1 +*/ | |
| { | |
| res->d0 = mul24(a.d0, b.d0); | |
| res->d1 = mad24(a.d1, b.d0, res->d0 >> 15); | |
| res->d1 = mad24(a.d0, b.d1, res->d1); | |
| res->d0 &= 0x7FFF; | |
| res->d2 = mad24(a.d2, b.d0, res->d1 >> 15); | |
| res->d2 = mad24(a.d1, b.d1, res->d2); | |
| res->d2 = mad24(a.d0, b.d2, res->d2); | |
| res->d1 &= 0x7FFF; | |
| res->d3 = mad24(a.d3, b.d0, res->d2 >> 15); | |
| res->d2 &= 0x7FFF; | |
| res->d3 = mad24(a.d2, b.d1, res->d3); | |
| res->d3 = mad24(a.d1, b.d2, res->d3); | |
| res->d3 = mad24(a.d0, b.d3, res->d3); | |
| res->d4 = mad24(a.d4, b.d0, res->d3 >> 15); | |
| res->d3 &= 0x7FFF; | |
| res->d4 = mad24(a.d3, b.d1, res->d4); | |
| res->d4 = mad24(a.d2, b.d2, res->d4); | |
| res->d4 = mad24(a.d1, b.d3, res->d4); | |
| res->d5 = mad24(a.d5, b.d0, res->d4 >> 15); // the 5th mad can overflow d4, need to handle carry before | |
| res->d4 &= 0x7FFF; | |
| res->d4 = mad24(a.d0, b.d4, res->d4); | |
| res->d5 += mad24(a.d4, b.d1, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d5 = mad24(a.d3, b.d2, res->d5); | |
| res->d5 = mad24(a.d2, b.d3, res->d5); | |
| res->d5 = mad24(a.d1, b.d4, res->d5); // the 5th mad can overflow d5, but that's ok for this function. | |
| res->d5 = mad24(a.d0, b.d5, res->d5); // the 6th mad can overflow d5, but that's ok for this function. | |
| res->d5 &= 0x7FFF; | |
| } | |
| void mul_90_180_no_low3(int180_v * const res, const int90_v a, const int90_v b) | |
| /* | |
| res ~= a * b | |
| res.d0 to res.d2 are NOT computed. Carries to res.d3 are ignored, | |
| too. So the digits res.d{3-b} might differ from mul_90_180(). | |
| */ | |
| { | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // this optimized mul 6x6 requires: 30 mul/mad24, 11 shift, 10 and, 3 add | |
| res->d3 = mul24(a.d3, b.d0); | |
| res->d3 = mad24(a.d2, b.d1, res->d3); | |
| res->d3 = mad24(a.d1, b.d2, res->d3); | |
| res->d3 = mad24(a.d0, b.d3, res->d3); | |
| res->d4 = mad24(a.d4, b.d0, res->d3 >> 15); | |
| // res->d3 &= 0x7FFF; // d3 itself is not used, only its carry to d4 is required | |
| res->d4 = mad24(a.d3, b.d1, res->d4); | |
| res->d4 = mad24(a.d2, b.d2, res->d4); | |
| res->d4 = mad24(a.d1, b.d3, res->d4); | |
| // 5th mad24 can overflow d4, need to handle carry before: pull in the first d5 line | |
| res->d5 = mad24(a.d5, b.d0, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d4 = mad24(a.d0, b.d4, res->d4); // 31-bit at most | |
| res->d5 = mad24(a.d4, b.d1, res->d4 >> 15) + res->d5; | |
| res->d4 &= 0x7FFF; | |
| res->d5 = mad24(a.d3, b.d2, res->d5); | |
| // handle carry after 3 of 6 mad's for d5: pull in the first d6 line | |
| res->d6 = mad24(a.d1, b.d5, res->d5 >> 15); | |
| res->d5 &= 0x7FFF; | |
| res->d5 = mad24(a.d2, b.d3, res->d5); | |
| res->d5 = mad24(a.d1, b.d4, res->d5); | |
| res->d5 = mad24(a.d0, b.d5, res->d5); | |
| res->d6 = mad24(a.d2, b.d4, res->d5 >> 15) + res->d6; | |
| res->d5 &= 0x7FFF; | |
| res->d6 = mad24(a.d3, b.d3, res->d6); | |
| // handle carry after 3 of 5 mad's for d6: pull in the first d7 line | |
| res->d7 = mad24(a.d2, b.d5, res->d6 >> 15); | |
| res->d6 &= 0x7FFF; | |
| res->d6 = mad24(a.d4, b.d2, res->d6); | |
| res->d6 = mad24(a.d5, b.d1, res->d6); | |
| res->d7 = mad24(a.d3, b.d4, res->d6 >> 15) + res->d7; | |
| res->d7 = mad24(a.d4, b.d3, res->d7); | |
| res->d7 = mad24(a.d5, b.d2, res->d7); // in d7 we have 4 mad's, and 2 carries (both not full 17 bits) | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d5, b.d3, res->d7 >> 15); | |
| res->d8 = mad24(a.d4, b.d4, res->d8); | |
| res->d8 = mad24(a.d3, b.d5, res->d8); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = mad24(a.d5, b.d4, res->d8 >> 15); | |
| res->d9 = mad24(a.d4, b.d5, res->d9); | |
| res->d8 &= 0x7FFF; | |
| res->da = mad24(a.d5, b.d5, res->d9 >> 15); | |
| res->d9 &= 0x7FFF; | |
| res->db = res->da >> 15; | |
| res->da &= 0x7FFF; | |
| } | |
| void mul_90_180_no_low5(int180_v * const res, const int90_v a, const int90_v b) | |
| /* | |
| res ~= a * b | |
| res.d0 to res.d4 are NOT computed. res.d5 is computed only to provide carries to res.d6. | |
| */ | |
| { | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // this optimized mul 6x6 requires: 21 mul/mad24, 8 shift, 7 and, 2 add | |
| // 5th mad24 can overflow d4, need to handle carry before: pull in the first d5 line | |
| res->d5 = mul24(a.d5, b.d0); | |
| res->d5 = mad24(a.d4, b.d1, res->d5); | |
| res->d5 = mad24(a.d3, b.d2, res->d5); | |
| // handle carry after 3 of 6 mad's for d5: pull in the first d6 line | |
| res->d6 = mad24(a.d1, b.d5, res->d5 >> 15); | |
| res->d5 &= 0x7FFF; | |
| res->d5 = mad24(a.d2, b.d3, res->d5); | |
| res->d5 = mad24(a.d1, b.d4, res->d5); | |
| res->d5 = mad24(a.d0, b.d5, res->d5); | |
| res->d6 = mad24(a.d2, b.d4, res->d5 >> 15) + res->d6; | |
| // res->d5 &= 0x7FFF; | |
| res->d6 = mad24(a.d3, b.d3, res->d6); | |
| // handle carry after 3 of 5 mad's for d6: pull in the first d7 line | |
| res->d7 = mad24(a.d2, b.d5, res->d6 >> 15); | |
| res->d6 &= 0x7FFF; | |
| res->d6 = mad24(a.d4, b.d2, res->d6); | |
| res->d6 = mad24(a.d5, b.d1, res->d6); | |
| res->d7 = mad24(a.d3, b.d4, res->d6 >> 15) + res->d7; | |
| res->d7 = mad24(a.d4, b.d3, res->d7); | |
| res->d7 = mad24(a.d5, b.d2, res->d7); // in d7 we have 4 mad's, and 2 carries (both not full 17 bits) | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d5, b.d3, res->d7 >> 15); | |
| res->d8 = mad24(a.d4, b.d4, res->d8); | |
| res->d8 = mad24(a.d3, b.d5, res->d8); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = mad24(a.d5, b.d4, res->d8 >> 15); | |
| res->d9 = mad24(a.d4, b.d5, res->d9); | |
| res->d8 &= 0x7FFF; | |
| res->da = mad24(a.d5, b.d5, res->d9 >> 15); | |
| res->d9 &= 0x7FFF; | |
| res->db = res->da >> 15; | |
| res->da &= 0x7FFF; | |
| } | |
| void mul_90_180(int180_v * const res, const int90_v a, const int90_v b) | |
| // res = a * b | |
| { | |
| // this is the complete implementation, used in montgomery mul, and was the basis for | |
| // the _no_low3 and square functions | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // mul 6x6 requires: 36 mul/mad24, 14 shift, 14 and, 3 add | |
| res->d0 = mul24(a.d0, b.d0); | |
| res->d1 = mad24(a.d1, b.d0, res->d0 >> 15); | |
| res->d1 = mad24(a.d0, b.d1, res->d1); | |
| res->d0 &= 0x7FFF; | |
| res->d2 = mad24(a.d2, b.d0, res->d1 >> 15); | |
| res->d2 = mad24(a.d1, b.d1, res->d2); | |
| res->d2 = mad24(a.d0, b.d2, res->d2); | |
| res->d1 &= 0x7FFF; | |
| res->d3 = mad24(a.d3, b.d0, res->d2 >> 15); | |
| res->d3 = mad24(a.d2, b.d1, res->d3); | |
| res->d3 = mad24(a.d1, b.d2, res->d3); | |
| res->d3 = mad24(a.d0, b.d3, res->d3); | |
| res->d2 &= 0x7FFF; | |
| res->d4 = mad24(a.d4, b.d0, res->d3 >> 15); | |
| res->d3 &= 0x7FFF; | |
| res->d4 = mad24(a.d3, b.d1, res->d4); | |
| res->d4 = mad24(a.d2, b.d2, res->d4); | |
| res->d4 = mad24(a.d1, b.d3, res->d4); | |
| // 5th mad24 can overflow d4, need to handle carry before: pull in the first d5 line | |
| res->d5 = mad24(a.d5, b.d0, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d4 = mad24(a.d0, b.d4, res->d4); // 31-bit at most | |
| res->d5 = mad24(a.d4, b.d1, res->d4 >> 15) + res->d5; | |
| res->d4 &= 0x7FFF; | |
| res->d5 = mad24(a.d3, b.d2, res->d5); | |
| // handle carry after 3 of 6 mad's for d5: pull in the first d6 line | |
| res->d6 = mad24(a.d1, b.d5, res->d5 >> 15); | |
| res->d5 &= 0x7FFF; | |
| res->d5 = mad24(a.d2, b.d3, res->d5); | |
| res->d5 = mad24(a.d1, b.d4, res->d5); | |
| res->d5 = mad24(a.d0, b.d5, res->d5); | |
| res->d6 = mad24(a.d2, b.d4, res->d5 >> 15) + res->d6; | |
| res->d5 &= 0x7FFF; | |
| res->d6 = mad24(a.d3, b.d3, res->d6); | |
| // handle carry after 3 of 5 mad's for d6: pull in the first d7 line | |
| res->d7 = mad24(a.d2, b.d5, res->d6 >> 15); | |
| res->d6 &= 0x7FFF; | |
| res->d6 = mad24(a.d4, b.d2, res->d6); | |
| res->d6 = mad24(a.d5, b.d1, res->d6); | |
| res->d7 = mad24(a.d3, b.d4, res->d6 >> 15) + res->d7; | |
| res->d7 = mad24(a.d4, b.d3, res->d7); | |
| res->d7 = mad24(a.d5, b.d2, res->d7); // in d7 we have 4 mad's, and 2 carries (both not full 17 bits) | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d5, b.d3, res->d7 >> 15); | |
| res->d8 = mad24(a.d4, b.d4, res->d8); | |
| res->d8 = mad24(a.d3, b.d5, res->d8); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = mad24(a.d5, b.d4, res->d8 >> 15); | |
| res->d9 = mad24(a.d4, b.d5, res->d9); | |
| res->d8 &= 0x7FFF; | |
| res->da = mad24(a.d5, b.d5, res->d9 >> 15); | |
| res->d9 &= 0x7FFF; | |
| res->db = res->da >> 15; | |
| res->da &= 0x7FFF; | |
| } | |
| void square_90_180(int180_v * const res, const int90_v a) | |
| /* res = a^2 = d0^2 + 2d0d1 + d1^2 + 2d0d2 + 2(d1d2 + d0d3) + d2^2 + | |
| 2(d0d4 + d1d3) + 2(d1d4 + d2d3) + d3^2 + 2d2d4 + 2d3d4 + d4^2 | |
| */ | |
| { | |
| // assume we have enough spare bits and can do all the carries at the very end: | |
| // 0x7FFF * 0x7FFF = 0x3FFF0001 = max result of mul24, up to 4 of these can be | |
| // added into 32-bit: 0x3FFF0001 * 4 = 0xFFFC0004, which even leaves room for | |
| // one (almost two) carry of 17 bit (32-bit >> 15) | |
| // square 6x6 requires: 21 mul/mad24, 29 shift (10 of them cacheable), 14 and, 3 add | |
| res->d0 = mul24(a.d0, a.d0); | |
| res->d1 = mad24(a.d1, a.d0 << 1, res->d0 >> 15); | |
| res->d0 &= 0x7FFF; | |
| res->d2 = mad24(a.d1, a.d1, res->d1 >> 15); | |
| res->d2 = mad24(a.d2, a.d0 << 1, res->d2); | |
| res->d1 &= 0x7FFF; | |
| res->d3 = mad24(a.d3, a.d0 << 1, res->d2 >> 15); | |
| res->d3 = mad24(a.d2, a.d1 << 1, res->d3); | |
| res->d2 &= 0x7FFF; | |
| res->d4 = mad24(a.d4, a.d0 << 1, res->d3 >> 15); | |
| res->d3 &= 0x7FFF; | |
| res->d4 = mad24(a.d3, a.d1 << 1, res->d4); | |
| // 5th mad24 can overflow d4, need to handle carry before: pull in the first d5 line | |
| res->d5 = mad24(a.d4, a.d1 << 1, res->d4 >> 15); | |
| res->d4 &= 0x7FFF; | |
| res->d4 = mad24(a.d2, a.d2, res->d4); // 31-bit at most | |
| res->d5 = mad24(a.d3, a.d2 << 1, res->d4 >> 15) + res->d5; | |
| res->d4 &= 0x7FFF; | |
| res->d6 = mad24(a.d5, a.d1 << 1, res->d5 >> 15); // d5 carry handling before overflowing | |
| res->d5 &= 0x7FFF; | |
| res->d5 = mad24(a.d5, a.d0 << 1, res->d5); | |
| res->d7 = mad24(a.d5, a.d2 << 1, res->d6 >> 15); // d6 carry handling before overflowing | |
| res->d6 &= 0x7FFF; | |
| res->d6 = mad24(a.d4, a.d2 << 1, res->d5 >> 15) + res->d6; | |
| res->d6 = mad24(a.d3, a.d3, res->d6); | |
| res->d5 &= 0x7FFF; | |
| res->d7 = mad24(a.d4, a.d3 << 1, res->d6 >> 15) + res->d7; | |
| res->d6 &= 0x7FFF; | |
| res->d8 = mad24(a.d4, a.d4, res->d7 >> 15); | |
| res->d8 = mad24(a.d5, a.d3 << 1, res->d8); | |
| res->d7 &= 0x7FFF; | |
| res->d9 = mad24(a.d5, a.d4 << 1, res->d8 >> 15); | |
| res->d8 &= 0x7FFF; | |
| res->da = mad24(a.d5, a.d5, res->d9 >> 15); | |
| res->d9 &= 0x7FFF; | |
| res->db = res->da >> 15; | |
| res->da &= 0x7FFF; | |
| } | |
| void shl_90(int90_v * const a) | |
| /* shiftleft a one bit */ | |
| { | |
| a->d5 = mad24(a->d5, 2u, a->d4 >> 14); // keep the extra top bit | |
| a->d4 = mad24(a->d4, 2u, a->d3 >> 14) & 0x7FFF; | |
| a->d3 = mad24(a->d3, 2u, a->d2 >> 14) & 0x7FFF; | |
| a->d2 = mad24(a->d2, 2u, a->d1 >> 14) & 0x7FFF; | |
| a->d1 = mad24(a->d1, 2u, a->d0 >> 14) & 0x7FFF; | |
| a->d0 = (a->d0 << 1u) & 0x7FFF; | |
| } | |
| void shl_180(int180_v * const a) | |
| /* shiftleft a one bit */ | |
| { | |
| a->db = mad24(a->db, 2u, a->da >> 14); // keep the extra top bit | |
| a->da = mad24(a->da, 2u, a->d9 >> 14) & 0x7FFF; | |
| a->d9 = mad24(a->d9, 2u, a->d8 >> 14) & 0x7FFF; | |
| a->d8 = mad24(a->d8, 2u, a->d7 >> 14) & 0x7FFF; | |
| a->d7 = mad24(a->d7, 2u, a->d6 >> 14) & 0x7FFF; | |
| a->d6 = mad24(a->d6, 2u, a->d5 >> 14) & 0x7FFF; | |
| a->d5 = mad24(a->d5, 2u, a->d4 >> 14) & 0x7FFF; | |
| a->d4 = mad24(a->d4, 2u, a->d3 >> 14) & 0x7FFF; | |
| a->d3 = mad24(a->d3, 2u, a->d2 >> 14) & 0x7FFF; | |
| a->d2 = mad24(a->d2, 2u, a->d1 >> 14) & 0x7FFF; | |
| a->d1 = mad24(a->d1, 2u, a->d0 >> 14) & 0x7FFF; | |
| a->d0 = (a->d0 << 1u) & 0x7FFF; | |
| } | |
| #if defined USE_DP | |
| void div_180_90_d(int90_v * const res, const uint qhi, const int90_v n, const double_v nf | |
| #if (TRACE_KERNEL > 1) | |
| , const uint tid | |
| #endif | |
| MODBASECASE_PAR_DEF | |
| )/* res = q / n (integer division) | |
| during function entry, qhi contains the upper 30 bits of an 180-bit-value. The remaining bits are zero implicitely. | |
| this is not a vector, as the first value is the same for all FCs*/ | |
| // do 2*45 bit reductions using double: should be sufficient for 90 bits (and 86 anyways) | |
| { | |
| __private double_v qf; | |
| __private double qf_1; // for the first conversion which does not need vectors yet | |
| __private ulong_v qi; | |
| __private uint_v qil, qim, qih; | |
| __private int180_v nn, q; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#0: q=%x:<150x0>, n=%x:%x:%x:%x:%x:%x, nf=%#G\n", | |
| qhi, V(n.d5), V(n.d4), V(n.d3), V(n.d2), V(n.d1), V(n.d0), V(nf)); | |
| #endif | |
| /********** Step 1, Offset 2^67 (4*15 + 7) **********/ | |
| // qf_1 = convert_float(qhi) * 4294967296.0f; // no vector yet, saving a few conversions! | |
| // qf_1 = qf_1 * 32768.0f * 64.0f; | |
| // qf_1 = convert_float(qhi) * 9007199254740992.0f; // no vector yet, saving a few conversions! 9007199254740992=4294967296*32768*64, which the compiler does not combine automatically | |
| qf_1 = convert_double(qhi) * 40564819207303340847894502572032.0; | |
| qi=CONVERT_ULONG_V(qf_1*nf); // vectorize just here | |
| MODBASECASE_QI_ERROR(1L<<46, 1, qi, 0); // qi here is about 45 bits | |
| qih = res->d5 = CONVERT_UINT_V(qi >> 30); // PERF: amd_bitalign ? | |
| qim = res->d4 = (CONVERT_UINT_V(qi) >> 15) & 0x7FFF; | |
| qil = res->d3 = CONVERT_UINT_V(qi ) & 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1: qf=%#G, nf=%#G, *=%#G, qi=%lld=0x%llx, res=%x:%x:%x:..:..:..\n", | |
| qf_1, V(nf), qf_1*V(nf), V(qi), V(qi), V(res->d5), V(res->d4), V(res->d3)); | |
| #endif | |
| /*******************************************************/ | |
| // nn = n * qi | |
| nn.d3 = mul24(n.d0, qil); | |
| nn.d4 = mad24(n.d0, qim, nn.d3 >> 15); | |
| nn.d4 = mad24(n.d1, qil, nn.d4); | |
| nn.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1.1: nn=..:..:..:..:..:..:%x:%x:..:..:..\n", | |
| V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d5 = mad24(n.d0, qih, nn.d4 >> 15); | |
| nn.d5 = mad24(n.d1, qim, nn.d5); | |
| nn.d5 = mad24(n.d2, qil, nn.d5); | |
| nn.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1.2: nn=..:..:..:..:..:%x:%x:%x:...\n", | |
| V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d6 = mad24(n.d1, qih, nn.d5 >> 15); | |
| nn.d6 = mad24(n.d2, qim, nn.d6); | |
| nn.d6 = mad24(n.d3, qil, nn.d6); | |
| nn.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1.3: nn=..:..:..:..:%x:%x:%x:%x:...\n", | |
| V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d7 = mad24(n.d2, qih, nn.d6 >> 15); | |
| nn.d7 = mad24(n.d3, qim, nn.d7); | |
| nn.d7 = mad24(n.d4, qil, nn.d7); | |
| nn.d6 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1.4: nn=..:..:..:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d8 = mad24(n.d3, qih, nn.d7 >> 15); | |
| nn.d8 = mad24(n.d4, qim, nn.d8); | |
| nn.d8 = mad24(n.d5, qil, nn.d8); | |
| nn.d7 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1.5: nn=..:..:%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| #if defined CHECKS_MODBASECASE || (TRACE_KERNEL > 3) | |
| nn.d9 = mad24(n.d4, qih, nn.d8 >> 15); | |
| nn.d9 = mad24(n.d5, qim, nn.d9); | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1.6: nn=..:%x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d9), V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.da = mad24(n.d5, qih, nn.d9 >> 15); // can be up to 30 bits, just as the input qhi | |
| nn.d9 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1.7: nn=..:%x:%x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.da), V(nn.d9), V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| #endif | |
| nn.d8 &= 0x7FFF; | |
| // q = q - nn, but upon function entry, qhi contains all the bits for db:da. All bits below are zero. | |
| q.d3 = (-nn.d3) & 0x7FFF; | |
| q.d4 = (-nn.d4 + AS_UINT_V((nn.d3 > 0))); | |
| q.d5 = (-nn.d5 + AS_UINT_V((q.d4 > 0))); | |
| q.d6 = (-nn.d6 + AS_UINT_V((q.d5 > 0))); | |
| q.d7 = (-nn.d7 + AS_UINT_V((q.d6 > 0))); | |
| q.d8 = (-nn.d8 + AS_UINT_V((q.d7 > 0))); | |
| #if defined CHECKS_MODBASECASE || (TRACE_KERNEL > 2) | |
| q.d9 = (-nn.d9 + AS_UINT_V((q.d8 > 0))); | |
| q.da = qhi - nn.da + AS_UINT_V((q.d9 > 0)); | |
| q.d9 &= 0x7FFF; | |
| q.da &= 0x7FFF; | |
| #endif | |
| q.d4 &= 0x7FFF; | |
| q.d5 &= 0x7FFF; | |
| q.d6 &= 0x7FFF; | |
| q.d7 &= 0x7FFF; | |
| q.d8 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#1.8: q=..:%x:%x!%x:%x:%x:%x:%x:%x:..:..\n", | |
| V(q.da), V(q.d9), V(q.d8), V(q.d7), V(q.d6), V(q.d5), V(q.d4), V(q.d3)); | |
| #endif | |
| MODBASECASE_NONZERO_ERROR(q.da, 2, 10, 1); | |
| MODBASECASE_NONZERO_ERROR(q.d9, 2, 9, 2); | |
| /********** Step 2, Offset 2^30 (2*15 + 0) **********/ | |
| qf= CONVERT_DOUBLE_V(mad24(q.d8, 32768u, q.d7)); | |
| qf= qf * 1073741824.0f + CONVERT_DOUBLE_V(mad24(q.d6, 32768u, q.d5)); | |
| qf= qf * 1073741824.0f + CONVERT_DOUBLE_V(mad24(q.d4, 32768u, q.d3)); | |
| qf*= 35184372088832.0; | |
| qi=CONVERT_ULONG_V(qf*nf); | |
| MODBASECASE_QI_ERROR(1L<<46, 2, qi, 3); | |
| res->d2 = CONVERT_UINT_V(qi >> 30); | |
| res->d1 = (CONVERT_UINT_V(qi) >> 15) & 0x7FFF; | |
| res->d0 = CONVERT_UINT_V(qi) & 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90_d#2: qf=%#G, nf=%#G, *=%#G, qi=%lld=0x%llx, res=%x:%x:%x:%x:%x:%x\n", | |
| V(qf), V(nf), V(qf)*V(nf), V(qi), V(qi), V(res->d5), V(res->d4), V(res->d3), V(res->d2), V(res->d1), V(res->d0)); | |
| #endif | |
| /*******************************************************/ | |
| // skip the last part - it will change the result by one at most - we can live with a result that is off by one | |
| } | |
| #else | |
| // no support for doubles | |
| void div_180_90(int90_v * const res, const uint qhi, const int90_v n, const float_v nf | |
| #if (TRACE_KERNEL > 1) | |
| , const uint tid | |
| #endif | |
| MODBASECASE_PAR_DEF | |
| )/* res = q / n (integer division) | |
| during function entry, qhi contains the upper 30 bits of an 180-bit-value. The remaining bits are zero implicitely. | |
| this is not a vector, as the first value is the same for all FCs*/ | |
| // try with 4 * 23 bit reductions: should be sufficient for 90 bits (and 86 anyways) | |
| { | |
| __private float_v qf; | |
| __private float qf_1; // for the first conversion which does not need vectors yet | |
| __private uint_v qi, qil, qih; | |
| __private int180_v nn, q; // PERF: reduce register usage by always using nn.d0-nn.d6 instead of shifting ? | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#0: q=%x:<150x0>, n=%x:%x:%x:%x:%x:%x, nf=%#G\n", | |
| qhi, V(n.d5), V(n.d4), V(n.d3), V(n.d2), V(n.d1), V(n.d0), V(nf)); | |
| #endif | |
| /********** Step 1, Offset 2^67 (4*15 + 7) **********/ | |
| // qf_1 = convert_float(qhi) * 4294967296.0f; // no vector yet, saving a few conversions! | |
| // qf_1 = qf_1 * 32768.0f * 64.0f; | |
| qf_1 = convert_float(qhi) * 9007199254740992.0f; // no vector yet, saving a few conversions! 9007199254740992=4294967296*32768*64, which the compiler does not combine automatically | |
| qi=CONVERT_UINT_V(qf_1*nf); // vectorize just here | |
| MODBASECASE_QI_ERROR(1<<24, 1, qi, 0); // qi here is about 23 bits | |
| res->d5 = (qi >> 8); | |
| res->d4 = (qi << 7) & 0x7FFF; | |
| qil = qi & 0x7FFF; | |
| qih = (qi >> 15); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1: qf=%#G, nf=%#G, *=%#G, qi=%d=0x%x, res=%x:%x:..:..:..:..\n", | |
| qf_1, V(nf), qf_1*V(nf), V(qi), V(qi), V(res->d5), V(res->d4)); | |
| #endif | |
| /*******************************************************/ | |
| // nn = n * qi | |
| nn.d4 = mul24(n.d0, qil); | |
| nn.d5 = mad24(n.d0, qih, nn.d4 >> 15); | |
| nn.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1.1: nn=..:..:..:..:..:%x:%x:..:..:..\n", | |
| V(nn.d5), V(nn.d4)); | |
| #endif | |
| nn.d5 = mad24(n.d1, qil, nn.d5); | |
| nn.d6 = mad24(n.d1, qih, nn.d5 >> 15); | |
| nn.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1.2: nn=..:..:..:..:%x:%x:%x:...\n", | |
| V(nn.d6), V(nn.d5), V(nn.d4)); | |
| #endif | |
| nn.d6 = mad24(n.d2, qil, nn.d6); | |
| nn.d7 = mad24(n.d2, qih, nn.d6 >> 15); | |
| nn.d6 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1.3: nn=..:..:..:%x:%x:%x:%x:...\n", | |
| V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4)); | |
| #endif | |
| nn.d7 = mad24(n.d3, qil, nn.d7); | |
| nn.d8 = mad24(n.d3, qih, nn.d7 >> 15); | |
| nn.d7 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1.4: nn=..:..:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4)); | |
| #endif | |
| nn.d8 = mad24(n.d4, qil, nn.d8); | |
| nn.d9 = mad24(n.d4, qih, nn.d8 >> 15); | |
| nn.d8 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1.5: nn=..:%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.d9), V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4)); | |
| #endif | |
| nn.d9 = mad24(n.d5, qil, nn.d9); | |
| nn.da = mad24(n.d5, qih, nn.d9 >> 15); | |
| nn.d9 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1.6: nn=..:%x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(nn.da), V(nn.d9), V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4)); | |
| #endif | |
| // now shift-left 7 bits PERF: would that still fit into qi to avoid the long shift? | |
| #ifdef CHECKS_MODBASECASE | |
| nn.db = nn.da >> 8; // PERF: not needed as it will be gone anyway after sub | |
| #endif | |
| nn.da = mad24(nn.da & 0xFF, 128u, nn.d9 >> 8); | |
| nn.d9 = mad24(nn.d9 & 0xFF, 128u, nn.d8 >> 8); | |
| nn.d8 = mad24(nn.d8 & 0xFF, 128u, nn.d7 >> 8); | |
| nn.d7 = mad24(nn.d7 & 0xFF, 128u, nn.d6 >> 8); | |
| nn.d6 = mad24(nn.d6 & 0xFF, 128u, nn.d5 >> 8); | |
| nn.d5 = mad24(nn.d5 & 0xFF, 128u, nn.d4 >> 8); | |
| nn.d4 = (nn.d4 & 0x3FF) << 7; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1.7: nn=%x!%x:%x:%x:%x:%x:%x:%x:..:..:..\n", | |
| V(nn.db), V(nn.da), V(nn.d9), V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4)); | |
| #endif | |
| // q = q - nn, but upon function entry, qhi contains all the bits for db:da. All bits below are zero. | |
| q.d4 = (-nn.d4) & 0x7FFF; | |
| q.d5 = (-nn.d5 + AS_UINT_V((nn.d4 > 0))); | |
| q.d6 = (-nn.d6 + AS_UINT_V((q.d5 > 0))); | |
| q.d7 = (-nn.d7 + AS_UINT_V((q.d6 > 0))); | |
| q.d8 = (-nn.d8 + AS_UINT_V((q.d7 > 0))); | |
| q.d9 = (-nn.d9 + AS_UINT_V((q.d8 > 0))); | |
| q.da = (qhi & 0x7FFF) - nn.da + AS_UINT_V((q.d9 > 0)); | |
| #ifdef CHECKS_MODBASECASE | |
| q.db = (qhi >> 15) - nn.db + AS_UINT_V((q.da > 0x7FFF)); // PERF: not needed: should be zero anyway | |
| q.db &= 0x7FFF; | |
| #endif | |
| q.d5 &= 0x7FFF; | |
| q.d6 &= 0x7FFF; | |
| q.d7 &= 0x7FFF; | |
| q.d8 &= 0x7FFF; | |
| q.d9 &= 0x7FFF; | |
| q.da &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#1.8: q=%x!%x:%x:%x:%x:%x:%x:%x:..:..\n", | |
| V(q.db), V(q.da), V(q.d9), V(q.d8), V(q.d7), V(q.d6), V(q.d5), V(q.d4)); | |
| #endif | |
| MODBASECASE_NONZERO_ERROR(q.db, 1, 11, 1); | |
| /********** Step 2, Offset 2^45 (3*15 + 0) **********/ | |
| qf= CONVERT_FLOAT_V(mad24(q.da, 32768u, q.d9)); | |
| qf= qf * 1073741824.0f + CONVERT_FLOAT_V(mad24(q.d8, 32768u, q.d7)); | |
| qf*= 4294967296.0f; | |
| qi=CONVERT_UINT_V(qf*nf); | |
| MODBASECASE_QI_ERROR(1<<23, 2, qi, 2); // here, we need 2^23 ... | |
| qih = (qi >> 15); | |
| qil = qi & 0x7FFF; | |
| res->d4 += (qi >> 17); | |
| res->d3 = (qi >> 2) &0x7FFF; | |
| res->d2 = (qi << 13) &0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2: qf=%#G, nf=%#G, *=%#G, qi=%d=0x%x, res=%x:%x:%x:%x:..:..\n", | |
| V(qf), V(nf), V(qf)*V(nf), V(qi), V(qi), V(res->d5), V(res->d4), V(res->d3), V(res->d2)); | |
| #endif | |
| /*******************************************************/ | |
| // nn = n * qi | |
| nn.d3 = mul24(n.d0, qil); | |
| nn.d4 = mad24(n.d0, qih, nn.d3 >> 15); | |
| nn.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2.1: nn=..:..:..:..:%x:%x:..:..:..\n", | |
| V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d4 = mad24(n.d1, qil, nn.d4); | |
| nn.d5 = mad24(n.d1, qih, nn.d4 >> 15); | |
| nn.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2.2: nn=..:..:..:%x:%x:%x:..:..:..\n", | |
| V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d5 = mad24(n.d2, qil, nn.d5); | |
| nn.d6 = mad24(n.d2, qih, nn.d5 >> 15); | |
| nn.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2.3: nn=..:..:%x:%x:%x:%x:..:..:..\n", | |
| V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d6 = mad24(n.d3, qil, nn.d6); | |
| nn.d7 = mad24(n.d3, qih, nn.d6 >> 15); | |
| nn.d6 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2.4: nn=..:%x:%x:%x:%x:%x:..:..:..\n", | |
| V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d7 = mad24(n.d4, qil, nn.d7); | |
| nn.d8 = mad24(n.d4, qih, nn.d7 >> 15); | |
| nn.d7 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2.5: nn=..:%x:%x:%x:%x:%x:%x:..:..:..\n", | |
| V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| nn.d8 = mad24(n.d5, qil, nn.d8); | |
| nn.d9 = mad24(n.d5, qih, nn.d8 >> 15); | |
| nn.d8 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2.6: nn=..:..:%x:%x:%x:%x:%x:%x:%x:..:..:..\n", | |
| V(nn.d9), V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3)); | |
| #endif | |
| #ifdef CHECKS_MODBASECASE | |
| nn.da = nn.d9 >> 15; | |
| nn.d9 &= 0x7FFF; | |
| #endif | |
| // shift-right 2 bits | |
| nn.d2 = (nn.d3 << 13) & 0x7FFF; | |
| nn.d3 = mad24(nn.d4 & 3, 8192u, nn.d3 >> 2); | |
| nn.d4 = mad24(nn.d5 & 3, 8192u, nn.d4 >> 2); | |
| nn.d5 = mad24(nn.d6 & 3, 8192u, nn.d5 >> 2); | |
| nn.d6 = mad24(nn.d7 & 3, 8192u, nn.d6 >> 2); | |
| nn.d7 = mad24(nn.d8 & 3, 8192u, nn.d7 >> 2); | |
| nn.d8 = mad24(nn.d9 & 3, 8192u, nn.d8 >> 2); | |
| #ifdef CHECKS_MODBASECASE | |
| nn.d9 = mad24(nn.da & 3, 8192u, nn.d9 >> 2); | |
| nn.da = nn.da >> 2; | |
| #endif | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2.8: nn=..:%x:%x!%x:%x:%x:%x:%x:%x:%x:..:..\n", | |
| V(nn.da), V(nn.d9), V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2)); | |
| #endif | |
| // q = q - nn; q.d2 and q.d3 are still 0 | |
| q.d2 = (-nn.d2) & 0x7FFF; | |
| q.d3 = (-nn.d3 + AS_UINT_V((nn.d2 > 0))); | |
| q.d4 = q.d4 - nn.d4 + AS_UINT_V((q.d3 > 0x7FFF)); | |
| q.d5 = q.d5 - nn.d5 + AS_UINT_V((q.d4 > 0x7FFF)); | |
| q.d6 = q.d6 - nn.d6 + AS_UINT_V((q.d5 > 0x7FFF)); | |
| q.d7 = q.d7 - nn.d7 + AS_UINT_V((q.d6 > 0x7FFF)); | |
| q.d8 = q.d8 - nn.d8 + AS_UINT_V((q.d7 > 0x7FFF)); | |
| #ifdef CHECKS_MODBASECASE | |
| q.d9 = q.d9 - nn.d9 + AS_UINT_V((q.d8 > 0x7FFF)); // PERF: not needed: should be zero anyway | |
| q.d9 &= 0x7FFF; | |
| #endif | |
| q.d3 &= 0x7FFF; | |
| q.d4 &= 0x7FFF; | |
| q.d5 &= 0x7FFF; | |
| q.d6 &= 0x7FFF; | |
| q.d7 &= 0x7FFF; | |
| q.d8 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#2.9: q=..:..:%x:%x:%x:%x:%x:%x:%x:..:..\n", | |
| V(q.d9), V(q.d8), V(q.d7), V(q.d6), V(q.d5), V(q.d4), V(q.d3)); | |
| #endif | |
| MODBASECASE_NONZERO_ERROR(q.da, 2, 10, 3); | |
| MODBASECASE_NONZERO_ERROR(q.d9, 2, 9, 4); | |
| /********** Step 3, Offset 2^22 (1*15 + 7) **********/ | |
| qf= CONVERT_FLOAT_V(mad24(q.d8, 32768u, q.d7)); | |
| qf= qf * 1073741824.0f + CONVERT_FLOAT_V(mad24(q.d6, 32768u, q.d5)); | |
| qf*= 8388608.0f; | |
| qi=CONVERT_UINT_V(qf*nf); | |
| MODBASECASE_QI_ERROR(1<<23, 3, qi, 5); | |
| qih = (qi >> 15); | |
| qil = qi & 0x7FFF; | |
| res->d2 += (qi >> 8); | |
| res->d1 = (qi << 7) & 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3: qf=%#G, nf=%#G, *=%#G, qi=%d=0x%x, res=%x:%x:%x:%x:%x:..\n", | |
| V(qf), V(nf), V(qf)*V(nf), V(qi), V(qi), V(res->d5), V(res->d4), V(res->d3), V(res->d2), V(res->d1)); | |
| #endif | |
| /*******************************************************/ | |
| // nn = n * qi | |
| nn.d1 = mul24(n.d0, qil); | |
| nn.d2 = mad24(n.d0, qih, nn.d1 >> 15); | |
| nn.d1 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3.1: nn=..:..:..:..:%x:%x:..\n", | |
| V(nn.d2), V(nn.d1)); | |
| #endif | |
| nn.d2 = mad24(n.d1, qil, nn.d2); | |
| nn.d3 = mad24(n.d1, qih, nn.d2 >> 15); | |
| nn.d2 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3.2: nn=..:..:..:%x:%x:%x:..\n", | |
| V(nn.d3), V(nn.d2), V(nn.d1)); | |
| #endif | |
| nn.d3 = mad24(n.d2, qil, nn.d3); | |
| nn.d4 = mad24(n.d2, qih, nn.d3 >> 15); | |
| nn.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 4) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3.3: nn=..:..:%x:%x:%x:%x:..\n", | |
| V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1)); | |
| #endif | |
| nn.d4 = mad24(n.d3, qil, nn.d4); | |
| nn.d5 = mad24(n.d3, qih, nn.d4 >> 15); | |
| nn.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3.4: nn=..:%x:%x:%x:%x:%x:..\n", | |
| V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1)); | |
| #endif | |
| nn.d5 = mad24(n.d4, qil, nn.d5); | |
| nn.d6 = mad24(n.d4, qih, nn.d5 >> 15); | |
| nn.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3.5: nn=..:%x:%x:%x:%x:%x:%x:..\n", | |
| V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1)); | |
| #endif | |
| nn.d6 = mad24(n.d5, qil, nn.d6); | |
| nn.d7 = mad24(n.d5, qih, nn.d6 >> 15); | |
| nn.d6 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3.6: nn=..:%x:%x:%x:%x:%x:%x:%x:..\n", | |
| V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1)); | |
| #endif | |
| // nn.d7 also contains the nn.d8 bits, will be distributed by the shifting below | |
| // now shift-left 7 bits | |
| #ifdef CHECKS_MODBASECASE | |
| nn.d8 = nn.d7 >> 8; // PERF: not needed as it will be gone anyway after sub | |
| #endif | |
| nn.d7 = mad24(nn.d7 & 0xFF, 128u, nn.d6 >> 8); | |
| nn.d6 = mad24(nn.d6 & 0xFF, 128u, nn.d5 >> 8); | |
| nn.d5 = mad24(nn.d5 & 0xFF, 128u, nn.d4 >> 8); | |
| nn.d4 = mad24(nn.d4 & 0xFF, 128u, nn.d3 >> 8); | |
| nn.d3 = mad24(nn.d3 & 0xFF, 128u, nn.d2 >> 8); | |
| nn.d2 = mad24(nn.d2 & 0xFF, 128u, nn.d1 >> 8); | |
| nn.d1 = (nn.d1 & 0xFF) << 7; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3.7: nn=..:..:%x!%x:%x:%x:%x:%x:%x:%x:..\n", | |
| V(nn.d8), V(nn.d7), V(nn.d6), V(nn.d5), V(nn.d4), V(nn.d3), V(nn.d2), V(nn.d1)); | |
| #endif | |
| // q = q - nn; q.d1 is still 0 | |
| q.d1 = (-nn.d1) & 0x7FFF; | |
| q.d2 = q.d2 - nn.d2 + AS_UINT_V((nn.d1 > 0)); | |
| q.d3 = q.d3 - nn.d3 + AS_UINT_V((q.d2 > 0x7FFF)); | |
| q.d4 = q.d4 - nn.d4 + AS_UINT_V((q.d3 > 0x7FFF)); | |
| q.d5 = q.d5 - nn.d5 + AS_UINT_V((q.d4 > 0x7FFF)); | |
| q.d6 = q.d6 - nn.d6 + AS_UINT_V((q.d5 > 0x7FFF)); | |
| q.d7 = q.d7 - nn.d7 + AS_UINT_V((q.d6 > 0x7FFF)); | |
| #ifdef CHECKS_MODBASECASE | |
| q.d8 = q.d8 - nn.d8 + AS_UINT_V((q.d7 > 0x7FFF)); // PERF: not needed: should be zero anyway | |
| q.d8 &= 0x7FFF; | |
| #endif | |
| q.d2 &= 0x7FFF; | |
| q.d3 &= 0x7FFF; | |
| q.d4 &= 0x7FFF; | |
| q.d5 &= 0x7FFF; | |
| q.d6 &= 0x7FFF; | |
| q.d7 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#3.8: q=..:%x!%x:%x:%x:%x:%x:%x:%x:..\n", | |
| V(q.d8), V(q.d7), V(q.d6), V(q.d5), V(q.d4), V(q.d3), V(q.d2), V(q.d1)); | |
| #endif | |
| MODBASECASE_NONZERO_ERROR(q.d8, 3, 8, 6); | |
| /********** Step 4, Offset 2^0 (0*15 + 0) **********/ | |
| qf= CONVERT_FLOAT_V(mad24(q.d7, 32768u, q.d6)); | |
| qf= qf * 1073741824.0f + CONVERT_FLOAT_V(mad24(q.d5, 32768u, q.d4)); | |
| qf*= 1073741824.0f; | |
| qi=CONVERT_UINT_V(qf*nf); | |
| MODBASECASE_QI_ERROR(1<<23, 4, qi, 7); | |
| qil = qi & 0x7FFF; | |
| qih = (qi >> 15); | |
| res->d1 += qih; | |
| res->d0 = qil; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"div_180_90#4: qf=%#G, nf=%#G, *=%#G, qi=%d=0x%x, res=%x:%x:%x:%x:%x:%x\n", | |
| V(qf), V(nf), V(qf)*V(nf), V(qi), V(qi), V(res->d5), V(res->d4), V(res->d3), V(res->d2), V(res->d1), V(res->d0)); | |
| #endif | |
| // skip the last part - it will change the result by one at most - we can live with a result that is off by one | |
| // but need to handle outstanding carries instead | |
| res->d2 += res->d1 >> 15; | |
| res->d1 &= 0x7FFF; | |
| res->d3 += res->d2 >> 15; | |
| res->d2 &= 0x7FFF; | |
| res->d4 += res->d3 >> 15; | |
| res->d3 &= 0x7FFF; | |
| res->d5 += res->d4 >> 15; | |
| res->d4 &= 0x7FFF; | |
| } | |
| #endif | |
| void check_barrett15_82(uint shifter, const int90_v f, const uint tid, const uint8 b_in, const int bit_max65, __global uint * restrict RES | |
| MODBASECASE_PAR_DEF) | |
| { | |
| __private int90_v a, u; | |
| __private int180_v b, tmp180; | |
| __private int90_v tmp90; | |
| __private float_v ff; | |
| #if defined USE_DP | |
| __private double_v ffd; | |
| #endif | |
| __private uint tmp, bit_max_bot, bit_max_mult; | |
| __private int180_t bb={0, 0, 0, 0, b_in.s0, b_in.s1, b_in.s2, b_in.s3, b_in.s4, b_in.s5, b_in.s6, b_in.s7}; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82: bb=%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x, bit_max65=%d\n", | |
| bb.db, bb.da, bb.d9, bb.d8, bb.d7, bb.d6, bb.d5, bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, bit_max65); | |
| #endif | |
| // ff = f as float, needed only for the final mod_simple | |
| ff= CONVERT_FLOAT_RTP_V(mad24(f.d5, 32768u, f.d4)); | |
| ff= ff * 1073741824.0f+ CONVERT_FLOAT_RTP_V(mad24(f.d3, 32768u, f.d2)); | |
| ff = as_float(0x3f7ffffc) / ff; | |
| tmp = 1 << (bit_max65+4); // tmp180 = 2^(89 + bits in f) | |
| #if defined USE_DP | |
| // ffd = f as double, needed in div_180_90_d). | |
| ffd= CONVERT_DOUBLE_RTP_V(mad24(f.d5, 32768u, f.d4)); | |
| ffd= ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d3, 32768u, f.d2)); | |
| ffd= ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d1, 32768u, f.d0)); | |
| ffd = as_double(0x3feffffffffffffdL) / ffd; // should be a bit less than 1.0 | |
| div_180_90_d(&u, tmp, f, ffd | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82: u(d)=%x:%x:%x:%x:%x:%x, ffd=%G\n", | |
| V(u.d5), V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ffd)); | |
| #endif | |
| #else | |
| // PERF: as div is only used here, use all those zeros directly in there | |
| // here, no vectorized data is necessary yet: the precalculated "b" value is the same for all | |
| // tmp contains the upper 2 parts (30 bits) of a 180-bit value. The lower 150 bits are all zero implicitely | |
| div_180_90(&u, tmp, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82: u=%x:%x:%x:%x:%x:%x, ff=%G\n", | |
| V(u.d5), V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ff)); | |
| #endif | |
| #endif | |
| if (bit_max65 > 10) // need to distiguish how far to shift; the same branch will be taken by all threads | |
| { | |
| //bit_max is 76 .. 89 | |
| bit_max_bot = bit_max65-11; | |
| bit_max_mult = 1 << (26-bit_max65); | |
| // a.d<n> = bb.d<n+5> >> bit_max_bot + bb.d<n+6> << top_bit_max | |
| //PERF: min limit of bb? bit_max > 75 ==> bb > 2^150 ==> d0..d9=0 | |
| a.d0 = mad24(bb.d6, bit_max_mult, (bb.d5 >> bit_max_bot))&0x7FFF; // a = floor(b / 2 ^ (bits_in_f - 1)) | |
| a.d1 = mad24(bb.d7, bit_max_mult, (bb.d6 >> bit_max_bot))&0x7FFF; | |
| a.d2 = mad24(bb.d8, bit_max_mult, (bb.d7 >> bit_max_bot))&0x7FFF; | |
| a.d3 = mad24(bb.d9, bit_max_mult, (bb.d8 >> bit_max_bot))&0x7FFF; | |
| a.d4 = mad24(bb.da, bit_max_mult, (bb.d9 >> bit_max_bot))&0x7FFF; | |
| a.d5 = mad24(bb.db, bit_max_mult, (bb.da >> bit_max_bot)); | |
| } | |
| else | |
| { | |
| //bit_max is 61 .. 75 | |
| bit_max_bot = bit_max65+4; | |
| bit_max_mult = 1 << (11-bit_max65); | |
| // a.d<n> = bb.d<n+4> >> bit_max_bot + bb.d<n+5> << top_bit_max | |
| //PERF: min limit of bb? bit_max >= 60 ==> bb >= 2^120 ==> d0..d7=0 | |
| a.d0 = mad24(bb.d5, bit_max_mult, (bb.d4 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(bb.d6, bit_max_mult, (bb.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(bb.d7, bit_max_mult, (bb.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(bb.d8, bit_max_mult, (bb.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(bb.d9, bit_max_mult, (bb.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(bb.da, bit_max_mult, (bb.d9 >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| // PERF: could be no_low_5 | |
| mul_90_180_no_low5(&tmp180, a, u); // tmp180 = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (89 + bits_in_f) / f) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82: a=%x:%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp180.db), V(tmp180.da), V(tmp180.d9), V(tmp180.d8), V(tmp180.d7), V(tmp180.d6), V(tmp180.d5)); | |
| #endif | |
| a.d0 = tmp180.d6; // a = tmp180 / 2^90, which is b / f | |
| a.d1 = tmp180.d7; | |
| a.d2 = tmp180.d8; | |
| a.d3 = tmp180.d9; | |
| a.d4 = tmp180.da; | |
| a.d5 = tmp180.db; | |
| mul_90(&tmp90, a, f); // tmp90 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82: a=%x:%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp90.d5), V(tmp90.d4), V(tmp90.d3), V(tmp90.d2), V(tmp90.d1), V(tmp90.d0)); | |
| #endif | |
| // bb.d0-bb.d3 are all 0 | |
| a.d0 = (-tmp90.d0) & 0x7FFF; | |
| a.d1 = (-tmp90.d1 + AS_UINT_V((a.d0 > 0) )); | |
| a.d2 = (-tmp90.d2 + AS_UINT_V((a.d1 > 0x7FFF) )); | |
| a.d3 = (-tmp90.d3 + AS_UINT_V((a.d2 > 0x7FFF) )); | |
| a.d4 = (bb.d4-tmp90.d4 + AS_UINT_V((a.d3 > 0x7FFF) )); | |
| a.d5 = (bb.d5-tmp90.d5 + AS_UINT_V((a.d4 > 0x7FFF) )); | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| a.d4 &= 0x7FFF; | |
| a.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82: b=%x:%x:0:0:0:0 - tmp = %x:%x:%x:%x:%x:%x (a)\n", | |
| bb.d5, bb.d4, V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| /// | |
| ///// here it starts to become different between the 3 6x15bit kernels | |
| /// | |
| while(shifter) | |
| { | |
| square_90_180(&b, a); // b = a^2 | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.db), V(b.da), V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| if (bit_max65 > 10) // need to distiguish how far to shift | |
| { | |
| a.d0 = mad24(b.d6, bit_max_mult, (b.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d7, bit_max_mult, (b.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d8, bit_max_mult, (b.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d9, bit_max_mult, (b.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.da, bit_max_mult, (b.d9 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(b.db, bit_max_mult, (b.da >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| else | |
| { | |
| a.d0 = mad24(b.d5, bit_max_mult, (b.d4 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d6, bit_max_mult, (b.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d7, bit_max_mult, (b.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d8, bit_max_mult, (b.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.d9, bit_max_mult, (b.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(b.da, bit_max_mult, (b.d9 >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| // PERF: could be no_low_5 | |
| mul_90_180_no_low5(&tmp180, a, u); // tmp180 = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (89 + bits_in_f) / f) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loopl: a=%x:%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp180.db), V(tmp180.da), V(tmp180.d9), V(tmp180.d8), V(tmp180.d7), V(tmp180.d6), V(tmp180.d5)); | |
| #endif | |
| a.d0 = tmp180.d6; // a = tmp180 / 2^90, which is b / f | |
| a.d1 = tmp180.d7; | |
| a.d2 = tmp180.d8; | |
| a.d3 = tmp180.d9; | |
| a.d4 = tmp180.da; | |
| a.d5 = tmp180.db; | |
| mul_90(&tmp90, a, f); // tmp90 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp90.d5), V(tmp90.d4), V(tmp90.d3), V(tmp90.d2), V(tmp90.d1), V(tmp90.d0)); | |
| #endif | |
| a.d0 = (b.d0 - tmp90.d0) & 0x7FFF; | |
| a.d1 = (b.d1 - tmp90.d1 + AS_UINT_V((a.d0 > b.d0) )); | |
| a.d2 = (b.d2 - tmp90.d2 + AS_UINT_V((a.d1 > b.d1) )); | |
| a.d3 = (b.d3 - tmp90.d3 + AS_UINT_V((a.d2 > b.d2) )); | |
| a.d4 = (b.d4 - tmp90.d4 + AS_UINT_V((a.d3 > b.d3) )); | |
| a.d5 = (b.d5 - tmp90.d5 + AS_UINT_V((a.d4 > b.d4) )); | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| a.d4 &= 0x7FFF; | |
| a.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: b=...:%x:%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x:%x (a)\n", | |
| V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0), V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| if(shifter&0x80000000)shl_90(&a); // "optional multiply by 2" in Prime 95 documentation | |
| shifter+=shifter; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"loopend: exp=%x, a= %x:%x:%x:%x:%x:%x \n", | |
| shifter, V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0) ); | |
| #endif | |
| } | |
| mod_simple_even_90_and_check_big_factor90(a, f, ff, RES | |
| #ifdef CHECKS_MODBASECASE | |
| , bit_max65, 10, modbasecase_debug | |
| #endif | |
| ); | |
| } | |
| void check_barrett15_83(uint shifter, const int90_v f, const uint tid, const uint8 b_in, const int bit_max65, __global uint * restrict RES | |
| MODBASECASE_PAR_DEF) | |
| { | |
| __private int90_v a, u; | |
| __private int180_v b, tmp180; | |
| __private int90_v tmp90; | |
| __private float_v ff; | |
| #if defined USE_DP | |
| __private double_v ffd; | |
| #endif | |
| __private uint tmp, bit_max_bot, bit_max_mult; | |
| __private int180_t bb={0, 0, 0, 0, b_in.s0, b_in.s1, b_in.s2, b_in.s3, b_in.s4, b_in.s5, b_in.s6, b_in.s7}; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83: bb=%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x, bit_max65=%d\n", | |
| bb.db, bb.da, bb.d9, bb.d8, bb.d7, bb.d6, bb.d5, bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, bit_max65); | |
| #endif | |
| // ff = f as float, needed only for the final mod_simple | |
| ff= CONVERT_FLOAT_RTP_V(mad24(f.d5, 32768u, f.d4)); | |
| ff= ff * 1073741824.0f+ CONVERT_FLOAT_RTP_V(mad24(f.d3, 32768u, f.d2)); | |
| ff = as_float(0x3f7ffffc) / ff; | |
| tmp = 1 << (bit_max65+4); // tmp180 = 2^(89 + bits in f) | |
| #if defined USE_DP | |
| // ffd = f as double, needed in div_180_90_d). | |
| ffd= CONVERT_DOUBLE_RTP_V(mad24(f.d5, 32768u, f.d4)); | |
| ffd= ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d3, 32768u, f.d2)); | |
| ffd= ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d1, 32768u, f.d0)); | |
| ffd = as_double(0x3feffffffffffffdL) / ffd; // should be a bit less than 1.0 | |
| div_180_90_d(&u, tmp, f, ffd | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83: u(d)=%x:%x:%x:%x:%x:%x, ffd=%G\n", | |
| V(u.d5), V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ffd)); | |
| #endif | |
| #else | |
| // PERF: as div is only used here, use all those zeros directly in there | |
| // here, no vectorized data is necessary yet: the precalculated "b" value is the same for all | |
| // tmp contains the upper 2 parts (30 bits) of a 180-bit value. The lower 150 bits are all zero implicitely | |
| div_180_90(&u, tmp, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83: u=%x:%x:%x:%x:%x:%x, ff=%G\n", | |
| V(u.d5), V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ff)); | |
| #endif | |
| #endif | |
| if (bit_max65 > 10) // need to distiguish how far to shift; the same branch will be taken by all threads | |
| { | |
| //bit_max is 76 .. 89 | |
| bit_max_bot = bit_max65-11; | |
| bit_max_mult = 1 << (26-bit_max65); | |
| // a.d<n> = bb.d<n+5> >> bit_max_bot + bb.d<n+6> << top_bit_max | |
| //PERF: min limit of bb? bit_max > 75 ==> bb > 2^150 ==> d0..d9=0 | |
| a.d0 = mad24(bb.d6, bit_max_mult, (bb.d5 >> bit_max_bot))&0x7FFF; // a = floor(b / 2 ^ (bits_in_f - 1)) | |
| a.d1 = mad24(bb.d7, bit_max_mult, (bb.d6 >> bit_max_bot))&0x7FFF; | |
| a.d2 = mad24(bb.d8, bit_max_mult, (bb.d7 >> bit_max_bot))&0x7FFF; | |
| a.d3 = mad24(bb.d9, bit_max_mult, (bb.d8 >> bit_max_bot))&0x7FFF; | |
| a.d4 = mad24(bb.da, bit_max_mult, (bb.d9 >> bit_max_bot))&0x7FFF; | |
| a.d5 = mad24(bb.db, bit_max_mult, (bb.da >> bit_max_bot)); | |
| } | |
| else | |
| { | |
| //bit_max is 61 .. 75 | |
| bit_max_bot = bit_max65+4; | |
| bit_max_mult = 1 << (11-bit_max65); | |
| // a.d<n> = bb.d<n+4> >> bit_max_bot + bb.d<n+5> << top_bit_max | |
| //PERF: min limit of bb? bit_max >= 60 ==> bb >= 2^120 ==> d0..d7=0 | |
| a.d0 = mad24(bb.d5, bit_max_mult, (bb.d4 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(bb.d6, bit_max_mult, (bb.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(bb.d7, bit_max_mult, (bb.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(bb.d8, bit_max_mult, (bb.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(bb.d9, bit_max_mult, (bb.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(bb.da, bit_max_mult, (bb.d9 >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| // PERF: could be no_low_5 | |
| mul_90_180_no_low5(&tmp180, a, u); // tmp180 = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (89 + bits_in_f) / f) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83: a=%x:%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp180.db), V(tmp180.da), V(tmp180.d9), V(tmp180.d8), V(tmp180.d7), V(tmp180.d6), V(tmp180.d5)); | |
| #endif | |
| a.d0 = tmp180.d6; // a = tmp180 / 2^90, which is b / f | |
| a.d1 = tmp180.d7; | |
| a.d2 = tmp180.d8; | |
| a.d3 = tmp180.d9; | |
| a.d4 = tmp180.da; | |
| a.d5 = tmp180.db; | |
| mul_90(&tmp90, a, f); // tmp90 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83: a=%x:%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp90.d5), V(tmp90.d4), V(tmp90.d3), V(tmp90.d2), V(tmp90.d1), V(tmp90.d0)); | |
| #endif | |
| // bb.d0-bb.d3 are 0 | |
| a.d0 = (-tmp90.d0) & 0x7FFF; | |
| a.d1 = (-tmp90.d1 + AS_UINT_V((a.d0 > 0) )); | |
| a.d2 = (-tmp90.d2 + AS_UINT_V((a.d1 > 0x7FFF) )); | |
| a.d3 = (-tmp90.d3 + AS_UINT_V((a.d2 > 0x7FFF) )); | |
| a.d4 = (bb.d4-tmp90.d4 + AS_UINT_V((a.d3 > 0x7FFF) )); | |
| a.d5 = (bb.d5-tmp90.d5 + AS_UINT_V((a.d4 > 0x7FFF) )); | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| a.d4 &= 0x7FFF; | |
| a.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83: b=%x:%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x:%x (a)\n", | |
| bb.d5, bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| while(shifter) | |
| { | |
| square_90_180(&b, a); // b = a^2 | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.db), V(b.da), V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| if(shifter&0x80000000) | |
| { | |
| shl_180(&b); // "optional multiply by 2" in Prime 95 documentation | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: shl: %x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| V(b.db), V(b.da), V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| } | |
| if (bit_max65 > 10) // need to distiguish how far to shift | |
| { | |
| a.d0 = mad24(b.d6, bit_max_mult, (b.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d7, bit_max_mult, (b.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d8, bit_max_mult, (b.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d9, bit_max_mult, (b.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.da, bit_max_mult, (b.d9 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(b.db, bit_max_mult, (b.da >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| else | |
| { | |
| a.d0 = mad24(b.d5, bit_max_mult, (b.d4 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d6, bit_max_mult, (b.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d7, bit_max_mult, (b.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d8, bit_max_mult, (b.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.d9, bit_max_mult, (b.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(b.da, bit_max_mult, (b.d9 >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| // PERF: could be no_low_5 | |
| mul_90_180_no_low5(&tmp180, a, u); // tmp180 = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (89 + bits_in_f) / f) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp180.db), V(tmp180.da), V(tmp180.d9), V(tmp180.d8), V(tmp180.d7), V(tmp180.d6), V(tmp180.d5)); | |
| #endif | |
| a.d0 = tmp180.d6; // a = tmp180 / 2^90, which is b / f | |
| a.d1 = tmp180.d7; | |
| a.d2 = tmp180.d8; | |
| a.d3 = tmp180.d9; | |
| a.d4 = tmp180.da; | |
| a.d5 = tmp180.db; | |
| mul_90(&tmp90, a, f); // tmp90 = (((b / (2^bit_max)) * u) / (2^bit_max)) * f | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp90.d5), V(tmp90.d4), V(tmp90.d3), V(tmp90.d2), V(tmp90.d1), V(tmp90.d0)); | |
| #endif | |
| // PERF: faster to compare against 0x7fff instead of b.dx? | |
| a.d0 = (b.d0 - tmp90.d0) & 0x7FFF; | |
| a.d1 = (b.d1 - tmp90.d1 + AS_UINT_V((a.d0 > b.d0) )); | |
| a.d2 = (b.d2 - tmp90.d2 + AS_UINT_V((a.d1 > b.d1) )); | |
| a.d3 = (b.d3 - tmp90.d3 + AS_UINT_V((a.d2 > b.d2) )); | |
| a.d4 = (b.d4 - tmp90.d4 + AS_UINT_V((a.d3 > b.d3) )); | |
| a.d5 = (b.d5 - tmp90.d5 + AS_UINT_V((a.d4 > b.d4) )); | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| a.d4 &= 0x7FFF; | |
| a.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: b=%x:%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x:%x (a)\n", | |
| V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0), V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| shifter+=shifter; | |
| } | |
| mod_simple_90_and_check_big_factor90(a, f, ff, RES | |
| #ifdef CHECKS_MODBASECASE | |
| , bit_max65, 10, modbasecase_debug | |
| #endif | |
| ); | |
| } | |
| void check_barrett15_88(uint shifter, const int90_v f, const uint tid, const uint8 b_in, const int bit_max65, __global uint * restrict RES | |
| MODBASECASE_PAR_DEF) | |
| { | |
| __private int90_v a, u; | |
| __private int180_v b, tmp180; | |
| __private int90_v tmp90; | |
| __private float_v ff; | |
| #if defined USE_DP | |
| __private double_v ffd; | |
| #endif | |
| __private uint tmp, bit_max_bot, bit_max_mult; | |
| __private int180_t bb={0, 0, 0, 0, b_in.s0, b_in.s1, b_in.s2, b_in.s3, b_in.s4, b_in.s5, b_in.s6, b_in.s7}; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_88: bb=%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x, bit_max65=%d\n", | |
| bb.db, bb.da, bb.d9, bb.d8, bb.d7, bb.d6, bb.d5, bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, bit_max65); | |
| #endif | |
| // ff = f as float, needed only for the final mod_simple | |
| ff= CONVERT_FLOAT_RTP_V(mad24(f.d5, 32768u, f.d4)); | |
| ff= ff * 1073741824.0f+ CONVERT_FLOAT_RTP_V(mad24(f.d3, 32768u, f.d2)); | |
| ff = as_float(0x3f7ffffc) / ff; | |
| tmp = 1 << (bit_max65+4); // tmp180 = 2^(89 + bits in f) | |
| #if defined USE_DP | |
| // ffd = f as double, needed in div_180_90_d). | |
| ffd= CONVERT_DOUBLE_RTP_V(mad24(f.d5, 32768u, f.d4)); | |
| ffd= ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d3, 32768u, f.d2)); | |
| ffd= ffd * 1073741824.0+ CONVERT_DOUBLE_RTP_V(mad24(f.d1, 32768u, f.d0)); | |
| ffd = as_double(0x3feffffffffffffdL) / ffd; // should be a bit less than 1.0 | |
| div_180_90_d(&u, tmp, f, ffd | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #else | |
| // PERF: as div is only used here, use all those zeros directly in there | |
| // here, no vectorized data is necessary yet: the precalculated "b" value is the same for all | |
| // tmp contains the upper 2 parts (30 bits) of a 180-bit value. The lower 150 bits are all zero implicitely | |
| div_180_90(&u, tmp, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| MODBASECASE_PAR); // u = floor(tmp180 / f) | |
| #endif | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_88: u=%x:%x:%x:%x:%x:%x, ff=%G\n", | |
| V(u.d5), V(u.d4), V(u.d3), V(u.d2), V(u.d1), V(u.d0), V(ff)); | |
| #endif | |
| if (bit_max65 > 10) // need to distiguish how far to shift; the same branch will be taken by all threads | |
| { | |
| //bit_max is 76 .. 89 | |
| bit_max_bot = bit_max65-11; | |
| bit_max_mult = 1 << (26-bit_max65); | |
| // a.d<n> = bb.d<n+5> >> bit_max_bot + bb.d<n+6> << top_bit_max | |
| //PERF: min limit of bb? bit_max > 75 ==> bb > 2^150 ==> d0..d9=0 | |
| a.d0 = mad24(bb.d6, bit_max_mult, (bb.d5 >> bit_max_bot))&0x7FFF; // a = floor(b / 2 ^ (bits_in_f - 1)) | |
| a.d1 = mad24(bb.d7, bit_max_mult, (bb.d6 >> bit_max_bot))&0x7FFF; | |
| a.d2 = mad24(bb.d8, bit_max_mult, (bb.d7 >> bit_max_bot))&0x7FFF; | |
| a.d3 = mad24(bb.d9, bit_max_mult, (bb.d8 >> bit_max_bot))&0x7FFF; | |
| a.d4 = mad24(bb.da, bit_max_mult, (bb.d9 >> bit_max_bot))&0x7FFF; | |
| a.d5 = mad24(bb.db, bit_max_mult, (bb.da >> bit_max_bot)); | |
| } | |
| else | |
| { | |
| //bit_max is 61 .. 75 | |
| bit_max_bot = bit_max65+4; | |
| bit_max_mult = 1 << (11-bit_max65); | |
| // a.d<n> = bb.d<n+4> >> bit_max_bot + bb.d<n+5> << top_bit_max | |
| //PERF: min limit of bb? bit_max >= 60 ==> bb >= 2^120 ==> d0..d7=0 | |
| a.d0 = mad24(bb.d5, bit_max_mult, (bb.d4 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(bb.d6, bit_max_mult, (bb.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(bb.d7, bit_max_mult, (bb.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(bb.d8, bit_max_mult, (bb.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(bb.d9, bit_max_mult, (bb.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(bb.da, bit_max_mult, (bb.d9 >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| // PERF: could be no_low_5 | |
| mul_90_180_no_low5(&tmp180, a, u); // tmp180 = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (89 + bits_in_f) / f) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_88: a=%x:%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp180.db), V(tmp180.da), V(tmp180.d9), V(tmp180.d8), V(tmp180.d7), V(tmp180.d6), V(tmp180.d5)); | |
| #endif | |
| a.d0 = tmp180.d6; // a = tmp180 / 2^90, which is b / f | |
| a.d1 = tmp180.d7; | |
| a.d2 = tmp180.d8; | |
| a.d3 = tmp180.d9; | |
| a.d4 = tmp180.da; | |
| a.d5 = tmp180.db; | |
| mul_90(&tmp90, a, f); // tmp90 = quotient * f, we only compute the low 90-bits here | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_88: a=%x:%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp90.d5), V(tmp90.d4), V(tmp90.d3), V(tmp90.d2), V(tmp90.d1), V(tmp90.d0)); | |
| #endif | |
| // bb.d0-bb.d3 are 0 | |
| a.d0 = (-tmp90.d0) & 0x7FFF; | |
| a.d1 = (-tmp90.d1 + AS_UINT_V((a.d0 > 0) )); | |
| a.d2 = (-tmp90.d2 + AS_UINT_V((a.d1 > 0x7FFF) )); | |
| a.d3 = (-tmp90.d3 + AS_UINT_V((a.d2 > 0x7FFF) )); | |
| a.d4 = (bb.d4-tmp90.d4 + AS_UINT_V((a.d3 > 0x7FFF) )); | |
| a.d5 = (bb.d5-tmp90.d5 + AS_UINT_V((a.d4 > 0x7FFF) )); | |
| a.d1 &= 0x7FFF; | |
| a.d2 &= 0x7FFF; | |
| a.d3 &= 0x7FFF; | |
| a.d4 &= 0x7FFF; | |
| a.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_88: b=%x:%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x:%x (a)\n", | |
| bb.d5, bb.d4, bb.d3, bb.d2, bb.d1, bb.d0, V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0)); | |
| #endif | |
| for(;;) | |
| { | |
| square_90_180(&b, a); // b = a^2 | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: exp=%.8x, a=%x:%x:%x:%x:%x:%x ^2 = %x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x:%x (b)\n", | |
| shifter, V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(b.db), V(b.da), V(b.d9), V(b.d8), V(b.d7), V(b.d6), V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0) ); | |
| #endif | |
| if (bit_max65 > 10) // need to distiguish how far to shift | |
| { | |
| a.d0 = mad24(b.d6, bit_max_mult, (b.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d7, bit_max_mult, (b.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d8, bit_max_mult, (b.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d9, bit_max_mult, (b.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.da, bit_max_mult, (b.d9 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(b.db, bit_max_mult, (b.da >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| else | |
| { | |
| a.d0 = mad24(b.d5, bit_max_mult, (b.d4 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d1 = mad24(b.d6, bit_max_mult, (b.d5 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d2 = mad24(b.d7, bit_max_mult, (b.d6 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d3 = mad24(b.d8, bit_max_mult, (b.d7 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d4 = mad24(b.d9, bit_max_mult, (b.d8 >> bit_max_bot))&0x7FFF; // a = b / (2^bit_max) | |
| a.d5 = mad24(b.da, bit_max_mult, (b.d9 >> bit_max_bot)); // a = b / (2^bit_max) | |
| } | |
| // PERF: could be no_low_5 | |
| mul_90_180_no_low5(&tmp180, a, u); // tmp180 = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (89 + bits_in_f) / f) | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x:%x * u = %x:%x:%x:%x:%x:%x:...\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), | |
| V(tmp180.db), V(tmp180.da), V(tmp180.d9), V(tmp180.d8), V(tmp180.d7), V(tmp180.d6)); | |
| #endif | |
| a.d0 = tmp180.d6; // a = tmp180 / 2^90, which is b / f | |
| a.d1 = tmp180.d7; | |
| a.d2 = tmp180.d8; | |
| a.d3 = tmp180.d9; | |
| a.d4 = tmp180.da; | |
| a.d5 = tmp180.db; | |
| mul_90(&tmp90, a, f); // tmp90 = quotient * f, we only compute the low 90-bits here | |
| #if (TRACE_KERNEL > 3) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: a=%x:%x:%x:%x:%x:%x * f = %x:%x:%x:%x:%x:%x (tmp)\n", | |
| V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0), V(tmp90.d5), V(tmp90.d4), V(tmp90.d3), V(tmp90.d2), V(tmp90.d1), V(tmp90.d0)); | |
| #endif | |
| tmp90.d0 = (b.d0 - tmp90.d0) & 0x7FFF; | |
| tmp90.d1 = (b.d1 - tmp90.d1 + AS_UINT_V((tmp90.d0 > b.d0) )); | |
| tmp90.d2 = (b.d2 - tmp90.d2 + AS_UINT_V((tmp90.d1 > b.d1) )); | |
| tmp90.d3 = (b.d3 - tmp90.d3 + AS_UINT_V((tmp90.d2 > b.d2) )); | |
| tmp90.d4 = (b.d4 - tmp90.d4 + AS_UINT_V((tmp90.d3 > b.d3) )); | |
| tmp90.d5 = (b.d5 - tmp90.d5 + AS_UINT_V((tmp90.d4 > b.d4) )); | |
| tmp90.d1 &= 0x7FFF; | |
| tmp90.d2 &= 0x7FFF; | |
| tmp90.d3 &= 0x7FFF; | |
| tmp90.d4 &= 0x7FFF; | |
| tmp90.d5 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"loop: b=%x:%x:%x:%x:%x:%x - tmp = %x:%x:%x:%x:%x:%x (tmp)\n", | |
| V(b.d5), V(b.d4), V(b.d3), V(b.d2), V(b.d1), V(b.d0), V(tmp90.d5), V(tmp90.d4), V(tmp90.d3), V(tmp90.d2), V(tmp90.d1), V(tmp90.d0)); | |
| #endif | |
| if (shifter & 0x80000000) shl_90(&tmp90); | |
| if (shifter == 0x80000000) break; | |
| shifter+=shifter; | |
| #ifndef CHECKS_MODBASECASE | |
| mod_simple_90(&a, tmp90, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| ); // adjustment, plain barrett returns N = AB mod M where N < 3M! | |
| #else | |
| int limit = 6; | |
| mod_simple_90(&a, tmp90, f, ff | |
| #if (TRACE_KERNEL > 1) | |
| , tid | |
| #endif | |
| , bit_max65, limit, modbasecase_debug); | |
| #endif | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"loopend: exp=%x, tmp=%x:%x:%x:%x:%x:%x mod f=%x:%x:%x:%x:%x:%x = %x:%x:%x:%x:%x:%x (a)\n", | |
| shifter, V(tmp90.d5), V(tmp90.d4), V(tmp90.d3), V(tmp90.d2), V(tmp90.d1), V(tmp90.d0), | |
| V(f.d5), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), V(a.d5), V(a.d4), V(a.d3), V(a.d2), V(a.d1), V(a.d0) ); | |
| #endif | |
| } | |
| mod_simple_even_90_and_check_big_factor90(tmp90, f, ff, RES | |
| #ifdef CHECKS_MODBASECASE | |
| , bit_max65, 10, modbasecase_debug | |
| #endif | |
| ); | |
| } | |
| #ifndef CL_GPU_SIEVE | |
| /**** | |
| * the actual kernels for handling 6x15bit computations | |
| ****/ | |
| __kernel void cl_barrett15_82(__private uint exponent, const int75_t k_base, const __global uint * restrict k_tab, const int shiftcount, | |
| const uint8 b_in, __global uint * restrict RES, const int bit_max65 | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private int90_v f; | |
| __private uint tid; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), (uint)get_local_id(0)) * VECTOR_SIZE; | |
| calculate_FC90(exponent, tid, k_tab, k_base, &f); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82: tid=%d, f=%x:%x:%x:%x:%x:%x, shift=%d\n", | |
| tid, V(f.d5), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), shiftcount); | |
| #endif | |
| check_barrett15_82(exponent << (32 - shiftcount), f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| __kernel void cl_barrett15_83(__private uint exponent, const int75_t k_base, const __global uint * restrict k_tab, const int shiftcount, | |
| const uint8 b_in, __global uint * restrict RES, const int bit_max65 | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private int90_v f; | |
| __private uint tid; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), (uint)get_local_id(0)) * VECTOR_SIZE; | |
| calculate_FC90(exponent, tid, k_tab, k_base, &f); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83: tid=%d, f=%x:%x:%x:%x:%x:%x, shift=%d\n", | |
| tid, V(f.d5), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), shiftcount); | |
| #endif | |
| check_barrett15_83(exponent << (32 - shiftcount), f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| __kernel void cl_barrett15_88(__private uint exponent, const int75_t k_base, const __global uint * restrict k_tab, const int shiftcount, | |
| const uint8 b_in, __global uint * restrict RES, const int bit_max65 | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private int90_v f; | |
| __private uint tid; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), (uint)get_local_id(0)) * VECTOR_SIZE; | |
| calculate_FC90(exponent, tid, k_tab, k_base, &f); | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_88: tid=%d, f=%x:%x:%x:%x:%x:%x, shift=%d\n", | |
| tid, V(f.d5), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0), shiftcount); | |
| #endif | |
| check_barrett15_88(exponent << (32 - shiftcount), f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| #else | |
| /**************************************** | |
| **************************************** | |
| * 15-bit-kernel consuming the GPU sieve | |
| * included by main kernel file | |
| **************************************** | |
| ****************************************/ | |
| __kernel void __attribute__((reqd_work_group_size(256, 1, 1))) | |
| cl_barrett15_69_gs(const uint exponent, const int75_t k_base, | |
| const __global uint * restrict bit_array, | |
| const uint bits_to_process, __local ushort *smem, | |
| const int shiftcount, const uint8 b_in, | |
| __global uint * restrict RES, const int bit_max65, | |
| const uint shared_mem_allocated // only used to verify assumptions | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private uint i, initial_shifter_value, total_bit_count; | |
| __local ushort bitcount[256]; // Each thread of our block puts bit-counts here | |
| __private int75_v k, f; | |
| __private uint tid, lid=get_local_id(0); | |
| __private int75_t exp75; | |
| tid = mad24((uint)get_group_id(0), (uint)get_local_size(0), lid); | |
| #if (TRACE_SIEVE_KERNEL > 0) | |
| if (lid==TRACE_SIEVE_TID) printf((__constant char *)"cl_barrett15_69_gs: exp=%d=%#x, k=%x:%x:%x, bits=%d, shift=%d, bit_max65=%d, b_in=%x:%x:%x:%x:%x:%x:%x:%x, base addr=%#x\n", | |
| exponent, exponent, k_base.d2, k_base.d1, k_base.d0, bits_to_process, shiftcount, bit_max65, b_in.s7, b_in.s6, b_in.s5, b_in.s4, b_in.s3, b_in.s2, b_in.s1, b_in.s0, bit_array); | |
| #endif | |
| // extract the bits set in bit_array into smem and get the total count (call to gpusieve.cl) | |
| total_bit_count = extract_bits(bits_to_process, tid, lid, bitcount, smem, bit_array); | |
| // Here, all warps in our block have placed their candidates in shared memory. | |
| // Now we can start TFing candidates. | |
| // Init some stuff that will be used for all k's tested <== this makes the OpenCL compiler abort, supposed to be fixed in Cat 13.4 | |
| // Compute factor corresponding to first sieve bit in this block. | |
| initial_shifter_value = exponent << (32 - shiftcount); // Initial shifter value | |
| exp75.d2=exponent>>29;exp75.d1=(exponent>>14)&0x7FFF;exp75.d0=(exponent<<1)&0x7FFF; // exp75 = 2 * exponent // PERF: exp.d1=amd_bfe(exp, 15, 14) | |
| #if (TRACE_KERNEL > 0) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69_gs: exp=%u, shift=%d, shifted exp=%#x, total_bit_count=%u, shared_mem_size=%u\n", | |
| exponent, shiftcount, initial_shifter_value, total_bit_count, shared_mem_allocated); | |
| #endif | |
| for (i = lid*VECTOR_SIZE; i < total_bit_count; i += 256*VECTOR_SIZE) // VECTOR_SIZE*THREADS_PER_BLOCK | |
| { | |
| // if i == total_bit_count-1, then we may read up to VECTOR_SIZE-1 elements beyond the array (uninitialized). | |
| // this can result in the same factor being reported up to VECTOR_SIZE times. | |
| uint_v k_delta; | |
| // Get the (k - k_base) value to test | |
| #if (VECTOR_SIZE == 1) | |
| k_delta = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| #elif (VECTOR_SIZE == 2) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| #elif (VECTOR_SIZE == 3) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| #elif (VECTOR_SIZE == 4) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| #elif (VECTOR_SIZE == 8) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| #elif (VECTOR_SIZE == 16) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| k_delta.s8 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+8])); | |
| k_delta.s9 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+9])); | |
| k_delta.sa = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+10])); | |
| k_delta.sb = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+11])); | |
| k_delta.sc = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+12])); | |
| k_delta.sd = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+13])); | |
| k_delta.se = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+14])); | |
| k_delta.sf = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+15])); | |
| #endif | |
| // Compute new f. This is computed as f = f_base + 2 * (k - k_base) * exp. | |
| k.d0 = mad24(NUM_CLASSES, k_delta & 0x7FFF, k_base.d0); // k_delta can exceed 2^24 | |
| k.d1 = (k.d0 >> 15) + mad24(NUM_CLASSES, k_delta >> 15, k_base.d1); // k is limited to 2^64 -1 | |
| k.d2 = (k.d1 >> 15) + k_base.d2; | |
| k.d3 = (k.d2 >> 15) + k_base.d3; | |
| k.d4 = (k.d3 >> 15) + k_base.d4; | |
| k.d0 &= 0x7FFF; | |
| k.d1 &= 0x7FFF; | |
| k.d2 &= 0x7FFF; | |
| k.d3 &= 0x7FFF; | |
| f.d0 = mad24(k.d0, exp75.d0, 1u); // exp75 = 2*exponent ==> f = 2kp+1 | |
| f.d1 = mad24(k.d1, exp75.d0, f.d0 >> 15); | |
| f.d1 = mad24(k.d0, exp75.d1, f.d1); | |
| f.d0 &= 0x7FFF; | |
| f.d2 = mad24(k.d2, exp75.d0, f.d1 >> 15); | |
| f.d2 = mad24(k.d1, exp75.d1, f.d2); | |
| f.d2 = mad24(k.d0, exp75.d2, f.d2); // PERF: if we limit exp at kernel compile time to 2^29, then we can skip exp75.d2 here and above. | |
| f.d1 &= 0x7FFF; | |
| f.d3 = mad24(k.d3, exp75.d0, f.d2 >> 15); | |
| f.d3 = mad24(k.d2, exp75.d1, f.d3); | |
| f.d3 = mad24(k.d1, exp75.d2, f.d3); | |
| // f.d3 = mad24(k.d0, exp75.d3, f.d3); // exp75.d3 = 0 | |
| f.d2 &= 0x7FFF; | |
| f.d4 = mad24(k.d4, exp75.d0, f.d3 >> 15); // PERF: see above | |
| f.d4 = mad24(k.d3, exp75.d1, f.d4); | |
| f.d4 = mad24(k.d2, exp75.d2, f.d4); | |
| f.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_69_gs: x: smem[%d]=%d, k_delta=%d, k=%x:%x:%x:%x:%x, f=%x:%x:%x:%x:%x\n", | |
| i, smem[i], V(k_delta), V(k.d4), V(k.d3), V(k.d2), V(k.d1), V(k.d0), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0)); | |
| #endif | |
| check_barrett15_69(initial_shifter_value, f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| } | |
| __kernel void __attribute__((reqd_work_group_size(256, 1, 1))) | |
| cl_barrett15_70_gs(const uint exponent, const int75_t k_base, | |
| const __global uint * restrict bit_array, | |
| const uint bits_to_process, __local ushort *smem, | |
| const int shiftcount, const uint8 b_in, | |
| __global uint * restrict RES, const int bit_max65, | |
| const uint shared_mem_allocated // only used to verify assumptions | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private uint i, initial_shifter_value, total_bit_count; | |
| __local ushort bitcount[256]; // Each thread of our block puts bit-counts here | |
| __private int75_v k, f; | |
| __private uint tid, lid=get_local_id(0); | |
| __private int75_t exp75; | |
| tid = mad24(get_group_id(0), get_local_size(0), lid); | |
| #if (TRACE_SIEVE_KERNEL > 0) | |
| if (lid==TRACE_SIEVE_TID) printf((__constant char *)"cl_barrett15_70_gs: exp=%d=%#x, k=%x:%x:%x, bits=%d, shift=%d, bit_max65=%d, b_in=%x:%x:%x:%x:%x:%x:%x:%x, base addr=%#x\n", | |
| exponent, exponent, k_base.d2, k_base.d1, k_base.d0, bits_to_process, shiftcount, bit_max65, b_in.s7, b_in.s6, b_in.s5, b_in.s4, b_in.s3, b_in.s2, b_in.s1, b_in.s0, bit_array); | |
| #endif | |
| // extract the bits set in bit_array into smem and get the total count (call to gpusieve.cl) | |
| total_bit_count = extract_bits(bits_to_process, tid, lid, bitcount, smem, bit_array); | |
| // Here, all warps in our block have placed their candidates in shared memory. | |
| // Now we can start TFing candidates. | |
| // Init some stuff that will be used for all k's tested <== this makes the OpenCL compiler abort, supposed to be fixed in Cat 13.4 | |
| // Compute factor corresponding to first sieve bit in this block. | |
| initial_shifter_value = exponent << (32 - shiftcount); // Initial shifter value | |
| exp75.d2=exponent>>29;exp75.d1=(exponent>>14)&0x7FFF;exp75.d0=(exponent<<1)&0x7FFF; // exp75 = 2 * exponent // PERF: exp.d1=amd_bfe(exp, 15, 14) | |
| #if (TRACE_KERNEL > 0) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_70_gs: exp=%u, shift=%d, shifted exp=%#x, total_bit_count=%u, shared_mem_size=%u\n", | |
| exponent, shiftcount, initial_shifter_value, total_bit_count, shared_mem_allocated); | |
| #endif | |
| for (i = lid*VECTOR_SIZE; i < total_bit_count; i += 256*VECTOR_SIZE) // VECTOR_SIZE*THREADS_PER_BLOCK | |
| { | |
| // if i == total_bit_count-1, then we may read up to VECTOR_SIZE-1 elements beyond the array (uninitialized). | |
| // this can result in the same factor being reported up to VECTOR_SIZE times. | |
| uint_v k_delta; | |
| // Get the (k - k_base) value to test | |
| #if (VECTOR_SIZE == 1) | |
| k_delta = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| #elif (VECTOR_SIZE == 2) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| #elif (VECTOR_SIZE == 3) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| #elif (VECTOR_SIZE == 4) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| #elif (VECTOR_SIZE == 8) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| #elif (VECTOR_SIZE == 16) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| k_delta.s8 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+8])); | |
| k_delta.s9 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+9])); | |
| k_delta.sa = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+10])); | |
| k_delta.sb = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+11])); | |
| k_delta.sc = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+12])); | |
| k_delta.sd = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+13])); | |
| k_delta.se = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+14])); | |
| k_delta.sf = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+15])); | |
| #endif | |
| // Compute new f. This is computed as f = f_base + 2 * (k - k_base) * exp. | |
| k.d0 = mad24(NUM_CLASSES, k_delta & 0x7FFF, k_base.d0); // k_delta can exceed 2^24 | |
| k.d1 = (k.d0 >> 15) + mad24(NUM_CLASSES, k_delta >> 15, k_base.d1); // k is limited to 2^64 -1 | |
| k.d2 = (k.d1 >> 15) + k_base.d2; | |
| k.d3 = (k.d2 >> 15) + k_base.d3; | |
| k.d4 = (k.d3 >> 15) + k_base.d4; | |
| k.d0 &= 0x7FFF; | |
| k.d1 &= 0x7FFF; | |
| k.d2 &= 0x7FFF; | |
| k.d3 &= 0x7FFF; | |
| f.d0 = mad24(k.d0, exp75.d0, 1u); // exp75 = 2*exponent ==> f = 2kp+1 | |
| f.d1 = mad24(k.d1, exp75.d0, f.d0 >> 15); | |
| f.d1 = mad24(k.d0, exp75.d1, f.d1); | |
| f.d0 &= 0x7FFF; | |
| f.d2 = mad24(k.d2, exp75.d0, f.d1 >> 15); | |
| f.d2 = mad24(k.d1, exp75.d1, f.d2); | |
| f.d2 = mad24(k.d0, exp75.d2, f.d2); // PERF: if we limit exp at kernel compile time to 2^29, then we can skip exp75.d2 here and above. | |
| f.d1 &= 0x7FFF; | |
| f.d3 = mad24(k.d3, exp75.d0, f.d2 >> 15); | |
| f.d3 = mad24(k.d2, exp75.d1, f.d3); | |
| f.d3 = mad24(k.d1, exp75.d2, f.d3); | |
| // f.d3 = mad24(k.d0, exp75.d3, f.d3); // exp75.d3 = 0 | |
| f.d2 &= 0x7FFF; | |
| f.d4 = mad24(k.d4, exp75.d0, f.d3 >> 15); // PERF: see above | |
| f.d4 = mad24(k.d3, exp75.d1, f.d4); | |
| f.d4 = mad24(k.d2, exp75.d2, f.d4); | |
| f.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_70_gs: x: smem[%d]=%d, k_delta=%d, k=%x:%x:%x:%x:%x, f=%x:%x:%x:%x:%x\n", | |
| i, smem[i], V(k_delta), V(k.d4), V(k.d3), V(k.d2), V(k.d1), V(k.d0), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0)); | |
| #endif | |
| check_barrett15_70(initial_shifter_value, f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| } | |
| __kernel void __attribute__((reqd_work_group_size(256, 1, 1))) | |
| cl_barrett15_71_gs(const uint exponent, const int75_t k_base, | |
| const __global uint * restrict bit_array, | |
| const uint bits_to_process, __local ushort *smem, | |
| const int shiftcount, const uint8 b_in, | |
| __global uint * restrict RES, const int bit_max65, | |
| const uint shared_mem_allocated // only used to verify assumptions | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private uint i, initial_shifter_value, total_bit_count; | |
| __local ushort bitcount[256]; // Each thread of our block puts bit-counts here | |
| __private int75_v k, f; | |
| __private uint tid, lid=get_local_id(0); | |
| __private int75_t exp75; | |
| tid = mad24(get_group_id(0), get_local_size(0), lid); | |
| #if (TRACE_SIEVE_KERNEL > 0) | |
| if (lid==TRACE_SIEVE_TID) printf((__constant char *)"cl_barrett15_71_gs: exp=%d=%#x, k=%x:%x:%x, bits=%d, shift=%d, bit_max65=%d, b_in=%x:%x:%x:%x:%x:%x:%x:%x, base addr=%#x\n", | |
| exponent, exponent, k_base.d2, k_base.d1, k_base.d0, bits_to_process, shiftcount, bit_max65, b_in.s7, b_in.s6, b_in.s5, b_in.s4, b_in.s3, b_in.s2, b_in.s1, b_in.s0, bit_array); | |
| #endif | |
| // extract the bits set in bit_array into smem and get the total count (call to gpusieve.cl) | |
| total_bit_count = extract_bits(bits_to_process, tid, lid, bitcount, smem, bit_array); | |
| // Here, all warps in our block have placed their candidates in shared memory. | |
| // Now we can start TFing candidates. | |
| // Init some stuff that will be used for all k's tested <== this makes the OpenCL compiler abort, supposed to be fixed in Cat 13.4 | |
| // Compute factor corresponding to first sieve bit in this block. | |
| initial_shifter_value = exponent << (32 - shiftcount); // Initial shifter value | |
| exp75.d2=exponent>>29;exp75.d1=(exponent>>14)&0x7FFF;exp75.d0=(exponent<<1)&0x7FFF; // exp75 = 2 * exponent // PERF: exp.d1=amd_bfe(exp, 15, 14) | |
| #if (TRACE_KERNEL > 0) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_71_gs: exp=%u, shift=%d, shifted exp=%#x, total_bit_count=%u, shared_mem_size=%u\n", | |
| exponent, shiftcount, initial_shifter_value, total_bit_count, shared_mem_allocated); | |
| #endif | |
| for (i = lid*VECTOR_SIZE; i < total_bit_count; i += 256*VECTOR_SIZE) // VECTOR_SIZE*THREADS_PER_BLOCK | |
| { | |
| // if i == total_bit_count-1, then we may read up to VECTOR_SIZE-1 elements beyond the array (uninitialized). | |
| // this can result in the same factor being reported up to VECTOR_SIZE times. | |
| uint_v k_delta; | |
| // Get the (k - k_base) value to test | |
| #if (VECTOR_SIZE == 1) | |
| k_delta = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| #elif (VECTOR_SIZE == 2) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| #elif (VECTOR_SIZE == 3) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| #elif (VECTOR_SIZE == 4) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| #elif (VECTOR_SIZE == 8) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| #elif (VECTOR_SIZE == 16) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| k_delta.s8 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+8])); | |
| k_delta.s9 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+9])); | |
| k_delta.sa = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+10])); | |
| k_delta.sb = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+11])); | |
| k_delta.sc = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+12])); | |
| k_delta.sd = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+13])); | |
| k_delta.se = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+14])); | |
| k_delta.sf = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+15])); | |
| #endif | |
| // Compute new f. This is computed as f = f_base + 2 * (k - k_base) * exp. | |
| k.d0 = mad24(NUM_CLASSES, k_delta & 0x7FFF, k_base.d0); // k_delta can exceed 2^24 | |
| k.d1 = (k.d0 >> 15) + mad24(NUM_CLASSES, k_delta >> 15, k_base.d1); // k is limited to 2^64 -1 | |
| k.d2 = (k.d1 >> 15) + k_base.d2; | |
| k.d3 = (k.d2 >> 15) + k_base.d3; | |
| k.d4 = (k.d3 >> 15) + k_base.d4; | |
| k.d0 &= 0x7FFF; | |
| k.d1 &= 0x7FFF; | |
| k.d2 &= 0x7FFF; | |
| k.d3 &= 0x7FFF; | |
| f.d0 = mad24(k.d0, exp75.d0, 1u); // exp75 = 2*exponent ==> f = 2kp+1 | |
| f.d1 = mad24(k.d1, exp75.d0, f.d0 >> 15); | |
| f.d1 = mad24(k.d0, exp75.d1, f.d1); | |
| f.d0 &= 0x7FFF; | |
| f.d2 = mad24(k.d2, exp75.d0, f.d1 >> 15); | |
| f.d2 = mad24(k.d1, exp75.d1, f.d2); | |
| f.d2 = mad24(k.d0, exp75.d2, f.d2); // PERF: if we limit exp at kernel compile time to 2^29, then we can skip exp75.d2 here and above. | |
| f.d1 &= 0x7FFF; | |
| f.d3 = mad24(k.d3, exp75.d0, f.d2 >> 15); | |
| f.d3 = mad24(k.d2, exp75.d1, f.d3); | |
| f.d3 = mad24(k.d1, exp75.d2, f.d3); | |
| // f.d3 = mad24(k.d0, exp75.d3, f.d3); // exp75.d3 = 0 | |
| f.d2 &= 0x7FFF; | |
| f.d4 = mad24(k.d4, exp75.d0, f.d3 >> 15); // PERF: see above | |
| f.d4 = mad24(k.d3, exp75.d1, f.d4); | |
| f.d4 = mad24(k.d2, exp75.d2, f.d4); | |
| f.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_71_gs: x: smem[%d]=%d, k_delta=%d, k=%x:%x:%x:%x:%x, f=%x:%x:%x:%x:%x\n", | |
| i, smem[i], V(k_delta), V(k.d4), V(k.d3), V(k.d2), V(k.d1), V(k.d0), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0)); | |
| #endif | |
| check_barrett15_71(initial_shifter_value, f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| } | |
| __kernel void __attribute__((reqd_work_group_size(256, 1, 1))) | |
| cl_barrett15_73_gs(const uint exponent, const int75_t k_base, | |
| const __global uint * restrict bit_array, | |
| const uint bits_to_process, __local ushort *smem, | |
| const int shiftcount, const uint8 b_in, | |
| __global uint * restrict RES, const int bit_max65, | |
| const uint shared_mem_allocated // only used to verify assumptions | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private uint i, initial_shifter_value, total_bit_count; | |
| __local ushort bitcount[256]; // Each thread of our block puts bit-counts here | |
| __private int75_v k, f; | |
| __private uint tid, lid=get_local_id(0); | |
| __private int75_t exp75; | |
| tid = mad24(get_group_id(0), get_local_size(0), lid); | |
| #if (TRACE_SIEVE_KERNEL > 0) | |
| if (lid==TRACE_SIEVE_TID) printf((__constant char *)"cl_barrett15_73_gs: exp=%d=%#x, k=%x:%x:%x, bits=%d, shift=%d, bit_max65=%d, b_in=%x:%x:%x:%x:%x:%x:%x:%x, base addr=%#x\n", | |
| exponent, exponent, k_base.d2, k_base.d1, k_base.d0, bits_to_process, shiftcount, bit_max65, b_in.s7, b_in.s6, b_in.s5, b_in.s4, b_in.s3, b_in.s2, b_in.s1, b_in.s0, bit_array); | |
| #endif | |
| // extract the bits set in bit_array into smem and get the total count (call to gpusieve.cl) | |
| total_bit_count = extract_bits(bits_to_process, tid, lid, bitcount, smem, bit_array); | |
| // Here, all warps in our block have placed their candidates in shared memory. | |
| // Now we can start TFing candidates. | |
| // Init some stuff that will be used for all k's tested <== this makes the OpenCL compiler abort, supposed to be fixed in Cat 13.4 | |
| // Compute factor corresponding to first sieve bit in this block. | |
| initial_shifter_value = exponent << (32 - shiftcount); // Initial shifter value | |
| exp75.d2=exponent>>29;exp75.d1=(exponent>>14)&0x7FFF;exp75.d0=(exponent<<1)&0x7FFF; // exp75 = 2 * exponent // PERF: exp.d1=amd_bfe(exp, 15, 14) | |
| #if (TRACE_KERNEL > 0) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73_gs: exp=%u, shift=%d, shifted exp=%#x, total_bit_count=%u, shared_mem_size=%u\n", | |
| exponent, shiftcount, initial_shifter_value, total_bit_count, shared_mem_allocated); | |
| #endif | |
| for (i = lid*VECTOR_SIZE; i < total_bit_count; i += 256*VECTOR_SIZE) // VECTOR_SIZE*THREADS_PER_BLOCK | |
| { | |
| // if i == total_bit_count-1, then we may read up to VECTOR_SIZE-1 elements beyond the array (uninitialized). | |
| // this can result in the same factor being reported up to VECTOR_SIZE times. | |
| uint_v k_delta; | |
| // Get the (k - k_base) value to test | |
| #if (VECTOR_SIZE == 1) | |
| k_delta = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| #elif (VECTOR_SIZE == 2) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| #elif (VECTOR_SIZE == 3) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| #elif (VECTOR_SIZE == 4) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| #elif (VECTOR_SIZE == 8) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| #elif (VECTOR_SIZE == 16) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| k_delta.s8 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+8])); | |
| k_delta.s9 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+9])); | |
| k_delta.sa = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+10])); | |
| k_delta.sb = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+11])); | |
| k_delta.sc = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+12])); | |
| k_delta.sd = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+13])); | |
| k_delta.se = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+14])); | |
| k_delta.sf = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+15])); | |
| #endif | |
| // Compute new f. This is computed as f = f_base + 2 * (k - k_base) * exp. | |
| k.d0 = mad24(NUM_CLASSES, k_delta & 0x7FFF, k_base.d0); // k_delta can exceed 2^24 | |
| k.d1 = (k.d0 >> 15) + mad24(NUM_CLASSES, k_delta >> 15, k_base.d1); // k is limited to 2^64 -1 | |
| k.d2 = (k.d1 >> 15) + k_base.d2; | |
| k.d3 = (k.d2 >> 15) + k_base.d3; | |
| k.d4 = (k.d3 >> 15) + k_base.d4; | |
| k.d0 &= 0x7FFF; | |
| k.d1 &= 0x7FFF; | |
| k.d2 &= 0x7FFF; | |
| k.d3 &= 0x7FFF; | |
| f.d0 = mad24(k.d0, exp75.d0, 1u); // exp75 = 2*exponent ==> f = 2kp+1 | |
| f.d1 = mad24(k.d1, exp75.d0, f.d0 >> 15); | |
| f.d1 = mad24(k.d0, exp75.d1, f.d1); | |
| f.d0 &= 0x7FFF; | |
| f.d2 = mad24(k.d2, exp75.d0, f.d1 >> 15); | |
| f.d2 = mad24(k.d1, exp75.d1, f.d2); | |
| f.d2 = mad24(k.d0, exp75.d2, f.d2); // PERF: if we limit exp at kernel compile time to 2^29, then we can skip exp75.d2 here and above. | |
| f.d1 &= 0x7FFF; | |
| f.d3 = mad24(k.d3, exp75.d0, f.d2 >> 15); | |
| f.d3 = mad24(k.d2, exp75.d1, f.d3); | |
| f.d3 = mad24(k.d1, exp75.d2, f.d3); | |
| // f.d3 = mad24(k.d0, exp75.d3, f.d3); // exp75.d3 = 0 | |
| f.d2 &= 0x7FFF; | |
| f.d4 = mad24(k.d4, exp75.d0, f.d3 >> 15); // PERF: see above | |
| f.d4 = mad24(k.d3, exp75.d1, f.d4); | |
| f.d4 = mad24(k.d2, exp75.d2, f.d4); | |
| f.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_73_gs: x: smem[%d]=%d, k_delta=%d, k=%x:%x:%x:%x:%x, f=%x:%x:%x:%x:%x\n", | |
| i, smem[i], V(k_delta), V(k.d4), V(k.d3), V(k.d2), V(k.d1), V(k.d0), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0)); | |
| #endif | |
| check_barrett15_73(initial_shifter_value, f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| } | |
| __kernel void __attribute__((reqd_work_group_size(256, 1, 1))) | |
| cl_barrett15_74_gs(const uint exponent, const int75_t k_base, | |
| const __global uint * restrict bit_array, | |
| const uint bits_to_process, __local ushort *smem, | |
| const int shiftcount, const uint8 b_in, | |
| __global uint * restrict RES, const int bit_max65, | |
| const uint shared_mem_allocated // only used to verify assumptions | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private uint i, initial_shifter_value, total_bit_count; | |
| __local ushort bitcount[256]; // Each thread of our block puts bit-counts here | |
| __private int75_v k, f; | |
| __private uint tid, lid=get_local_id(0); | |
| __private int75_t exp75; | |
| tid = mad24(get_group_id(0), get_local_size(0), lid); | |
| // extract the bits set in bit_array into smem and get the total count (call to gpusieve.cl) | |
| total_bit_count = extract_bits(bits_to_process, tid, lid, bitcount, smem, bit_array); | |
| // Here, all warps in our block have placed their candidates in shared memory. | |
| // Now we can start TFing candidates. | |
| // Init some stuff that will be used for all k's tested <== this makes the OpenCL compiler abort, supposed to be fixed in Cat 13.4 | |
| // Compute factor corresponding to first sieve bit in this block. | |
| initial_shifter_value = exponent << (32 - shiftcount); // Initial shifter value | |
| exp75.d2=exponent>>29;exp75.d1=(exponent>>14)&0x7FFF;exp75.d0=(exponent<<1)&0x7FFF; // exp75 = 2 * exponent // PERF: exp.d1=amd_bfe(exp, 15, 14) | |
| #if (TRACE_KERNEL > 0) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74_gs: exp=%u, shift=%d, shifted exp=%#x, total_bit_count=%u, shared_mem_size=%u\n", | |
| exponent, shiftcount, initial_shifter_value, total_bit_count, shared_mem_allocated); | |
| #endif | |
| for (i = lid*VECTOR_SIZE; i < total_bit_count; i += 256*VECTOR_SIZE) // VECTOR_SIZE*THREADS_PER_BLOCK | |
| { | |
| // if i == total_bit_count-1, then we may read up to VECTOR_SIZE-1 elements beyond the array (uninitialized). | |
| // this can result in the same factor being reported up to VECTOR_SIZE times. | |
| uint_v k_delta; | |
| // Get the (k - k_base) value to test | |
| #if (VECTOR_SIZE == 1) | |
| k_delta = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| #elif (VECTOR_SIZE == 2) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| #elif (VECTOR_SIZE == 3) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| #elif (VECTOR_SIZE == 4) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| #elif (VECTOR_SIZE == 8) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| #elif (VECTOR_SIZE == 16) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| k_delta.s8 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+8])); | |
| k_delta.s9 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+9])); | |
| k_delta.sa = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+10])); | |
| k_delta.sb = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+11])); | |
| k_delta.sc = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+12])); | |
| k_delta.sd = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+13])); | |
| k_delta.se = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+14])); | |
| k_delta.sf = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+15])); | |
| #endif | |
| // Compute new f. This is computed as f = f_base + 2 * (k - k_base) * exp. | |
| k.d0 = mad24(NUM_CLASSES, k_delta & 0x7FFF, k_base.d0); // k_delta can exceed 2^24 | |
| k.d1 = (k.d0 >> 15) + mad24(NUM_CLASSES, k_delta >> 15, k_base.d1); // k is limited to 2^64 -1 | |
| k.d2 = (k.d1 >> 15) + k_base.d2; | |
| k.d3 = (k.d2 >> 15) + k_base.d3; | |
| k.d4 = (k.d3 >> 15) + k_base.d4; | |
| k.d0 &= 0x7FFF; | |
| k.d1 &= 0x7FFF; | |
| k.d2 &= 0x7FFF; | |
| k.d3 &= 0x7FFF; | |
| f.d0 = mad24(k.d0, exp75.d0, 1u); // exp75 = 2*exponent ==> f = 2kp+1 | |
| f.d1 = mad24(k.d1, exp75.d0, f.d0 >> 15); | |
| f.d1 = mad24(k.d0, exp75.d1, f.d1); | |
| f.d0 &= 0x7FFF; | |
| f.d2 = mad24(k.d2, exp75.d0, f.d1 >> 15); | |
| f.d2 = mad24(k.d1, exp75.d1, f.d2); | |
| f.d2 = mad24(k.d0, exp75.d2, f.d2); // PERF: if we limit exp at kernel compile time to 2^29, then we can skip exp75.d2 here and above. | |
| f.d1 &= 0x7FFF; | |
| f.d3 = mad24(k.d3, exp75.d0, f.d2 >> 15); | |
| f.d3 = mad24(k.d2, exp75.d1, f.d3); | |
| f.d3 = mad24(k.d1, exp75.d2, f.d3); | |
| // f.d3 = mad24(k.d0, exp75.d3, f.d3); // exp75.d3 = 0 | |
| f.d2 &= 0x7FFF; | |
| f.d4 = mad24(k.d4, exp75.d0, f.d3 >> 15); // PERF: see above | |
| f.d4 = mad24(k.d3, exp75.d1, f.d4); | |
| f.d4 = mad24(k.d2, exp75.d2, f.d4); | |
| f.d3 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_74_gs: x: smem[%d]=%d, k_delta=%d, k=%x:%x:%x:%x:%x, f=%x:%x:%x:%x:%x\n", | |
| i, smem[i], V(k_delta), V(k.d4), V(k.d3), V(k.d2), V(k.d1), V(k.d0), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0)); | |
| #endif | |
| check_barrett15_74(initial_shifter_value, f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| } | |
| /**************************************** | |
| **************************************** | |
| * 15-bit based 90-bit barrett-kernels based on GPU sieve | |
| * | |
| **************************************** | |
| ****************************************/ | |
| __kernel void __attribute__((reqd_work_group_size(256, 1, 1))) | |
| cl_barrett15_82_gs(const uint exponent, const int75_t k_base, | |
| const __global uint * restrict bit_array, | |
| const uint bits_to_process, __local ushort *smem, | |
| const int shiftcount, const uint8 b_in, | |
| __global uint * restrict RES, const int bit_max65, | |
| const uint shared_mem_allocated // only used to verify assumptions | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private uint i, initial_shifter_value, total_bit_count; | |
| __local ushort bitcount[256]; // Each thread of our block puts bit-counts here | |
| __private int75_v k; | |
| __private int90_v f; | |
| __private uint tid, lid=get_local_id(0); | |
| __private int75_t exp75; | |
| tid = mad24(get_group_id(0), get_local_size(0), lid); | |
| #if (TRACE_SIEVE_KERNEL > 0) | |
| if (tid==TRACE_SIEVE_TID) printf((__constant char *)"cl_barrett15_82_gs: exp=%d=%#x, k=%x:%x:%x, bits=%d, shift=%d, bit_max65=%d, b_in=%x:%x:%x:%x:%x:%x:%x:%x, base addr=%#x\n", | |
| exponent, exponent, k_base.d2, k_base.d1, k_base.d0, bits_to_process, shiftcount, bit_max65, b_in.s7, b_in.s6, b_in.s5, b_in.s4, b_in.s3, b_in.s2, b_in.s1, b_in.s0, bit_array); | |
| #endif | |
| // extract the bits set in bit_array into smem and get the total count (call to gpusieve.cl) | |
| total_bit_count = extract_bits(bits_to_process, tid, lid, bitcount, smem, bit_array); | |
| // Here, all warps in our block have placed their candidates in shared memory. | |
| // Now we can start TFing candidates. | |
| // Init some stuff that will be used for all k's tested <== this makes the OpenCL compiler abort, supposed to be fixed in Cat 13.4 | |
| // Compute factor corresponding to first sieve bit in this block. | |
| initial_shifter_value = exponent << (32 - shiftcount); // Initial shifter value | |
| exp75.d2=exponent>>29;exp75.d1=(exponent>>14)&0x7FFF;exp75.d0=(exponent<<1)&0x7FFF; // exp75 = 2 * exponent // PERF: exp.d1=amd_bfe(exp, 15, 14) | |
| #if (TRACE_KERNEL > 0) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82_gs: exp=%u, shift=%d, shifted exp=%#x, total_bit_count=%u, shared_mem_size=%u\n", | |
| exponent, shiftcount, initial_shifter_value, total_bit_count, shared_mem_allocated); | |
| #endif | |
| for (i = lid*VECTOR_SIZE; i < total_bit_count; i += 256*VECTOR_SIZE) // VECTOR_SIZE*THREADS_PER_BLOCK | |
| { | |
| // if i == total_bit_count-1, then we may read up to VECTOR_SIZE-1 elements beyond the array (uninitialized). | |
| // this can result in the same factor being reported up to VECTOR_SIZE times. | |
| uint_v k_delta; | |
| // Get the (k - k_base) value to test | |
| #if (VECTOR_SIZE == 1) | |
| k_delta = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| #elif (VECTOR_SIZE == 2) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| #elif (VECTOR_SIZE == 3) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| #elif (VECTOR_SIZE == 4) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| #elif (VECTOR_SIZE == 8) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| #elif (VECTOR_SIZE == 16) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| k_delta.s8 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+8])); | |
| k_delta.s9 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+9])); | |
| k_delta.sa = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+10])); | |
| k_delta.sb = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+11])); | |
| k_delta.sc = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+12])); | |
| k_delta.sd = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+13])); | |
| k_delta.se = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+14])); | |
| k_delta.sf = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+15])); | |
| #endif | |
| // Compute new f. This is computed as f = f_base + 2 * (k - k_base) * exp. | |
| k.d0 = mad24(NUM_CLASSES, k_delta & 0x7FFF, k_base.d0); // k_delta can exceed 2^24 | |
| k.d1 = (k.d0 >> 15) + mad24(NUM_CLASSES, k_delta >> 15, k_base.d1); // k is limited to 2^64 -1 | |
| k.d2 = (k.d1 >> 15) + k_base.d2; | |
| k.d3 = (k.d2 >> 15) + k_base.d3; | |
| k.d4 = (k.d3 >> 15) + k_base.d4; | |
| k.d0 &= 0x7FFF; | |
| k.d1 &= 0x7FFF; | |
| k.d2 &= 0x7FFF; | |
| k.d3 &= 0x7FFF; | |
| f.d0 = mad24(k.d0, exp75.d0, 1u); // exp75 = 2*exponent ==> f = 2kp+1 | |
| f.d1 = mad24(k.d1, exp75.d0, f.d0 >> 15); | |
| f.d1 = mad24(k.d0, exp75.d1, f.d1); | |
| f.d0 &= 0x7FFF; | |
| f.d2 = mad24(k.d2, exp75.d0, f.d1 >> 15); | |
| f.d2 = mad24(k.d1, exp75.d1, f.d2); | |
| f.d2 = mad24(k.d0, exp75.d2, f.d2); // PERF: if we limit exp at kernel compile time to 2^29, then we can skip exp75.d2 here and above. | |
| f.d1 &= 0x7FFF; | |
| f.d3 = mad24(k.d3, exp75.d0, f.d2 >> 15); | |
| f.d3 = mad24(k.d2, exp75.d1, f.d3); | |
| f.d3 = mad24(k.d1, exp75.d2, f.d3); | |
| // f.d3 = mad24(k.d0, exp75.d3, f.d3); // exp75.d3 = 0 | |
| f.d2 &= 0x7FFF; | |
| f.d4 = mad24(k.d4, exp75.d0, f.d3 >> 15); // PERF: see above | |
| f.d4 = mad24(k.d3, exp75.d1, f.d4); | |
| f.d4 = mad24(k.d2, exp75.d2, f.d4); | |
| f.d3 &= 0x7FFF; | |
| f.d5 = mad24(k.d4, exp75.d1, f.d4 >> 15); // PERF: see above | |
| f.d5 = mad24(k.d3, exp75.d2, f.d5); | |
| f.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_82_gs: x: smem[%d]=%d, k_delta=%d, k=%x:%x:%x:%x:%x, f=%x:%x:%x:%x:%x:%x\n", | |
| i, smem[i], V(k_delta), V(k.d4), V(k.d3), V(k.d2), V(k.d1), V(k.d0), V(f.d5), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0)); | |
| #endif | |
| check_barrett15_82(initial_shifter_value, f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| } | |
| __kernel void __attribute__((reqd_work_group_size(256, 1, 1))) | |
| cl_barrett15_83_gs(const uint exponent, const int75_t k_base, | |
| const __global uint * restrict bit_array, | |
| const uint bits_to_process, __local ushort *smem, | |
| const int shiftcount, const uint8 b_in, | |
| __global uint * restrict RES, const int bit_max65, | |
| const uint shared_mem_allocated // only used to verify assumptions | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private uint i, initial_shifter_value, total_bit_count; | |
| __local ushort bitcount[256]; // Each thread of our block puts bit-counts here | |
| __private int75_v k; | |
| __private int90_v f; | |
| __private uint tid, lid=get_local_id(0); | |
| __private int75_t exp75; | |
| tid = mad24(get_group_id(0), get_local_size(0), lid); | |
| #if (TRACE_SIEVE_KERNEL > 0) | |
| if (lid==TRACE_SIEVE_TID) printf((__constant char *)"cl_barrett15_83_gs: exp=%d=%#x, k=%x:%x:%x, bits=%d, shift=%d, bit_max65=%d, b_in=%x:%x:%x:%x:%x:%x:%x:%x, base addr=%#x\n", | |
| exponent, exponent, k_base.d2, k_base.d1, k_base.d0, bits_to_process, shiftcount, bit_max65, b_in.s7, b_in.s6, b_in.s5, b_in.s4, b_in.s3, b_in.s2, b_in.s1, b_in.s0, bit_array); | |
| #endif | |
| // extract the bits set in bit_array into smem and get the total count (call to gpusieve.cl) | |
| total_bit_count = extract_bits(bits_to_process, tid, lid, bitcount, smem, bit_array); | |
| // Here, all warps in our block have placed their candidates in shared memory. | |
| // Now we can start TFing candidates. | |
| // Init some stuff that will be used for all k's tested <== this makes the OpenCL compiler abort, supposed to be fixed in Cat 13.4 | |
| // Compute factor corresponding to first sieve bit in this block. | |
| initial_shifter_value = exponent << (32 - shiftcount); // Initial shifter value | |
| exp75.d2=exponent>>29;exp75.d1=(exponent>>14)&0x7FFF;exp75.d0=(exponent<<1)&0x7FFF; // exp75 = 2 * exponent // PERF: exp.d1=amd_bfe(exp, 15, 14) | |
| #if (TRACE_KERNEL > 0) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83_gs: exp=%u, shift=%d, shifted exp=%#x, total_bit_count=%u, shared_mem_size=%u\n", | |
| exponent, shiftcount, initial_shifter_value, total_bit_count, shared_mem_allocated); | |
| #endif | |
| for (i = lid*VECTOR_SIZE; i < total_bit_count; i += 256*VECTOR_SIZE) // VECTOR_SIZE*THREADS_PER_BLOCK | |
| { | |
| // if i == total_bit_count-1, then we may read up to VECTOR_SIZE-1 elements beyond the array (uninitialized). | |
| // this can result in the same factor being reported up to VECTOR_SIZE times. | |
| uint_v k_delta; | |
| // Get the (k - k_base) value to test | |
| #if (VECTOR_SIZE == 1) | |
| k_delta = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| #elif (VECTOR_SIZE == 2) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| #elif (VECTOR_SIZE == 3) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| #elif (VECTOR_SIZE == 4) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| #elif (VECTOR_SIZE == 8) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| #elif (VECTOR_SIZE == 16) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| k_delta.s8 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+8])); | |
| k_delta.s9 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+9])); | |
| k_delta.sa = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+10])); | |
| k_delta.sb = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+11])); | |
| k_delta.sc = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+12])); | |
| k_delta.sd = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+13])); | |
| k_delta.se = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+14])); | |
| k_delta.sf = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+15])); | |
| #endif | |
| // Compute new f. This is computed as f = f_base + 2 * (k - k_base) * exp. | |
| k.d0 = mad24(NUM_CLASSES, k_delta & 0x7FFF, k_base.d0); // k_delta can exceed 2^24 | |
| k.d1 = (k.d0 >> 15) + mad24(NUM_CLASSES, k_delta >> 15, k_base.d1); // k is limited to 2^64 -1 | |
| k.d2 = (k.d1 >> 15) + k_base.d2; | |
| k.d3 = (k.d2 >> 15) + k_base.d3; | |
| k.d4 = (k.d3 >> 15) + k_base.d4; | |
| k.d0 &= 0x7FFF; | |
| k.d1 &= 0x7FFF; | |
| k.d2 &= 0x7FFF; | |
| k.d3 &= 0x7FFF; | |
| f.d0 = mad24(k.d0, exp75.d0, 1u); // exp75 = 2*exponent ==> f = 2kp+1 | |
| f.d1 = mad24(k.d1, exp75.d0, f.d0 >> 15); | |
| f.d1 = mad24(k.d0, exp75.d1, f.d1); | |
| f.d0 &= 0x7FFF; | |
| f.d2 = mad24(k.d2, exp75.d0, f.d1 >> 15); | |
| f.d2 = mad24(k.d1, exp75.d1, f.d2); | |
| f.d2 = mad24(k.d0, exp75.d2, f.d2); // PERF: if we limit exp at kernel compile time to 2^29, then we can skip exp75.d2 here and above. | |
| f.d1 &= 0x7FFF; | |
| f.d3 = mad24(k.d3, exp75.d0, f.d2 >> 15); | |
| f.d3 = mad24(k.d2, exp75.d1, f.d3); | |
| f.d3 = mad24(k.d1, exp75.d2, f.d3); | |
| // f.d3 = mad24(k.d0, exp75.d3, f.d3); // exp75.d3 = 0 | |
| f.d2 &= 0x7FFF; | |
| f.d4 = mad24(k.d4, exp75.d0, f.d3 >> 15); // PERF: see above | |
| f.d4 = mad24(k.d3, exp75.d1, f.d4); | |
| f.d4 = mad24(k.d2, exp75.d2, f.d4); | |
| f.d3 &= 0x7FFF; | |
| f.d5 = mad24(k.d4, exp75.d1, f.d4 >> 15); // PERF: see above | |
| f.d5 = mad24(k.d3, exp75.d2, f.d5); | |
| f.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 2) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_83_gs: x: smem[%d]=%d, k_delta=%d, k=%x:%x:%x:%x:%x, f=%x:%x:%x:%x:%x:%x\n", | |
| i, smem[i], V(k_delta), V(k.d4), V(k.d3), V(k.d2), V(k.d1), V(k.d0), V(f.d5), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0)); | |
| #endif | |
| check_barrett15_83(initial_shifter_value, f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| } | |
| __kernel void __attribute__((reqd_work_group_size(256, 1, 1))) | |
| cl_barrett15_88_gs(const uint exponent, const int75_t k_base, | |
| const __global uint * restrict bit_array, | |
| const uint bits_to_process, __local ushort *smem, | |
| const int shiftcount, const uint8 b_in, | |
| __global uint * restrict RES, const int bit_max65, | |
| const uint shared_mem_allocated // only used to verify assumptions | |
| MODBASECASE_PAR_DEF ) | |
| { | |
| __private uint i, initial_shifter_value, total_bit_count; | |
| __local ushort bitcount[256]; // Each thread of our block puts bit-counts here | |
| __private int75_v k; | |
| __private int90_v f; | |
| __private uint tid, lid=get_local_id(0); | |
| __private int75_t exp75; | |
| tid = mad24(get_group_id(0), get_local_size(0), lid); | |
| #if (TRACE_SIEVE_KERNEL > 0) | |
| if (lid==TRACE_SIEVE_TID) printf((__constant char *)"cl_barrett15_88_gs: exp=%d=%#x, k=%x:%x:%x, bits=%d, shift=%d, bit_max65=%d, b_in=%x:%x:%x:%x:%x:%x:%x:%x, base addr=%#x\n", | |
| exponent, exponent, k_base.d2, k_base.d1, k_base.d0, bits_to_process, shiftcount, bit_max65, b_in.s7, b_in.s6, b_in.s5, b_in.s4, b_in.s3, b_in.s2, b_in.s1, b_in.s0, bit_array); | |
| #endif | |
| // extract the bits set in bit_array into smem and get the total count (call to gpusieve.cl) | |
| total_bit_count = extract_bits(bits_to_process, tid, lid, bitcount, smem, bit_array); | |
| // Here, all warps in our block have placed their candidates in shared memory. | |
| // Now we can start TFing candidates. | |
| // Init some stuff that will be used for all k's tested <== this makes the OpenCL compiler abort, supposed to be fixed in Cat 13.4 | |
| // Compute factor corresponding to first sieve bit in this block. | |
| initial_shifter_value = exponent << (32 - shiftcount); // Initial shifter value | |
| exp75.d2=exponent>>29;exp75.d1=(exponent>>14)&0x7FFF;exp75.d0=(exponent<<1)&0x7FFF; // exp75 = 2 * exponent // PERF: exp.d1=amd_bfe(exp, 15, 14) | |
| #if (TRACE_KERNEL > 0) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_88_gs: exp=%u, shift=%d, shifted exp=%#x, total_bit_count=%u, shared_mem_size=%u\n", | |
| exponent, shiftcount, initial_shifter_value, total_bit_count, shared_mem_allocated); | |
| #endif | |
| for (i = lid*VECTOR_SIZE; i < total_bit_count; i += 256*VECTOR_SIZE) // VECTOR_SIZE*THREADS_PER_BLOCK | |
| { | |
| // if i == total_bit_count-1, then we may read up to VECTOR_SIZE-1 elements beyond the array (uninitialized). | |
| // this can result in the same factor being reported up to VECTOR_SIZE times. | |
| uint_v k_delta; | |
| // Get the (k - k_base) value to test | |
| #if (VECTOR_SIZE == 1) | |
| k_delta = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| #elif (VECTOR_SIZE == 2) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| #elif (VECTOR_SIZE == 3) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| #elif (VECTOR_SIZE == 4) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| #elif (VECTOR_SIZE == 8) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| #elif (VECTOR_SIZE == 16) | |
| k_delta.s0 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i])); | |
| k_delta.s1 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+1])); | |
| k_delta.s2 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+2])); | |
| k_delta.s3 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+3])); | |
| k_delta.s4 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+4])); | |
| k_delta.s5 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+5])); | |
| k_delta.s6 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+6])); | |
| k_delta.s7 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+7])); | |
| k_delta.s8 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+8])); | |
| k_delta.s9 = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+9])); | |
| k_delta.sa = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+10])); | |
| k_delta.sb = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+11])); | |
| k_delta.sc = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+12])); | |
| k_delta.sd = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+13])); | |
| k_delta.se = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+14])); | |
| k_delta.sf = mad24(bits_to_process, (uint)get_group_id(0), (uint)(smem[i+15])); | |
| #endif | |
| // Compute new f. This is computed as f = f_base + 2 * (k - k_base) * exp. | |
| k.d0 = mad24(NUM_CLASSES, k_delta & 0x7FFF, k_base.d0); // k_delta can exceed 2^24 | |
| k.d1 = (k.d0 >> 15) + mad24(NUM_CLASSES, k_delta >> 15, k_base.d1); // k is limited to 2^64 -1 | |
| k.d2 = (k.d1 >> 15) + k_base.d2; | |
| k.d3 = (k.d2 >> 15) + k_base.d3; | |
| k.d4 = (k.d3 >> 15) + k_base.d4; | |
| k.d0 &= 0x7FFF; | |
| k.d1 &= 0x7FFF; | |
| k.d2 &= 0x7FFF; | |
| k.d3 &= 0x7FFF; | |
| f.d0 = mad24(k.d0, exp75.d0, 1u); // exp75 = 2*exponent ==> f = 2kp+1 | |
| f.d1 = mad24(k.d1, exp75.d0, f.d0 >> 15); | |
| f.d1 = mad24(k.d0, exp75.d1, f.d1); | |
| f.d0 &= 0x7FFF; | |
| f.d2 = mad24(k.d2, exp75.d0, f.d1 >> 15); | |
| f.d2 = mad24(k.d1, exp75.d1, f.d2); | |
| f.d2 = mad24(k.d0, exp75.d2, f.d2); // PERF: if we limit exp at kernel compile time to 2^29, then we can skip exp75.d2 here and above. | |
| f.d1 &= 0x7FFF; | |
| f.d3 = mad24(k.d3, exp75.d0, f.d2 >> 15); | |
| f.d3 = mad24(k.d2, exp75.d1, f.d3); | |
| f.d3 = mad24(k.d1, exp75.d2, f.d3); | |
| // f.d3 = mad24(k.d0, exp75.d3, f.d3); // exp75.d3 = 0 | |
| f.d2 &= 0x7FFF; | |
| f.d4 = mad24(k.d4, exp75.d0, f.d3 >> 15); // PERF: see above | |
| f.d4 = mad24(k.d3, exp75.d1, f.d4); | |
| f.d4 = mad24(k.d2, exp75.d2, f.d4); | |
| f.d3 &= 0x7FFF; | |
| f.d5 = mad24(k.d4, exp75.d1, f.d4 >> 15); // PERF: see above | |
| f.d5 = mad24(k.d3, exp75.d2, f.d5); | |
| f.d4 &= 0x7FFF; | |
| #if (TRACE_KERNEL > 1) | |
| if (tid==TRACE_TID) printf((__constant char *)"cl_barrett15_88_gs: x: smem[%d]=%d, k_delta=%d, k=%x:%x:%x:%x:%x, f=%x:%x:%x:%x:%x:%x\n", | |
| i, smem[i], V(k_delta), V(k.d4), V(k.d3), V(k.d2), V(k.d1), V(k.d0), V(f.d5), V(f.d4), V(f.d3), V(f.d2), V(f.d1), V(f.d0)); | |
| #endif | |
| check_barrett15_88(initial_shifter_value, f, tid, b_in, bit_max65, RES | |
| MODBASECASE_PAR); | |
| } | |
| } | |
| #endif |