Skip to content

Commit

Permalink
Alternate SSE unrolling, looks to be marginally better, though very
Browse files Browse the repository at this point in the history
close to measurement error.
  • Loading branch information
jedbrown committed May 10, 2009
1 parent b421aaf commit 4a40385
Showing 1 changed file with 51 additions and 4 deletions.
55 changes: 51 additions & 4 deletions src/jacobi/impls/tensor/inlinetmulthex.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ static dErr TensorMult_Hex_P4_Q4_D1(dInt D_is_1,const dInt P[3],const dInt Q[3],
const dReal (*restrict Ay)[P[1]] = (const dReal (*)[P[1]])A[1];
const dReal (*restrict Az)[P2] = (const dReal (*)[P2])A[2];
dErr err;
/* #define VERBOSE_TIMING 1 */
#ifdef VERBOSE_TIMING
uint32_t time0,time1,time2,time3;
#endif

dFunctionBegin;
if (P[2] != P2 || Q[2] != Q2 || D_is_1 != D) dERROR(1,"input sizes do not agree with unrolled sizes");
Expand All @@ -124,7 +128,7 @@ static dErr TensorMult_Hex_P4_Q4_D1(dInt D_is_1,const dInt P[3],const dInt Q[3],
}

#ifdef VERBOSE_TIMING
uint32_t time0 = read_time(),time1,time2,time3;
time0 = read_time();
#endif

#if 0
Expand Down Expand Up @@ -197,6 +201,32 @@ static dErr TensorMult_Hex_P4_Q4_D1(dInt D_is_1,const dInt P[3],const dInt Q[3],
_mm_prefetch((const char*)out,_MM_HINT_T1); /* Make sure it is at least in L2 when we get to it after this loop */

#if 0
for (dInt i=0; i<Q[0]; i++) {
const dInt JKD = P[1]*P2*D,KD = P2*D;
dScalar (*restrict aa)[KD] = (dScalar(*)[4])&amem[i*JKD];
dScalar (*restrict bb)[KD] = (dScalar(*)[4])&bmem[(i*Q[1])*KD];
for (dInt l=0; l<Q[1]; l++) {
for (dInt j=0; j<P[1]; j++) {
for (dInt k=0; k<KD; k++) {
bb[l][k] += Ay[l][j] * aa[j][k];
}
}
}
}
#elif 0
if (P[1] != 4) dERROR(1,"not supported");
for (dInt i=0; i<Q[0]; i++) {
const dInt JKD = P[1]*P2*D,KD = P2*D;
dScalar (*restrict aa)[KD] = (dScalar(*)[4])&amem[i*JKD];
dScalar (*restrict bb)[KD] = (dScalar(*)[4])&bmem[(i*Q[1])*KD];
for (dInt l=0; l<Q[1]; l++) { /* \a k is unrolled down, \a j is unrolled across */
bb[l][0] = Ay[l][0]*aa[0][0] + Ay[l][1]*aa[1][0] + Ay[l][2]*aa[2][0] + Ay[l][3]*aa[3][0];
bb[l][1] = Ay[l][0]*aa[0][1] + Ay[l][1]*aa[1][1] + Ay[l][2]*aa[2][1] + Ay[l][3]*aa[3][1];
bb[l][2] = Ay[l][0]*aa[0][2] + Ay[l][1]*aa[1][2] + Ay[l][2]*aa[2][2] + Ay[l][3]*aa[3][2];
bb[l][3] = Ay[l][0]*aa[0][3] + Ay[l][1]*aa[1][3] + Ay[l][2]*aa[2][3] + Ay[l][3]*aa[3][3];
}
}
#elif 0
for (dInt i=0; i<Q[0]; i++) {
const dInt JKD = P[1]*P2*D,KD = P2*D;
dScalar *restrict aa = &amem[i*JKD];
Expand Down Expand Up @@ -234,15 +264,14 @@ static dErr TensorMult_Hex_P4_Q4_D1(dInt D_is_1,const dInt P[3],const dInt Q[3],
}
}
}
#else
#elif 0
for (dInt i=0; i<Q[0]; i++) {
const dInt JKD = P[1]*P2*D,KD = P2*D;
dScalar *restrict aa = &amem[i*JKD];
for (dInt l=0; l<Q[1]; l+=2) {
dScalar *restrict bb0 = &bmem[(i*Q[1]+l)*KD],*restrict bb1 = &bmem[(i*Q[1]+l+1)*KD];
//if ((size_t)bb0 & 0xf || (size_t)bb1 & 0xf) dERROR(1,"bmem is unaligned");
__m128d // bb00,bb02,bb10,bb12;
bb00 = _mm_load_pd(bb0),bb02=bb00,bb10=bb00,bb12=bb00;
__m128d bb00 = _mm_setzero_pd(),bb02=bb00,bb10=bb00,bb12=bb00;
for (dInt j=0; j<P[1]; j+=2) {
const dScalar *restrict aaa0 = &aa[j*KD],*restrict aaa1 = &aa[(j+1)*KD];
//if ((size_t)aaa0 & 0xf || (size_t)aaa1 & 0xf) dERROR(1,"aa is unaligned");
Expand All @@ -260,6 +289,24 @@ static dErr TensorMult_Hex_P4_Q4_D1(dInt D_is_1,const dInt P[3],const dInt Q[3],
_mm_store_pd(bb1+2,bb12);
}
}
#else
if (P[1] != P2) dERROR(1,"not supported");
for (dInt i=0; i<Q[0]; i++) {
const dInt JKD = P[1]*P2*D,KD = P2*D;
dScalar (*restrict aa)[KD] = (dScalar(*)[KD])&amem[i*JKD];
dScalar (*restrict bb)[KD] = (dScalar(*)[KD])&bmem[(i*Q[1])*KD];
__m128d
aa00 = _mm_load_pd(&aa[0][0]),aa02 = _mm_load_pd(&aa[0][2]),aa10 = _mm_load_pd(&aa[1][0]),aa12 = _mm_load_pd(&aa[1][2]),
aa20 = _mm_load_pd(&aa[2][0]),aa22 = _mm_load_pd(&aa[2][2]),aa30 = _mm_load_pd(&aa[3][0]),aa32 = _mm_load_pd(&aa[3][2]);
for (dInt l=0; l<Q[1]; l++) {
__m128d ayl0p = _mm_loaddup_pd(&Ay[l][0]),ayl1p = _mm_loaddup_pd(&Ay[l][1]),ayl2p = _mm_loaddup_pd(&Ay[l][2]),ayl3p = _mm_loaddup_pd(&Ay[l][3]);
__m128d t0,t2;
t0 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(ayl0p,aa00),_mm_mul_pd(ayl1p,aa10)), _mm_add_pd(_mm_mul_pd(ayl2p,aa20),_mm_mul_pd(ayl3p,aa30)));
t2 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(ayl0p,aa02),_mm_mul_pd(ayl1p,aa12)), _mm_add_pd(_mm_mul_pd(ayl2p,aa22),_mm_mul_pd(ayl3p,aa32)));
_mm_store_pd(&bb[l][0],t0);
_mm_store_pd(&bb[l][2],t2);
}
}
#endif

#ifdef VERBOSE_TIMING
Expand Down

0 comments on commit 4a40385

Please sign in to comment.