diff --git a/rtengine/clutstore.cc b/rtengine/clutstore.cc index 5d859f90cc..5fd2cfae36 100644 --- a/rtengine/clutstore.cc +++ b/rtengine/clutstore.cc @@ -56,7 +56,7 @@ bool loadFile( img_src.convertColorSpace(img_float.get(), icm, curr_wb); } - AlignedBuffer image(fw * fh * 4 + 8); // + 8 because of SSE4_1 version of getClutValue + AlignedBuffer image(fw * fh * 4 + 4); // getClutValues() loads one pixel in advance std::size_t index = 0; @@ -78,12 +78,30 @@ bool loadFile( } #ifdef __SSE2__ -vfloat getClutValue(const AlignedBuffer& clut_image, size_t index) +vfloat2 getClutValues(const AlignedBuffer& clut_image, size_t index) { + const vint v_values = _mm_loadu_si128(reinterpret_cast(clut_image.data + index)); #ifdef __SSE4_1__ - return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast(clut_image.data + index)))); + return { + _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_values)), + _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_srli_si128(v_values, 8))) + }; #else - return _mm_set_ps(clut_image.data[index + 3], clut_image.data[index + 2], clut_image.data[index + 1], clut_image.data[index]); + const vint v_mask = _mm_set1_epi32(0x0000FFFF); + + vint v_low = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(1, 0, 1, 0)); + vint v_high = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(3, 2, 3, 2)); + v_low = _mm_shufflelo_epi16(v_low, _MM_SHUFFLE(1, 1, 0, 0)); + v_high = _mm_shufflelo_epi16(v_high, _MM_SHUFFLE(1, 1, 0, 0)); + v_low = _mm_shufflehi_epi16(v_low, _MM_SHUFFLE(3, 3, 2, 2)); + v_high = _mm_shufflehi_epi16(v_high, _MM_SHUFFLE(3, 3, 2, 2)); + v_low = vandm(v_low, m_mask); + v_high = vandm(v_high, v_mask); + + return { + _mm_cvtepi32_ps(lowval), + _mm_cvtepi32_ps(highval) + }; #endif } #endif @@ -212,11 +230,13 @@ void rtengine::HaldCLUT::getRGB( const vfloat v_r = PERMUTEPS(v_rgb, _MM_SHUFFLE(0, 0, 0, 0)); - vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + vfloat2 v_clut_values = getClutValues(clut_image, index); + vfloat v_tmp1 = vintpf(v_r, v_clut_values.y, v_clut_values.x); index = (color + level) * 4; - vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + v_clut_values = getClutValues(clut_image, index); + vfloat v_tmp2 = vintpf(v_r, v_clut_values.y, v_clut_values.x); const vfloat v_g = PERMUTEPS(v_rgb, _MM_SHUFFLE(1, 1, 1, 1)); @@ -224,11 +244,13 @@ void rtengine::HaldCLUT::getRGB( index = (color + level_square) * 4; - v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + v_clut_values = getClutValues(clut_image, index); + v_tmp1 = vintpf(v_r, v_clut_values.y, v_clut_values.x); index = (color + level + level_square) * 4; - v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index)); + v_clut_values = getClutValues(clut_image, index); + v_tmp2 = vintpf(v_r, v_clut_values.y, v_clut_values.x); v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1); @@ -250,12 +272,6 @@ void rtengine::HaldCLUT::splitClutFilename( { Glib::ustring basename = Glib::path_get_basename(filename); - Glib::ustring::size_type last_slash_pos = basename.rfind('/'); - - if (last_slash_pos == Glib::ustring::npos) { - last_slash_pos = basename.rfind('\\'); - } - const Glib::ustring::size_type last_dot_pos = basename.rfind('.'); if (last_dot_pos != Glib::ustring::npos) { diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc index e9ae98e2aa..bc70bf5e5d 100644 --- a/rtengine/improcfun.cc +++ b/rtengine/improcfun.cc @@ -3226,8 +3226,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer xyz2clut = iccStore->workingSpaceInverseMatrix( hald_clut->getProfile() ); xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working ); clut2xyz = iccStore->workingSpaceMatrix( hald_clut->getProfile() ); -#ifdef __SSE2__ +#ifdef __SSE2__ for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { v_work2xyz[i][j] = F2V(work2xyz[i][j]); @@ -3236,8 +3236,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer v_clut2xyz[i][j] = F2V(clut2xyz[i][j]); } } - #endif + } } } @@ -4361,12 +4361,12 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer // Convert from working to clut profile int j = jstart; int tj = 0; -#ifdef __SSE2__ +#ifdef __SSE2__ for (; j < tW - 3; j += 4, tj += 4) { - vfloat sourceR = LVFU(rtemp[ti * TS + tj]); - vfloat sourceG = LVFU(gtemp[ti * TS + tj]); - vfloat sourceB = LVFU(btemp[ti * TS + tj]); + vfloat sourceR = LVF(rtemp[ti * TS + tj]); + vfloat sourceG = LVF(gtemp[ti * TS + tj]); + vfloat sourceB = LVF(btemp[ti * TS + tj]); vfloat x; vfloat y; @@ -4374,11 +4374,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_work2xyz); Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2clut); - STVFU(rtemp[ti * TS + tj], sourceR); - STVFU(gtemp[ti * TS + tj], sourceG); - STVFU(btemp[ti * TS + tj], sourceB); + STVF(rtemp[ti * TS + tj], sourceR); + STVF(gtemp[ti * TS + tj], sourceG); + STVF(btemp[ti * TS + tj], sourceB); } - #endif for (; j < tW; j++, tj++) { @@ -4428,12 +4427,12 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer // Convert from clut to working profile int j = jstart; int tj = 0; -#ifdef __SSE2__ +#ifdef __SSE2__ for (; j < tW - 3; j += 4, tj += 4) { - vfloat sourceR = LVFU(rtemp[ti * TS + tj]); - vfloat sourceG = LVFU(gtemp[ti * TS + tj]); - vfloat sourceB = LVFU(btemp[ti * TS + tj]); + vfloat sourceR = LVF(rtemp[ti * TS + tj]); + vfloat sourceG = LVF(gtemp[ti * TS + tj]); + vfloat sourceB = LVF(btemp[ti * TS + tj]); vfloat x; vfloat y; @@ -4441,11 +4440,10 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_clut2xyz); Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2work); - STVFU(rtemp[ti * TS + tj], sourceR); - STVFU(gtemp[ti * TS + tj], sourceG); - STVFU(btemp[ti * TS + tj], sourceB); + STVF(rtemp[ti * TS + tj], sourceR); + STVF(gtemp[ti * TS + tj], sourceG); + STVF(btemp[ti * TS + tj], sourceB); } - #endif for (; j < tW; j++, tj++) {