Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HaldCLUT cleanups after the dust (#3154) has settled #3282

Merged
merged 4 commits into from
May 14, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 30 additions & 14 deletions rtengine/clutstore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ bool loadFile(
img_src.convertColorSpace(img_float.get(), icm, curr_wb);
}

AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 8); // + 8 because of SSE4_1 version of getClutValue
AlignedBuffer<std::uint16_t> image(fw * fh * 4 + 4); // getClutValues() loads one pixel in advance

std::size_t index = 0;

Expand All @@ -78,12 +78,30 @@ bool loadFile(
}

#ifdef __SSE2__
vfloat getClutValue(const AlignedBuffer<std::uint16_t>& clut_image, size_t index)
vfloat2 getClutValues(const AlignedBuffer<std::uint16_t>& clut_image, size_t index)
{
const vint v_values = _mm_loadu_si128(reinterpret_cast<const vint*>(clut_image.data + index));
#ifdef __SSE4_1__
return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(clut_image.data + index))));
return {
_mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_values)),
_mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_srli_si128(v_values, 8)))
};
#else
return _mm_set_ps(clut_image.data[index + 3], clut_image.data[index + 2], clut_image.data[index + 1], clut_image.data[index]);
const vint v_mask = _mm_set1_epi32(0x0000FFFF);

vint v_low = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(1, 0, 1, 0));
vint v_high = _mm_shuffle_epi32(v_values, _MM_SHUFFLE(3, 2, 3, 2));
v_low = _mm_shufflelo_epi16(v_low, _MM_SHUFFLE(1, 1, 0, 0));
v_high = _mm_shufflelo_epi16(v_high, _MM_SHUFFLE(1, 1, 0, 0));
v_low = _mm_shufflehi_epi16(v_low, _MM_SHUFFLE(3, 3, 2, 2));
v_high = _mm_shufflehi_epi16(v_high, _MM_SHUFFLE(3, 3, 2, 2));
v_low = vandm(v_low, m_mask);
v_high = vandm(v_high, v_mask);

return {
_mm_cvtepi32_ps(lowval),
_mm_cvtepi32_ps(highval)
};
#endif
}
#endif
Expand Down Expand Up @@ -212,23 +230,27 @@ void rtengine::HaldCLUT::getRGB(

const vfloat v_r = PERMUTEPS(v_rgb, _MM_SHUFFLE(0, 0, 0, 0));

vfloat v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
vfloat2 v_clut_values = getClutValues(clut_image, index);
vfloat v_tmp1 = vintpf(v_r, v_clut_values.y, v_clut_values.x);

index = (color + level) * 4;

vfloat v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
v_clut_values = getClutValues(clut_image, index);
vfloat v_tmp2 = vintpf(v_r, v_clut_values.y, v_clut_values.x);

const vfloat v_g = PERMUTEPS(v_rgb, _MM_SHUFFLE(1, 1, 1, 1));

vfloat v_out = vintpf(v_g, v_tmp2, v_tmp1);

index = (color + level_square) * 4;

v_tmp1 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
v_clut_values = getClutValues(clut_image, index);
v_tmp1 = vintpf(v_r, v_clut_values.y, v_clut_values.x);

index = (color + level + level_square) * 4;

v_tmp2 = vintpf(v_r, getClutValue(clut_image, index + 4), getClutValue(clut_image, index));
v_clut_values = getClutValues(clut_image, index);
v_tmp2 = vintpf(v_r, v_clut_values.y, v_clut_values.x);

v_tmp1 = vintpf(v_g, v_tmp2, v_tmp1);

Expand All @@ -250,12 +272,6 @@ void rtengine::HaldCLUT::splitClutFilename(
{
Glib::ustring basename = Glib::path_get_basename(filename);

Glib::ustring::size_type last_slash_pos = basename.rfind('/');

if (last_slash_pos == Glib::ustring::npos) {
last_slash_pos = basename.rfind('\\');
}

const Glib::ustring::size_type last_dot_pos = basename.rfind('.');

if (last_dot_pos != Glib::ustring::npos) {
Expand Down
34 changes: 16 additions & 18 deletions rtengine/improcfun.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3226,8 +3226,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
xyz2clut = iccStore->workingSpaceInverseMatrix( hald_clut->getProfile() );
xyz2work = iccStore->workingSpaceInverseMatrix( params->icm.working );
clut2xyz = iccStore->workingSpaceMatrix( hald_clut->getProfile() );
#ifdef __SSE2__

#ifdef __SSE2__
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
v_work2xyz[i][j] = F2V(work2xyz[i][j]);
Expand All @@ -3236,8 +3236,8 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
v_clut2xyz[i][j] = F2V(clut2xyz[i][j]);
}
}

#endif

}
}
}
Expand Down Expand Up @@ -4361,24 +4361,23 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
// Convert from working to clut profile
int j = jstart;
int tj = 0;
#ifdef __SSE2__

#ifdef __SSE2__
for (; j < tW - 3; j += 4, tj += 4) {
vfloat sourceR = LVFU(rtemp[ti * TS + tj]);
vfloat sourceG = LVFU(gtemp[ti * TS + tj]);
vfloat sourceB = LVFU(btemp[ti * TS + tj]);
vfloat sourceR = LVF(rtemp[ti * TS + tj]);
vfloat sourceG = LVF(gtemp[ti * TS + tj]);
vfloat sourceB = LVF(btemp[ti * TS + tj]);

vfloat x;
vfloat y;
vfloat z;
Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_work2xyz);
Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2clut);

STVFU(rtemp[ti * TS + tj], sourceR);
STVFU(gtemp[ti * TS + tj], sourceG);
STVFU(btemp[ti * TS + tj], sourceB);
STVF(rtemp[ti * TS + tj], sourceR);
STVF(gtemp[ti * TS + tj], sourceG);
STVF(btemp[ti * TS + tj], sourceB);
}

#endif

for (; j < tW; j++, tj++) {
Expand Down Expand Up @@ -4428,24 +4427,23 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, PipetteBuffer
// Convert from clut to working profile
int j = jstart;
int tj = 0;
#ifdef __SSE2__

#ifdef __SSE2__
for (; j < tW - 3; j += 4, tj += 4) {
vfloat sourceR = LVFU(rtemp[ti * TS + tj]);
vfloat sourceG = LVFU(gtemp[ti * TS + tj]);
vfloat sourceB = LVFU(btemp[ti * TS + tj]);
vfloat sourceR = LVF(rtemp[ti * TS + tj]);
vfloat sourceG = LVF(gtemp[ti * TS + tj]);
vfloat sourceB = LVF(btemp[ti * TS + tj]);

vfloat x;
vfloat y;
vfloat z;
Color::rgbxyz(sourceR, sourceG, sourceB, x, y, z, v_clut2xyz);
Color::xyz2rgb(x, y, z, sourceR, sourceG, sourceB, v_xyz2work);

STVFU(rtemp[ti * TS + tj], sourceR);
STVFU(gtemp[ti * TS + tj], sourceG);
STVFU(btemp[ti * TS + tj], sourceB);
STVF(rtemp[ti * TS + tj], sourceR);
STVF(gtemp[ti * TS + tj], sourceG);
STVF(btemp[ti * TS + tj], sourceB);
}

#endif

for (; j < tW; j++, tj++) {
Expand Down