-
Notifications
You must be signed in to change notification settings - Fork 314
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RT crashes on i686 (most likely caused by aligned SSE access on unaligned address) #4432
Comments
@heckflosse Don't we have those |
@Floessie They are used in LUT.h, but I remember that on non |
@heckflosse Well, I don't have that insight, but why might the stack be unaligned when GCC defaults to |
@Floessie iirc
where |
@Floessie Can you test this on a non #include "LUT.h"
LUTf test(65536);
std::cout << alignof(test) << std::endl; On |
@Floessie In case on diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index abc2b0bd..5824fd01 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -88,7 +88,7 @@ using LUTd = LUT<double>;
using LUTuc = LUT<uint8_t>;
template<typename T>
-class LUT :
+class alignas(16) LUT :
public rtengine::NonCopyable
{
protected:
|
@Llerr I pushed a possible fix to |
@heckflosse It reports |
At least on W10 32 bits no improvment
|
@gaaned92 André, could you also test Ingo's patch with the |
@Llerr I pushed a possible fix to dev branch. Can you please test? |
@Llerr after cloning, do |
@Floessie
beginning of LUT.h
|
This patch for diff --git a/rtengine/LUT.h b/rtengine/LUT.h
index abc2b0bd..709bf860 100644
--- a/rtengine/LUT.h
+++ b/rtengine/LUT.h
@@ -133,9 +133,15 @@ public:
maxs = size - 2;
maxsf = (float)maxs;
#ifdef __SSE2__
+#ifdef __x86_64__
maxsv = F2V( maxs );
sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = F2V( size - 1 );
+#else
+ _mm_storeu_ps((float*)&maxsv, F2V( maxs ));
+ _mm_storeu_si128(&sizeiv, _mm_set1_epi32( (int)(size - 1) ));
+ _mm_storeu_ps((float*)&sizev, F2V( size - 1 ));
+#endif
#endif
}
void operator ()(int s, int flags = 0xfffffff)
@@ -163,9 +169,16 @@ public:
maxs = size - 2;
maxsf = (float)maxs;
#ifdef __SSE2__
+#ifdef __x86_64__
maxsv = F2V( maxs );
sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = F2V( size - 1 );
+#else
+ _mm_storeu_ps((float*)&maxsv, F2V( maxs ));
+ _mm_storeu_si128(&sizeiv, _mm_set1_epi32( (int)(size - 1) ));
+ _mm_storeu_ps((float*)&sizev, F2V( size - 1 ));
+
+#endif
#endif
}
@@ -174,9 +187,15 @@ public:
data = nullptr;
reset();
#ifdef __SSE2__
+#ifdef __x86_64__
maxsv = ZEROV;
sizev = ZEROV;
sizeiv = _mm_setzero_si128();
+#else
+ _mm_storeu_ps((float*)&maxsv, ZEROV);
+ _mm_storeu_ps((float*)&sizev, ZEROV);
+ _mm_storeu_si128(&sizeiv, _mm_setzero_si128());
+#endif
#endif
}
@@ -238,9 +257,15 @@ public:
this->maxs = this->size - 2;
this->maxsf = (float)this->maxs;
#ifdef __SSE2__
+#ifdef __x86_64__
this->maxsv = F2V( this->size - 2);
this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) );
this->sizev = F2V( this->size - 1 );
+#else
+ _mm_storeu_ps((float*)&this->maxsv, F2V( this->size - 2));
+ _mm_storeu_si128(&this->sizeiv, _mm_set1_epi32( (int)(this->size - 1) ));
+ _mm_storeu_ps((float*)&this->sizev, F2V( this->size - 1 ));
+#endif
#endif
}
@@ -311,7 +336,11 @@ public:
// Clamp and convert to integer values. Extract out of SSE register because all
// lookup operations use regular addresses.
+#ifdef __x86_64__
vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
+#else
+ vfloat clampedIndexes = vmaxf(ZEROV, vminf(_mm_loadu_ps((float*)&maxsv), indexv));
+#endif
vint indexes = _mm_cvttps_epi32(clampedIndexes);
int indexArray[4];
_mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
@@ -343,7 +372,11 @@ public:
// Clamp and convert to integer values. Extract out of SSE register because all
// lookup operations use regular addresses.
+#ifdef __x86_64__
vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
+#else
+ vfloat clampedIndexes = vmaxf(ZEROV, vminf(_mm_loadu_ps((float*)&maxsv), indexv));
+#endif
vint indexes = _mm_cvttps_epi32(clampedIndexes);
int indexArray[4];
_mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
@@ -363,7 +396,11 @@ public:
vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
+#ifdef __x86_64__
vfloat diff = vmaxf(ZEROV, vminf(sizev, indexv)) - _mm_cvtepi32_ps(indexes);
+#else
+ vfloat diff = vmaxf(ZEROV, vminf(_mm_loadu_ps((float*)&sizev), indexv)) - _mm_cvtepi32_ps(indexes);
+#endif
return vintpf(diff, upper, lower);
}
@@ -374,7 +411,11 @@ public:
// Clamp and convert to integer values. Extract out of SSE register because all
// lookup operations use regular addresses.
+#ifdef __x86_64__
vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
+#else
+ vfloat clampedIndexes = vmaxf(ZEROV, vminf(_mm_loadu_ps((float*)&maxsv), indexv));
+#endif
vint indexes = _mm_cvttps_epi32(clampedIndexes);
int indexArray[4];
_mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
@@ -402,7 +443,11 @@ public:
vfloat operator[](vint idxv ) const
{
vfloat tempv, p1v;
+#ifdef __x86_64__
idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
+#else
+ idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, _mm_loadu_si128((__m128i*)&sizeiv)));
+#endif
// access the LUT 4 times and shuffle the values into p1v
int idx;
@@ -443,7 +488,11 @@ public:
{
vfloat tempv, p1v;
tempv = _mm_cvtepi32_ps(idxv);
+#ifdef __x86_64__
tempv = _mm_min_ps( tempv, sizev );
+#else
+ tempv = _mm_min_ps( tempv, _mm_loadu_ps((float*)&sizev ));
+#endif
idxv = _mm_cvttps_epi32(_mm_max_ps( tempv, _mm_setzero_ps( ) ));
// access the LUT 4 times and shuffle the values into p1v
@@ -691,9 +740,15 @@ public:
maxs = size - 2;
maxsf = (float)maxs;
#ifdef __SSE2__
+#ifdef __x86_64__
maxsv = F2V( size - 2);
sizeiv = _mm_set1_epi32( (int)(size - 1) );
sizev = F2V( size - 1 );
+#else
+ _mm_storeu_ps((float*)&maxsv, F2V( size - 2));
+ _mm_storeu_si128(&sizeiv, _mm_set1_epi32( (int)(size - 1) ));
+ _mm_storeu_ps((float*)&sizev, F2V( size - 1 ));
+#endif
#endif
}
|
@heckflosse Isn't unaligned heap our problem here, given the fact, that this |
@Floessie Yes, I think that's the problem. |
@heckflosse This is just a PoC (call it a dirty hack): diff --git a/rtgui/main.cc b/rtgui/main.cc
index 2ae442e5a..34e7fe68a 100644
--- a/rtgui/main.cc
+++ b/rtgui/main.cc
@@ -51,6 +51,18 @@
#include "conio.h"
#endif
+#include <stdlib.h>
+
+void* operator new(decltype(sizeof(0)) n) noexcept(false)
+{
+ return aligned_alloc(16, n);
+}
+
+void operator delete(void* p) noexcept
+{
+ free(p);
+}
+
// Set this to 1 to make RT work when started with Eclipse and arguments, at least on Windows platform
#define ECLIPSE_ARGS 0
This requires C11. |
If you want I test something, ping me. |
@gaaned92 Ping! 😄 Could you test my dirty hack, so we know if it helps at all? |
@Floessie I tried your patch on msys2/mingw64. There is no Edit: I changed |
That's a pity. Works on Linux amd64 and i686. But I don't have real hardware that would SEGV on unaligned SSE accesses, so I can't test... |
@Floessie afaik all cpus fail on |
@heckflosse for build = release crash
Not a lot of information here! |
@gaaned92 Thanks a lot for testing. Because the debug build does not crash, I guess the bug with sse access to unaligned data in LUT.h is fixed with my patch and the crash in release build is a different one (maybe specific to Win32 builds as now all the SSEFUNCTION macros, which were needed for Win32 SSE builds, are removed). But that's just a guess. Nevertheless it would be good to know how a release build on i686 Linux behaves with the patch @Llerr |
it`s crushed :(
|
Tested again with latest |
@Floessie If you want to know about your emulation handling things different, the following code should crash on all systems: float test[8] = { 1.f };
for(int i = 0; i < 4; ++i) {
_mm_store_ps(&test[i], _mm_load_ps(&test[i+1]) + _mm_set1_ps(1.f));
}
printf("%f\n",test[3]); |
@heckflosse Proof: My old QEMU 2.1.2 doesn't crash on the snippet above, but happily prints out |
@Floessie Damn. My example was so simple that the compiler most likely could predict the output. Here's a working (crashing) example: diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 0420c5b7..8dd30b05 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -1520,6 +1520,10 @@ void RawImageSource::vflip (Imagefloat* image)
int RawImageSource::load (const Glib::ustring &fname)
{
+float test[8] = { 1.f };
+LFModifier::test(test);
+std::cout << test[3] << std::endl;
+
MyTime t1, t2;
t1.set();
fileName = fname;
diff --git a/rtengine/rtlensfun.cc b/rtengine/rtlensfun.cc
index 8216f0b8..ebd8605e 100644
--- a/rtengine/rtlensfun.cc
+++ b/rtengine/rtlensfun.cc
@@ -43,6 +43,11 @@ LFModifier::operator bool() const
return data_;
}
+void LFModifier::test(float *test) {
+ for(int i = 0; i < 4; ++i) {
+ _mm_store_ps(&test[i], _mm_load_ps(&test[i+1]) + _mm_set1_ps(1.f));
+ }
+}
void LFModifier::correctDistortion(double &x, double &y, int cx, int cy, double scale) const
{
diff --git a/rtengine/rtlensfun.h b/rtengine/rtlensfun.h
index 7690ef54..ebb7b7f2 100644
--- a/rtengine/rtlensfun.h
+++ b/rtengine/rtlensfun.h
@@ -47,9 +47,9 @@ public:
void correctCA(double &x, double &y, int cx, int cy, int channel) const override;
void processVignetteLine(int width, int y, float *line) const override;
void processVignetteLine3Channels(int width, int y, float *line) const override;
+ static void test(float *test);
Glib::ustring getDisplayString() const;
-
private:
LFModifier(lfModifier *m, bool swap_xy, int flags);
And here's the same, but with unaligned access, which does not crash: diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 0420c5b7..8dd30b05 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -1520,6 +1520,10 @@ void RawImageSource::vflip (Imagefloat* image)
int RawImageSource::load (const Glib::ustring &fname)
{
+float test[8] = { 1.f };
+LFModifier::test(test);
+std::cout << test[3] << std::endl;
+
MyTime t1, t2;
t1.set();
fileName = fname;
diff --git a/rtengine/rtlensfun.cc b/rtengine/rtlensfun.cc
index 8216f0b8..938ebd7a 100644
--- a/rtengine/rtlensfun.cc
+++ b/rtengine/rtlensfun.cc
@@ -43,6 +43,11 @@ LFModifier::operator bool() const
return data_;
}
+void LFModifier::test(float *test) {
+ for(int i = 0; i < 4; ++i) {
+ _mm_storeu_ps(&test[i], _mm_loadu_ps(&test[i+1]) + _mm_set1_ps(1.f));
+ }
+}
void LFModifier::correctDistortion(double &x, double &y, int cx, int cy, double scale) const
{
diff --git a/rtengine/rtlensfun.h b/rtengine/rtlensfun.h
index 7690ef54..ebb7b7f2 100644
--- a/rtengine/rtlensfun.h
+++ b/rtengine/rtlensfun.h
@@ -47,9 +47,9 @@ public:
void correctCA(double &x, double &y, int cx, int cy, int channel) const override;
void processVignetteLine(int width, int y, float *line) const override;
void processVignetteLine3Channels(int width, int y, float *line) const override;
+ static void test(float *test);
Glib::ustring getDisplayString() const;
-
private:
LFModifier(lfModifier *m, bool swap_xy, int flags);
To test, apply the patch to dev and load a raw file. |
@heckflosse Segfaults! 🎉 |
@Llerr It seems that in your latest test my patch from #4432 (comment) was not applied. Your output shows
while with applied patch line 177 is:
|
@heckflosse But Ingo, there are a lot of other places, where unaligned loads and stores are used (not to mention 😕, |
Seek and ye shall find: kraj/glibc@4e61a6b#diff-f2db25b59456b2ed40b038a3935be358 (nota bene: the hash has no HTH, |
@Floessie maybe it's been truncated ;) |
@heckflosse In the meantime I've set up a VM with Debian 9.4 i386. Here's a patch as a starting point (must be diff --git a/rtgui/main.cc b/rtgui/main.cc
index 55d27125..b969ca8b 100644
--- a/rtgui/main.cc
+++ b/rtgui/main.cc
@@ -51,6 +51,13 @@
#include "conio.h"
#endif
+#include <malloc.h>
+
+void* malloc(size_t size)
+{
+ return memalign(16, size);
+}
+
// Set this to 1 to make RT work when started with Eclipse and arguments, at least on Windows platform
#define ECLIPSE_ARGS 0
Best, |
Here's a full integration, which fixes the problem for i386 Linux. Successfully tested on Debian 9.4 i386, Debian 9.4 amd64, and Debian 10 i386. The diff --git a/CMakeLists.txt b/CMakeLists.txt
index d57e1827..8dcf6793 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,9 @@ if(HAVE_X86_SSE_MATH)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -mfpmath=sse")
endif()
+# On i386 Linux we can fix unaligned SSE malloc (see GitHub issue #4432)
+include(FindUnalignedMalloc)
+
if(WIN32)
# Add additional paths. Look in the MinGW path first, then in the Gtkmm path.
# If you wish to build some dependent libraries, you have to install them in MinGW to use them:
diff --git a/cmake/modules/FindUnalignedMalloc.cmake b/cmake/modules/FindUnalignedMalloc.cmake
new file mode 100644
index 00000000..4ddfb2af
--- /dev/null
+++ b/cmake/modules/FindUnalignedMalloc.cmake
@@ -0,0 +1,49 @@
+# This file is part of RawTherapee.
+#
+# Copyright (C) 2018 Flössie <floessie.mail@gmail.com>
+#
+# RawTherapee is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# RawTherapee is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with RawTherapee. If not, see <http://www.gnu.org/licenses/>.
+
+include(CheckCXXSourceCompiles)
+
+set(CMAKE_REQUIRED_QUIET_COPY "${CMAKE_REQUIRED_QUIET}")
+set(CMAKE_REQUIRED_QUIET ON)
+
+set(TEST_SOURCE
+"
+#include <cstddef>
+#include <type_traits>
+
+int main()
+{
+#if defined(__SSE2__) && (defined(__i386) || defined(_M_IX86)) && defined(__linux)
+ static_assert(std::alignment_of<std::max_align_t>::value >= 16, \"Unaligned heap objects possible\");
+#endif
+
+ return 0;
+}
+")
+
+CHECK_CXX_SOURCE_COMPILES("${TEST_SOURCE}" HAVE_ALIGNED_MALLOC)
+
+if(NOT HAVE_ALIGNED_MALLOC)
+ set(HAVE_UNALIGNED_MALLOC 1)
+else()
+ unset(HAVE_ALIGNED_MALLOC)
+endif()
+
+unset(TEST_SOURCE)
+
+set(CMAKE_REQUIRED_QUIET "${CMAKE_REQUIRED_QUIET_COPY}")
+unset(CMAKE_REQUIRED_QUIET_COPY)
diff --git a/rtgui/CMakeLists.txt b/rtgui/CMakeLists.txt
index 540b4d26..c39ed004 100644
--- a/rtgui/CMakeLists.txt
+++ b/rtgui/CMakeLists.txt
@@ -1,5 +1,6 @@
# Common source files for both CLI and non-CLI execautables
set(CLISOURCEFILES
+ alignedmalloc.cc
edit.cc
main-cli.cc
multilangmgr.cc
@@ -11,6 +12,7 @@ set(CLISOURCEFILES
set(NONCLISOURCEFILES
adjuster.cc
+ alignedmalloc.cc
batchqueue.cc
batchqueuebuttonset.cc
batchqueueentry.cc
diff --git a/rtgui/alignedmalloc.cc b/rtgui/alignedmalloc.cc
new file mode 100644
index 00000000..91fe15a8
--- /dev/null
+++ b/rtgui/alignedmalloc.cc
@@ -0,0 +1,31 @@
+/*
+ * This file is part of RawTherapee.
+ *
+ * Copyright (C) 2018 Flössie <floessie.mail@gmail.com>
+ *
+ * RawTherapee is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * RawTherapee is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with RawTherapee. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#ifdef HAVE_UNALIGNED_MALLOC
+
+#include <malloc.h>
+
+void* malloc(size_t size)
+{
+ return memalign(16, size);
+}
+
+#endif
diff --git a/rtgui/config.h.in b/rtgui/config.h.in
index fdf64b73..2d1f41db 100644
--- a/rtgui/config.h.in
+++ b/rtgui/config.h.in
@@ -21,6 +21,7 @@
#define __CONFIG_H__
#cmakedefine BUILD_BUNDLE
+#cmakedefine HAVE_UNALIGNED_MALLOC
#define DATA_SEARCH_PATH "${DATADIR}"
#define DOC_SEARCH_PATH "${DOCDIR}"
#define CREDITS_SEARCH_PATH "${CREDITSDIR}" @heckflosse @Beep6581 @innir Please test, thanks. HTH, |
Thanks @Floessie, builds and works fine on Win7/64 |
Built and ran fine in Sabayon |
Tested with Clang 3.9.1 on Debian 9.4 i386. ✔️ |
Tested with Clang 5.0.1 on Debian 10 i386: With patch: ✔️ ( Clang 5 needs |
Shall I push to |
@Floessie yes, please :) |
Is this for 5.5? |
imho for 5.4 |
I'm about to push... |
i'm test. it's work. |
system: ubuntu 16.04 LTS i686
The text was updated successfully, but these errors were encountered: