Skip to content

Commit

Permalink
Fix CORE-6145 - Wrong result in "similar to" with non latin characters.
Browse files Browse the repository at this point in the history
  • Loading branch information
asfernandes committed Sep 19, 2019
1 parent 128ecdc commit 093ba00
Showing 1 changed file with 28 additions and 8 deletions.
36 changes: 28 additions & 8 deletions src/jrd/Collation.cpp
Expand Up @@ -116,6 +116,7 @@ class Re2SimilarMatcher : public PatternMatcher
Re2SimilarMatcher(thread_db* tdbb, MemoryPool& pool, TextType* textType,
const UCHAR* patternStr, SLONG patternLen, const UCHAR* escapeStr, SLONG escapeLen)
: PatternMatcher(pool, textType),
converter(INTL_convert_lookup(tdbb, CS_UTF8, textType->getCharSet()->getId())),
buffer(pool)
{
UCharBuffer patternBuffer, escapeBuffer;
Expand All @@ -131,8 +132,6 @@ class Re2SimilarMatcher : public PatternMatcher
flags |= (textType->getFlags() & TEXTTYPE_ATTR_CASE_INSENSITIVE) ?
SimilarToRegex::FLAG_CASE_INSENSITIVE : 0;

CsConvert converter = INTL_convert_lookup(tdbb, CS_UTF8, charSetId);

converter.convert(patternLen, patternStr, patternBuffer);

if (textType->getFlags() & TEXTTYPE_ATTR_ACCENT_INSENSITIVE)
Expand Down Expand Up @@ -190,13 +189,24 @@ class Re2SimilarMatcher : public PatternMatcher

virtual bool result()
{
UCharBuffer utfBuffer;
const auto charSetId = textType->getCharSet()->getId();
UCharBuffer* bufferPtr = &buffer;

if (charSetId != CS_NONE && charSetId != CS_BINARY && charSetId != CS_UTF8)
{
converter.convert(buffer.getCount(), buffer.begin(), utfBuffer);
bufferPtr = &utfBuffer;
}

if (textType->getFlags() & TEXTTYPE_ATTR_ACCENT_INSENSITIVE)
UnicodeUtil::utf8Normalize(buffer);
UnicodeUtil::utf8Normalize(*bufferPtr);

return regex->matches((const char*) buffer.begin(), buffer.getCount());
return regex->matches((const char*) bufferPtr->begin(), bufferPtr->getCount());
}

private:
CsConvert converter;
AutoPtr<SimilarToRegex> regex;
UCharBuffer buffer;
};
Expand All @@ -207,6 +217,7 @@ class Re2SubstringSimilarMatcher : public BaseSubstringSimilarMatcher
Re2SubstringSimilarMatcher(thread_db* tdbb, MemoryPool& pool, TextType* textType,
const UCHAR* patternStr, SLONG patternLen, const UCHAR* escapeStr, SLONG escapeLen)
: BaseSubstringSimilarMatcher(pool, textType),
converter(INTL_convert_lookup(tdbb, CS_UTF8, textType->getCharSet()->getId())),
buffer(pool),
resultStart(0),
resultLength(0)
Expand All @@ -224,8 +235,6 @@ class Re2SubstringSimilarMatcher : public BaseSubstringSimilarMatcher
flags |= (textType->getFlags() & TEXTTYPE_ATTR_CASE_INSENSITIVE) ?
SubstringSimilarRegex::FLAG_CASE_INSENSITIVE : 0;

CsConvert converter = INTL_convert_lookup(tdbb, textType->getCharSet()->getId(), CS_UTF8);

converter.convert(patternLen, patternStr, patternBuffer);

if (textType->getFlags() & TEXTTYPE_ATTR_ACCENT_INSENSITIVE)
Expand Down Expand Up @@ -289,10 +298,20 @@ class Re2SubstringSimilarMatcher : public BaseSubstringSimilarMatcher

virtual bool result()
{
UCharBuffer utfBuffer;
const auto charSetId = textType->getCharSet()->getId();
UCharBuffer* bufferPtr = &buffer;

if (charSetId != CS_NONE && charSetId != CS_BINARY && charSetId != CS_UTF8)
{
converter.convert(buffer.getCount(), buffer.begin(), utfBuffer);
bufferPtr = &utfBuffer;
}

if (textType->getFlags() & TEXTTYPE_ATTR_ACCENT_INSENSITIVE)
UnicodeUtil::utf8Normalize(buffer);
UnicodeUtil::utf8Normalize(*bufferPtr);

return regex->matches((const char*) buffer.begin(), buffer.getCount(), &resultStart, &resultLength);
return regex->matches((const char*) bufferPtr->begin(), bufferPtr->getCount(), &resultStart, &resultLength);
}

virtual void getResultInfo(unsigned* start, unsigned* length)
Expand All @@ -302,6 +321,7 @@ class Re2SubstringSimilarMatcher : public BaseSubstringSimilarMatcher
}

private:
CsConvert converter;
AutoPtr<SubstringSimilarRegex> regex;
UCharBuffer buffer;
unsigned resultStart, resultLength;
Expand Down

2 comments on commit 093ba00

@aafemt
Copy link
Contributor

@aafemt aafemt commented on 093ba00 Sep 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot of time is spent here to converting strings from utf-8 to unicode and back. Cannot re2 work with wchar_t directly?

@asfernandes
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No.

Please sign in to comment.