Skip to content

Commit c6dfb62

Browse files
authored
[flang] Support \u Unicode escape sequences (llvm#76757)
Support \uNNNN and \uNNNNNNNN escape sequences for CHARACTER(KIND=2) and CHARACTER(KIND=4) literal constants for better GNU Fortran compatibility. Fixes llvm-test-suite/Fortran/gfortran/regression/achar_6.F90 and .../widechar_1.f90.
1 parent 22f6e97 commit c6dfb62

File tree

6 files changed

+77
-14
lines changed

6 files changed

+77
-14
lines changed

flang/docs/Extensions.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,8 @@ end
322322
### Extensions supported when enabled by options
323323

324324
* C-style backslash escape sequences in quoted CHARACTER literals
325-
(but not Hollerith) [-fbackslash]
325+
(but not Hollerith) [-fbackslash], including Unicode escapes
326+
with `\U`.
326327
* Logical abbreviations `.T.`, `.F.`, `.N.`, `.A.`, `.O.`, and `.X.`
327328
[-flogical-abbreviations]
328329
* `.XOR.` as a synonym for `.NEQV.` [-fxor-operator]

flang/include/flang/Parser/characters.h

+17
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,23 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
237237
}};
238238
if (ch <= 0x7f) {
239239
emitOneByte(ch);
240+
} else if (useHexadecimalEscapeSequences) {
241+
insert('\\');
242+
insert('u');
243+
if (ch > 0xffff) {
244+
unsigned c1{(ch >> 28) & 0xf}, c2{(ch >> 24) & 0xf}, c3{(ch >> 20) & 0xf},
245+
c4{(ch >> 16) & 0xf};
246+
insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
247+
insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
248+
insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
249+
insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
250+
}
251+
unsigned c1{(ch >> 12) & 0xf}, c2{(ch >> 8) & 0xf}, c3{(ch >> 4) & 0xf},
252+
c4{ch & 0xf};
253+
insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
254+
insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
255+
insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
256+
insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
240257
} else {
241258
EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
242259
for (int j{0}; j < encoded.bytes; ++j) {

flang/lib/Evaluate/character.h

+3-10
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@
1313
#include <string>
1414

1515
// Provides implementations of intrinsic functions operating on character
16-
// scalars. No assumption is made regarding character encodings other than they
17-
// must be compatible with ASCII (else, NEW_LINE, ACHAR and IACHAR need to be
18-
// adapted).
16+
// scalars.
1917

2018
namespace Fortran::evaluate {
2119

@@ -34,13 +32,8 @@ template <int KIND> class CharacterUtils {
3432
// contain ASCII
3533
static std::int64_t ICHAR(const Character &c) {
3634
CHECK(c.length() == 1);
37-
if constexpr (std::is_same_v<CharT, char>) {
38-
// char may be signed, so cast it first to unsigned to avoid having
39-
// ichar(char(128_4)) returning -128
40-
return static_cast<unsigned char>(c[0]);
41-
} else {
42-
return c[0];
43-
}
35+
// Convert first to an unsigned integer type to avoid sign extension
36+
return static_cast<common::HostUnsignedIntType<(8 * KIND)>>(c[0]);
4437
}
4538

4639
static Character NEW_LINE() { return Character{{NewLine()}}; }

flang/lib/Parser/characters.cpp

+24-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,30 @@ template <Encoding ENCODING>
235235
DecodedCharacter DecodeCharacter(
236236
const char *cp, std::size_t bytes, bool backslashEscapes) {
237237
if (backslashEscapes && bytes >= 2 && *cp == '\\') {
238-
return DecodeEscapedCharacters<ENCODING>(cp, bytes);
238+
if (ENCODING == Encoding::UTF_8 && bytes >= 6 &&
239+
ToLowerCaseLetter(cp[1]) == 'u' && IsHexadecimalDigit(cp[2]) &&
240+
IsHexadecimalDigit(cp[3]) && IsHexadecimalDigit(cp[4]) &&
241+
IsHexadecimalDigit(cp[5])) {
242+
char32_t ch{
243+
static_cast<char32_t>(4096 * HexadecimalDigitValue(cp[2]) +
244+
256 * HexadecimalDigitValue(cp[3]) +
245+
16 * HexadecimalDigitValue(cp[4]) + HexadecimalDigitValue(cp[5])),
246+
};
247+
if (bytes >= 10 && IsHexadecimalDigit(cp[6]) &&
248+
IsHexadecimalDigit(cp[7]) && IsHexadecimalDigit(cp[8]) &&
249+
IsHexadecimalDigit(cp[9])) {
250+
return {(ch << 16) |
251+
(4096 * HexadecimalDigitValue(cp[6]) +
252+
256 * HexadecimalDigitValue(cp[7]) +
253+
16 * HexadecimalDigitValue(cp[8]) +
254+
HexadecimalDigitValue(cp[9])),
255+
10};
256+
} else {
257+
return {ch, 6};
258+
}
259+
} else {
260+
return DecodeEscapedCharacters<ENCODING>(cp, bytes);
261+
}
239262
} else {
240263
return DecodeRawCharacter<ENCODING>(cp, bytes);
241264
}

flang/runtime/edit-input.cpp

+12-2
Original file line numberDiff line numberDiff line change
@@ -976,7 +976,12 @@ bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x,
976976
if (skipping) {
977977
--skipChars;
978978
} else if (auto ucs{DecodeUTF8(input)}) {
979-
*x++ = *ucs;
979+
if ((sizeof *x == 1 && *ucs > 0xff) ||
980+
(sizeof *x == 2 && *ucs > 0xffff)) {
981+
*x++ = '?';
982+
} else {
983+
*x++ = *ucs;
984+
}
980985
--lengthChars;
981986
} else if (chunkBytes == 0) {
982987
// error recovery: skip bad encoding
@@ -990,7 +995,12 @@ bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x,
990995
} else {
991996
char32_t buffer{0};
992997
std::memcpy(&buffer, input, chunkBytes);
993-
*x++ = buffer;
998+
if ((sizeof *x == 1 && buffer > 0xff) ||
999+
(sizeof *x == 2 && buffer > 0xffff)) {
1000+
*x++ = '?';
1001+
} else {
1002+
*x++ = buffer;
1003+
}
9941004
--lengthChars;
9951005
}
9961006
} else if constexpr (sizeof *x > 1) {

flang/test/Semantics/modfile60.f90

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
! RUN: %python %S/test_modfile.py %s %flang_fc1 -fbackslash
2+
! Test Unicode escape sequences
3+
module m
4+
integer, parameter :: wide = 4
5+
character(kind=wide, len=20), parameter :: ch = wide_"\u1234 \u56789abc"
6+
integer, parameter :: check(2) = [ iachar(ch(1:1)), iachar(ch(3:3)) ]
7+
logical, parameter :: valid = all(check == [int(z'1234'), int(z'56789abc')])
8+
end
9+
10+
!Expect: m.mod
11+
!module m
12+
!integer(4),parameter::wide=4_4
13+
!character(20_4,4),parameter::ch=4_"\341\210\264 \375\226\236\211\252\274 "
14+
!integer(4),parameter::check(1_8:2_8)=[INTEGER(4)::4660_4,1450744508_4]
15+
!intrinsic::iachar
16+
!logical(4),parameter::valid=.true._4
17+
!intrinsic::all
18+
!intrinsic::int
19+
!end

0 commit comments

Comments
 (0)