Skip to content

Commit 38cbe97

Browse files
committed
Handle invalid UTF-8 by converting from current code page to UTF-8
1 parent c5f0dc8 commit 38cbe97

File tree

3 files changed

+96
-3
lines changed

3 files changed

+96
-3
lines changed

doc.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,6 +1964,7 @@ CString strLine (lpszText, size);
19641964
switch (m_phase)
19651965
{
19661966
case HAVE_ESC: Phase_ESC (c); continue;
1967+
case HAVE_UTF8_CHARACTER: Phase_UTF8 (c); continue;
19671968

19681969
case HAVE_FOREGROUND_256_START: // these 4 are similar to Phase_ANSI
19691970
case HAVE_FOREGROUND_256_FINISH:
@@ -2107,9 +2108,32 @@ CString strLine (lpszText, size);
21072108
continue; // back to main loop
21082109
} // end of Pueblo startup
21092110

2110-
// here when phase is none
2111+
// here when phase is NONE
21112112

2112-
cOneCharacterLine [0] = c; // in case we need to use it
2113+
// do not display UTF-8 characters until they have completely arrived
2114+
// check if high-order bit is set
2115+
if (m_bUTF_8 && (c & 0x80))
2116+
{
2117+
m_UTF8Sequence [0] = c;
2118+
m_UTF8Sequence [1] = 0; // null terminator
2119+
m_phase = HAVE_UTF8_CHARACTER;
2120+
2121+
// 0x00000080 - 0x000007FF 110 xxxxx 10 xxxxxx
2122+
if ((c & 0xE0) == 0xC0)
2123+
m_iUTF8BytesLeft = 1;
2124+
// 0x00000800 - 0x0000FFFF 1110 xxxx 10 xxxxxx 10 xxxxxx
2125+
else if ((c & 0xF0) == 0xE0)
2126+
m_iUTF8BytesLeft = 2;
2127+
// 0x00010000 - 0x001FFFFF 11110 xxx 10 xxxxxx 10 xxxxxx 10 xxxxxx
2128+
else if ((c & 0xF8) == 0xF0)
2129+
m_iUTF8BytesLeft = 3;
2130+
else
2131+
// some bogus byte with the high-order bit set
2132+
OutputBadUTF8characters ();
2133+
continue; // we are done for now with this byte
2134+
} // end of high-order bit set with UTF-8 enabled
2135+
2136+
cOneCharacterLine [0] = c; // in case we need to use it as a string
21132137

21142138
switch (c)
21152139
{

doc.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ enum { NONE, // normal text
9696
HAVE_BACKGROUND_256_START, // received ESC[48;
9797
HAVE_BACKGROUND_256_FINISH, // received ESC[48;5;
9898

99+
// utf-8 modes
100+
HAVE_UTF8_CHARACTER, // received 110 xxxxx, 1110 xxxx, or 11110 xxx
101+
99102

100103
// mxp modes
101104
HAVE_MXP_ELEMENT, // collecting element, eg. < xxxxx >. Starts on <, stops on >
@@ -972,6 +975,9 @@ class CMUSHclientDoc : public CDocument
972975
long m_iUTF8ErrorCount; // count of lines with bad UTF8
973976
long m_iOutputWindowRedrawCount; // count of times output window redrawn
974977

978+
unsigned char m_UTF8Sequence [5]; // we collect up to 4 UTF8 bytes here and a null-terminator
979+
int m_iUTF8BytesLeft; // how many UTF8 bytes to go
980+
975981
long m_iTriggersEvaluatedCount; // how many triggers we evaluated
976982
long m_iTriggersMatchedCount; // how many triggers matched
977983
long m_iAliasesEvaluatedCount; // how many aliases we evaluated
@@ -1383,7 +1389,8 @@ class CMUSHclientDoc : public CDocument
13831389
void DisplayMsg(LPCTSTR lpszText, int size, const int flags);
13841390
void AddToLine (LPCTSTR lpszText, const int flags);
13851391
void StartNewLine_KeepPreviousStyle (const int flags);
1386-
void Phase_ESC (const unsigned char c);
1392+
void Phase_ESC (const unsigned char c);
1393+
void Phase_UTF8 (const unsigned char c);
13871394
void Phase_ANSI (const unsigned char c);
13881395
void Phase_IAC (unsigned char & c);
13891396
void Phase_WILL (const unsigned char c);
@@ -1545,6 +1552,7 @@ class CMUSHclientDoc : public CDocument
15451552
void WriteToLog (const CString & strText);
15461553
void LogLineInHTMLcolour (POSITION startpos);
15471554
void LogCommand (const char * text);
1555+
void OutputBadUTF8characters (void);
15481556

15491557
BOOL OpenSession (void);
15501558
void SetUpOutputWindow (void);

telnet_phases.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,3 +803,64 @@ void CMUSHclientDoc::Phase_COMPRESS_WILL (const unsigned char c)
803803
m_phase = NONE;
804804
} // end of Phase_COMPRESS_WILL
805805

806+
// in UTF-8 mode, if we get a "bad" UTF-8 character we assume it is standard ANSI
807+
// and convert it from ANSI into Unicode, and then into UTF-8 and output that instead
808+
void CMUSHclientDoc::OutputBadUTF8characters (void)
809+
{
810+
for (int i = 0; m_UTF8Sequence [i]; i++)
811+
{
812+
// convert ANSI to Unicode:
813+
WCHAR sUnicode [1];
814+
MultiByteToWideChar (CP_THREAD_ACP, MB_PRECOMPOSED, (const char *) &m_UTF8Sequence [i], 1, sUnicode, 1);
815+
// now convert Unicode to UTF8
816+
char sOutput [5];
817+
memset (sOutput, 0, sizeof sOutput); // ensure trailing null
818+
WideCharToMultiByte (CP_UTF8, 0, sUnicode, 1, sOutput, sizeof sOutput, NULL, NULL);
819+
AddToLine (sOutput, 0);
820+
m_cLastChar = m_UTF8Sequence [i];
821+
}
822+
823+
m_phase = NONE;
824+
} // end of CMUSHclientDoc::OutputBadUTF8characters
825+
826+
// test data: testing \C5\87\C4\A8\C4\86\C4\B6 Gammon and now: \C6 <---
827+
828+
// here when getting second or subsequent bytes of a UTF8 character
829+
void CMUSHclientDoc::Phase_UTF8 (const unsigned char c)
830+
{
831+
832+
// append to our UTF8 sequence
833+
int i = 0;
834+
while (m_UTF8Sequence [i])
835+
i++;
836+
m_UTF8Sequence [i] = c;
837+
m_UTF8Sequence [i + 1] = 0; // null terminator
838+
839+
if ((c & 0xC0) != 0x80)
840+
{
841+
OutputBadUTF8characters ();
842+
return;
843+
}
844+
845+
// we are waiting for less of them
846+
m_iUTF8BytesLeft--;
847+
848+
// more to go
849+
if (m_iUTF8BytesLeft > 0)
850+
return;
851+
852+
// check the sequence
853+
854+
int erroroffset;
855+
int iBad = _pcre_valid_utf ((const unsigned char *) m_UTF8Sequence, strlen ((const char *) m_UTF8Sequence), &erroroffset);
856+
if (iBad > 0)
857+
{
858+
OutputBadUTF8characters ();
859+
return;
860+
}
861+
862+
// valid UTF8 sequence, add to line
863+
AddToLine ((const char *) m_UTF8Sequence, 0);
864+
m_phase = NONE;
865+
866+
} // end of CMUSHclientDoc::Phase_UTF8

0 commit comments

Comments
 (0)