@@ -803,3 +803,64 @@ void CMUSHclientDoc::Phase_COMPRESS_WILL (const unsigned char c)
803
803
m_phase = NONE;
804
804
} // end of Phase_COMPRESS_WILL
805
805
806
+ // in UTF-8 mode, if we get a "bad" UTF-8 character we assume it is standard ANSI
807
+ // and convert it from ANSI into Unicode, and then into UTF-8 and output that instead
808
+ void CMUSHclientDoc::OutputBadUTF8characters (void )
809
+ {
810
+ for (int i = 0 ; m_UTF8Sequence [i]; i++)
811
+ {
812
+ // convert ANSI to Unicode:
813
+ WCHAR sUnicode [1 ];
814
+ MultiByteToWideChar (CP_THREAD_ACP, MB_PRECOMPOSED, (const char *) &m_UTF8Sequence [i], 1 , sUnicode , 1 );
815
+ // now convert Unicode to UTF8
816
+ char sOutput [5 ];
817
+ memset (sOutput , 0 , sizeof sOutput ); // ensure trailing null
818
+ WideCharToMultiByte (CP_UTF8, 0 , sUnicode , 1 , sOutput , sizeof sOutput , NULL , NULL );
819
+ AddToLine (sOutput , 0 );
820
+ m_cLastChar = m_UTF8Sequence [i];
821
+ }
822
+
823
+ m_phase = NONE;
824
+ } // end of CMUSHclientDoc::OutputBadUTF8characters
825
+
826
+ // test data: testing \C5\87\C4\A8\C4\86\C4\B6 Gammon and now: \C6 <---
827
+
828
+ // here when getting second or subsequent bytes of a UTF8 character
829
+ void CMUSHclientDoc::Phase_UTF8 (const unsigned char c)
830
+ {
831
+
832
+ // append to our UTF8 sequence
833
+ int i = 0 ;
834
+ while (m_UTF8Sequence [i])
835
+ i++;
836
+ m_UTF8Sequence [i] = c;
837
+ m_UTF8Sequence [i + 1 ] = 0 ; // null terminator
838
+
839
+ if ((c & 0xC0 ) != 0x80 )
840
+ {
841
+ OutputBadUTF8characters ();
842
+ return ;
843
+ }
844
+
845
+ // we are waiting for less of them
846
+ m_iUTF8BytesLeft--;
847
+
848
+ // more to go
849
+ if (m_iUTF8BytesLeft > 0 )
850
+ return ;
851
+
852
+ // check the sequence
853
+
854
+ int erroroffset;
855
+ int iBad = _pcre_valid_utf ((const unsigned char *) m_UTF8Sequence, strlen ((const char *) m_UTF8Sequence), &erroroffset);
856
+ if (iBad > 0 )
857
+ {
858
+ OutputBadUTF8characters ();
859
+ return ;
860
+ }
861
+
862
+ // valid UTF8 sequence, add to line
863
+ AddToLine ((const char *) m_UTF8Sequence, 0 );
864
+ m_phase = NONE;
865
+
866
+ } // end of CMUSHclientDoc::Phase_UTF8
0 commit comments