Skip to content

Commit

Permalink
Fixed a problem with multibyte characters being split at the middle
Browse files Browse the repository at this point in the history
in mail header encoding, because (some?) mail clients do not merge
the bytes back together into the original multibyte character,
resulting in broken output.
  • Loading branch information
mmakaay committed Jan 4, 2012
1 parent e6fe80a commit 1d5dc13
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 29 deletions.
3 changes: 3 additions & 0 deletions common.php
Expand Up @@ -422,6 +422,9 @@
$PHORUM["DATA"]["HCHARSET"] = $PHORUM["DATA"]["CHARSET"];
}

// Set the internal encoding for mbstring functions.
mb_internal_encoding($PHORUM['DATA']['CHARSET']);

// HTML titles can't contain HTML code, so we strip HTML tags
// and HTML escape the title.
$PHORUM["DATA"]["HTML_TITLE"] = htmlspecialchars(strip_tags($PHORUM["DATA"]["HTML_TITLE"]), ENT_COMPAT, $PHORUM["DATA"]["HCHARSET"]);
Expand Down
56 changes: 30 additions & 26 deletions include/api/mail.php
Expand Up @@ -515,29 +515,33 @@ function phorum_api_mail_encode_header($string)
}
}

// No more characters left? Then we are done.
if ($len == 0) break;
// Encode unsafe chars.
while ($len > 0 && strcspn($string, $safe_chars, $cursor) > 0)
{
// Check how many bytes long the following character is.
//
// We found that mail clients do not handle multibyte characters
// correctly when its bytes are split over wrapped encoding
// lines. The clients do not put the separated bytes back
// together, resulting in broken characters in the output.
$mb_char = mb_substr(substr($string, $cursor), 0, 1);
$mb_len = strlen($mb_char);

// Check how many unsafe chars in a row we can find in the string
// from the current cursor position on.
$count = strcspn($string, $safe_chars, $cursor);
// From the RFC:
// "(General 8bit representation) Any octet, except a CR or LF that
// is part of a CRLF line break of the canonical (standard) form
// of the data being encoded, may be represented by an "=" followed
// by a two digit hexadecimal representation of the octet's value.
// The digits of the hexadecimal alphabet, for this purpose, are
// "0123456789ABCDEF". Uppercase letters must be used; lowercase
// letters are not allowed."

// From the RFC:
// "(General 8bit representation) Any octet, except a CR or LF that
// is part of a CRLF line break of the canonical (standard) form
// of the data being encoded, may be represented by an "=" followed
// by a two digit hexadecimal representation of the octet's value.
// The digits of the hexadecimal alphabet, for this purpose, are
// "0123456789ABCDEF". Uppercase letters must be used; lowercase
// letters are not allowed."
while ($count > 0)
{
// From the RFC:
// "(Line Breaks) A line break in a text body, represented
// as a CRLF sequence in the text canonical form, must be
// represented by a (RFC 822) line break, which is also a
// CRLF sequence"
if ($string[$cursor] == "\r" &&
if ($mb_char == "\r" &&
isset($string[$cursor+1]) &&
$string[$cursor + 1] == "\n") {
$res .= "\r\n\t";
Expand All @@ -550,24 +554,24 @@ function phorum_api_mail_encode_header($string)
else
{
// If we are at the end of the line, then wrap around with
// a soft break. We take 3 characters into account to
// a soft break. We take 3 characters into account per byte to
// take care of the "=XX" encoding.
if (($linecursor + 3) >= RFC2045_WRAPLEN) {
if (($linecursor + $mb_len * 3) >= RFC2045_WRAPLEN) {
$res .= "$postfix\r\n\t$prefix";
$linecursor = $prefixlen;
}

// Add the escaped character.
$res .= sprintf('=%02X', ord($string[$cursor]));
$cursor ++;
$linecursor += 3;
$count--;
$len--;
for ($pos = 0; $pos < $mb_len; $pos++) {
$res .= sprintf('=%02X', ord($mb_char[$pos]));
}

// Update counters.
$cursor += $mb_len;
$linecursor += $mb_len * 3;
$len -= $mb_len;
}
}

// No more characters left? Then we are done.
if ($len == 0) break;
}

// Add the closing postfix.
Expand Down
16 changes: 13 additions & 3 deletions mods/compat_mbstring/compat_mbstring.php
Expand Up @@ -2,6 +2,10 @@

if (!defined('PHORUM')) return;

$GLOBALS['PHORUM']['compat_mbstring']['encoding'] =
isset($GLOBALS['PHORUM']['DATA']['CHARSET'])
? $GLOBALS['PHORUM']['DATA']['CHARSET'] : 'UTF-8';

if (!function_exists('mb_substr'))
{
function mb_substr($str, $offset, $length = NULL, $encoding = NULL)
Expand All @@ -11,9 +15,7 @@ function mb_substr($str, $offset, $length = NULL, $encoding = NULL)
if ($length !== NULL) settype($length, 'int');

if ($encoding === NULL) {
global $PHORUM;
$encoding = isset($PHORUM['DATA']['CHARSET'])
? $PHORUM['DATA']['CHARSET'] : 'utf-8';
$encoding = $GLOBALS['PHORUM']['compat_mbstring']['encoding'];
}

// For non-UTF-8 data, we fallback to substr().
Expand Down Expand Up @@ -105,4 +107,12 @@ function mb_substr($str, $offset, $length = NULL, $encoding = NULL)
}
}

if (!function_exists('mb_internal_encoding'))
{
function mb_internal_encoding($encoding)
{
$GLOBALS['PHORUM']['compat_mbstring']['encoding'] = $encoding;
}
}

?>

0 comments on commit 1d5dc13

Please sign in to comment.