Skip to content

Commit

Permalink
Merge pull request #9005 from chinpei215/mb-truncate
Browse files Browse the repository at this point in the history
Add wide character support to `Text::truncate()`.
  • Loading branch information
markstory committed Jul 11, 2016
2 parents 14f39eb + 1fa7bed commit 81e8d11
Show file tree
Hide file tree
Showing 2 changed files with 376 additions and 111 deletions.
296 changes: 214 additions & 82 deletions src/Utility/Text.php
Expand Up @@ -554,6 +554,7 @@ public static function tail($text, $length = 100, array $options = [])
* - `ellipsis` Will be used as ending and appended to the trimmed string
* - `exact` If false, $text will not be cut mid-word
* - `html` If true, HTML tags would be handled correctly
* - `trimWidth` If true, $text will be truncated with the width
*
* @param string $text String to truncate.
* @param int $length Length of returned string, including ellipsis.
Expand All @@ -564,124 +565,255 @@ public static function tail($text, $length = 100, array $options = [])
public static function truncate($text, $length = 100, array $options = [])
{
$default = [
'ellipsis' => '...', 'exact' => true, 'html' => false
'ellipsis' => '...', 'exact' => true, 'html' => false, 'trimWidth' => false,
];
if (!empty($options['html']) && strtolower(mb_internal_encoding()) === 'utf-8') {
$default['ellipsis'] = "\xe2\x80\xa6";
}
$options += $default;

$prefix = '';
$suffix = $options['ellipsis'];

if ($options['html']) {
if (mb_strlen(preg_replace('/<.*?>/', '', $text)) <= $length) {
return $text;
}
$totalLength = mb_strlen(strip_tags($options['ellipsis']));
$ellipsisLength = self::_strlen(strip_tags($options['ellipsis']), $options);

$truncateLength = 0;
$totalLength = 0;
$openTags = [];
$truncate = '';

preg_match_all('/(<\/?([\w+]+)[^>]*>)?([^<>]*)/', $text, $tags, PREG_SET_ORDER);
foreach ($tags as $tag) {
if (!preg_match('/img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param/s', $tag[2])) {
if (preg_match('/<[\w]+[^>]*>/s', $tag[0])) {
array_unshift($openTags, $tag[2]);
} elseif (preg_match('/<\/([\w]+)[^>]*>/s', $tag[0], $closeTag)) {
$pos = array_search($closeTag[1], $openTags);
if ($pos !== false) {
array_splice($openTags, $pos, 1);
}
}
}
$truncate .= $tag[1];

$contentLength = mb_strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', ' ', $tag[3]));
if ($contentLength + $totalLength > $length) {
$left = $length - $totalLength;
$entitiesLength = 0;
if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', $tag[3], $entities, PREG_OFFSET_CAPTURE)) {
foreach ($entities[0] as $entity) {
if ($entity[1] + 1 - $entitiesLength <= $left) {
$left--;
$entitiesLength += mb_strlen($entity[0]);
} else {
break;
$contentLength = self::_strlen($tag[3], $options);

if ($truncate === '') {
if (!preg_match('/img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param/i', $tag[2])) {
if (preg_match('/<[\w]+[^>]*>/', $tag[0])) {
array_unshift($openTags, $tag[2]);
} elseif (preg_match('/<\/([\w]+)[^>]*>/', $tag[0], $closeTag)) {
$pos = array_search($closeTag[1], $openTags);
if ($pos !== false) {
array_splice($openTags, $pos, 1);
}
}
}

if (!$options['exact']) {
$words = explode(' ', $tag[3]);
// Keep at least one word.
if (count($words) === 1) {
$truncate .= mb_substr($tag[3], 0, $left + $entitiesLength);
} else {
$wordLength = 0;
$addWords = [];
// Append words until the length is crossed.
foreach ($words as $word) {
// Add words until we have enough letters.
if ($wordLength < $left + $entitiesLength) {
$addWords[] = $word;
}
// Include inter-word space.
$wordLength += mb_strlen($word) + 1;
}
$truncate .= implode(' ', $addWords);

// If the string is longer than requested, find the last space and cut there.
$lastSpace = mb_strrpos($truncate, ' ');
if (mb_strlen($truncate) > $totalLength && $lastSpace !== false) {
$remainder = mb_substr($truncate, $lastSpace);
$truncate = mb_substr($truncate, 0, $lastSpace);

// Re-add close tags that were cut off.
preg_match_all('/<\/([a-z]+)>/', $remainder, $droppedTags, PREG_SET_ORDER);
if ($droppedTags) {
foreach ($droppedTags as $closingTag) {
if (!in_array($closingTag[1], $openTags)) {
array_unshift($openTags, $closingTag[1]);
}
}
}
}
}
$prefix .= $tag[1];

if ($totalLength + $contentLength + $ellipsisLength > $length) {
$truncate = $tag[3];
$truncateLength = $length - $totalLength;
} else {
$truncate .= mb_substr($tag[3], 0, $left + $entitiesLength);
$prefix .= $tag[3];
}
break;
}
$truncate .= $tag[3];

$totalLength += $contentLength;
if ($totalLength >= $length) {
if ($totalLength > $length) {
break;
}
}

$truncate .= $options['ellipsis'];
if ($totalLength <= $length) {
return $text;
}

$text = $truncate;
$length = $truncateLength;

foreach ($openTags as $tag) {
$truncate .= '</' . $tag . '>';
$suffix .= '</' . $tag . '>';
}
return $truncate;
} else {
if (self::_strlen($text, $options) <= $length) {
return $text;
}
$ellipsisLength = self::_strlen($options['ellipsis'], $options);
}

if (mb_strlen($text) <= $length) {
return $text;
}
$truncate = mb_substr($text, 0, $length - mb_strlen($options['ellipsis']));
$result = self::_substr($text, 0, $length - $ellipsisLength, $options);

if (!$options['exact']) {
$spacepos = mb_strrpos($truncate, ' ');
$truncate = mb_substr($truncate, 0, $spacepos);
if (self::_substr($text, $length - $ellipsisLength, 1, $options) !== ' ') {
$result = self::_removeLastWord($result);
}

// If result is empty, then we don't need to count ellipsis in the cut.
if (!strlen($result)) {
$result = self::_substr($text, 0, $length, $options);
}
}

return $prefix . $result . $suffix;
}

/**
* Truncate text with specified width.
*
* @param string $text String to truncate.
* @param int $length Length of returned string, including ellipsis.
* @param array $options An array of HTML attributes and options.
* @return string Trimmed string.
* @see \Cake\Utility\Text::truncate()
*/
public static function truncateByWidth($text, $length = 100, array $options = [])
{
return static::truncate($text, $length, ['trimWidth' => true] + $options);
}

/**
* Get string length.
*
* ### Options:
*
* - `html` If true, HTML entities will be handled as decoded characters.
* - `trimWidth` If true, the width will return.
*
* @param string $text The string being checked for length
* @param array $options An array of options.
* @return string
*/
protected static function _strlen($text, array $options)
{
if (empty($options['trimWidth'])) {
$strlen = 'mb_strlen';
} else {
$strlen = 'mb_strwidth';
}

if (empty($options['html'])) {
return $strlen($text);
}

// If truncate still empty, then we don't need to count ellipsis in the cut.
if (mb_strlen($truncate) === 0) {
$truncate = mb_substr($text, 0, $length);
$pattern = '/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i';
$replace = preg_replace_callback(
$pattern,
function ($match) use ($strlen) {
$utf8 = html_entity_decode($match[0], ENT_HTML5 | ENT_QUOTES, 'UTF-8');
return str_repeat(' ', $strlen($utf8, 'UTF-8'));
},
$text
);
return $strlen($replace);
}

/**
* Return part of a string.
*
* ### Options:
*
* - `html` If true, HTML entities will be handled as decoded characters.
* - `trimWidth` If true, will be truncated with specified width.
*
* @param string $text The input string.
* @param int $start The position to begin extracting.
* @param int $length The desired length.
* @param array $options An array of options.
* @return string
*/
protected static function _substr($text, $start, $length, array $options)
{
if (empty($options['trimWidth'])) {
$substr = 'mb_substr';
} else {
$substr = 'mb_strimwidth';
}

$maxPosition = self::_strlen($text, ['trimWidth' => false] + $options);
if ($start < 0) {
$start += $maxPosition;
if ($start < 0) {
$start = 0;
}
}
if ($start >= $maxPosition) {
return '';
}

if ($length === null) {
$length = self::_strlen($text, $options);
}

if ($length < 0) {
$text = self::_substr($text, $start, null, $options);
$start = 0;
$length += self::_strlen($text, $options);
}

if ($length <= 0) {
return '';
}

if (empty($options['html'])) {
return (string)$substr($text, $start, $length);
}

$totalOffset = 0;
$totalLength = 0;
$result = '';

$pattern = '/(&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};)/i';
$parts = preg_split($pattern, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
foreach ($parts as $part) {
$offset = 0;

if ($totalOffset < $start) {
$len = self::_strlen($part, ['trimWidth' => false] + $options);
if ($totalOffset + $len <= $start) {
$totalOffset += $len;
continue;
}

$offset = $start - $totalOffset;
$totalOffset = $start;
}

$len = self::_strlen($part, $options);
if ($offset !== 0 || $totalLength + $len > $length) {
if (strpos($part, '&') === 0 && preg_match($pattern, $part)
&& $part !== html_entity_decode($part, ENT_HTML5 | ENT_QUOTES, 'UTF-8')
) {
// Entities cannot be passed substr.
continue;
}

$part = $substr($part, $offset, $length - $totalLength);
$len = self::_strlen($part, $options);
}

$result .= $part;
$totalLength += $len;
if ($totalLength >= $length) {
break;
}
}

$truncate .= $options['ellipsis'];
return $truncate;
return $result;
}

/**
* Removes the last word from the input text.
*
* @param string $text The input text
* @return string
*/
protected static function _removeLastWord($text)
{
$spacepos = mb_strrpos($text, ' ');

if ($spacepos !== false) {
$lastWord = mb_strrpos($text, $spacepos);

// Some languages are written without word separation.
// We recognize a string as a word if it doesn't contain any full-width characters.
if (mb_strwidth($lastWord) === mb_strlen($lastWord)) {
$text = mb_substr($text, 0, $spacepos);
}
return $text;
}

return '';
}

/**
Expand Down

0 comments on commit 81e8d11

Please sign in to comment.