From 347d8252fbb7cd5289cb9da97ec17ac8b9cce63f Mon Sep 17 00:00:00 2001 From: Thomas Calvet Date: Wed, 1 Jan 2020 14:44:24 +0100 Subject: [PATCH] [String] Made AbstractString::width() follow POSIX.1-2001 Co-authored-by: Nicolas Grekas --- src/Symfony/Component/String/.gitattributes | 2 + .../Component/String/AbstractString.php | 3 + .../String/AbstractUnicodeString.php | 96 +- src/Symfony/Component/String/ByteString.php | 28 +- src/Symfony/Component/String/CHANGELOG.md | 1 + .../Resources/WcswidthDataGenerator.php | 113 ++ .../String/Resources/bin/update-data.php | 55 + .../Resources/data/wcswidth_table_wide.php | 1095 ++++++++++++++ .../Resources/data/wcswidth_table_zero.php | 1303 +++++++++++++++++ .../String/Tests/AbstractAsciiTestCase.php | 31 + .../Component/String/Tests/ByteStringTest.php | 11 + src/Symfony/Component/String/composer.json | 4 + 12 files changed, 2704 insertions(+), 38 deletions(-) create mode 100644 src/Symfony/Component/String/Resources/WcswidthDataGenerator.php create mode 100644 src/Symfony/Component/String/Resources/bin/update-data.php create mode 100644 src/Symfony/Component/String/Resources/data/wcswidth_table_wide.php create mode 100644 src/Symfony/Component/String/Resources/data/wcswidth_table_zero.php diff --git a/src/Symfony/Component/String/.gitattributes b/src/Symfony/Component/String/.gitattributes index ebb9287043dc..4a7ef98aba42 100644 --- a/src/Symfony/Component/String/.gitattributes +++ b/src/Symfony/Component/String/.gitattributes @@ -1,3 +1,5 @@ +/Resources/bin/update-data.php export-ignore +/Resources/WcswidthDataGenerator.php export-ignore /Tests export-ignore /phpunit.xml.dist export-ignore /.gitignore export-ignore diff --git a/src/Symfony/Component/String/AbstractString.php b/src/Symfony/Component/String/AbstractString.php index ec981176d25d..122b6beb68a7 100644 --- a/src/Symfony/Component/String/AbstractString.php +++ b/src/Symfony/Component/String/AbstractString.php @@ -646,6 +646,9 @@ public function truncate(int $length, string $ellipsis = ''): self */ abstract public function upper(): self; + /** + * Returns the printable length on a terminal. + */ abstract public function width(bool $ignoreAnsiDecoration = true): int; /** diff --git a/src/Symfony/Component/String/AbstractUnicodeString.php b/src/Symfony/Component/String/AbstractUnicodeString.php index 08d87ccde8ee..e833f6f2bdb6 100644 --- a/src/Symfony/Component/String/AbstractUnicodeString.php +++ b/src/Symfony/Component/String/AbstractUnicodeString.php @@ -352,9 +352,6 @@ public function replaceMatches(string $fromRegexp, $to): parent return $str; } - /** - * {@inheritdoc} - */ public function reverse(): parent { $str = clone $this; @@ -444,22 +441,21 @@ public function width(bool $ignoreAnsiDecoration = true): int $s = str_replace(["\r\n", "\r"], "\n", $s); } + if (!$ignoreAnsiDecoration) { + $s = preg_replace('/[\p{Cc}\x7F]++/u', '', $s); + } + foreach (explode("\n", $s) as $s) { if ($ignoreAnsiDecoration) { - $s = preg_replace('/\x1B(?: + $s = preg_replace('/(?:\x1B(?: \[ [\x30-\x3F]*+ [\x20-\x2F]*+ [0x40-\x7E] | [P\]X^_] .*? \x1B\\\\ | [\x41-\x7E] - )/x', '', $s); + )|[\p{Cc}\x7F]++)/xu', '', $s); } - $w = substr_count($s, "\xAD") - substr_count($s, "\x08"); - $s = preg_replace('/[\x00\x05\x07\p{Mn}\p{Me}\p{Cf}\x{1160}-\x{11FF}\x{200B}]+/u', '', $s); - $s = preg_replace('/[\x{1100}-\x{115F}\x{2329}\x{232A}\x{2E80}-\x{303E}\x{3040}-\x{A4CF}\x{AC00}-\x{D7A3}\x{F900}-\x{FAFF}\x{FE10}-\x{FE19}\x{FE30}-\x{FE6F}\x{FF00}-\x{FF60}\x{FFE0}-\x{FFE6}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}]/u', '', $s, -1, $wide); - - if ($width < $w += mb_strlen($s, 'UTF-8') + ($wide << 1)) { - $width = $w; - } + // Non printable characters have been dropped, so wcswidth cannot logically return -1. + $width += $this->wcswidth($s); } return $width; @@ -503,4 +499,80 @@ private function pad(int $len, self $pad, int $type): parent throw new InvalidArgumentException('Invalid padding type.'); } } + + /** + * Based on https://github.com/jquast/wcwidth, a Python implementation of https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c. + */ + private function wcswidth(string $string): int + { + $width = 0; + + foreach (preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY) as $c) { + $codePoint = mb_ord($c, 'UTF-8'); + + if (0 === $codePoint // NULL + || 0x034F === $codePoint // COMBINING GRAPHEME JOINER + || (0x200B <= $codePoint && 0x200F >= $codePoint) // ZERO WIDTH SPACE to RIGHT-TO-LEFT MARK + || 0x2028 === $codePoint // LINE SEPARATOR + || 0x2029 === $codePoint // PARAGRAPH SEPARATOR + || (0x202A <= $codePoint && 0x202E >= $codePoint) // LEFT-TO-RIGHT EMBEDDING to RIGHT-TO-LEFT OVERRIDE + || (0x2060 <= $codePoint && 0x2063 >= $codePoint) // WORD JOINER to INVISIBLE SEPARATOR + ) { + continue; + } + + // Non printable characters + if (32 > $codePoint // C0 control characters + || (0x07F <= $codePoint && 0x0A0 > $codePoint) // C1 control characters and DEL + ) { + return -1; + } + + static $tableZero; + if (null === $tableZero) { + $tableZero = require __DIR__.'/Resources/data/wcswidth_table_zero.php'; + } + + if ($codePoint >= $tableZero[0][0] && $codePoint <= $tableZero[$ubound = \count($tableZero) - 1][1]) { + $lbound = 0; + while ($ubound >= $lbound) { + $mid = floor(($lbound + $ubound) / 2); + + if ($codePoint > $tableZero[$mid][1]) { + $lbound = $mid + 1; + } elseif ($codePoint < $tableZero[$mid][0]) { + $ubound = $mid - 1; + } else { + continue 2; + } + } + } + + static $tableWide; + if (null === $tableWide) { + $tableWide = require __DIR__.'/Resources/data/wcswidth_table_wide.php'; + } + + if ($codePoint >= $tableWide[0][0] && $codePoint <= $tableWide[$ubound = \count($tableWide) - 1][1]) { + $lbound = 0; + while ($ubound >= $lbound) { + $mid = floor(($lbound + $ubound) / 2); + + if ($codePoint > $tableWide[$mid][1]) { + $lbound = $mid + 1; + } elseif ($codePoint < $tableWide[$mid][0]) { + $ubound = $mid - 1; + } else { + $width += 2; + + continue 2; + } + } + } + + ++$width; + } + + return $width; + } } diff --git a/src/Symfony/Component/String/ByteString.php b/src/Symfony/Component/String/ByteString.php index ab44882ceade..7b83cb05c762 100644 --- a/src/Symfony/Component/String/ByteString.php +++ b/src/Symfony/Component/String/ByteString.php @@ -303,9 +303,6 @@ public function replaceMatches(string $fromRegexp, $to): parent return $str; } - /** - * {@inheritdoc} - */ public function reverse(): parent { $str = clone $this; @@ -460,29 +457,8 @@ public function upper(): parent public function width(bool $ignoreAnsiDecoration = true): int { - $width = 0; - $s = str_replace(["\x00", "\x05", "\x07"], '', $this->string); + $string = preg_match('//u', $this->string) ? $this->string : preg_replace('/[\x80-\xFF]/', '?', $this->string); - if (false !== strpos($s, "\r")) { - $s = str_replace(["\r\n", "\r"], "\n", $s); - } - - foreach (explode("\n", $s) as $s) { - if ($ignoreAnsiDecoration) { - $s = preg_replace('/\x1B(?: - \[ [\x30-\x3F]*+ [\x20-\x2F]*+ [0x40-\x7E] - | [P\]X^_] .*? \x1B\\\\ - | [\x41-\x7E] - )/x', '', $s); - } - - $w = substr_count($s, "\xAD") - substr_count($s, "\x08"); - - if ($width < $w += \strlen($s)) { - $width = $w; - } - } - - return $width; + return (new CodePointString($string))->width($ignoreAnsiDecoration); } } diff --git a/src/Symfony/Component/String/CHANGELOG.md b/src/Symfony/Component/String/CHANGELOG.md index 050c734f8982..819b6ef59558 100644 --- a/src/Symfony/Component/String/CHANGELOG.md +++ b/src/Symfony/Component/String/CHANGELOG.md @@ -5,6 +5,7 @@ CHANGELOG ----- * Added the `AbstractString::reverse()` method. + * Made `AbstractString::width()` follow POSIX.1-2001. 5.0.0 ----- diff --git a/src/Symfony/Component/String/Resources/WcswidthDataGenerator.php b/src/Symfony/Component/String/Resources/WcswidthDataGenerator.php new file mode 100644 index 000000000000..cd507350d6a8 --- /dev/null +++ b/src/Symfony/Component/String/Resources/WcswidthDataGenerator.php @@ -0,0 +1,113 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\Component\String\Resources; + +use Symfony\Component\HttpClient\HttpClient; +use Symfony\Component\String\Exception\RuntimeException; +use Symfony\Component\VarExporter\VarExporter; + +/** + * @internal + */ +final class WcswidthDataGenerator +{ + private $outDir; + + private $client; + + public function __construct(string $outDir) + { + $this->outDir = $outDir; + + $this->client = HttpClient::createForBaseUri('https://www.unicode.org/Public/UNIDATA/'); + } + + public function generate(): void + { + $this->writeWideWidthData(); + + $this->writeZeroWidthData(); + } + + private function writeWideWidthData(): void + { + if (!preg_match('/^# EastAsianWidth-(\d+\.\d+\.\d+)\.txt/', $content = $this->client->request('GET', 'EastAsianWidth.txt')->getContent(), $matches)) { + throw new RuntimeException('The Unicode version could not be determined.'); + } + + $version = $matches[1]; + + if (!preg_match_all('/^([A-H\d]{4,})(?:\.\.([A-H\d]{4,}))?;[W|F]/m', $content, $matches, PREG_SET_ORDER)) { + throw new RuntimeException('The wide width pattern did not match anything.'); + } + + $this->write('wcswidth_table_wide.php', $version, $matches); + } + + private function writeZeroWidthData(): void + { + if (!preg_match('/^# DerivedGeneralCategory-(\d+\.\d+\.\d+)\.txt/', $content = $this->client->request('GET', 'extracted/DerivedGeneralCategory.txt')->getContent(), $matches)) { + throw new RuntimeException('The Unicode version could not be determined.'); + } + + $version = $matches[1]; + + if (!preg_match_all('/^([A-H\d]{4,})(?:\.\.([A-H\d]{4,}))? *; (?:Me|Mn)/m', $content, $matches, PREG_SET_ORDER)) { + throw new RuntimeException('The zero width pattern did not match anything.'); + } + + $this->write('wcswidth_table_zero.php', $version, $matches); + } + + private function write(string $fileName, string $version, array $rawData): void + { + $content = $this->getHeader($version).'return '.VarExporter::export($this->format($rawData)).";\n"; + + if (!file_put_contents($this->outDir.'/'.$fileName, $content)) { + throw new RuntimeException(sprintf('The "%s" file could not be written.', $fileName)); + } + } + + private function getHeader(string $version): string + { + $date = (new \DateTimeImmutable())->format('c'); + + return << + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\Component\String\Resources\WcswidthDataGenerator; + +error_reporting(E_ALL); + +set_error_handler(static function (int $type, string $msg, string $file, int $line): void { + throw new \ErrorException($msg, 0, $type, $file, $line); +}); + +set_exception_handler(static function (\Throwable $exception): void { + echo "\n"; + + $cause = $exception; + $root = true; + + while (null !== $cause) { + if (!$root) { + echo "Caused by\n"; + } + + echo get_class($cause).': '.$cause->getMessage()."\n"; + echo "\n"; + echo $cause->getFile().':'.$cause->getLine()."\n"; + echo $cause->getTraceAsString()."\n"; + + $cause = $cause->getPrevious(); + $root = false; + } +}); + +$autoload = __DIR__.'/../../vendor/autoload.php'; + +if (!file_exists($autoload)) { + echo wordwrap('You should run "composer install" in the component before running this script.', 75)." Aborting.\n"; + + exit(1); +} + +require_once $autoload; + +echo "Generating wcswidth tables data...\n"; + +(new WcswidthDataGenerator(dirname(__DIR__).'/data'))->generate(); + +echo "Done.\n"; diff --git a/src/Symfony/Component/String/Resources/data/wcswidth_table_wide.php b/src/Symfony/Component/String/Resources/data/wcswidth_table_wide.php new file mode 100644 index 000000000000..18370667258c --- /dev/null +++ b/src/Symfony/Component/String/Resources/data/wcswidth_table_wide.php @@ -0,0 +1,1095 @@ +assertSame($expected, static::createFromString($origin)->width($ignoreAnsiDecoration)); + } + + public static function provideWidth(): array + { + return [ + [0, ''], + [1, 'c'], + [3, 'foo'], + [2, '⭐'], + [8, 'f⭐o⭐⭐'], + [19, 'コンニチハ, セカイ!'], + [6, "foo\u{0000}bar"], + [6, "foo\u{001b}[0mbar"], + [6, "foo\u{0001}bar"], + [6, "foo\u{0001}bar", false], + [4, '--ֿ--'], + [4, 'café'], + [1, 'А҈'], + [4, 'ᬓᬨᬮ᭄'], + [1, "\u{00AD}"], + [14, "\u{007f}\u{007f}f\u{001b}[0moo\u{0001}bar\u{007f}cccïf\u{008e}cy\u{0005}1"], // foobarcccïfcy1 + [17, "\u{007f}\u{007f}f\u{001b}[0moo\u{0001}bar\u{007f}cccïf\u{008e}cy\u{0005}1", false], // f[0moobarcccïfcy1 + ]; + } } diff --git a/src/Symfony/Component/String/Tests/ByteStringTest.php b/src/Symfony/Component/String/Tests/ByteStringTest.php index b7a47a562f25..28dedb1fb418 100644 --- a/src/Symfony/Component/String/Tests/ByteStringTest.php +++ b/src/Symfony/Component/String/Tests/ByteStringTest.php @@ -43,4 +43,15 @@ public static function provideLength(): array ] ); } + + public static function provideWidth(): array + { + return array_merge( + parent::provideWidth(), + [ + [10, "f\u{001b}[0moo\x80bar\xfe\xfe1"], // foo?bar??1 + [13, "f\u{001b}[0moo\x80bar\xfe\xfe1", false], // f[0moo?bar??1 + ] + ); + } } diff --git a/src/Symfony/Component/String/composer.json b/src/Symfony/Component/String/composer.json index 97cd66b0b460..470caf4e2683 100644 --- a/src/Symfony/Component/String/composer.json +++ b/src/Symfony/Component/String/composer.json @@ -22,6 +22,10 @@ "symfony/polyfill-mbstring": "~1.0", "symfony/translation-contracts": "^1.1|^2" }, + "require-dev": { + "symfony/http-client": "^4.4|^5.0", + "symfony/var-exporter": "^4.4|^5.0" + }, "autoload": { "psr-4": { "Symfony\\Component\\String\\": "" }, "files": [ "Resources/functions.php" ],