Skip to content

Commit

Permalink
feature #35156 [String] Made AbstractString::width() follow POSIX.1-2…
Browse files Browse the repository at this point in the history
…001 (fancyweb)

This PR was merged into the 5.1-dev branch.

Discussion
----------

[String] Made AbstractString::width() follow POSIX.1-2001

| Q             | A
| ------------- | ---
| Branch?       | master
| Bug fix?      | no
| New feature?  | yes
| Deprecations? | no
| Tickets       | -
| License       | MIT
| Doc PR        | -

This PR ports the wcswidth() function (see http://man7.org/linux/man-pages/man3/wcwidth.3.html and https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c) into the String component. This new method will be useful in the Console component to determine how many columns a character takes.

I kind of copied the Intl data import strategy.

Commits
-------

347d825 [String] Made AbstractString::width() follow POSIX.1-2001
  • Loading branch information
fabpot committed Jan 30, 2020
2 parents e493752 + 347d825 commit 75fc3fa
Show file tree
Hide file tree
Showing 12 changed files with 2,704 additions and 38 deletions.
2 changes: 2 additions & 0 deletions src/Symfony/Component/String/.gitattributes
@@ -1,3 +1,5 @@
/Resources/bin/update-data.php export-ignore
/Resources/WcswidthDataGenerator.php export-ignore
/Tests export-ignore
/phpunit.xml.dist export-ignore
/.gitignore export-ignore
3 changes: 3 additions & 0 deletions src/Symfony/Component/String/AbstractString.php
Expand Up @@ -646,6 +646,9 @@ public function truncate(int $length, string $ellipsis = ''): self
*/
abstract public function upper(): self;

/**
* Returns the printable length on a terminal.
*/
abstract public function width(bool $ignoreAnsiDecoration = true): int;

/**
Expand Down
96 changes: 84 additions & 12 deletions src/Symfony/Component/String/AbstractUnicodeString.php
Expand Up @@ -352,9 +352,6 @@ public function replaceMatches(string $fromRegexp, $to): parent
return $str;
}

/**
* {@inheritdoc}
*/
public function reverse(): parent
{
$str = clone $this;
Expand Down Expand Up @@ -444,22 +441,21 @@ public function width(bool $ignoreAnsiDecoration = true): int
$s = str_replace(["\r\n", "\r"], "\n", $s);
}

if (!$ignoreAnsiDecoration) {
$s = preg_replace('/[\p{Cc}\x7F]++/u', '', $s);
}

foreach (explode("\n", $s) as $s) {
if ($ignoreAnsiDecoration) {
$s = preg_replace('/\x1B(?:
$s = preg_replace('/(?:\x1B(?:
\[ [\x30-\x3F]*+ [\x20-\x2F]*+ [0x40-\x7E]
| [P\]X^_] .*? \x1B\\\\
| [\x41-\x7E]
)/x', '', $s);
)|[\p{Cc}\x7F]++)/xu', '', $s);
}

$w = substr_count($s, "\xAD") - substr_count($s, "\x08");
$s = preg_replace('/[\x00\x05\x07\p{Mn}\p{Me}\p{Cf}\x{1160}-\x{11FF}\x{200B}]+/u', '', $s);
$s = preg_replace('/[\x{1100}-\x{115F}\x{2329}\x{232A}\x{2E80}-\x{303E}\x{3040}-\x{A4CF}\x{AC00}-\x{D7A3}\x{F900}-\x{FAFF}\x{FE10}-\x{FE19}\x{FE30}-\x{FE6F}\x{FF00}-\x{FF60}\x{FFE0}-\x{FFE6}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}]/u', '', $s, -1, $wide);

if ($width < $w += mb_strlen($s, 'UTF-8') + ($wide << 1)) {
$width = $w;
}
// Non printable characters have been dropped, so wcswidth cannot logically return -1.
$width += $this->wcswidth($s);
}

return $width;
Expand Down Expand Up @@ -503,4 +499,80 @@ private function pad(int $len, self $pad, int $type): parent
throw new InvalidArgumentException('Invalid padding type.');
}
}

/**
* Based on https://github.com/jquast/wcwidth, a Python implementation of https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c.
*/
private function wcswidth(string $string): int
{
$width = 0;

foreach (preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY) as $c) {
$codePoint = mb_ord($c, 'UTF-8');

if (0 === $codePoint // NULL
|| 0x034F === $codePoint // COMBINING GRAPHEME JOINER
|| (0x200B <= $codePoint && 0x200F >= $codePoint) // ZERO WIDTH SPACE to RIGHT-TO-LEFT MARK
|| 0x2028 === $codePoint // LINE SEPARATOR
|| 0x2029 === $codePoint // PARAGRAPH SEPARATOR
|| (0x202A <= $codePoint && 0x202E >= $codePoint) // LEFT-TO-RIGHT EMBEDDING to RIGHT-TO-LEFT OVERRIDE
|| (0x2060 <= $codePoint && 0x2063 >= $codePoint) // WORD JOINER to INVISIBLE SEPARATOR
) {
continue;
}

// Non printable characters
if (32 > $codePoint // C0 control characters
|| (0x07F <= $codePoint && 0x0A0 > $codePoint) // C1 control characters and DEL
) {
return -1;
}

static $tableZero;
if (null === $tableZero) {
$tableZero = require __DIR__.'/Resources/data/wcswidth_table_zero.php';
}

if ($codePoint >= $tableZero[0][0] && $codePoint <= $tableZero[$ubound = \count($tableZero) - 1][1]) {
$lbound = 0;
while ($ubound >= $lbound) {
$mid = floor(($lbound + $ubound) / 2);

if ($codePoint > $tableZero[$mid][1]) {
$lbound = $mid + 1;
} elseif ($codePoint < $tableZero[$mid][0]) {
$ubound = $mid - 1;
} else {
continue 2;
}
}
}

static $tableWide;
if (null === $tableWide) {
$tableWide = require __DIR__.'/Resources/data/wcswidth_table_wide.php';
}

if ($codePoint >= $tableWide[0][0] && $codePoint <= $tableWide[$ubound = \count($tableWide) - 1][1]) {
$lbound = 0;
while ($ubound >= $lbound) {
$mid = floor(($lbound + $ubound) / 2);

if ($codePoint > $tableWide[$mid][1]) {
$lbound = $mid + 1;
} elseif ($codePoint < $tableWide[$mid][0]) {
$ubound = $mid - 1;
} else {
$width += 2;

continue 2;
}
}
}

++$width;
}

return $width;
}
}
28 changes: 2 additions & 26 deletions src/Symfony/Component/String/ByteString.php
Expand Up @@ -303,9 +303,6 @@ public function replaceMatches(string $fromRegexp, $to): parent
return $str;
}

/**
* {@inheritdoc}
*/
public function reverse(): parent
{
$str = clone $this;
Expand Down Expand Up @@ -460,29 +457,8 @@ public function upper(): parent

public function width(bool $ignoreAnsiDecoration = true): int
{
$width = 0;
$s = str_replace(["\x00", "\x05", "\x07"], '', $this->string);
$string = preg_match('//u', $this->string) ? $this->string : preg_replace('/[\x80-\xFF]/', '?', $this->string);

if (false !== strpos($s, "\r")) {
$s = str_replace(["\r\n", "\r"], "\n", $s);
}

foreach (explode("\n", $s) as $s) {
if ($ignoreAnsiDecoration) {
$s = preg_replace('/\x1B(?:
\[ [\x30-\x3F]*+ [\x20-\x2F]*+ [0x40-\x7E]
| [P\]X^_] .*? \x1B\\\\
| [\x41-\x7E]
)/x', '', $s);
}

$w = substr_count($s, "\xAD") - substr_count($s, "\x08");

if ($width < $w += \strlen($s)) {
$width = $w;
}
}

return $width;
return (new CodePointString($string))->width($ignoreAnsiDecoration);
}
}
1 change: 1 addition & 0 deletions src/Symfony/Component/String/CHANGELOG.md
Expand Up @@ -5,6 +5,7 @@ CHANGELOG
-----

* Added the `AbstractString::reverse()` method.
* Made `AbstractString::width()` follow POSIX.1-2001.

5.0.0
-----
Expand Down
113 changes: 113 additions & 0 deletions src/Symfony/Component/String/Resources/WcswidthDataGenerator.php
@@ -0,0 +1,113 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <fabien@symfony.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Symfony\Component\String\Resources;

use Symfony\Component\HttpClient\HttpClient;
use Symfony\Component\String\Exception\RuntimeException;
use Symfony\Component\VarExporter\VarExporter;

/**
* @internal
*/
final class WcswidthDataGenerator
{
private $outDir;

private $client;

public function __construct(string $outDir)
{
$this->outDir = $outDir;

$this->client = HttpClient::createForBaseUri('https://www.unicode.org/Public/UNIDATA/');
}

public function generate(): void
{
$this->writeWideWidthData();

$this->writeZeroWidthData();
}

private function writeWideWidthData(): void
{
if (!preg_match('/^# EastAsianWidth-(\d+\.\d+\.\d+)\.txt/', $content = $this->client->request('GET', 'EastAsianWidth.txt')->getContent(), $matches)) {
throw new RuntimeException('The Unicode version could not be determined.');
}

$version = $matches[1];

if (!preg_match_all('/^([A-H\d]{4,})(?:\.\.([A-H\d]{4,}))?;[W|F]/m', $content, $matches, PREG_SET_ORDER)) {
throw new RuntimeException('The wide width pattern did not match anything.');
}

$this->write('wcswidth_table_wide.php', $version, $matches);
}

private function writeZeroWidthData(): void
{
if (!preg_match('/^# DerivedGeneralCategory-(\d+\.\d+\.\d+)\.txt/', $content = $this->client->request('GET', 'extracted/DerivedGeneralCategory.txt')->getContent(), $matches)) {
throw new RuntimeException('The Unicode version could not be determined.');
}

$version = $matches[1];

if (!preg_match_all('/^([A-H\d]{4,})(?:\.\.([A-H\d]{4,}))? *; (?:Me|Mn)/m', $content, $matches, PREG_SET_ORDER)) {
throw new RuntimeException('The zero width pattern did not match anything.');
}

$this->write('wcswidth_table_zero.php', $version, $matches);
}

private function write(string $fileName, string $version, array $rawData): void
{
$content = $this->getHeader($version).'return '.VarExporter::export($this->format($rawData)).";\n";

if (!file_put_contents($this->outDir.'/'.$fileName, $content)) {
throw new RuntimeException(sprintf('The "%s" file could not be written.', $fileName));
}
}

private function getHeader(string $version): string
{
$date = (new \DateTimeImmutable())->format('c');

return <<<EOT
<?php
/*
* This file has been auto-generated by the Symfony String Component for internal use.
*
* Unicode version: $version
* Date: $date
*/
EOT;
}

private function format(array $rawData): array
{
$data = array_map(static function (array $row): array {
$start = $row[1];
$end = $row[2] ?? $start;

return [hexdec($start), hexdec($end)];
}, $rawData);

usort($data, static function (array $a, array $b): int {
return $a[0] - $b[0];
});

return $data;
}
}
55 changes: 55 additions & 0 deletions src/Symfony/Component/String/Resources/bin/update-data.php
@@ -0,0 +1,55 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <fabien@symfony.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

use Symfony\Component\String\Resources\WcswidthDataGenerator;

error_reporting(E_ALL);

set_error_handler(static function (int $type, string $msg, string $file, int $line): void {
throw new \ErrorException($msg, 0, $type, $file, $line);
});

set_exception_handler(static function (\Throwable $exception): void {
echo "\n";

$cause = $exception;
$root = true;

while (null !== $cause) {
if (!$root) {
echo "Caused by\n";
}

echo get_class($cause).': '.$cause->getMessage()."\n";
echo "\n";
echo $cause->getFile().':'.$cause->getLine()."\n";
echo $cause->getTraceAsString()."\n";

$cause = $cause->getPrevious();
$root = false;
}
});

$autoload = __DIR__.'/../../vendor/autoload.php';

if (!file_exists($autoload)) {
echo wordwrap('You should run "composer install" in the component before running this script.', 75)." Aborting.\n";

exit(1);
}

require_once $autoload;

echo "Generating wcswidth tables data...\n";

(new WcswidthDataGenerator(dirname(__DIR__).'/data'))->generate();

echo "Done.\n";

0 comments on commit 75fc3fa

Please sign in to comment.