Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feature #35415 Extracted code to expand an URI to
UriExpander
(lyrixx)
This PR was merged into the 5.1-dev branch. Discussion ---------- Extracted code to expand an URI to `UriExpander` | Q | A | ------------- | --- | Branch? | master | Bug fix? | no | New feature? | yes | Deprecations? | no | Tickets | | License | MIT | Doc PR | When building a crawler we need to extract and to expand all links on a web pages. ATM, we need to create a DomDocument, attach the href, and ask for the full URL. This is a bit slow, and unecessary. This is why I extracted the minimal code to expand the URL to its onw trait for better re-usability. I benched (a specific part of) my application: * before: 2.16ms * after: 1.42ms Commits ------- 0c499c6 Extracted code to expand an URI to `UriExpanderTrait`
- Loading branch information
Showing
4 changed files
with
223 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
86 changes: 86 additions & 0 deletions
86
src/Symfony/Component/DomCrawler/Tests/UriExpanderTest.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
<?php | ||
|
||
/* | ||
* This file is part of the Symfony package. | ||
* | ||
* (c) Fabien Potencier <fabien@symfony.com> | ||
* | ||
* For the full copyright and license information, please view the LICENSE | ||
* file that was distributed with this source code. | ||
*/ | ||
|
||
namespace Symfony\Component\DomCrawler\Tests; | ||
|
||
use PHPUnit\Framework\TestCase; | ||
use Symfony\Component\DomCrawler\UriExpander; | ||
|
||
class UriExpanderTest extends TestCase | ||
{ | ||
/** | ||
* @dataProvider provideExpandUriTests | ||
*/ | ||
public function testExpandUri(string $uri, string $currentUri, string $expected) | ||
{ | ||
$this->assertEquals($expected, UriExpander::expand($uri, $currentUri)); | ||
} | ||
|
||
public function provideExpandUriTests() | ||
{ | ||
return [ | ||
['/foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], | ||
['/foo', 'http://localhost/bar/foo', 'http://localhost/foo'], | ||
[' | ||
/foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], | ||
['/foo | ||
', 'http://localhost/bar/foo', 'http://localhost/foo'], | ||
|
||
['foo', 'http://localhost/bar/foo/', 'http://localhost/bar/foo/foo'], | ||
['foo', 'http://localhost/bar/foo', 'http://localhost/bar/foo'], | ||
|
||
['', 'http://localhost/bar/', 'http://localhost/bar/'], | ||
['#', 'http://localhost/bar/', 'http://localhost/bar/#'], | ||
['#bar', 'http://localhost/bar?a=b', 'http://localhost/bar?a=b#bar'], | ||
['#bar', 'http://localhost/bar/#foo', 'http://localhost/bar/#bar'], | ||
['?a=b', 'http://localhost/bar#foo', 'http://localhost/bar?a=b'], | ||
['?a=b', 'http://localhost/bar/', 'http://localhost/bar/?a=b'], | ||
|
||
['http://login.foo.com/foo', 'http://localhost/bar/', 'http://login.foo.com/foo'], | ||
['https://login.foo.com/foo', 'https://localhost/bar/', 'https://login.foo.com/foo'], | ||
['mailto:foo@bar.com', 'http://localhost/foo', 'mailto:foo@bar.com'], | ||
|
||
// tests schema relative URL (issue #7169) | ||
['//login.foo.com/foo', 'http://localhost/bar/', 'http://login.foo.com/foo'], | ||
['//login.foo.com/foo', 'https://localhost/bar/', 'https://login.foo.com/foo'], | ||
|
||
['?foo=2', 'http://localhost?foo=1', 'http://localhost?foo=2'], | ||
['?foo=2', 'http://localhost/?foo=1', 'http://localhost/?foo=2'], | ||
['?foo=2', 'http://localhost/bar?foo=1', 'http://localhost/bar?foo=2'], | ||
['?foo=2', 'http://localhost/bar/?foo=1', 'http://localhost/bar/?foo=2'], | ||
['?bar=2', 'http://localhost?foo=1', 'http://localhost?bar=2'], | ||
|
||
['foo', 'http://login.foo.com/bar/baz?/query/string', 'http://login.foo.com/bar/foo'], | ||
|
||
['.', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'], | ||
['./', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'], | ||
['./foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/foo'], | ||
['..', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'], | ||
['../', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'], | ||
['../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/foo'], | ||
['../..', 'http://localhost/foo/bar/baz', 'http://localhost/'], | ||
['../../', 'http://localhost/foo/bar/baz', 'http://localhost/'], | ||
['../../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo'], | ||
['../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], | ||
['../bar/../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], | ||
['../bar/./../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], | ||
['../../', 'http://localhost/', 'http://localhost/'], | ||
['../../', 'http://localhost', 'http://localhost/'], | ||
|
||
['/foo', 'http://localhost?bar=1', 'http://localhost/foo'], | ||
['/foo', 'http://localhost#bar', 'http://localhost/foo'], | ||
['/foo', 'file:///', 'file:///foo'], | ||
['/foo', 'file:///bar/baz', 'file:///foo'], | ||
['foo', 'file:///', 'file:///foo'], | ||
['foo', 'file:///bar/baz', 'file:///bar/foo'], | ||
]; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
<?php | ||
|
||
/* | ||
* This file is part of the Symfony package. | ||
* | ||
* (c) Fabien Potencier <fabien@symfony.com> | ||
* | ||
* For the full copyright and license information, please view the LICENSE | ||
* file that was distributed with this source code. | ||
*/ | ||
|
||
namespace Symfony\Component\DomCrawler; | ||
|
||
/** | ||
* Expand an URI according a current URI. | ||
* | ||
* @author Fabien Potencier <fabien@symfony.com> | ||
* @author Grégoire Pineau <lyrixx@lyrixx.info> | ||
*/ | ||
class UriExpander | ||
{ | ||
/** | ||
* Expand an URI according to a current Uri. | ||
* | ||
* For example if $uri=/foo/bar and $currentUri=https://symfony.com it will | ||
* return https://symfony.com/foo/bar | ||
* | ||
* If the $uri is not absolute you must pass an absolute $currentUri | ||
*/ | ||
public static function expand(string $uri, ?string $currentUri): string | ||
{ | ||
$uri = trim($uri); | ||
|
||
// absolute URL? | ||
if (null !== parse_url($uri, PHP_URL_SCHEME)) { | ||
return $uri; | ||
} | ||
|
||
if (null === $currentUri) { | ||
throw new \InvalidArgumentException('The URI is relative, so you must define its base URI passing an absolute URL.'); | ||
} | ||
|
||
// empty URI | ||
if (!$uri) { | ||
return $currentUri; | ||
} | ||
|
||
// an anchor | ||
if ('#' === $uri[0]) { | ||
return self::cleanupAnchor($currentUri).$uri; | ||
} | ||
|
||
$baseUri = self::cleanupUri($currentUri); | ||
|
||
if ('?' === $uri[0]) { | ||
return $baseUri.$uri; | ||
} | ||
|
||
// absolute URL with relative schema | ||
if (0 === strpos($uri, '//')) { | ||
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri; | ||
} | ||
|
||
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri); | ||
|
||
// absolute path | ||
if ('/' === $uri[0]) { | ||
return $baseUri.$uri; | ||
} | ||
|
||
// relative path | ||
$path = parse_url(substr($currentUri, \strlen($baseUri)), PHP_URL_PATH); | ||
$path = self::canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri); | ||
|
||
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path; | ||
} | ||
|
||
/** | ||
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4). | ||
*/ | ||
private static function canonicalizePath(string $path): string | ||
{ | ||
if ('' === $path || '/' === $path) { | ||
return $path; | ||
} | ||
|
||
if ('.' === substr($path, -1)) { | ||
$path .= '/'; | ||
} | ||
|
||
$output = []; | ||
|
||
foreach (explode('/', $path) as $segment) { | ||
if ('..' === $segment) { | ||
array_pop($output); | ||
} elseif ('.' !== $segment) { | ||
$output[] = $segment; | ||
} | ||
} | ||
|
||
return implode('/', $output); | ||
} | ||
|
||
/** | ||
* Removes the query string and the anchor from the given uri. | ||
*/ | ||
private static function cleanupUri(string $uri): string | ||
{ | ||
return self::cleanupQuery(self::cleanupAnchor($uri)); | ||
} | ||
|
||
/** | ||
* Removes the query string from the uri. | ||
*/ | ||
private static function cleanupQuery(string $uri): string | ||
{ | ||
if (false !== $pos = strpos($uri, '?')) { | ||
return substr($uri, 0, $pos); | ||
} | ||
|
||
return $uri; | ||
} | ||
|
||
/** | ||
* Removes the anchor from the uri. | ||
*/ | ||
private static function cleanupAnchor(string $uri): string | ||
{ | ||
if (false !== $pos = strpos($uri, '#')) { | ||
return substr($uri, 0, $pos); | ||
} | ||
|
||
return $uri; | ||
} | ||
} |