Skip to content

Commit

Permalink
Fixed the XPath filtering to have the same behavior than Symfony 2.4
Browse files Browse the repository at this point in the history
  • Loading branch information
stof committed May 21, 2014
1 parent 711ac32 commit 80438c2
Showing 1 changed file with 101 additions and 13 deletions.
114 changes: 101 additions & 13 deletions src/Symfony/Component/DomCrawler/Crawler.php
Expand Up @@ -170,7 +170,7 @@ public function addHtmlContent($content, $charset = 'UTF-8')

$this->addDocument($dom);

$base = $this->filterXPath('descendant-or-self::base')->extract(array('href'));
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));

$baseHref = current($base);
if (count($base) && !empty($baseHref)) {
Expand Down Expand Up @@ -580,6 +580,11 @@ public function extract($attributes)
/**
* Filters the list of nodes with an XPath expression.
*
* The XPath expression is evaluated in the context of the crawler, which
* is considered as a fake parent of the elements inside it.
* This means that a child selector "div" or "./div" will match only
* the div elements of the current crawler, not their children.
*
* @param string $xpath An XPath expression
*
* @return Crawler A new instance of Crawler with the filtered list of nodes
Expand All @@ -588,14 +593,14 @@ public function extract($attributes)
*/
public function filterXPath($xpath)
{
$crawler = new static(null, $this->uri);
$xpath = $this->relativize($xpath);

foreach ($this as $node) {
$domxpath = new \DOMXPath($node->ownerDocument);
$crawler->add($domxpath->query($xpath, $node));
// If we dropped all expressions in the XPath while preparing it, there would be no match
if ('' === $xpath) {
return new static(null, $this->uri);
}

return $crawler;
return $this->filterRelativeXPath($xpath);
}

/**
Expand All @@ -619,7 +624,8 @@ public function filter($selector)
// @codeCoverageIgnoreEnd
}

return $this->filterXPath(CssSelector::toXPath($selector));
// The CssSelector already prefixes the selector with descendant-or-self::
return $this->filterRelativeXPath(CssSelector::toXPath($selector));
}

/**
Expand All @@ -633,10 +639,10 @@ public function filter($selector)
*/
public function selectLink($value)
{
$xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
$xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));

return $this->filterXPath($xpath);
return $this->filterRelativeXPath($xpath);
}

/**
Expand All @@ -651,11 +657,11 @@ public function selectLink($value)
public function selectButton($value)
{
$translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
$xpath = sprintf('//input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
$xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', $translate, static::xpathLiteral(' '.$value.' '), $value, $value).
sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);

return $this->filterXPath($xpath);
return $this->filterRelativeXPath($xpath);
}

/**
Expand Down Expand Up @@ -771,6 +777,88 @@ public static function xpathLiteral($s)
return sprintf("concat(%s)", implode($parts, ', '));
}

/**
* Filters the list of nodes with an XPath expression.
*
* The XPath expression should already be processed to apply it in the context of each node.
*
* @param string $xpath
*
* @return Crawler
*/
private function filterRelativeXPath($xpath)
{
$crawler = new static(null, $this->uri);

foreach ($this as $node) {
$domxpath = new \DOMXPath($node->ownerDocument);
$crawler->add($domxpath->query($xpath, $node));
}

return $crawler;
}

/**
* Make the XPath relative to the current context.
*
* The returned XPath will match elements matching the XPath inside the current crawler
* when running in the context of a node of the crawler.
*
* @param string $xpath
*
* @return string
*/
private function relativize($xpath)
{
$expressions = array();

$unionPattern = '/\|(?![^\[]*\])/';
// An expression which will never match to replace expressions which cannot match in the crawler
// We cannot simply drop
$nonMatchingExpression = 'a[name() = "b"]';

// Split any unions into individual expressions.
foreach (preg_split($unionPattern, $xpath) as $expression) {
$expression = trim($expression);
$parenthesis = '';

// If the union is inside some braces, we need to preserve the opening braces and apply
// the change only inside it.
if (preg_match('/^[\(\s*]+/', $expression, $matches)) {
$parenthesis = $matches[0];
$expression = substr($expression, strlen($parenthesis));
}

// BC for Symfony 2.4 and lower were elements were adding in a fake _root parent
if (0 === strpos($expression, '/_root/')) {
$expression = './'.substr($expression, 7);
}

// add prefix before absolute element selector
if (empty($expression)) {
$expression = $nonMatchingExpression;
} elseif (0 === strpos($expression, '//')) {
$expression = 'descendant-or-self::' . substr($expression, 2);
} elseif (0 === strpos($expression, './')) {
$expression = 'self::' . substr($expression, 2);
} elseif ('/' === $expression[0]) {
// the only direct child in Symfony 2.4 and lower is _root, which is already handled previously
// so let's drop the expression entirely
$expression = $nonMatchingExpression;
} elseif ('.' === $expression[0]) {
// '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results
$expression = $nonMatchingExpression;
} elseif (0 === strpos($expression, 'descendant::')) {
$expression = 'descendant-or-self::' . substr($expression, strlen('descendant::'));
} elseif (0 !== strpos($expression, 'descendant-or-self::')) {
$expression = 'self::' .$expression;
}
$expressions[] = $parenthesis.$expression;
}

return implode(' | ', $expressions);
}

/**
* @param int $position
*
Expand Down

0 comments on commit 80438c2

Please sign in to comment.