From 80438c2dd92028a06efc88b06c73615d550b958a Mon Sep 17 00:00:00 2001 From: Christophe Coevoet Date: Sun, 18 May 2014 02:00:59 +0200 Subject: [PATCH] Fixed the XPath filtering to have the same behavior than Symfony 2.4 --- src/Symfony/Component/DomCrawler/Crawler.php | 114 ++++++++++++++++--- 1 file changed, 101 insertions(+), 13 deletions(-) diff --git a/src/Symfony/Component/DomCrawler/Crawler.php b/src/Symfony/Component/DomCrawler/Crawler.php index 9003b9d9a383..de65978614aa 100644 --- a/src/Symfony/Component/DomCrawler/Crawler.php +++ b/src/Symfony/Component/DomCrawler/Crawler.php @@ -170,7 +170,7 @@ public function addHtmlContent($content, $charset = 'UTF-8') $this->addDocument($dom); - $base = $this->filterXPath('descendant-or-self::base')->extract(array('href')); + $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href')); $baseHref = current($base); if (count($base) && !empty($baseHref)) { @@ -580,6 +580,11 @@ public function extract($attributes) /** * Filters the list of nodes with an XPath expression. * + * The XPath expression is evaluated in the context of the crawler, which + * is considered as a fake parent of the elements inside it. + * This means that a child selector "div" or "./div" will match only + * the div elements of the current crawler, not their children. + * * @param string $xpath An XPath expression * * @return Crawler A new instance of Crawler with the filtered list of nodes @@ -588,14 +593,14 @@ public function extract($attributes) */ public function filterXPath($xpath) { - $crawler = new static(null, $this->uri); + $xpath = $this->relativize($xpath); - foreach ($this as $node) { - $domxpath = new \DOMXPath($node->ownerDocument); - $crawler->add($domxpath->query($xpath, $node)); + // If we dropped all expressions in the XPath while preparing it, there would be no match + if ('' === $xpath) { + return new static(null, $this->uri); } - return $crawler; + return $this->filterRelativeXPath($xpath); } /** @@ -619,7 +624,8 @@ public function filter($selector) // @codeCoverageIgnoreEnd } - return $this->filterXPath(CssSelector::toXPath($selector)); + // The CssSelector already prefixes the selector with descendant-or-self:: + return $this->filterRelativeXPath(CssSelector::toXPath($selector)); } /** @@ -633,10 +639,10 @@ public function filter($selector) */ public function selectLink($value) { - $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')). - sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' ')); + $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')). + sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' ')); - return $this->filterXPath($xpath); + return $this->filterRelativeXPath($xpath); } /** @@ -651,11 +657,11 @@ public function selectLink($value) public function selectButton($value) { $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")'; - $xpath = sprintf('//input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')). + $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')). sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', $translate, static::xpathLiteral(' '.$value.' '), $value, $value). - sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value); + sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value); - return $this->filterXPath($xpath); + return $this->filterRelativeXPath($xpath); } /** @@ -771,6 +777,88 @@ public static function xpathLiteral($s) return sprintf("concat(%s)", implode($parts, ', ')); } + /** + * Filters the list of nodes with an XPath expression. + * + * The XPath expression should already be processed to apply it in the context of each node. + * + * @param string $xpath + * + * @return Crawler + */ + private function filterRelativeXPath($xpath) + { + $crawler = new static(null, $this->uri); + + foreach ($this as $node) { + $domxpath = new \DOMXPath($node->ownerDocument); + $crawler->add($domxpath->query($xpath, $node)); + } + + return $crawler; + } + + /** + * Make the XPath relative to the current context. + * + * The returned XPath will match elements matching the XPath inside the current crawler + * when running in the context of a node of the crawler. + * + * @param string $xpath + * + * @return string + */ + private function relativize($xpath) + { + $expressions = array(); + + $unionPattern = '/\|(?![^\[]*\])/'; + // An expression which will never match to replace expressions which cannot match in the crawler + // We cannot simply drop + $nonMatchingExpression = 'a[name() = "b"]'; + + // Split any unions into individual expressions. + foreach (preg_split($unionPattern, $xpath) as $expression) { + $expression = trim($expression); + $parenthesis = ''; + + // If the union is inside some braces, we need to preserve the opening braces and apply + // the change only inside it. + if (preg_match('/^[\(\s*]+/', $expression, $matches)) { + $parenthesis = $matches[0]; + $expression = substr($expression, strlen($parenthesis)); + } + + // BC for Symfony 2.4 and lower were elements were adding in a fake _root parent + if (0 === strpos($expression, '/_root/')) { + $expression = './'.substr($expression, 7); + } + + // add prefix before absolute element selector + if (empty($expression)) { + $expression = $nonMatchingExpression; + } elseif (0 === strpos($expression, '//')) { + $expression = 'descendant-or-self::' . substr($expression, 2); + } elseif (0 === strpos($expression, './')) { + $expression = 'self::' . substr($expression, 2); + } elseif ('/' === $expression[0]) { + // the only direct child in Symfony 2.4 and lower is _root, which is already handled previously + // so let's drop the expression entirely + $expression = $nonMatchingExpression; + } elseif ('.' === $expression[0]) { + // '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results + $expression = $nonMatchingExpression; + } elseif (0 === strpos($expression, 'descendant::')) { + $expression = 'descendant-or-self::' . substr($expression, strlen('descendant::')); + } elseif (0 !== strpos($expression, 'descendant-or-self::')) { + $expression = 'self::' .$expression; + } + $expressions[] = $parenthesis.$expression; + } + + return implode(' | ', $expressions); + } + /** * @param int $position *