Permalink
Browse files

Release 1.5.3

* added `Craur::createFromHtml($html_string[, $encoding = 'utf-8'])`
  • Loading branch information...
1 parent 5456f6e commit ac926d021905bd93f06ada1fbad96144668c0fc5 @DracoBlue committed Apr 16, 2012
View
@@ -1,7 +1,7 @@
# Craur
-* Version: 1.5.2
-* Date: 2012/04/13
+* Version: 1.5.3
+* Date: 2012/04/16
* Build Status: [![Build Status](https://secure.travis-ci.org/DracoBlue/Craur.png?branch=master)](http://travis-ci.org/DracoBlue/Craur), 100% Code Coverage
The library craur has two main purposes:
@@ -86,6 +86,14 @@ Will create and return a new craur instance for the given XML string.
$authors = $node->get('book.author[]');
assert(count($authors) == 2);
+### Craur::createFromHtml(`$html_string[, $encoding = 'utf-8']) : `Craur`
+
+Will create and return a new craur instance for the given HTML string.
+
+ $node = Craur::createFromHtml('<html><head><title>Hans</title></head><body>Paul</body></html>');
+ assert($node->get('html.head.title') == 'Hans');
+ assert($node->get('html.body') == 'Paul');
+
### Craur::createFromCsvFile(`$file_path, array $field_mappings`) : `Craur`
Will load the csv file and fill the objects according to the given `$field_mappings`.
@@ -330,6 +338,8 @@ element, you can do this:
## Changelog
+- 1.5.3 (2012/04/16)
+ - added `Craur::createFromHtml($html_string, $encoding = 'utf-8')`
- 1.5.2 (2012/04/12)
- strip invalid utf8 characters in createFromXml
- added encoding parameter for createFromXml
View
@@ -78,6 +78,50 @@ static function createFromXml($xml_string, $encoding = 'utf-8')
return new Craur($data);
}
+ /**
+ * Create a new `Craur` from a given HTML-string.
+ *
+ * @example
+ * $node = Craur::createFromHtml('<html><head><title>Hans</title></head><body>Paul</body></html>');
+ * assert($node->get('html.head.title') == 'Hans');
+ * assert($node->get('html.body') == 'Paul');
+ *
+ * @return Craur
+ */
+ static function createFromHtml($html_string, $encoding = 'utf-8')
+ {
+ $html_string = preg_replace('/[\x1-\x8\xB-\xC\xE-\x1F]/', '', $html_string);
+
+ if ($encoding != 'utf-8')
+ {
+ $html_string = iconv($encoding, 'utf-8', $html_string);
+ }
+
+ $node = new DOMDocument('1.0', 'utf-8');
+
+ /*
+ * FIXME: Can we check if that was enabled in first place?
+ */
+ libxml_use_internal_errors(true);
+ $node->loadHTML($html_string);
+ $error = libxml_get_last_error();
+ libxml_use_internal_errors(false);
+
+ if ($error)
+ {
+ throw new Exception('Invalid html (' . trim($error->message) . ', line: ' . $error->line . ', col: ' . $error->column . '): ' . $html_string);
+ }
+
+ $data = self::convertDomNodeToDataArray($node);
+
+ /*
+ * We don't need to parse for namespaces here (like in the xml case),
+ * because namespaces are just attributes in html!
+ */
+
+ return new Craur($data);
+ }
+
static function convertDomNodeToDataArray(DomNode $node)
{
$data = array();
@@ -88,6 +132,16 @@ static function convertDomNodeToDataArray(DomNode $node)
{
foreach ($node->childNodes as $child_node)
{
+ /*
+ * A html dom node always contains one dom document type child
+ * node with no content (DOMDocumentType#internalSubset is for
+ * example <!DOCTYPE html>). Ignore it!
+ */
+ if ($child_node instanceof DOMDocumentType)
+ {
+ continue ;
+ }
+
if ($child_node->nodeType === XML_TEXT_NODE)
{
$has_value = true;
@@ -0,0 +1,7 @@
+<html>
+ <head>
+ <title> xx</title>
+ </head>
+ <body>
+ </body>
+</html>
@@ -0,0 +1,11 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en" lang="en">
+ <head>
+ <title>Test Title</title>
+ </head>
+ <body>
+ <p>test<br>test2<img width=20 height=30 src="http://example.org/image.png"></p>
+ </body>
+</html>
+
@@ -0,0 +1,23 @@
+<?php
+
+$craur = Craur::createFromHtml('<!DOCTYPE html>' . PHP_EOL . '<html><head><title>Test Title</title></head><body></body></html>');
+
+assert($craur->get('html.head.title') == 'Test Title');
+
+$craur = Craur::createFromHtml(file_get_contents(dirname(__FILE__) . '/fixtures/strict_html_file.html'));
+assert($craur->get('html.head.title') == 'Test Title');
+assert($craur->get('html.@xmlns:atom') == 'http://www.w3.org/2005/Atom');
+assert($craur->get('html.body.p.img.@width') == '20');
+assert($craur->get('html.body.p.img.@height') == '30');
+assert($craur->get('html.body.p.img.@src') == 'http://example.org/image.png');
+assert($craur->get('html.body.p') == 'testtest2');
+
+try
+{
+ $craur = Craur::createFromHtml('<!DOCTYPE html>' . PHP_EOL . '<html><head><title>Test Title</title></head><body></body><html>');
+ assert(false);
+}
+catch (Exception $exception)
+{
+ assert(strpos($exception->getMessage(), 'Invalid html') > -1);
+}
@@ -0,0 +1,7 @@
+<?php
+
+$invalid_html = file_get_contents(dirname(__FILE__) . '/fixtures/invalid_character_in_html.html');
+
+$craur = Craur::createFromHtml($invalid_html, 'iso-8859-1');
+
+assert($craur->get('html.head.title') == 'xx');
@@ -13,6 +13,13 @@
assert(count($authors) == 2);
+/* Craur#createFromHtml */
+
+$node = Craur::createFromHtml('<html><head><title>Hans</title></head><body>Paul</body></html>');
+assert($node->get('html.head.title') == 'Hans');
+assert($node->get('html.body') == 'Paul');
+
+
/* Craur#createFromCsvFile */
// If the file loooks like this:

0 comments on commit ac926d0

Please sign in to comment.