Skip to content

Latest commit

 

History

History
206 lines (187 loc) · 6.64 KB

6.HTMLParsing.md

File metadata and controls

206 lines (187 loc) · 6.64 KB

FXP supports parsing of HTML document. Here is an example;

Input HTML Document

<!DOCTYPE html>
        <html lang="en">
            <head>
                <script>
                    window.dataLayer = window.dataLayer || [];
                    function gtag(){dataLayer.push(arguments);}
                    gtag('js', new Date());
        
                    gtag('config', 'UA-80202630-2');
                </script>
        
                <title>Fast XMl Parser</title>
                <meta charset="UTF-8">
                <meta name="viewport" content="width=device-width, initial-scale=1">
                <link rel="stylesheet" href="static/css/bootstrap.min.css">
                <link rel="stylesheet" href="static/css/jquery-confirm.min.css">
                <link rel="stylesheet" type="text/css" href="style.css">
        
                <script src="static/js/jquery-3.2.1.min.js"></script>
                <style>
                    .CodeMirror{
                        height: 100%;
                        width: 100%;
                    }
                </style>
            </head>
            <body role="document" style="background-color: #2c3e50;">
            <h1>Heading</h1>
            <hr>
            <h2>&inr;</h2>
                <pre>
                    <h1>Heading</h1>
                    <hr>
                    <h2>&inr;</h2>
                </pre>
                <script>
                  let highlightedLine = null;
                  let editor;
                    <!-- this should not be parsed separately -->
                  function updateLength(){
                      const xmlData = editor.getValue();
                      $("#lengthxml")[0].innerText = xmlData.replace(/>\s*</g, "><").length;
                  }
                </script>
            </body>
        </html>

Code and necessary configuration to parse it to JS object.

const parsingOptions = {
    ignoreAttributes: false,
    // preserveOrder: true,
    unpairedTags: ["hr", "br", "link", "meta"],
    stopNodes : [ "*.pre", "*.script"],
    processEntities: true,
    htmlEntities: true
  };
  const parser = new XMLParser(parsingOptions);
  parser.parse(html);

JS Object

{
    "html": {
        "head": {
            "script": [
                "\n                    window.dataLayer = window.dataLayer || [];\n                    function gtag(){dataLayer.push(arguments);}\n                    gtag('js', new Date());\n        \n                    gtag('config', 'UA-80202630-2');\n                ",
                {
                    "@_src": "static/js/jquery-3.2.1.min.js"
                }
            ],
            "title": "Fast XMl Parser",
            "meta": [
                {
                    "@_charset": "UTF-8"
                },
                {
                    "@_name": "viewport",
                    "@_content": "width=device-width, initial-scale=1"
                }
            ],
            "link": [
                {
                    "@_rel": "stylesheet",
                    "@_href": "static/css/bootstrap.min.css"
                },
                {
                    "@_rel": "stylesheet",
                    "@_href": "static/css/jquery-confirm.min.css"
                },
                {
                    "@_rel": "stylesheet",
                    "@_type": "text/css",
                    "@_href": "style.css"
                }
            ],
            "style": ".CodeMirror{\n                        height: 100%;\n                        width: 100%;\n                    }"
        },
        "body": {
            "h1": "Heading",
            "hr": "",
            "h2": "",
            "pre": "\n                    <h1>Heading</h1>\n                    <hr>\n                    <h2>&inr;</h2>\n                ",
            "script": "\n                  let highlightedLine = null;\n                  let editor;\n                    <!-- this should not be parsed separately -->\n                  function updateLength(){\n                      const xmlData = editor.getValue();\n                      $(\"#lengthxml\")[0].innerText = xmlData.replace(/>s*</g, \"><\").length;\n                  }\n                ",
            "@_role": "document",
            "@_style": "background-color: #2c3e50;"
        },
        "@_lang": "en"
    }
}

To build the HTML document back from JS object, you need to uncomment preserveOrder: true in above code. And pass the output to the XML builder;

const parsingOptions = {
    ignoreAttributes: false,
    preserveOrder: true,
    unpairedTags: ["hr", "br", "link", "meta"],
    stopNodes : [ "*.pre", "*.script"],
    processEntities: true,
    htmlEntities: true
  };
  const parser = new XMLParser(parsingOptions);
  let result = parser.parse(html);

  const builderOptions = {
    ignoreAttributes: false,
    format: true,
    preserveOrder: true,
    suppressEmptyNode: true,
    unpairedTags: ["hr", "br", "link", "meta"],
    stopNodes : [ "*.pre", "*.script"],
  }
  const builder = new XMLBuilder(builderOptions);
  const output = builder.build(result);

Output

<html lang="en">
  <head>
    <script>
      
                    window.dataLayer = window.dataLayer || [];
                    function gtag(){dataLayer.push(arguments);}
                    gtag('js', new Date());
        
                    gtag('config', 'UA-80202630-2');
                
    </script>
    <title>
      Fast XMl Parser
    </title>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <link rel="stylesheet" href="static/css/bootstrap.min.css">
    <link rel="stylesheet" href="static/css/jquery-confirm.min.css">
    <link rel="stylesheet" type="text/css" href="style.css">
    <script src="static/js/jquery-3.2.1.min.js">
      
    </script>
    <style>
      .CodeMirror{
                        height: 100%;
                        width: 100%;
                    }
    </style>
  </head>
  <body role="document" style="background-color: #2c3e50;">
    <h1>
      Heading
    </h1>
    <hr>
    <h2></h2>
    <pre>
      
                    <h1>Heading</h1>
                    <hr>
                    <h2>&inr;</h2>
                
    </pre>
    <script>
      
                  let highlightedLine = null;
                  let editor;
                    <!-- this should not be parsed separately -->
                  function updateLength(){
                      const xmlData = editor.getValue();
                      $("#lengthxml")[0].innerText = xmlData.replace(/>s*</g, "><").length;
                  }
                
    </script>
  </body>
</html>

> Next: PI Tag processing