In [2]:
# file: format_validator.py
import json
from collections import Counter
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import xmldiff.main
import jsonschema
import html5lib

In [3]:
class FormatValidator:
    def __init__(self):
        self.errors = []
        self.suggestions = []

    def validate(self, original_content, reconstructed_content, format_type):
        self.errors = []
        self.suggestions = []
        
        if format_type == 'html':
            return self._validate_html(original_content, reconstructed_content)
        elif format_type == 'json':
            return self._validate_json(original_content, reconstructed_content)
        elif format_type == 'xml':
            return self._validate_xml(original_content, reconstructed_content)
        else:
            self.errors.append(f"Unsupported format: {format_type}")
            return False

    def _validate_html(self, original_html, reconstructed_html):
        original_soup = BeautifulSoup(original_html, 'html5lib')
        reconstructed_soup = BeautifulSoup(reconstructed_html, 'html5lib')

        # Check 1: Basic HTML parsing
        if original_soup.find() is None:
            self.errors.append("Original HTML is not parseable")
            return False
        if reconstructed_soup.find() is None:
            self.errors.append("Reconstructed HTML is not parseable")
            self.suggestions.append("Ensure the reconstructed HTML has a valid structure with proper opening and closing tags.")
            return False

        # Check 2: Compare tag counts
        original_tags = Counter(tag.name for tag in original_soup.find_all())
        reconstructed_tags = Counter(tag.name for tag in reconstructed_soup.find_all())
        if original_tags != reconstructed_tags:
            self.errors.append("Tag count mismatch")
            self.suggestions.append(f"Adjust the number of HTML tags to match the original. Original counts: {dict(original_tags)}, Reconstructed counts: {dict(reconstructed_tags)}")

        # Check 3: Structure comparison
        if not self._compare_html_structure(original_soup, reconstructed_soup):
            self.errors.append("Overall HTML structure mismatch")
            self.suggestions.append("Ensure the hierarchical structure of HTML elements matches the original.")

        # Check 4: Attribute preservation
        if not self._compare_html_attributes(original_soup, reconstructed_soup):
            self.errors.append("HTML attribute mismatch")
            self.suggestions.append("Preserve all original HTML attributes, including their values.")

        # Check 5: Content length comparison
        original_content_length = len(original_soup.get_text())
        reconstructed_content_length = len(reconstructed_soup.get_text())
        if abs(original_content_length - reconstructed_content_length) > original_content_length * 0.2:  # 20% tolerance
            self.errors.append("Significant content length difference")
            self.suggestions.append(f"Adjust the content length to be within 20% of the original. Original length: {original_content_length}, Reconstructed length: {reconstructed_content_length}")

        return len(self.errors) == 0

    def _validate_json(self, original_json, reconstructed_json):
        try:
            original_dict = json.loads(original_json)
            reconstructed_dict = json.loads(reconstructed_json)
        except json.JSONDecodeError as e:
            self.errors.append(f"Invalid JSON format: {str(e)}")
            self.suggestions.append("Ensure the JSON is properly formatted with correct syntax.")
            return False

        # Check 1: Schema validation
        schema = self._generate_json_schema(original_dict)
        try:
            jsonschema.validate(instance=reconstructed_dict, schema=schema)
        except jsonschema.exceptions.ValidationError as ve:
            self.errors.append(f"JSON schema validation failed: {ve}")
            self.suggestions.append("Adjust the JSON structure to match the original schema.")
            return False

        # Check 2: Structure comparison
        if not self._compare_json_structure(original_dict, reconstructed_dict):
            self.errors.append("JSON structure mismatch")
            self.suggestions.append("Ensure all keys and nested structures in the JSON match the original.")

        # Check 3: Value type preservation
        type_mismatches = self._check_json_value_types(original_dict, reconstructed_dict)
        if type_mismatches:
            self.errors.append("JSON value type mismatch")
            self.suggestions.extend(type_mismatches)

        return len(self.errors) == 0

    def _validate_xml(self, original_xml, reconstructed_xml):
        try:
            original_root = ET.fromstring(original_xml)
            reconstructed_root = ET.fromstring(reconstructed_xml)
        except ET.ParseError as e:
            self.errors.append(f"Invalid XML format: {str(e)}")
            self.suggestions.append("Ensure the XML is well-formed with proper opening and closing tags.")
            return False

        # Check 1: Structure comparison
        diff = xmldiff.main.diff_trees(original_root, reconstructed_root)
        if diff:
            self.errors.append("XML structure mismatch")
            self.suggestions.append("Ensure the XML element structure and hierarchy match the original.")

        # Check 2: Attribute preservation
        if not self._compare_xml_attributes(original_root, reconstructed_root):
            self.errors.append("XML attributes mismatch")
            self.suggestions.append("Preserve all original XML attributes, including their values.")

        # Check 3: Element count
        original_count = self._count_xml_elements(original_root)
        reconstructed_count = self._count_xml_elements(reconstructed_root)
        if original_count != reconstructed_count:
            self.errors.append("XML element count mismatch")
            self.suggestions.append(f"Adjust the number of XML elements to match the original. Original count: {original_count}, Reconstructed count: {reconstructed_count}")

        return len(self.errors) == 0

    def _compare_html_structure(self, soup1, soup2):
        def get_structure(soup):
            return ''.join(element.name for element in soup.descendants if element.name)
        return get_structure(soup1) == get_structure(soup2)

    def _compare_html_attributes(self, soup1, soup2):
        elements1 = soup1.find_all()
        elements2 = soup2.find_all()
        if len(elements1) != len(elements2):
            return False
        return all(e1.attrs == e2.attrs for e1, e2 in zip(elements1, elements2))

    def _generate_json_schema(self, json_dict):
        schema = {"type": "object", "properties": {}}
        for key, value in json_dict.items():
            if isinstance(value, dict):
                schema["properties"][key] = self._generate_json_schema(value)
            elif isinstance(value, list):
                schema["properties"][key] = {"type": "array"}
            else:
                schema["properties"][key] = {"type": type(value).__name__}
        return schema

    def _compare_json_structure(self, dict1, dict2):
        if not isinstance(dict1, type(dict2)):
            return False
        if isinstance(dict1, dict):
            return set(dict1.keys()) == set(dict2.keys()) and all(self._compare_json_structure(dict1[k], dict2[k]) for k in dict1)
        if isinstance(dict1, list):
            return len(dict1) == len(dict2) and all(self._compare_json_structure(v1, v2) for v1, v2 in zip(dict1, dict2))
        return True

    def _check_json_value_types(self, dict1, dict2, path=""):
        mismatches = []
        if isinstance(dict1, dict):
            for k in dict1:
                new_path = f"{path}.{k}" if path else k
                if k in dict2:
                    mismatches.extend(self._check_json_value_types(dict1[k], dict2[k], new_path))
                else:
                    mismatches.append(f"Missing key at {new_path}")
        elif isinstance(dict1, list):
            if len(dict1) != len(dict2):
                mismatches.append(f"Array length mismatch at {path}")
            else:
                for i, (v1, v2) in enumerate(zip(dict1, dict2)):
                    mismatches.extend(self._check_json_value_types(v1, v2, f"{path}[{i}]"))
        elif type(dict1) != type(dict2):
            mismatches.append(f"Type mismatch at {path}: expected {type(dict1).__name__}, got {type(dict2).__name__}")
        return mismatches

    def _compare_xml_attributes(self, elem1, elem2):
        if elem1.tag != elem2.tag:
            return False
        if elem1.attrib != elem2.attrib:
            return False
        if len(elem1) != len(elem2):
            return False
        return all(self._compare_xml_attributes(c1, c2) for c1, c2 in zip(elem1, elem2))

    def _count_xml_elements(self, elem):
        return 1 + sum(self._count_xml_elements(child) for child in elem)

    def get_feedback(self):
        return {
            "errors": self.errors,
            "suggestions": self.suggestions
        }

In [4]:
# Usage example
if __name__ == "__main__":
    validator = FormatValidator()

    # HTML example
    # original_html = "<html><body><h1>Hello</h1><p class='test'>World</p></body></html>"
    # reconstructed_html = "<html><body><h1>Bonjour</h1><p>Monde</p></body></html>"
    # print("HTML Validation:", validator.validate(original_html, reconstructed_html, 'html'))
    # print("Feedback:", validator.get_feedback())

    # JSON example
    # original_json = '{"name": "John", "age": 30, "city": "New York"}'
    # reconstructed_json = '{"name": "Jean", "age": "30", "town": "Paris"}'
    # print("\nJSON Validation:", validator.validate(original_json, reconstructed_json, 'json'))
    # print("Feedback:", validator.get_feedback())

    # XML example
    original_xml = '<root><person><name>John</name><age>30</age></person></root>'
    reconstructed_xml = '<root><person><name>Jean</name><age>30</age><city>Paris</city></person></root>'
    print("\nXML Validation:", validator.validate(original_xml, reconstructed_xml, 'xml'))
    print("Feedback:", validator.get_feedback())

TypeError: The 'left' and 'right' parameters must be lxml Elements.

In [2]:
%pip install xmldiff

Collecting xmldiff
  Downloading xmldiff-2.7.0-py3-none-any.whl.metadata (9.0 kB)
Downloading xmldiff-2.7.0-py3-none-any.whl (43 kB)
Installing collected packages: xmldiff
Successfully installed xmldiff-2.7.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install jsonschema


Collecting jsonschema
  Downloading jsonschema-4.23.0-py3-none-any.whl.metadata (7.9 kB)
Collecting jsonschema-specifications>=2023.03.6 (from jsonschema)
  Downloading jsonschema_specifications-2024.10.1-py3-none-any.whl.metadata (3.0 kB)
Collecting referencing>=0.28.4 (from jsonschema)
  Downloading referencing-0.35.1-py3-none-any.whl.metadata (2.8 kB)
Collecting rpds-py>=0.7.1 (from jsonschema)
  Downloading rpds_py-0.20.0-cp310-none-win_amd64.whl.metadata (4.2 kB)
Downloading jsonschema-4.23.0-py3-none-any.whl (88 kB)
Downloading jsonschema_specifications-2024.10.1-py3-none-any.whl (18 kB)
Downloading referencing-0.35.1-py3-none-any.whl (26 kB)
Downloading rpds_py-0.20.0-cp310-none-win_amd64.whl (213 kB)
Installing collected packages: rpds-py, referencing, jsonschema-specifications, jsonschema
Successfully installed jsonschema-4.23.0 jsonschema-specifications-2024.10.1 referencing-0.35.1 rpds-py-0.20.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Collecting webencodings (from html5lib)
  Downloading webencodings-0.5.1-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
Downloading webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
Installing collected packages: webencodings, html5lib
Successfully installed html5lib-1.1 webencodings-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
%pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install html5lib lxml

Note: you may need to restart the kernel to use updated packages.


In [None]:
# file: format_validator.py

import json
from collections import Counter
from bs4 import BeautifulSoup
from lxml import etree
import xmldiff.main
import jsonschema

class FormatValidator:
    def __init__(self):
        self.errors = []
        self.suggestions = []

    def validate(self, original_content, reconstructed_content, format_type):
        self.errors = []
        self.suggestions = []
        
        if format_type == 'html':
            return self._validate_html(original_content, reconstructed_content)
        elif format_type == 'json':
            return self._validate_json(original_content, reconstructed_content)
        elif format_type == 'xml':
            return self._validate_xml(original_content, reconstructed_content)
        else:
            self.errors.append(f"Unsupported format: {format_type}")
            return False

    def _validate_html(self, original_html, reconstructed_html):
        parsers = ['html5lib', 'lxml', 'html.parser']
        original_soup = None
        reconstructed_soup = None

        for parser in parsers:
            try:
                original_soup = BeautifulSoup(original_html, parser)
                reconstructed_soup = BeautifulSoup(reconstructed_html, parser)
                break
            except ImportError:
                continue

        if original_soup is None or reconstructed_soup is None:
            self.errors.append("Failed to parse HTML. Please install 'html5lib' or 'lxml' for better HTML parsing.")
            return False

        # Check 1: Basic HTML parsing
        if original_soup.find() is None:
            self.errors.append("Original HTML is not parseable")
            return False
        if reconstructed_soup.find() is None:
            self.errors.append("Reconstructed HTML is not parseable")
            self.suggestions.append("Ensure the reconstructed HTML has a valid structure with proper opening and closing tags.")
            return False

        # Check 2: Compare tag counts
        original_tags = Counter(tag.name for tag in original_soup.find_all())
        reconstructed_tags = Counter(tag.name for tag in reconstructed_soup.find_all())
        if original_tags != reconstructed_tags:
            self.errors.append("Tag count mismatch")
            self.suggestions.append(f"Adjust the number of HTML tags to match the original. Original counts: {dict(original_tags)}, Reconstructed counts: {dict(reconstructed_tags)}")

        # Check 3: Structure comparison
        if not self._compare_html_structure(original_soup, reconstructed_soup):
            self.errors.append("Overall HTML structure mismatch")
            self.suggestions.append("Ensure the hierarchical structure of HTML elements matches the original.")

        # Check 4: Attribute preservation
        if not self._compare_html_attributes(original_soup, reconstructed_soup):
            self.errors.append("HTML attribute mismatch")
            self.suggestions.append("Preserve all original HTML attributes, including their values.")

        # Check 5: Content length comparison
        original_content_length = len(original_soup.get_text())
        reconstructed_content_length = len(reconstructed_soup.get_text())
        if abs(original_content_length - reconstructed_content_length) > original_content_length * 0.2:  # 20% tolerance
            self.errors.append("Significant content length difference")
            self.suggestions.append(f"Adjust the content length to be within 20% of the original. Original length: {original_content_length}, Reconstructed length: {reconstructed_content_length}")

        return len(self.errors) == 0

    def _validate_json(self, original_json, reconstructed_json):
        try:
            original_dict = json.loads(original_json)
            reconstructed_dict = json.loads(reconstructed_json)
        except json.JSONDecodeError as e:
            self.errors.append(f"Invalid JSON format: {str(e)}")
            self.suggestions.append("Ensure the JSON is properly formatted with correct syntax.")
            return False

        # Check 1: Schema validation
        schema = self._generate_json_schema(original_dict)
        try:
            jsonschema.validate(instance=reconstructed_dict, schema=schema)
        except jsonschema.exceptions.ValidationError as ve:
            self.errors.append(f"JSON schema validation failed: {ve}")
            self.suggestions.append("Adjust the JSON structure and types to match the original schema.")
            return False

        # Check 2: Structure comparison
        if not self._compare_json_structure(original_dict, reconstructed_dict):
            self.errors.append("JSON structure mismatch")
            self.suggestions.append("Ensure all keys and nested structures in the JSON match the original.")

        # Check 3: Value type preservation
        type_mismatches = self._check_json_value_types(original_dict, reconstructed_dict)
        if type_mismatches:
            self.errors.append("JSON value type mismatch")
            self.suggestions.extend(type_mismatches)

        return len(self.errors) == 0

    def _validate_xml(self, original_xml, reconstructed_xml):
        try:
            original_root = etree.fromstring(original_xml.encode())
            reconstructed_root = etree.fromstring(reconstructed_xml.encode())
        except etree.XMLSyntaxError as e:
            self.errors.append(f"Invalid XML format: {str(e)}")
            self.suggestions.append("Ensure the XML is well-formed with proper opening and closing tags.")
            return False

        # Check 1: Structure comparison
        try:
            diff = xmldiff.main.diff_trees(original_root, reconstructed_root)
            if diff:
                self.errors.append("XML structure mismatch")
                self.suggestions.append("Ensure the XML element structure and hierarchy match the original.")
        except Exception as e:
            self.errors.append(f"Error in XML comparison: {str(e)}")
            self.suggestions.append("There was an issue comparing the XML structures. Please check the XML content.")

        # Check 2: Attribute preservation
        if not self._compare_xml_attributes(original_root, reconstructed_root):
            self.errors.append("XML attributes mismatch")
            self.suggestions.append("Preserve all original XML attributes, including their values.")

        # Check 3: Element count
        original_count = self._count_xml_elements(original_root)
        reconstructed_count = self._count_xml_elements(reconstructed_root)
        if original_count != reconstructed_count:
            self.errors.append("XML element count mismatch")
            self.suggestions.append(f"Adjust the number of XML elements to match the original. Original count: {original_count}, Reconstructed count: {reconstructed_count}")

        return len(self.errors) == 0

    def _compare_html_structure(self, soup1, soup2):
        def get_structure(soup):
            return ''.join(element.name for element in soup.descendants if element.name)
        return get_structure(soup1) == get_structure(soup2)

    def _compare_html_attributes(self, soup1, soup2):
        elements1 = soup1.find_all()
        elements2 = soup2.find_all()
        if len(elements1) != len(elements2):
            return False
        return all(e1.attrs == e2.attrs for e1, e2 in zip(elements1, elements2))

    def _generate_json_schema(self, json_dict):
        def get_type(value):
            if isinstance(value, str):
                return "string"
            elif isinstance(value, bool):
                return "boolean"
            elif isinstance(value, int):
                return "integer"
            elif isinstance(value, float):
                return "number"
            elif isinstance(value, list):
                return "array"
            elif isinstance(value, dict):
                return "object"
            else:
                return "null"

        schema = {"type": "object", "properties": {}}
        for key, value in json_dict.items():
            if isinstance(value, dict):
                schema["properties"][key] = self._generate_json_schema(value)
            elif isinstance(value, list):
                schema["properties"][key] = {
                    "type": "array",
                    "items": self._generate_json_schema(value[0]) if value else {}
                }
            else:
                schema["properties"][key] = {"type": get_type(value)}
        return schema

    def _compare_json_structure(self, dict1, dict2):
        if not isinstance(dict1, type(dict2)):
            return False
        if isinstance(dict1, dict):
            return set(dict1.keys()) == set(dict2.keys()) and all(self._compare_json_structure(dict1[k], dict2[k]) for k in dict1)
        if isinstance(dict1, list):
            return len(dict1) == len(dict2) and all(self._compare_json_structure(v1, v2) for v1, v2 in zip(dict1, dict2))
        return True

    def _check_json_value_types(self, dict1, dict2, path=""):
        mismatches = []
        if isinstance(dict1, dict):
            for k in dict1:
                new_path = f"{path}.{k}" if path else k
                if k in dict2:
                    mismatches.extend(self._check_json_value_types(dict1[k], dict2[k], new_path))
                else:
                    mismatches.append(f"Missing key at {new_path}")
        elif isinstance(dict1, list):
            if len(dict1) != len(dict2):
                mismatches.append(f"Array length mismatch at {path}")
            else:
                for i, (v1, v2) in enumerate(zip(dict1, dict2)):
                    mismatches.extend(self._check_json_value_types(v1, v2, f"{path}[{i}]"))
        elif type(dict1) != type(dict2):
            mismatches.append(f"Type mismatch at {path}: expected {type(dict1).__name__}, got {type(dict2).__name__}")
        return mismatches

    def _compare_xml_attributes(self, elem1, elem2):
        if elem1.tag != elem2.tag:
            return False
        if elem1.attrib != elem2.attrib:
            return False
        if len(elem1) != len(elem2):
            return False
        return all(self._compare_xml_attributes(c1, c2) for c1, c2 in zip(elem1, elem2))

    def _count_xml_elements(self, elem):
        return 1 + sum(self._count_xml_elements(child) for child in elem)

    def get_feedback(self):
        return {
            "errors": self.errors,
            "suggestions": self.suggestions
        }


In [1]:
# file: format_validator.py

import json
from collections import Counter
from bs4 import BeautifulSoup
from lxml import etree
import xmldiff.main
import jsonschema

In [2]:
class FormatValidator:
    def __init__(self):
        self.errors = []
        self.suggestions = []

    def validate(self, original_content, reconstructed_content, format_type):
        self.errors = []
        self.suggestions = []
        
        if format_type == 'html':
            return self._validate_html(original_content, reconstructed_content)
        elif format_type == 'json':
            return self._validate_json(original_content, reconstructed_content)
        elif format_type == 'xml':
            return self._validate_xml(original_content, reconstructed_content)
        else:
            self.errors.append(f"Unsupported format: {format_type}")
            return False

    def _validate_html(self, original_html, reconstructed_html):
        parsers = ['html5lib', 'lxml', 'html.parser']
        original_soup = None
        reconstructed_soup = None

        for parser in parsers:
            try:
                original_soup = BeautifulSoup(original_html, parser)
                reconstructed_soup = BeautifulSoup(reconstructed_html, parser)
                break
            except ImportError:
                continue

        if original_soup is None or reconstructed_soup is None:
            self.errors.append("Failed to parse HTML. Please install 'html5lib' or 'lxml' for better HTML parsing.")
            return False

        # Check 1: Basic HTML parsing
        if original_soup.find() is None:
            self.errors.append("Original HTML is not parseable")
            return False
        if reconstructed_soup.find() is None:
            self.errors.append("Reconstructed HTML is not parseable")
            self.suggestions.append("Ensure the reconstructed HTML has a valid structure with proper opening and closing tags.")
            return False

        # Check 2: Compare tag counts
        original_tags = Counter(tag.name for tag in original_soup.find_all())
        reconstructed_tags = Counter(tag.name for tag in reconstructed_soup.find_all())
        if original_tags != reconstructed_tags:
            self.errors.append("Tag count mismatch")
            self.suggestions.append(f"Adjust the number of HTML tags to match the original. Original counts: {dict(original_tags)}, Reconstructed counts: {dict(reconstructed_tags)}")

        # Check 3: Structure comparison
        if not self._compare_html_structure(original_soup, reconstructed_soup):
            self.errors.append("Overall HTML structure mismatch")
            self.suggestions.append("Ensure the hierarchical structure of HTML elements matches the original.")

        # Check 4: Attribute preservation
        if not self._compare_html_attributes(original_soup, reconstructed_soup):
            self.errors.append("HTML attribute mismatch")
            self.suggestions.append("Preserve all original HTML attributes, including their values.")

        # Check 5: Content length comparison
        original_content_length = len(original_soup.get_text())
        reconstructed_content_length = len(reconstructed_soup.get_text())
        if abs(original_content_length - reconstructed_content_length) > original_content_length * 0.2:  # 20% tolerance
            self.errors.append("Significant content length difference")
            self.suggestions.append(f"Adjust the content length to be within 20% of the original. Original length: {original_content_length}, Reconstructed length: {reconstructed_content_length}")

        return len(self.errors) == 0

    def _validate_json(self, original_json, reconstructed_json):
        try:
            original_dict = json.loads(original_json)
            reconstructed_dict = json.loads(reconstructed_json)
        except json.JSONDecodeError as e:
            self.errors.append(f"Invalid JSON format: {str(e)}")
            self.suggestions.append("Ensure the JSON is properly formatted with correct syntax.")
            return False

        # Check 1: Schema validation
        schema = self._generate_json_schema(original_dict)
        try:
            jsonschema.validate(instance=reconstructed_dict, schema=schema)
        except jsonschema.exceptions.ValidationError as ve:
            self.errors.append(f"JSON schema validation failed: {ve}")
            self.suggestions.append("Adjust the JSON structure and types to match the original schema.")
            return False

        # Check 2: Structure comparison
        if not self._compare_json_structure(original_dict, reconstructed_dict):
            self.errors.append("JSON structure mismatch")
            self.suggestions.append("Ensure all keys and nested structures in the JSON match the original.")

        # Check 3: Value type preservation
        type_mismatches = self._check_json_value_types(original_dict, reconstructed_dict)
        if type_mismatches:
            self.errors.append("JSON value type mismatch")
            self.suggestions.extend(type_mismatches)

        return len(self.errors) == 0

    def _validate_xml(self, original_xml, reconstructed_xml):
        try:
            original_root = etree.fromstring(original_xml.encode())
            reconstructed_root = etree.fromstring(reconstructed_xml.encode())
        except etree.XMLSyntaxError as e:
            self.errors.append(f"Invalid XML format: {str(e)}")
            self.suggestions.append("Ensure the XML is well-formed with proper opening and closing tags.")
            return False

        # Check 1: Structure comparison
        try:
            diff = xmldiff.main.diff_trees(original_root, reconstructed_root)
            if diff:
                self.errors.append("XML structure mismatch")
                self.suggestions.append("Ensure the XML element structure and hierarchy match the original.")
        except Exception as e:
            self.errors.append(f"Error in XML comparison: {str(e)}")
            self.suggestions.append("There was an issue comparing the XML structures. Please check the XML content.")

        # Check 2: Attribute preservation
        if not self._compare_xml_attributes(original_root, reconstructed_root):
            self.errors.append("XML attributes mismatch")
            self.suggestions.append("Preserve all original XML attributes, including their values.")

        # Check 3: Element count
        original_count = self._count_xml_elements(original_root)
        reconstructed_count = self._count_xml_elements(reconstructed_root)
        if original_count != reconstructed_count:
            self.errors.append("XML element count mismatch")
            self.suggestions.append(f"Adjust the number of XML elements to match the original. Original count: {original_count}, Reconstructed count: {reconstructed_count}")

        return len(self.errors) == 0

    def _compare_html_structure(self, soup1, soup2):
        def get_structure(soup):
            return ''.join(element.name for element in soup.descendants if element.name)
        return get_structure(soup1) == get_structure(soup2)

    def _compare_html_attributes(self, soup1, soup2):
        elements1 = soup1.find_all()
        elements2 = soup2.find_all()
        if len(elements1) != len(elements2):
            return False
        return all(e1.attrs == e2.attrs for e1, e2 in zip(elements1, elements2))

    def _generate_json_schema(self, json_dict):
        def get_type(value):
            if isinstance(value, str):
                return "string"
            elif isinstance(value, bool):
                return "boolean"
            elif isinstance(value, int):
                return "integer"
            elif isinstance(value, float):
                return "number"
            elif isinstance(value, list):
                return "array"
            elif isinstance(value, dict):
                return "object"
            else:
                return "null"

        schema = {"type": "object", "properties": {}}
        for key, value in json_dict.items():
            if isinstance(value, dict):
                schema["properties"][key] = self._generate_json_schema(value)
            elif isinstance(value, list):
                schema["properties"][key] = {
                    "type": "array",
                    "items": self._generate_json_schema(value[0]) if value else {}
                }
            else:
                schema["properties"][key] = {"type": get_type(value)}
        return schema

    def _compare_json_structure(self, dict1, dict2):
        if not isinstance(dict1, type(dict2)):
            return False
        if isinstance(dict1, dict):
            return set(dict1.keys()) == set(dict2.keys()) and all(self._compare_json_structure(dict1[k], dict2[k]) for k in dict1)
        if isinstance(dict1, list):
            return len(dict1) == len(dict2) and all(self._compare_json_structure(v1, v2) for v1, v2 in zip(dict1, dict2))
        return True

    def _check_json_value_types(self, dict1, dict2, path=""):
        mismatches = []
        if isinstance(dict1, dict):
            for k in dict1:
                new_path = f"{path}.{k}" if path else k
                if k in dict2:
                    mismatches.extend(self._check_json_value_types(dict1[k], dict2[k], new_path))
                else:
                    mismatches.append(f"Missing key at {new_path}")
        elif isinstance(dict1, list):
            if len(dict1) != len(dict2):
                mismatches.append(f"Array length mismatch at {path}")
            else:
                for i, (v1, v2) in enumerate(zip(dict1, dict2)):
                    mismatches.extend(self._check_json_value_types(v1, v2, f"{path}[{i}]"))
        elif type(dict1) != type(dict2):
            mismatches.append(f"Type mismatch at {path}: expected {type(dict1).__name__}, got {type(dict2).__name__}")
        return mismatches

    def _compare_xml_attributes(self, elem1, elem2):
        if elem1.tag != elem2.tag:
            return False
        if elem1.attrib != elem2.attrib:
            return False
        if len(elem1) != len(elem2):
            return False
        return all(self._compare_xml_attributes(c1, c2) for c1, c2 in zip(elem1, elem2))

    def _count_xml_elements(self, elem):
        return 1 + sum(self._count_xml_elements(child) for child in elem)

    def get_feedback(self):
        return {
            "errors": self.errors,
            "suggestions": self.suggestions
        }


In [8]:
# Usage example
if __name__ == "__main__":
    validator = FormatValidator()

    # HTML example
    original_html = "<html><body><h1>Hello</h1><p class='test'>World</p></body></html>"
    reconstructed_html = "<html><body><h1>Bonjour</h1><p>Monde</p></body></html>"
    print("HTML Validation:", validator.validate(original_html, reconstructed_html, 'html'))
    print("Feedback:", validator.get_feedback())

    # JSON example
    original_json = '{"name": "John", "age": 30, "city": "New York"}'
    reconstructed_json = '{"name": "Jean", "age": 30, "town": "Paris"}'
    print("\nJSON Validation:", validator.validate(original_json, reconstructed_json, 'json'))
    print("Feedback:", validator.get_feedback())

    # XML example
    original_xml = '<root><person><name>John</name><age>30</age></person></root>'
    reconstructed_xml = '<root><person><name>John</name><age>30</age></person></root>'
    print("\nXML Validation:", validator.validate(original_xml, reconstructed_xml, 'xml'))
    print("Feedback:", validator.get_feedback())


XML Validation: False
Feedback: {'errors': ['XML structure mismatch'], 'suggestions': ['Ensure the XML element structure and hierarchy match the original.']}


In [9]:
import json
from collections import Counter
from bs4 import BeautifulSoup
from lxml import etree
import xmldiff.main
import jsonschema

In [10]:
class ComprehensiveStructureValidator:
    def __init__(self):
        self.errors = []
        self.suggestions = []

    def validate(self, original_content, reconstructed_content, format_type):
        self.errors = []
        self.suggestions = []
        
        if format_type == 'html':
            return self._validate_html(original_content, reconstructed_content)
        elif format_type == 'json':
            return self._validate_json(original_content, reconstructed_content)
        elif format_type == 'xml':
            return self._validate_xml(original_content, reconstructed_content)
        else:
            self.errors.append(f"Unsupported format: {format_type}")
            return False

    def _validate_html(self, original_html, reconstructed_html):
        parsers = ['html5lib', 'lxml', 'html.parser']
        original_soup = None
        reconstructed_soup = None

        for parser in parsers:
            try:
                original_soup = BeautifulSoup(original_html, parser)
                reconstructed_soup = BeautifulSoup(reconstructed_html, parser)
                break
            except ImportError:
                continue

        if original_soup is None or reconstructed_soup is None:
            self.errors.append("Failed to parse HTML. Please install 'html5lib' or 'lxml' for better HTML parsing.")
            return False

        # Check 1: Basic HTML parsing
        if original_soup.find() is None or reconstructed_soup.find() is None:
            self.errors.append("HTML is not parseable")
            self.suggestions.append("Ensure the HTML has a valid structure with proper opening and closing tags.")
            return False

        # Check 2: Compare tag counts
        original_tags = Counter(tag.name for tag in original_soup.find_all())
        reconstructed_tags = Counter(tag.name for tag in reconstructed_soup.find_all())
        if original_tags != reconstructed_tags:
            self.errors.append("Tag count mismatch")
            self.suggestions.append(f"Adjust the number of HTML tags to match the original. Original counts: {dict(original_tags)}, Reconstructed counts: {dict(reconstructed_tags)}")

        # Check 3: Structure comparison
        if not self._compare_html_structure(original_soup, reconstructed_soup):
            self.errors.append("Overall HTML structure mismatch")
            self.suggestions.append("Ensure the hierarchical structure of HTML elements matches the original.")

        # Check 4: Attribute preservation
        if not self._compare_html_attributes(original_soup, reconstructed_soup):
            self.errors.append("HTML attribute mismatch")
            self.suggestions.append("Preserve all original HTML attributes, including their values.")

        # Check 5: Class and ID preservation
        if not self._compare_html_classes_and_ids(original_soup, reconstructed_soup):
            self.errors.append("Mismatch in HTML classes or IDs")
            self.suggestions.append("Ensure all class names and IDs are preserved in the reconstructed HTML.")

        # Check 6: Form structure preservation
        if not self._compare_html_forms(original_soup, reconstructed_soup):
            self.errors.append("Mismatch in HTML form structures")
            self.suggestions.append("Ensure all form elements and their attributes are preserved.")

        return len(self.errors) == 0

    def _validate_json(self, original_json, reconstructed_json):
        try:
            original_dict = json.loads(original_json)
            reconstructed_dict = json.loads(reconstructed_json)
        except json.JSONDecodeError as e:
            self.errors.append(f"Invalid JSON format: {str(e)}")
            self.suggestions.append("Ensure the JSON is properly formatted with correct syntax.")
            return False

        # Check 1: Schema validation
        schema = self._generate_json_schema(original_dict)
        try:
            jsonschema.validate(instance=reconstructed_dict, schema=schema)
        except jsonschema.exceptions.ValidationError as ve:
            self.errors.append(f"JSON schema validation failed: {ve}")
            self.suggestions.append("Adjust the JSON structure to match the original schema.")
            return False

        # Check 2: Structure comparison
        if not self._compare_json_structure(original_dict, reconstructed_dict):
            self.errors.append("JSON structure mismatch")
            self.suggestions.append("Ensure all keys and nested structures in the JSON match the original.")

        # Check 3: Array length preservation
        if not self._compare_json_array_lengths(original_dict, reconstructed_dict):
            self.errors.append("JSON array length mismatch")
            self.suggestions.append("Ensure all arrays in the JSON have the same length as in the original.")

        return len(self.errors) == 0

    def _validate_xml(self, original_xml, reconstructed_xml):
        try:
            original_root = etree.fromstring(original_xml.encode())
            reconstructed_root = etree.fromstring(reconstructed_xml.encode())
        except etree.XMLSyntaxError as e:
            self.errors.append(f"Invalid XML format: {str(e)}")
            self.suggestions.append("Ensure the XML is well-formed with proper opening and closing tags.")
            return False

        # Check 1: Structure comparison
        try:
            diff = xmldiff.main.diff_trees(original_root, reconstructed_root)
            if diff:
                self.errors.append("XML structure mismatch")
                self.suggestions.append("Ensure the XML element structure and hierarchy match the original.")
        except Exception as e:
            self.errors.append(f"Error in XML comparison: {str(e)}")
            self.suggestions.append("There was an issue comparing the XML structures. Please check the XML content.")

        # Check 2: Attribute preservation
        if not self._compare_xml_attributes(original_root, reconstructed_root):
            self.errors.append("XML attributes mismatch")
            self.suggestions.append("Preserve all original XML attributes, including their values.")

        # Check 3: Element count
        original_count = self._count_xml_elements(original_root)
        reconstructed_count = self._count_xml_elements(reconstructed_root)
        if original_count != reconstructed_count:
            self.errors.append("XML element count mismatch")
            self.suggestions.append(f"Adjust the number of XML elements to match the original. Original count: {original_count}, Reconstructed count: {reconstructed_count}")

        # Check 4: Namespace preservation
        if not self._compare_xml_namespaces(original_root, reconstructed_root):
            self.errors.append("XML namespace mismatch")
            self.suggestions.append("Ensure all XML namespaces are preserved in the reconstructed XML.")

        return len(self.errors) == 0

    def _compare_html_structure(self, soup1, soup2):
        def get_structure(soup):
            return ''.join(element.name for element in soup.descendants if element.name)
        return get_structure(soup1) == get_structure(soup2)

    def _compare_html_attributes(self, soup1, soup2):
        elements1 = soup1.find_all()
        elements2 = soup2.find_all()
        if len(elements1) != len(elements2):
            return False
        return all(e1.attrs == e2.attrs for e1, e2 in zip(elements1, elements2))

    def _compare_html_classes_and_ids(self, soup1, soup2):
        elements1 = soup1.find_all(class_=True) + soup1.find_all(id=True)
        elements2 = soup2.find_all(class_=True) + soup2.find_all(id=True)
        if len(elements1) != len(elements2):
            return False
        return all(e1.get('class') == e2.get('class') and e1.get('id') == e2.get('id') 
                   for e1, e2 in zip(elements1, elements2))

    def _compare_html_forms(self, soup1, soup2):
        forms1 = soup1.find_all('form')
        forms2 = soup2.find_all('form')
        if len(forms1) != len(forms2):
            return False
        for f1, f2 in zip(forms1, forms2):
            if f1.attrs != f2.attrs:
                return False
            inputs1 = f1.find_all(['input', 'select', 'textarea'])
            inputs2 = f2.find_all(['input', 'select', 'textarea'])
            if len(inputs1) != len(inputs2):
                return False
            if any(i1.attrs != i2.attrs for i1, i2 in zip(inputs1, inputs2)):
                return False
        return True

    def _generate_json_schema(self, json_dict):
        schema = {"type": "object", "properties": {}}
        for key, value in json_dict.items():
            if isinstance(value, dict):
                schema["properties"][key] = self._generate_json_schema(value)
            elif isinstance(value, list):
                schema["properties"][key] = {
                    "type": "array",
                    "items": self._generate_json_schema(value[0]) if value else {}
                }
            else:
                schema["properties"][key] = {"type": "string"}  # Treat all non-structural elements as strings
        return schema

    def _compare_json_structure(self, dict1, dict2):
        if not isinstance(dict1, type(dict2)):
            return False
        if isinstance(dict1, dict):
            return set(dict1.keys()) == set(dict2.keys()) and all(self._compare_json_structure(dict1[k], dict2[k]) for k in dict1)
        if isinstance(dict1, list):
            return len(dict1) == len(dict2) and all(self._compare_json_structure(v1, v2) for v1, v2 in zip(dict1, dict2))
        return True

    def _compare_json_array_lengths(self, dict1, dict2):
        if isinstance(dict1, dict) and isinstance(dict2, dict):
            return all(self._compare_json_array_lengths(dict1[k], dict2[k]) for k in dict1 if k in dict2)
        if isinstance(dict1, list) and isinstance(dict2, list):
            return len(dict1) == len(dict2)
        return True

    def _compare_xml_attributes(self, elem1, elem2):
        if elem1.tag != elem2.tag:
            return False
        if elem1.attrib != elem2.attrib:
            return False
        if len(elem1) != len(elem2):
            return False
        return all(self._compare_xml_attributes(c1, c2) for c1, c2 in zip(elem1, elem2))

    def _count_xml_elements(self, elem):
        return 1 + sum(self._count_xml_elements(child) for child in elem)

    def _compare_xml_namespaces(self, elem1, elem2):
        return elem1.nsmap == elem2.nsmap and all(self._compare_xml_namespaces(c1, c2) for c1, c2 in zip(elem1, elem2))

    def get_feedback(self):
        return {
            "errors": self.errors,
            "suggestions": self.suggestions
        }

In [11]:

# Usage example
if __name__ == "__main__":
    validator = ComprehensiveStructureValidator()

    # HTML example
    original_html = "<html><body><h1 class='title'>Hello</h1><p id='content'>World</p></body></html>"
    reconstructed_html = "<html><body><h1 class='title'>Bonjour</h1><p id='content'>Monde</p></body></html>"
    print("HTML Validation:", validator.validate(original_html, reconstructed_html, 'html'))
    print("Feedback:", validator.get_feedback())

    # JSON example
    original_json = '{"name": "John", "age": 30, "city": {"name": "New York", "population": 8400000}}'
    reconstructed_json = '{"name": "Jean", "age": "35", "city": {"name": "Paris", "population": "2161000"}}'
    print("\nJSON Validation:", validator.validate(original_json, reconstructed_json, 'json'))
    print("Feedback:", validator.get_feedback())

    # XML example
    original_xml = '<root xmlns:x="http://example.com"><person><name>John</name><age>30</age></person></root>'
    reconstructed_xml = '<root xmlns:x="http://example.com"><person><name>Jean</name><age>35</age></person></root>'
    print("\nXML Validation:", validator.validate(original_xml, reconstructed_xml, 'xml'))
    print("Feedback:", validator.get_feedback())

HTML Validation: True
Feedback: {'errors': [], 'suggestions': []}

JSON Validation: False
Feedback: {'errors': ['JSON structure mismatch'], 'suggestions': ['Ensure all keys and nested structures in the JSON match the original.']}

XML Validation: False
Feedback: {'errors': ['XML structure mismatch'], 'suggestions': ['Ensure the XML element structure and hierarchy match the original.']}


In [12]:
%pip install beautifulsoup4 lxml html5lib jsonschema xmlschema

Collecting xmlschema
  Downloading xmlschema-3.4.2-py3-none-any.whl.metadata (8.3 kB)
Collecting elementpath<5.0.0,>=4.4.0 (from xmlschema)
  Downloading elementpath-4.5.0-py3-none-any.whl.metadata (7.0 kB)
Downloading xmlschema-3.4.2-py3-none-any.whl (417 kB)
Downloading elementpath-4.5.0-py3-none-any.whl (228 kB)
Installing collected packages: elementpath, xmlschema
Successfully installed elementpath-4.5.0 xmlschema-3.4.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import json
from collections import Counter
from bs4 import BeautifulSoup
from lxml import etree
import xmlschema
import jsonschema
import html5lib
from urllib.parse import urlparse

In [5]:

class EnhancedStructureValidator:
    def __init__(self, config=None):
        self.config = config or {
            'validation_level': 'normal',
            'check_html5': True,
            'check_accessibility': True,
            'check_xml_schema': True,
            'check_json_schema': True
        }
        self.errors = []
        self.suggestions = []

    def validate(self, original_content, reconstructed_content, format_type):
        self.errors = []
        self.suggestions = []
        
        if format_type == 'html':
            return self._validate_html(original_content, reconstructed_content)
        elif format_type == 'json':
            return self._validate_json(original_content, reconstructed_content)
        elif format_type == 'xml':
            return self._validate_xml(original_content, reconstructed_content)
        else:
            self.errors.append(f"Unsupported format: {format_type}")
            return False

    def _validate_html(self, original_html, reconstructed_html):
        original_soup = BeautifulSoup(original_html, 'html5lib')
        reconstructed_soup = BeautifulSoup(reconstructed_html, 'html5lib')

        # Basic structure checks
        if not self._compare_html_structure(original_soup, reconstructed_soup):
            self.errors.append("HTML structure mismatch")
            self.suggestions.append("Ensure the hierarchical structure of HTML elements matches the original.")

        # Tag count check
        if not self._compare_html_tag_counts(original_soup, reconstructed_soup):
            self.errors.append("HTML tag count mismatch")
            self.suggestions.append("Ensure the number of HTML tags matches the original.")

        # Attribute preservation check
        if not self._compare_html_attributes(original_soup, reconstructed_soup):
            self.errors.append("HTML attribute mismatch")
            self.suggestions.append("Preserve all original HTML attributes, including their values.")

        # HTML5 specific checks
        if self.config['check_html5']:
            self._check_html5_specific(reconstructed_soup)

        # Accessibility checks
        if self.config['check_accessibility']:
            self._check_accessibility(reconstructed_soup)

        return len(self.errors) == 0

    def _validate_json(self, original_json, reconstructed_json):
        try:
            original_dict = json.loads(original_json)
            reconstructed_dict = json.loads(reconstructed_json)
        except json.JSONDecodeError as e:
            self.errors.append(f"Invalid JSON format: {str(e)}")
            self.suggestions.append("Ensure the JSON is properly formatted with correct syntax.")
            return False

        # Structure comparison
        if not self._compare_json_structure(original_dict, reconstructed_dict):
            self.errors.append("JSON structure mismatch")
            self.suggestions.append("Ensure all keys and nested structures in the JSON match the original.")

        # JSON Schema validation
        if self.config['check_json_schema']:
            schema = self._generate_json_schema(original_dict)
            try:
                jsonschema.validate(instance=reconstructed_dict, schema=schema)
            except jsonschema.exceptions.ValidationError as ve:
                self.errors.append(f"JSON schema validation failed: {ve}")
                self.suggestions.append("Adjust the JSON structure to match the original schema.")

        return len(self.errors) == 0

    def _validate_xml(self, original_xml, reconstructed_xml):
        try:
            original_root = etree.fromstring(original_xml.encode())
            reconstructed_root = etree.fromstring(reconstructed_xml.encode())
        except etree.XMLSyntaxError as e:
            self.errors.append(f"Invalid XML format: {str(e)}")
            self.suggestions.append("Ensure the XML is well-formed with proper opening and closing tags.")
            return False

        # Structure comparison
        if not self._compare_xml_structure(original_root, reconstructed_root):
            self.errors.append("XML structure mismatch")
            self.suggestions.append("Ensure the XML element structure and hierarchy match the original.")

        # Namespace check
        if not self._compare_xml_namespaces(original_root, reconstructed_root):
            self.errors.append("XML namespace mismatch")
            self.suggestions.append("Ensure all XML namespaces are preserved in the reconstructed XML.")

        # XML Schema validation
        if self.config['check_xml_schema']:
            schema = xmlschema.XMLSchema(original_xml)
            if not schema.is_valid(reconstructed_xml):
                self.errors.append("XML schema validation failed")
                self.suggestions.append("Ensure the reconstructed XML conforms to the schema of the original XML.")

        return len(self.errors) == 0

    def _compare_html_structure(self, soup1, soup2):
        def get_structure(soup):
            return [
                (elem.name, elem.get('id', ''), ' '.join(elem.get('class', [])))
                for elem in soup.descendants if elem.name
            ]
        return get_structure(soup1) == get_structure(soup2)

    def _compare_html_tag_counts(self, soup1, soup2):
        count1 = Counter(tag.name for tag in soup1.find_all())
        count2 = Counter(tag.name for tag in soup2.find_all())
        return count1 == count2

    def _compare_html_attributes(self, soup1, soup2):
        elements1 = soup1.find_all()
        elements2 = soup2.find_all()
        if len(elements1) != len(elements2):
            return False
        return all(e1.attrs == e2.attrs for e1, e2 in zip(elements1, elements2))

    def _check_html5_specific(self, soup):
        html5_elements = {'article', 'aside', 'figcaption', 'figure', 'footer', 'header', 'main', 'mark', 'nav', 'section', 'time'}
        for elem in soup.find_all(html5_elements):
            if not self._is_properly_used_html5_element(elem):
                self.errors.append(f"Improper use of HTML5 element: {elem.name}")
                self.suggestions.append(f"Ensure the {elem.name} element is used correctly according to HTML5 specifications.")

    def _is_properly_used_html5_element(self, elem):
        # This is a simplified check. In a real-world scenario, you'd want more comprehensive rules.
        if elem.name == 'nav' and not elem.find_all('a'):
            return False
        if elem.name == 'figure' and not elem.find('figcaption'):
            return False
        return True

    def _check_accessibility(self, soup):
        # Check for alt text on images
        for img in soup.find_all('img'):
            if not img.get('alt'):
                self.errors.append("Image without alt text")
                self.suggestions.append("Add descriptive alt text to all images for accessibility.")

        # Check for proper heading structure
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        for i, heading in enumerate(headings):
            if i > 0 and int(heading.name[1]) > int(headings[i-1].name[1]) + 1:
                self.errors.append("Improper heading structure")
                self.suggestions.append("Ensure heading levels are properly nested without skipping levels.")

        # Check for ARIA roles
        if not soup.find_all(attrs={"role": True}):
            self.suggestions.append("Consider adding ARIA roles to improve accessibility.")

    def _compare_json_structure(self, dict1, dict2):
        if type(dict1) != type(dict2):
            if (isinstance(dict1, (int, float, str, bool)) and 
                isinstance(dict2, (int, float, str, bool))):
                return True  # Allow type flexibility for leaf nodes
            return False
        if isinstance(dict1, dict):
            return set(dict1.keys()) == set(dict2.keys()) and all(self._compare_json_structure(dict1[k], dict2[k]) for k in dict1)
        if isinstance(dict1, list):
            return len(dict1) == len(dict2) and all(self._compare_json_structure(v1, v2) for v1, v2 in zip(dict1, dict2))
        return True

    def _generate_json_schema(self, json_dict):
        schema = {"type": "object", "properties": {}}
        for key, value in json_dict.items():
            if isinstance(value, dict):
                schema["properties"][key] = self._generate_json_schema(value)
            elif isinstance(value, list):
                schema["properties"][key] = {
                    "type": "array",
                    "items": self._generate_json_schema(value[0]) if value else {}
                }
            else:
                schema["properties"][key] = {"type": self._json_type(value)}
        return schema

    def _json_type(self, value):
        if isinstance(value, int):
            return "integer"
        elif isinstance(value, float):
            return "number"
        elif isinstance(value, bool):
            return "boolean"
        elif isinstance(value, str):
            return "string"
        else:
            return "null"

    def _compare_xml_structure(self, elem1, elem2):
        if elem1.tag != elem2.tag:
            return False
        if elem1.attrib != elem2.attrib:
            return False
        if len(elem1) != len(elem2):
            return False
        return all(self._compare_xml_structure(c1, c2) for c1, c2 in zip(elem1, elem2))

    def _compare_xml_namespaces(self, elem1, elem2):
        return elem1.nsmap == elem2.nsmap and all(self._compare_xml_namespaces(c1, c2) for c1, c2 in zip(elem1, elem2))

    def get_feedback(self):
        return {
            "errors": self.errors,
            "suggestions": self.suggestions
        }


In [6]:
# Usage example
if __name__ == "__main__":
    validator = EnhancedStructureValidator()

    # HTML example
    original_html = """
    <html>
    <head><title>Test</title></head>
    <body>
        <h1>Hello</h1>
        <nav><a href="#">Link</a></nav>
        <img src="test.jpg" alt="Test image">
        <p>World</p>
    </body>
    </html>
    """
    reconstructed_html = """
    <html>
    <head><title>Test</title></head>
    <body>
        <h1>Bonjour</h1>
        <nav><a href="#">Lien</a></nav>
        <img src="test.jpg">
        <p>Monde</p>
    </body>
    </html>
    """
    print("HTML Validation:", validator.validate(original_html, reconstructed_html, 'html'))
    print("Feedback:", validator.get_feedback())

    # JSON example
    original_json = '{"name": "John", "age": 30, "city": {"name": "New York", "population": 8400000}}'
    reconstructed_json = '{"name": "Jean", "age": 35, "city": {"name": "Paris", "population": 2161000}}'
    print("\nJSON Validation:", validator.validate(original_json, reconstructed_json, 'json'))
    print("Feedback:", validator.get_feedback())

    # XML example
    original_xml = """
    <?xml version="1.0" encoding="UTF-8"?>
    <root xmlns:x="http://example.com">
        <person>
            <name>John</name>
            <age>30</age>
        </person>
    </root>
    """
    reconstructed_xml = """
    <?xml version="1.0" encoding="UTF-8"?>
    <root xmlns:x="http://example.com">
        <person>
            <name>Jean</name>
            <age>35</age>
        </person>
    </root>
    """
    print("\nXML Validation:", validator.validate(original_xml, reconstructed_xml, 'xml'))
    print("Feedback:", validator.get_feedback())

HTML Validation: False
Feedback: {'errors': ['HTML attribute mismatch', 'Image without alt text'], 'suggestions': ['Preserve all original HTML attributes, including their values.', 'Add descriptive alt text to all images for accessibility.', 'Consider adding ARIA roles to improve accessibility.']}

JSON Validation: True
Feedback: {'errors': [], 'suggestions': []}

XML Validation: False
Feedback: {'errors': ['Invalid XML format: XML declaration allowed only at the start of the document, line 2, column 10 (<string>, line 2)'], 'suggestions': ['Ensure the XML is well-formed with proper opening and closing tags.']}
