<a href="https://colab.research.google.com/github/Aryanp018/CSCI_226/blob/main/validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# File Download Here

In [1]:
# Download required files here

# Sample XML - `https://drive.google.com/file/d/1THaqLPVR5U7ZBU23MEKydJWiLyUu98x7/view?usp=sharing`
# Broken XML - `https://drive.google.com/file/d/1yumSaIFIy0cYnq_Y6XyF1Xg1mBWW_yz0/view?usp=sharing`
# DTD File - `https://drive.google.com/file/d/1rh78_kc9cJGifFaj5q1N4bob8ptrIvmN/view?usp=sharing`

# XML Valid
!gdown 1THaqLPVR5U7ZBU23MEKydJWiLyUu98x7 -O sample.xml
# XML Broken
!gdown 1yumSaIFIy0cYnq_Y6XyF1Xg1mBWW_yz0 -O broken.xml
# DTD File
!gdown 1rh78_kc9cJGifFaj5q1N4bob8ptrIvmN -O domain.dtd

Downloading...
From (original): https://drive.google.com/uc?id=1THaqLPVR5U7ZBU23MEKydJWiLyUu98x7
From (redirected): https://drive.google.com/uc?id=1THaqLPVR5U7ZBU23MEKydJWiLyUu98x7&confirm=t&uuid=c51a6dac-5b73-4624-b12a-a47fb6c0fb22
To: /content/sample.xml
100% 700/700 [00:00<00:00, 2.45MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1yumSaIFIy0cYnq_Y6XyF1Xg1mBWW_yz0
From (redirected): https://drive.google.com/uc?id=1yumSaIFIy0cYnq_Y6XyF1Xg1mBWW_yz0&confirm=t&uuid=c1e1f86a-861e-436e-a0e3-6512c75347ca
To: /content/broken.xml
100% 565/565 [00:00<00:00, 2.85MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rh78_kc9cJGifFaj5q1N4bob8ptrIvmN
To: /content/domain.dtd
100% 525/525 [00:00<00:00, 1.50MB/s]


# Manage Dependancies Here

In [2]:
!pip -q install lxml pygments

# XML Validation [Valid XML]

In [3]:
from lxml import etree

# Load DTD
with open("domain.dtd", "rb") as f:
    dtd = etree.DTD(f)

# Parse XML
tree = etree.parse("sample.xml")

# Validate
if dtd.validate(tree):
    print("XML is valid ✅")
else:
    print("XML is NOT valid ❌")
    print(dtd.error_log.filter_from_errors())

XML is valid ✅


# XML Validation [Invalid XML]

In [4]:
from lxml import etree

with open("domain.dtd", "rb") as f:
    dtd = etree.DTD(f)

tree = etree.parse("broken.xml")

if dtd.validate(tree):
    print("XML is valid ✅")
else:
    print("XML is NOT valid ❌")
    print(dtd.error_log.filter_from_errors())

XML is NOT valid ❌
broken.xml:3:0:ERROR:VALID:DTD_CONTENT_MODEL: Element article content does not follow the DTD, expecting (title , summary , category , publishTime , company , marketData , source , sentiment), got (title category publishTime company marketData source sentiment )


# Inspect Files

In [5]:
from lxml import etree
from IPython.display import HTML, display
from pygments import highlight
from pygments.lexers import XmlLexer, DtdLexer
from pygments.formatters import HtmlFormatter

def pretty_xml_text(path: str) -> str:
    """Load XML and return an indented, unicode string."""
    parser = etree.XMLParser(remove_blank_text=True)
    tree = etree.parse(path, parser)
    return etree.tostring(tree, pretty_print=True, encoding="unicode")

def show_code(text: str, lexer, title: str = None, max_height: str = "480px"):
    """Render syntax-highlighted code with line numbers in a scrollable box."""
    formatter = HtmlFormatter(linenos="table", style="friendly")
    css = formatter.get_style_defs('.highlight')
    html = [f"<style>{css}.codebox{{border:1px solid #e5e7eb;border-radius:10px;overflow:auto;max-height:{max_height}}}.title{{font-weight:600;margin:4px 0 8px}}</style>"]
    if title:
        html.append(f'<div class="title">{title}</div>')
    html.append(f'<div class="codebox">{highlight(text, lexer, formatter)}</div>')
    display(HTML("".join(html)))

# Show the pretty XML and the DTD (side by side calls)
show_code(pretty_xml_text("sample.xml"), XmlLexer(), "sample.xml (pretty-printed)")
show_code(open("domain.dtd", encoding="utf-8").read(), DtdLexer(), "domain.dtd")

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20,"<!DOCTYPE article SYSTEM ""domain.dtd""> <article>  <title>Apple Reports Quarterly Earnings Beat Expectations</title>  <summary>Apple Inc. posted stronger-than-expected results for Q2, with revenue growth driven by iPhone sales and services.</summary>  <category>Finance</category>  <publishTime>2025-09-01T14:30:00Z</publishTime>  <company>  <name>Apple Inc.</name>  <ticker>AAPL</ticker>  </company>  <marketData>  <stockPrice>225.50</stockPrice>  <changePercent>+2.4%</changePercent>  </marketData>  <source>  <name>Wall Street Journal</name>  <credibility>high</credibility>  </source>  <sentiment>positive</sentiment> </article>"


0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14,"<!ELEMENT article (title, summary, category, publishTime, company, marketData, source, sentiment)> <!ELEMENT title (#PCDATA)> <!ELEMENT summary (#PCDATA)> <!ELEMENT category (#PCDATA)> <!ELEMENT publishTime (#PCDATA)> <!ELEMENT company (name, ticker)> <!ELEMENT name (#PCDATA)> <!ELEMENT ticker (#PCDATA)> <!ELEMENT marketData (stockPrice, changePercent)> <!ELEMENT stockPrice (#PCDATA)> <!ELEMENT changePercent (#PCDATA)> <!ELEMENT source (name, credibility)> <!ELEMENT credibility (#PCDATA)> <!ELEMENT sentiment (#PCDATA)>"


In [6]:
show_code(pretty_xml_text("broken.xml"), XmlLexer(), "broken.xml (pretty-printed)")

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19,"<!DOCTYPE article SYSTEM ""domain.dtd""> <article>  <title>Apple Reports Quarterly Earnings Beat Expectations</title>  <category>Finance</category>  <publishTime>2025-09-01T14:30:00Z</publishTime>  <company>  <name>Apple Inc.</name>  <ticker>AAPL</ticker>  </company>  <marketData>  <stockPrice>225.50</stockPrice>  <changePercent>+2.4%</changePercent>  </marketData>  <source>  <name>Wall Street Journal</name>  <credibility>high</credibility>  </source>  <sentiment>positive</sentiment> </article>"


# Display Varient

In [8]:
def show_collapsible(title: str, text: str, lexer, open_default=False):
    fmt = HtmlFormatter(linenos="table", style="friendly")
    css = fmt.get_style_defs('.highlight')
    details_attr = "open" if open_default else ""
    html = f"""
    <style>{css}</style>
    <details {details_attr} style="margin:6px 0">
      <summary style="cursor:pointer;font-weight:600">{title}</summary>
      <div style="border:1px solid #e5e7eb;border-radius:10px;overflow:auto;max-height:520px;margin-top:8px">
        {highlight(text, lexer, fmt)}
      </div>
    </details>
    """
    display(HTML(html))

show_collapsible("sample.xml (pretty-printed)", pretty_xml_text("sample.xml"), XmlLexer(), open_default=True)
show_collapsible("domain.dtd", open("domain.dtd", encoding="utf-8").read(), DtdLexer())

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20,"<!DOCTYPE article SYSTEM ""domain.dtd""> <article>  <title>Apple Reports Quarterly Earnings Beat Expectations</title>  <summary>Apple Inc. posted stronger-than-expected results for Q2, with revenue growth driven by iPhone sales and services.</summary>  <category>Finance</category>  <publishTime>2025-09-01T14:30:00Z</publishTime>  <company>  <name>Apple Inc.</name>  <ticker>AAPL</ticker>  </company>  <marketData>  <stockPrice>225.50</stockPrice>  <changePercent>+2.4%</changePercent>  </marketData>  <source>  <name>Wall Street Journal</name>  <credibility>high</credibility>  </source>  <sentiment>positive</sentiment> </article>"


0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14,"<!ELEMENT article (title, summary, category, publishTime, company, marketData, source, sentiment)> <!ELEMENT title (#PCDATA)> <!ELEMENT summary (#PCDATA)> <!ELEMENT category (#PCDATA)> <!ELEMENT publishTime (#PCDATA)> <!ELEMENT company (name, ticker)> <!ELEMENT name (#PCDATA)> <!ELEMENT ticker (#PCDATA)> <!ELEMENT marketData (stockPrice, changePercent)> <!ELEMENT stockPrice (#PCDATA)> <!ELEMENT changePercent (#PCDATA)> <!ELEMENT source (name, credibility)> <!ELEMENT credibility (#PCDATA)> <!ELEMENT sentiment (#PCDATA)>"


In [9]:
from lxml import etree

with open("domain.dtd","rb") as f:
    dtd = etree.DTD(f)

tree_ok = etree.parse("sample.xml")
tree_bad = etree.parse("broken.xml")

print("sample.xml →", "VALID ✅" if dtd.validate(tree_ok) else "NOT valid ❌")
print("broken.xml →", "VALID ✅" if dtd.validate(tree_bad) else "NOT valid ❌")
if not dtd.validate(tree_bad):
    # Show the last few errors for teaching
    for e in list(dtd.error_log)[-5:]:
        print("•", e)

sample.xml → VALID ✅
broken.xml → NOT valid ❌
• broken.xml:3:0:ERROR:VALID:DTD_CONTENT_MODEL: Element article content does not follow the DTD, expecting (title , summary , category , publishTime , company , marketData , source , sentiment), got (title category publishTime company marketData source sentiment )
