# XML Alto Data Validation

In [None]:
import glob
from lxml import etree

alto_files = glob.glob("Editions/Padova_1618_Cesare_Ripa_processed/*.xml")

In [None]:
for file in alto_files:
    try:
        etree.parse(file)
        print(f"✓ {file} is valid XML")
    except etree.XMLSyntaxError as e:
        print(f"✗ {file} has XML error: {e}")

# Validate ALTO

In [114]:
#!/usr/bin/env python3
import subprocess
import glob
import requests
import os

# Download schema if needed
if not os.path.exists("alto.xsd"):
    print("Downloading ALTO schema...")
    with open("alto.xsd", 'w') as f:
        f.write(requests.get("https://www.loc.gov/standards/alto/v4/alto.xsd").text)

# Validate using xmllint (handles dependencies automatically)
valid = invalid = 0
for xml_file in glob.glob("Editions/Padova_1618_Cesare_Ripa_processed/*.xml"):
    result = subprocess.run(['xmllint', '--schema', 'alto.xsd', '--noout', xml_file], 
                          capture_output=True, text=True)
    
    if result.returncode == 0:
        print(f"✓ {os.path.basename(xml_file)}")
        valid += 1
    else:
        print(f"✗ {os.path.basename(xml_file)}")
        invalid += 1

print(f"\nValid: {valid}, Invalid: {invalid}")

✗ page_0117.xml
✗ page_0671.xml
✗ page_0665.xml
✗ page_0103.xml
✗ page_0659.xml
✗ page_0498.xml
✗ page_0473.xml
✗ page_0315.xml
✗ page_0301.xml
✗ page_0467.xml
✗ page_0329.xml
✗ page_0507.xml
✗ page_0261.xml
✗ page_0275.xml
✗ page_0513.xml
✗ page_0249.xml
✗ page_0088.xml
✗ page_0063.xml
✗ page_0077.xml
✗ page_0076.xml
✗ page_0062.xml
✓ page_0704.xml
✗ page_0089.xml
✗ page_0248.xml
✗ page_0274.xml
✗ page_0512.xml
✗ page_0506.xml
✗ page_0260.xml
✗ page_0328.xml
✗ page_0300.xml
✗ page_0466.xml
✗ page_0472.xml
✗ page_0314.xml
✗ page_0499.xml
✗ page_0658.xml
✗ page_0664.xml
✗ page_0102.xml
✗ page_0116.xml
✗ page_0670.xml
✗ page_0699.xml
✗ page_0100.xml
✗ page_0666.xml
✗ page_0672.xml
✗ page_0114.xml
✗ page_0128.xml
✗ page_0464.xml
✗ page_0302.xml
✗ page_0316.xml
✗ page_0470.xml
✗ page_0458.xml
✗ page_0289.xml
✗ page_0510.xml
✗ page_0276.xml
✗ page_0262.xml
✗ page_0504.xml
✗ page_0538.xml
✗ page_0074.xml
✗ page_0060.xml
✗ page_0048.xml
✗ page_0049.xml
✗ page_0061.xml
✗ page_0075.xml
✗ page_0

In [117]:
!xmllint --schema alto.xsd --noout /Users/carboni/Downloads/Balzac1624_Lettres_btv1b86262420_corrected_0011.xml

/Users/carboni/Downloads/Balzac1624_Lettres_btv1b86262420_corrected_0011.xml validates


In [None]:
!rm alto.xsd