This notebook exports the notebook to a neat XML format as specified by book_schema.rng

In [4]:
import json, os
import xml.etree.ElementTree as ET

DATA_DIR = os.path.join('..','data')
IN_FILE  = os.path.join(DATA_DIR,'processed','phrases.json')
SCHEMA = os.path.join('..','schema','xml','book_schema.rng')
OUT_XML = os.path.join(DATA_DIR,'output','xml','all_reviews.xml')


print("Metadata input exists:", os.path.exists(IN_FILE))
print("Metadata input exists:", os.path.exists(SCHEMA))


Metadata input exists: True
Metadata input exists: True


In [None]:
# Load JSON
with open(IN_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

In [2]:
#build XML element tree
root = ET.Element("reviews")
for entry in data:
    asin = str(entry.get("asin") or "")
    title = str(entry.get("title") or "")
    author = str(entry.get("title") or "")
    genre = str(entry.get("genre") or "")
    price = float(entry.get("price")) or ""
    formatted_price="$"+str("{:10.2f}".format(price)).strip()
    
    review_el = ET.SubElement(root, "review")
    #metadata
    book_el = ET.SubElement(
        review_el,
        "book",
        {"asin": asin, "title": title, "author":author, "genre": genre, "price": formatted_price},
    )

    #review data
    sents_el = ET.SubElement(review_el, "sentiments")
    for phrase in entry.get("phrases", []):
        text = str(phrase.get("text") or "")
        pol = str(phrase.get("polarity") or "")
        ph_el = ET.SubElement(sents_el, "phrase", {"polarity": pol})
        ph_el.text = text

In [3]:
# write Element Tree to XML file
tree = ET.ElementTree(root)
tree.write(OUT_XML, encoding="utf-8", xml_declaration=True)
print(f"XML successfully written to {OUT_XML}")

XML successfully written to output/xml/all_reviews.xml


In [11]:
from lxml import etree

# Load the Relax NG schema
with open(SCHEMA,"rb") as f:
    relaxng_doc = etree.parse(f)
    relaxng = etree.RelaxNG(relaxng_doc)

# Load the XML document
with open(OUT_XML, "rb") as f:
    xml_doc = etree.parse(f)

# Validate
if relaxng.validate(xml_doc):
    print("XML is valid ")
else:
    print("XML is invalid ")
    print(relaxng.error_log)


XML is valid 
