In [11]:
from lxml import etree

import csv

import json

# Define the path to the XML file
xml_file_path = '/Users/shubh/Desktop/PyPDF & GROBID/GROBID/grobid_xml_files/Grobid_RR_2024_LevelIII_combined.xml'

# Function to parse the TEI XML and extract structured metadata
def parse_tei_xml(file_path):
    # Parse the XML file
    tree = etree.parse(file_path)
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Initialize the structured metadata dictionary
    structured_metadata = {
        'title': '',
        'md5': '',
        'grobid_version': '',
        'abstract': [],
        'content': []
    }

    # Extract title
    title = tree.xpath('//tei:titleStmt/tei:title/text()', namespaces=ns)
    if title:
        structured_metadata['title'] = title[0]

    # Extract MD5 identifier
    md5 = tree.xpath('//tei:sourceDesc/tei:biblStruct/tei:idno[@type="MD5"]/text()', namespaces=ns)
    if md5:
        structured_metadata['md5'] = md5[0]

    # Extract GROBID version
    grobid_version = tree.xpath('//tei:encodingDesc/tei:appInfo/tei:application/@version', namespaces=ns)
    if grobid_version:
        structured_metadata['grobid_version'] = grobid_version[0]

    # Extract abstract
    abstract_sections = tree.xpath('//tei:profileDesc/tei:abstract/tei:div', namespaces=ns)
    for section in abstract_sections:
        heading = section.xpath('.//tei:head/text()', namespaces=ns)
        paragraphs = section.xpath('.//tei:p/text()', namespaces=ns)
        structured_metadata['abstract'].append({
            'heading': heading[0] if heading else '',
            'paragraphs': paragraphs
        })

    # Extract main content
    content_sections = tree.xpath('//tei:text/tei:body/tei:div', namespaces=ns)
    for section in content_sections:
        heading = section.xpath('.//tei:head/text()', namespaces=ns)
        paragraphs = section.xpath('.//tei:p/text()', namespaces=ns)
        structured_metadata['content'].append({
            'heading': heading[0] if heading else '',
            'paragraphs': paragraphs
        })

    return structured_metadata

# Parse the XML file and convert to structured metadata
structured_metadata = parse_tei_xml(xml_file_path)

# Define the output JSON file path
output_json_path = '2024-l3-topics-combined_structured-metadata.json'

# Save the structured metadata to a JSON file
with open(output_json_path, 'w', encoding='utf-8') as json_file:
    json.dump(structured_metadata, json_file, indent=4)

output_json_path

# FOR CSV FILE


# Function to parse the TEI XML and extract structured metadata
def parse_tei_xml_to_csv(file_path, output_csv_path):
    # Parse the XML file
    tree = etree.parse(file_path)
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Open the CSV file for writing
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        # Write the CSV header
        writer.writerow(['Heading', 'Paragraph'])
        
        # Extract and write abstract sections
        abstract_sections = tree.xpath('//tei:profileDesc/tei:abstract/tei:div', namespaces=ns)
        for section in abstract_sections:
            heading = section.xpath('.//tei:head/text()', namespaces=ns)[0] if section.xpath('.//tei:head/text()', namespaces=ns) else "Abstract"
            paragraphs = section.xpath('.//tei:p/text()', namespaces=ns)
            for paragraph in paragraphs:
                writer.writerow([heading, paragraph])

        # Extract and write content sections
        content_sections = tree.xpath('//tei:text/tei:body/tei:div', namespaces=ns)
        for section in content_sections:
            heading = section.xpath('.//tei:head/text()', namespaces=ns)[0] if section.xpath('.//tei:head/text()', namespaces=ns) else "Content"
            paragraphs = section.xpath('.//tei:p/text()', namespaces=ns)
            for paragraph in paragraphs:
                writer.writerow([heading, paragraph])

# Define the output CSV file path
output_csv_path = '2024-l3-topics-combined_structured-metadata.csv'

# Parse the XML and write to a CSV file
parse_tei_xml_to_csv(xml_file_path, output_csv_path)

# Output the path to the CSV file for download
output_csv_path


'2024-l3-topics-combined_structured-metadata.csv'