In [1]:
# Import necessary libraries
from pydantic import BaseModel, Field, ValidationError  # For data validation and constraints
from lxml import etree  # For parsing XML files
import csv  # For CSV file operations

# Define a Pydantic model for validating each row of the output CSV file
class RowModel(BaseModel):
    Level: int = Field(..., ge=1, le=3)  # Ensures 'Level' is an integer between 1 and 3, inclusive
    Category: str  # A string to hold the category
    Topic: str  # A string to hold the topic
    LearningOutcomes: str  # A string to hold the learning outcomes

# Specify the file paths for XML files, categorized by levels
xml_file_paths = {
    'Level 1': '/Users/shubh/Desktop/Assignment 3/GROBID_RR_2024_Level1_combined.xml',
    'Level 2': '/Users/shubh/Desktop/Assignment 3/GROBID_RR_2024_Level2_combined.xml',
    'Level 3': '/Users/shubh/Desktop/Assignment 3/GROBID_RR_2024_Level3_combined.xml',
}

# Define the output path for the CSV file that will be generated
output_csv_path = '/Users/shubh/Desktop/Aww/ContentClass_Cleaned.csv'

# Map categories to row index ranges for categorization purposes
combined_category_ranges = {
    # Each category is associated with one or more tuples of (start_index, end_index)
    # indicating the row indices that fall under that category
    'Quantitative Methods': [(1, 11), (88, 95)],
    'Economics': [(12, 19), (96, 98)],
    'Portfolio Management': [(20, 25), (137, 147)],
    'Corporate Issuers': [(26, 32), (106, 109)],
    'Financial Statement Analysis': [(33, 44), (99, 105)],
    'Equity Investments': [(45, 52), (154, 157)],
    'Fixed Income': [(53, 70), (116, 120), (150, 153)],
    'Derivatives': [(71, 80), (121, 122), (148, 149)],
    'Alternative Investments': [(81, 87), (123, 126), (158, 161)],
    'Equity Valuation': [(110, 115)],
    'Analysis of Active Portfolio': [(127, 132)],
    'Economies': [(133, 136)],

}

def find_category(row_index):
    # Determines the category of a given row index based on predefined ranges
    for category, ranges in combined_category_ranges.items():
        for start, end in ranges:
            if start <= row_index <= end:
                return category
    return "Unknown"  # Returns "Unknown" if no range matches the index

def parse_tei_xml_and_merge_paragraphs_to_csv(output_csv_path):
    # Initializes counters for tracking validation successes and failures
    row_index = 1
    successful_validations = 0
    unsuccessful_validations = 0
    
    # Opens the output CSV file for writing
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        # Writes the header row to the CSV file
        writer.writerow(['Level', 'Category', 'Topic', 'Learning Outcomes'])

        # Iterates over the XML files, parsing and processing each one
        for level, file_path in xml_file_paths.items():
            tree = etree.parse(file_path)  # Parses the XML file
            ns = {'tei': 'http://www.tei-c.org/ns/1.0'}  # Namespace definition for XPath

            def write_merged_paragraphs(sections, default_heading):
                # Helper function to process and write data for each section
                nonlocal row_index, successful_validations, unsuccessful_validations
                for section in sections:
                    # Extracts the heading and paragraphs, merging paragraphs as needed
                    heading = section.xpath('.//tei:head/text()', namespaces=ns)[0] if section.xpath('.//tei:head/text()', namespaces=ns) else default_heading
                    paragraphs = section.xpath('.//tei:p/text()', namespaces=ns)
                    merged_paragraphs = '\n'.join(paragraphs)
                    if merged_paragraphs.strip() and "LEARNING OUTCOMES" not in heading.upper():
                        category = find_category(row_index)
                        # Attempts to validate the data against the Pydantic model before writing to CSV
                        try:
                            valid_data = RowModel(Level=int(level.split(' ')[1]), Category=category, Topic=heading, LearningOutcomes=merged_paragraphs)
                            writer.writerow([valid_data.Level, valid_data.Category, valid_data.Topic, valid_data.LearningOutcomes])
                            successful_validations += 1
                        except ValidationError:
                            unsuccessful_validations += 1
                        row_index += 1

            # Processes abstract and content sections separately
            abstract_sections = tree.xpath('//tei:profileDesc/tei:abstract/tei:div', namespaces=ns)
            write_merged_paragraphs(abstract_sections, "Abstract")
            content_sections = tree.xpath('//tei:text/tei:body/tei:div', namespaces=ns)
            write_merged_paragraphs(content_sections, "Content")

    # Prints the counts of successful and unsuccessful validations
    print(f"Successful validations: {successful_validations}")
    print(f"Unsuccessful validations: {unsuccessful_validations}")

# Executes the main function to start the process
parse_tei_xml_and_merge_paragraphs_to_csv(output_csv_path)


Successful validations: 161
Unsuccessful validations: 0
