### ContentPDFClass: Stores the extracted content from each PDF file

In [33]:
import pandas as pd
from lxml import etree

# File paths
txt_files = [
    "/Users/riyasingh/Downloads/Datasets/PyPDF/PyPDF_RR_2024_l1_combined.txt",
    "/Users/riyasingh/Downloads/Datasets/PyPDF/PyPDF_RR_2024_l2_combined.txt",
    "/Users/riyasingh/Downloads/Datasets/PyPDF/PyPDF_RR_2024_l3_combined.txt"
]

xml_files = [
    "/Users/riyasingh/Downloads/Datasets/Grobid/Grobid_RR_2024_l1_combined.xml",
    "/Users/riyasingh/Downloads/Datasets/Grobid/Grobid_RR_2024_l2_combined.xml",
    "/Users/riyasingh/Downloads/Datasets/Grobid/Grobid_RR_2024_l3_combined.xml"
]

# Read text file content up to the first newline character
def read_first_line(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readline().strip()

# Extract data from XML based on provided structure and requirements
def extract_data_from_xml(file_path):
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
    topics = []
    heading =[]
    head_with_p = []  # Correct variable name used
    learning_outcomes_found = False

    with open(file_path, 'rb') as file:
        tree = etree.parse(file)
        divs = tree.findall('.//tei:div', namespaces=ns)
        subtopics = []
        for div in divs:
            head = div.find('tei:head', namespaces=ns)
            if head is not None and head.text:
                subtopics.append(head.text.strip())
                if head.text.strip() == "LEARNING OUTCOMES":
                    subtopics.pop(-1)
                    heading.append(subtopics[-1])
                    subtopics.pop(-1)
                    topics.append(subtopics)
                    subtopics = []
                    continue
    
    topics.append(subtopics)
    # Correctly use head_with_p variable
    return topics, heading



# Process files and collect data
data = []
for txt_file, xml_file in zip(txt_files, xml_files):
    first_line = read_first_line(txt_file)
    topics, heading = extract_data_from_xml(xml_file)
    heading.insert(0,first_line)
    for i in range(len(topics)):
        data.append([xml_file.split('/')[-1], heading[i],'|'.join(topics[i]),len(topics[i])])
    

# Convert to DataFrame
df = pd.DataFrame(data, columns=['File Name','Headings', 'Topics', 'Topics Count'])

# Write to CSV
output_csv_path = "/Users/riyasingh/Downloads/Datasets/final_output.csv"
df.to_csv(output_csv_path, index=False)
