In [1]:
# Installing and importing required libraries
!pip install PyPDF2
import PyPDF2
import re




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Defining a function to process PDFs and extract topic outlines and learning outcomes
def process_pdf(input_pdf_path, output_txt_path):
    pdfFileObj = open(input_pdf_path, 'rb')
    pdfReader = PyPDF2.PdfReader(pdfFileObj)

    print(f"Processing {input_pdf_path}")
    print("Total number of pages:", len(pdfReader.pages))
    
# Initializing dictionaries to store extracted content
    content = dict()
    topic = ""
    topic_dict = dict()

    # Iterating through each page of the PDF and extracting text from the current page and split into lines
    for page_num in range(len(pdfReader.pages)):
        t = pdfReader.pages[page_num].extract_text().split('\n')
        line_num = 0

        # Skipping header lines containing 'topic outlines'
        while line_num < len(t):
            if line_num == 0:
                if 'topic outlines' in t[line_num].strip().lower():
                    line_num += 1
                topic_new = re.sub(r'[^A-Za-z ]+', '', t[line_num]).strip()
            
                # Checking if the topic already exists in the content dictionary
                all_keys = [x.lower().strip().replace(" ", "") for x in content.keys()]
                if topic_new.lower().strip().replace(" ", "") in all_keys:
                    topic_new = list(filter(lambda x: x.lower().strip().replace(" ", "") == topic_new.lower().strip().replace(" ", ""), content.keys()))[0]

                # Updating the topic if it has changed
                if topic == topic_new:
                    pass
                else:
                    subtopic = ""
                    subtopic_dict = []
                    topic = topic_new
            topic_dict = content.get(topic, dict())

            # Identifying subtopics i.e. Learning outcoomes for Topics
            subtopic_loc = t[line_num].find("The candidate should be able to:")
            if subtopic_loc != -1:
                subtopic = t[line_num - 1] if subtopic_loc == 0 else t[line_num][:subtopic_loc + 1]
                subtopic_dict = topic_dict.get(subtopic, [])
                tab_loc = t[line_num].find("\t")

                # Extract learning outcomes and appending to the subtopic dictionary
                append_list = t[line_num][tab_loc + 1:] + t[line_num + 1]
                if append_list.find("\t") == -1:
                    subtopic_dict.append(append_list)

                    line_num += 2
                if line_num >= len(t):
                    break
            # Handeling corcer cases, extract learning outcomes from lines with tabs
            tab_loc = t[line_num].find("\t")

            if tab_loc != -1:
                subtopic_dict.append(t[line_num][tab_loc + 1:])

            # Updating the topic dictionary
            topic_dict[subtopic] = subtopic_dict
            content[topic] = topic_dict

            line_num += 1
    
    # Writing the extracted data to an output text files
    with open(output_txt_path, 'w', encoding='utf-8') as output_file:
        for topic, subtopics in content.items():
            output_file.write(f"\nTopic: {topic}\n")
            output_file.write("\t\n\tLearning Outcomes: \n")
            output_file.write("\t\t(For the below Learning Outcomes, The candidate should be able to: )\n")
            for subtopic, learning_outcomes in subtopics.items():
                if subtopic == '':
                    continue
                output_file.write(f"\t{subtopic}\n")
                for outcome in learning_outcomes:
                    output_file.write(f"\t\t- {outcome}\n")

    print(f"Output saved to: {output_txt_path}")


In [10]:
# Processing all 3 PDF files for three different levels and storing extracted  data in 3 different text files
process_pdf('..\\..\\data\\raw-pdf-data\\2024-l1-topics-combined-2.pdf', '..\\..\\data\\extracted-pdf-data_PyPDF2\\PyPDF_RR_2024_l1_combined.txt')
process_pdf('..\\..\\data\\raw-pdf-data\2024-l2-topics-combined-2.pdf', '..\..\data\extracted-pdf-data_PyPDF2\PyPDF_RR_2024_l2_combined.txt')
process_pdf('..\\..\\data\\raw-pdf-data\2024-l3-topics-combined-2.pdf', '..\..\data\extracted-pdf-data_PyPDF2\PyPDF_RR_2024_l3_combined.txt')

Processing ..\..\data\raw-pdf-data\2024-l1-topics-combined-2.pdf
Total number of pages: 27
Output saved to: ..\..\data\extracted-pdf-data_PyPDF2\PyPDF_RR_2024_l1_combined.txt


Grobid PDF Extraction

In [15]:
from grobid_client_python.grobid_client.grobid_client import GrobidClient
import os
import xml.etree.ElementTree as ET
import csv
import datetime

In [16]:
def process_pdfs(input_directory, output_directory):
    client = GrobidClient(config_path="./config.json")
    client.process("processFulltextDocument", input_directory, output_directory, n=1, consolidate_header=False,consolidate_citations=True,force=True,segment_sentences=True)

In [17]:
def xml_to_text(xml_string):
    root = ET.fromstring(xml_string)
    text = ""
    for elem in root.iter():
        if elem.text:
            text += elem.text + "\n"
    return text.strip()

In [18]:
input_directory = "../../data/raw-pdf-data"  
output_directory = "../../data/extracted-pdf-data_Grobid/"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

process_pdfs(input_directory, output_directory)
i=0
for filename in os.listdir(output_directory):
    if filename.endswith(".xml"):
        i+=1
        xml_file_path = os.path.join(output_directory, filename)
        print(xml_file_path)
        with open(xml_file_path, "r", encoding="utf-8") as xml_file:
            xml_content = xml_file.read()
            text_content = xml_to_text(xml_content)
            txt_file_path = output_directory + "Grobid_RR_" + str(datetime.date.today().year) + "_" +  str(i)+ "_combined"+ ".txt"
            with open(txt_file_path, "w", encoding="utf-8") as txt_file:
                txt_file.write(text_content)
            print(f"Converted {filename} to {os.path.basename(txt_file_path)}")

GROBID server is up and running
../../data/extracted-pdf-data_Grobid/2024-l2-topics-combined-2.grobid.tei.xml
Converted 2024-l2-topics-combined-2.grobid.tei.xml to Grobid_RR_2024_1_combined.txt
../../data/extracted-pdf-data_Grobid/2024-l1-topics-combined-2.grobid.tei.xml
Converted 2024-l1-topics-combined-2.grobid.tei.xml to Grobid_RR_2024_2_combined.txt
../../data/extracted-pdf-data_Grobid/2024-l3-topics-combined-2.grobid.tei.xml
Converted 2024-l3-topics-combined-2.grobid.tei.xml to Grobid_RR_2024_3_combined.txt


In [19]:
# Specify the directory containing XML files
xml_directory = '../../data/extracted-pdf-data_Grobid'

# Create a CSV file for storing metadata
csv_file_path = '../../data/extracted-pdf-data_Grobid/grobid_metadata.csv'
if os.path.exists(csv_file_path):
    os.remove(csv_file_path)
    
csv_file = open(csv_file_path, 'w', newline='', encoding='utf-8')

csv_writer = csv.writer(csv_file)

# Write header row to CSV file
csv_writer.writerow(['Filename','Title', 'Time','MD5 Identifier', 'Encoding Version','Lang', 'Application Identifier',
                     'Application Description', 'Application Version', 'Application Reference URL'])


# Iterate through XML files in the directory
for filename in os.listdir(xml_directory):
    if filename.endswith('.xml'):
        # Parse the XML document
        tree = ET.parse(os.path.join(xml_directory, filename))
        root = tree.getroot()

        # Extract metadata from TEI header
        title = root.find('.//{http://www.tei-c.org/ns/1.0}titleStmt/{http://www.tei-c.org/ns/1.0}title')
        title = title.text if title is not None else ""

        when_attribute = root.find('.//{http://www.tei-c.org/ns/1.0}appInfo/{http://www.tei-c.org/ns/1.0}application[@ident="GROBID"]')
        when_attribute = when_attribute.get('when') if when_attribute is not None else ""

        md5_identifier = root.find('.//{http://www.tei-c.org/ns/1.0}sourceDesc/{http://www.tei-c.org/ns/1.0}biblStruct/{http://www.tei-c.org/ns/1.0}idno[@type="MD5"]')
        md5_identifier = md5_identifier.text if md5_identifier is not None else ""

        version_attribute = root.get('encoding') if 'encoding' in root.attrib else "UTF-8"

        lang_attribute = root.get('lang') if 'lang' in root.attrib else "en"

        application_identifier = root.find('.//{http://www.tei-c.org/ns/1.0}appInfo/{http://www.tei-c.org/ns/1.0}application[@ident="GROBID"]')
        application_identifier = application_identifier.get('ident') if application_identifier is not None else ""

        application_description = root.find('.//{http://www.tei-c.org/ns/1.0}appInfo/{http://www.tei-c.org/ns/1.0}application[@ident="GROBID"]/{http://www.tei-c.org/ns/1.0}desc')
        application_description = application_description.text if application_description is not None else ""

        application_version = root.find('.//{http://www.tei-c.org/ns/1.0}appInfo/{http://www.tei-c.org/ns/1.0}application[@ident="GROBID"]')
        application_version = application_version.get('version') if application_version is not None else ""

        application_reference_url = root.find('.//{http://www.tei-c.org/ns/1.0}appInfo/{http://www.tei-c.org/ns/1.0}application[@ident="GROBID"]/{http://www.tei-c.org/ns/1.0}ref')
        application_reference_url = application_reference_url.get('target') if application_reference_url is not None else ""

        # Write metadata to CSV file
        csv_writer.writerow([filename, title, when_attribute, md5_identifier, version_attribute, lang_attribute, application_identifier,
                             application_description, application_version, application_reference_url])

# Close CSV file
csv_file.close()