### PDF EXTRACTION USING PyPDF2

In [145]:
from PyPDF2 import PdfReader
import os

# Change the path below to the correct path for your computer.
pdf_dir = './input-files/'
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]

In [146]:
def pypdf_extraction (pdf_files: list):

    # Define the directory where output files will be saved
    output_directory = './pypdf/'

    # Iterate over each PDF file
    for pdf_file in pdf_files:
        try:

            # Extract year and level from the file name
            parts = pdf_file.split('-')
            year = parts[0]
            level = parts[1]

            # Construct full path to PDF file
            pdf_path = os.path.join(pdf_dir, pdf_file)

            # Open the PDF file
            with open(pdf_path, 'rb') as file:
                reader = PdfReader(file)
                # Initialize a variable to store the extracted text
                extracted_text = ""

                # Extract metadata
                metadata = reader.metadata
                title = metadata.get('/Title', 'Unknown Title')
                total_pages = len(reader.pages)

                # Initialize a variable to store the extracted text including metadata
                extracted_text = f"Title: {title}\nTotal Pages: {total_pages}\n\n"

                # Extract text from each page
                for page in reader.pages:

                    text = page.extract_text() if page.extract_text() else ''
                    extracted_text += text + "\n"

                # Define the output text file name based on the year and level
                text_file_name = f"PyPDF_RR_{year}_{level}_combined.txt"
                output_path = os.path.join(output_directory, text_file_name)

                # Save the extracted text to a text file
                with open(output_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(extracted_text)

                print(f"Extracted text from {pdf_file} to {output_path}")

        except FileNotFoundError:
            print(f"File {pdf_file} not found in {pdf_dir}.")
        except PermissionError:
            print(f"Permission denied to access or write to file {pdf_file}.")
        except Exception as e:
            print(f"An error occurred while processing {pdf_file}: {e}")

    return extracted_text

In [147]:
pypdf_extraction(pdf_files)

Extracted text from 2024-l3-topics-combined-2.pdf to ./pypdf/PyPDF_RR_2024_l3_combined.txt
Extracted text from 2024-l1-topics-combined-2.pdf to ./pypdf/PyPDF_RR_2024_l1_combined.txt
Extracted text from 2024-l2-topics-combined-2.pdf to ./pypdf/PyPDF_RR_2024_l2_combined.txt




### PDF EXTRACTION USING GROBID

In [148]:
from grobid_client.grobid_client import GrobidClient
import os
import re
import csv
import xml.etree.ElementTree as ET
from lxml import etree 

In [149]:
def process_pdfs(input_directory, output_directory):
    client = GrobidClient(config_path="./config.json")
    client.process("processFulltextDocument", input_directory, output_directory, n=1, 
                   consolidate_header=True, consolidate_citations=True, include_raw_citations=True,
                   include_raw_affiliations=True,force=True)

In [150]:
def xml_to_text_and_metadata(xml_string):
    root = ET.fromstring(xml_string)
    text = ""

    for elem in root.iter():
        if elem.text:
                # Extract text content
                text += elem.text + "\n"
    return text.strip()

In [151]:
def extract_tei_metadata(tei_file):
    """
    Extract TEI metadata including TEI Header, File Description, version, and encoding.
    """
    metadata = {}
    try:
        tree = etree.parse(tei_file)
        root = tree.getroot()

        # Extract TEI Header metadata
        tei_header = root.find(".//{http://www.tei-c.org/ns/1.0}teiHeader")
        if tei_header is not None:
            metadata['language'] = tei_header.get('{http://www.w3.org/XML/1998/namespace}lang')

        # Extract version and encoding
        with open(tei_file, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            match = re.match(r'^<\?xml\s+version\s*=\s*(["\'])(.*?)\1\s+encoding\s*=\s*(["\'])(.*?)\3.*\?>', first_line)
            if match:
                metadata['version'] = match.group(2)
                metadata['encoding'] = match.group(4)

    except Exception as e:
        print(f"Error occurred while processing {tei_file}: {e}")

    return metadata

In [152]:
def write_metadata_to_csv(metadata_list, output_file):
    """
    Write metadata to a CSV file.
    """
    fieldnames = ['filename', 'language', 'version', 'encoding']
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(metadata_list)

In [153]:
import os

if __name__ == "__main__":
    input_directory = "./input-files/"
    output_directory = "./grobid/"
    
    try:
        os.makedirs(output_directory, exist_ok=True)
        
        # Process PDF files using Grobid
        process_pdfs(input_directory, output_directory)
        
        metadata_list = []  # Initialize an empty list to store metadata
            
        # Convert XML output to text and metadata for each PDF file
        for pdf_filename in os.listdir(input_directory):
            if pdf_filename.endswith(".pdf"):
                # Extract year and level from the PDF file name
                year, level = pdf_filename.split("-")[:2]
                
                # Generate file names
                xml_file_name = f"Grobid_RR_{year}_{level}_combined.xml"
                txt_file_name = f"Grobid_RR_{year}_{level}_combined.txt"
                
                xml_file_path = os.path.join(output_directory, f"{year}-{level}-topics-combined-2.grobid.tei.xml")

                #Extract metadata
                metadata = extract_tei_metadata(xml_file_path)
                metadata['filename'] = xml_file_name
                metadata_list.append(metadata)

                # Rename XML file and get its content
                os.rename(xml_file_path, os.path.join(output_directory, xml_file_name))

                with open(os.path.join(output_directory, xml_file_name), "r", encoding="utf-8") as xml_file:
                    xml_content = xml_file.read()

                # Convert XML to text with metadata
                text_content = xml_to_text_and_metadata(xml_content)
            
                # Write text content to .txt file
                with open(os.path.join(output_directory, txt_file_name), "w", encoding="utf-8") as txt_file:
                    txt_file.write(text_content)
                
                
                # Remove XML file after converting to text
                os.remove(os.path.join(output_directory, xml_file_name))
                
                # Print information
                print(f"Extracted text from {pdf_filename} to {txt_file_name}")

        # Store metadata in a .csv file
        write_metadata_to_csv(metadata_list, '../cloud-storage-integration/metadata-grobid.csv')

    except Exception as e:
        print(f"An error occurred: {e}")

GROBID server is up and running
Extracted text from 2024-l3-topics-combined-2.pdf to Grobid_RR_2024_l3_combined.txt
Extracted text from 2024-l1-topics-combined-2.pdf to Grobid_RR_2024_l1_combined.txt
Extracted text from 2024-l2-topics-combined-2.pdf to Grobid_RR_2024_l2_combined.txt
