In [None]:
%pip install -r requirements.txt

In [None]:
import os
import PyPDF2
import re
from pathlib import Path
import xml.etree.ElementTree as ET
import re
import os
from dotenv import load_dotenv

In [None]:
# Load environment variables from .env file
def load_env():
    load_dotenv()
    #grobid_url = os.getenv("GROBID_URL")
    pdf_directory = os.getenv('PDF_DIR_PATH') # Store the downloaded PDF files from S3
    output_dir = os.getenv("OUTPUT_DIR_PATH") # Store the extracted txt files
    AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
    AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
    S3_BUCKET_NAME = os.getenv('S3_BUCKET_NAME')
    
    
    return pdf_directory, output_dir, S3_BUCKET_NAME, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY

pdf_directory, output_dir, S3_BUCKET_NAME, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY = load_env()


print(pdf_directory)

Downloading PDFs from S3 to local

In [None]:
import boto3


def download_files_from_s3():
    s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    # List objects in the specified S3 folder
    response = s3.list_objects_v2(Bucket=S3_BUCKET_NAME)

    if not os.path.exists(pdf_directory):
        os.makedirs(pdf_directory)


    # Download each file to the local directory
    #for obj in response.get('Contents')[1:]:
    for obj in response.get('Contents', []):
        key = obj['Key']
        print(key)
        local_file_path = os.path.join(pdf_directory, os.path.basename(key))
        print(local_file_path)

        try:
            s3.download_file(S3_BUCKET_NAME, key, local_file_path)
            print(f"Downloaded: {key} to {local_file_path}")
        except Exception as e:
            print(f"Failed to download {key}: {e}")
            

try:
    download_files_from_s3()
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
## Extract text using PyPDF2


def extract_text_pypdf(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    output_filename = os.path.join(output_folder, f'PyPDF_RR_{os.path.basename(pdf_path)[:-4]}_combined.txt')
    with open(output_filename, 'w') as file:
        file.write(text)
    print(f"Text extracted from {pdf_path} using PyPDF2 and saved to {output_filename}")


In [None]:
pwd

In [None]:
cd grobid_client_python

In [None]:
## Use Grobid to convert PDF to XML

import subprocess

def convert_to_xml_with_grobid(input_folder, output_folder):
    subprocess.run(['python3', '-m', 'grobid_client.grobid_client', '--input', input_folder, '--output', output_folder, 'processFulltextDocument'])
    print(f"PDF files in {input_folder} converted to XML and saved to {output_folder}")


In [None]:
## Convert XML to text using Grobid

    
def extract_text_from_element(element):
    text = ''
    if element.text:
        text += element.text.strip() + ' '
    for child in element:
        text += extract_text_from_element(child)
    return text

def convert_xml_to_txt(xml_folder, txt_folder):
    if not os.path.exists(txt_folder):
        os.makedirs(txt_folder)  # Create the directory if it doesn't exist
    
    for xml_file in os.listdir(xml_folder):
        if xml_file.endswith('.xml'):
            xml_path = os.path.join(xml_folder, xml_file)
            tree = ET.parse(xml_path)
            root = tree.getroot()
            
            txt_filename = f"Grobid_RR_{xml_file[:-4]}_combined.txt"
            txt_path = os.path.join(txt_folder, txt_filename)
            
            with open(txt_path, 'w') as txt_file:
                text = extract_text_from_element(root)
                txt_file.write(text)
            print(f"XML file {xml_file} converted to TXT and saved to {txt_path}")


In [None]:
pwd

In [None]:
## Run the extraction process

input_folder = '../Input'
output_folder_pypdf = '../PyPDF/'
xml_folder = '../GROBID/xml'
txt_folder = '../GROBID/txt'

for filename in os.listdir():
    if filename.endswith('.pdf'):
        extract_text_pypdf(os.path.join(input_folder, filename), output_folder_pypdf)



In [None]:
convert_to_xml_with_grobid(input_folder, xml_folder)
convert_xml_to_txt(xml_folder, txt_folder)
