### Extracting PDF to JSON

<p>This function processes PDF files in the input folder,extracts words and coordinates from them, and saves the<br>
extracted data as JSON files in the output folder.</p>

In [None]:
#pip install pdfquery

In [6]:
import os
import pdfplumber
import json

def extract_words_and_coordinates(pdf_path):
    extracted_data = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_data = {'page_number': page_num, 'words': []}

            words = page.extract_words()
            for word in words:
                word_info = {
                    'word': word['text'],
                    'x0': word['x0'],
                    'y0': word['top'],
                    'x1': word['x1'],
                    'y1': word['bottom']
                }
                page_data['words'].append(word_info)

            extracted_data.append(page_data)

    return extracted_data

def save_to_json(data, json_path):
    with open(json_path, 'w') as json_file:
        json.dump(data, json_file, indent=2)

def process_pdfs(input_folder, output_folder):
    for pdf_file in os.listdir(input_folder):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(input_folder, pdf_file)
            extracted_data = extract_words_and_coordinates(pdf_path)
            
            pdf_name = os.path.splitext(pdf_file)[0]
            
            for page_data in extracted_data:
                page_num = page_data['page_number']
                output_json_path = os.path.join(output_folder, f'{pdf_name}_page{page_num}.json')
                save_to_json(page_data, output_json_path)

#folder path for PDF and JSON
PDF_folder = '01_PDF_Sample'
JSON_folder = '02_JSON_Sample'

#create JSON folder if not exist
if not os.path.exists(JSON_folder):
    os.makedirs(JSON_folder)

In [7]:
process_pdfs(PDF_folder, JSON_folder)