# Text Extraction From PDFs using `pytesseract`
---

In [None]:
import os
import json
import pytesseract
from pdf2image import convert_from_path

def pdf_to_json(pdf_folder, output_folder, tesseract_cmd=None):
    if tesseract_cmd:
        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_files = sorted([f for f in os.listdir(pdf_folder) if f.endswith('.pdf')])
    
    for pdf_file in pdf_files:
        json_file_name = os.path.splitext(pdf_file)[0] + '.json'
        json_path = os.path.join(output_folder, json_file_name)

        if os.path.exists(json_path):
            print(f"Skipping {pdf_file} as it is already processed.")
            continue

        pdf_path = os.path.join(pdf_folder, pdf_file)
        images = convert_from_path(pdf_path)
        pages_text = {}
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            pages_text[int(i + 1)] = text  # Ensuring keys are integers
            
        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(pages_text, json_file, ensure_ascii=False, indent=4)

pdf_to_json('../../data/official_journal/1.pdf_data', '../../data/official_journal/2.json_data', tesseract_cmd='/opt/homebrew/bin/tesseract')

Skipping F1962001.pdf as it is already processed.
Skipping F1962002.pdf as it is already processed.
Skipping F1962003.pdf as it is already processed.
Skipping F1962004.pdf as it is already processed.
Skipping F1962005.pdf as it is already processed.
Skipping F1962006.pdf as it is already processed.
Skipping F1962007.pdf as it is already processed.
Skipping F1962008.pdf as it is already processed.
Skipping F1962009.pdf as it is already processed.
Skipping F1962010.pdf as it is already processed.
Skipping F1962011.pdf as it is already processed.
Skipping F1962012.pdf as it is already processed.
Skipping F1962013.pdf as it is already processed.
Skipping F1962014.pdf as it is already processed.
Skipping F1962015.pdf as it is already processed.
Skipping F1962016.pdf as it is already processed.
Skipping F1962017.pdf as it is already processed.
Skipping F1962018.pdf as it is already processed.
Skipping F1962019.pdf as it is already processed.
Skipping F1962020.pdf as it is already processed.
