# **Text Extraction From PDFs using `pytesseract`**

---


In [None]:
import os
import json
import pytesseract
from pdf2image import convert_from_path


def pdf_to_json(pdf_folder, output_folder, tesseract_cmd):
    pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_files = sorted([f for f in os.listdir(pdf_folder) if f.endswith(".pdf")])

    for pdf_file in pdf_files:
        json_file_name = os.path.splitext(pdf_file)[0] + ".json"
        json_path = os.path.join(output_folder, json_file_name)

        if os.path.exists(json_path):
            print(f"Skipping {pdf_file}")
            continue

        pdf_path = os.path.join(pdf_folder, pdf_file)
        images = convert_from_path(pdf_path)
        pages_text = {}

        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            pages_text[int(i + 1)] = text  # Ensuring keys are integers

        with open(json_path, "w", encoding="utf-8") as json_file:
            json.dump(pages_text, json_file, ensure_ascii=False, indent=4)


pdf_to_json(
    "../../data/official_journal/1.pdf_data",
    "../../data/official_journal/2.json_data",
    tesseract_cmd="/opt/homebrew/bin/tesseract",
)

# **Turn The JSON Files To a CSV File For Import in The PostgrSQL Database**

---


In [11]:
import os
import json
import csv

# Path to the folder containing JSON files
folder_path = "../../data/official_journal/2.json_data"

# Path for the CSV output file
csv_file_path = "../../data/official_journal/db.csv"

# Open the CSV file in write mode
with open(csv_file_path, mode="w", newline="", encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file, quotechar='"', quoting=csv.QUOTE_MINIMAL)

    # Write the header row to the CSV
    csv_writer.writerow(["year", "number", "page", "content"])

    # Get a sorted list of filenames based on year and number (ascending order)
    sorted_filenames = sorted(
        os.listdir(folder_path),
        key=lambda filename: (int(filename[1:5]), int(filename[5:8])),
    )

    # Process each JSON file
    for filename in sorted_filenames:
        if filename.endswith(".json"):
            with open(os.path.join(folder_path, filename), "r") as file:
                data = json.load(file)

            # Extract year and number from the filename
            year = int(filename[1:5])
            number = int(filename[5:8])

            # Sort pages within the current file by page number
            sorted_pages = sorted(data.items(), key=lambda x: int(x[0]))

            # Write each page's content to the CSV, cleaning content field
            for page, content in sorted_pages:
                cleaned_content = content.replace(
                    "\n", " "
                )  # Clean newlines from content
                csv_writer.writerow([year, number, page, cleaned_content])

print(f"Data has been written and cleaned in {csv_file_path}")

Data has been written and cleaned in ../../data/official_journal/db.csv
