In [1]:
# HYBRID OF TEXT EXTRACTION TOOLS WITH TESSERACT OCR. CONVERT PDF TO MARKDOWN FORMAT.

import os
import fitz  
from pdf2image import convert_from_path
import pytesseract
import pypandoc

def extract_text_or_perform_ocr(pdf_path, page_number):
    """Extracts text from a PDF page; performs OCR if the page is image-based."""
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    text = page.get_text().strip()
    doc.close()

    if not text:  # If no text, perform OCR on the page
        images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
        text = pytesseract.image_to_string(images[0])
    
    return text

def convert_pdf_to_md(pdf_path, output_path):
    """Converts a PDF file to Markdown format using Pandoc."""
    doc = fitz.open(pdf_path)
    # Extract text from each page (with OCR for image-based pages)
    texts = [extract_text_or_perform_ocr(pdf_path, pn) for pn in range(len(doc))]
    doc.close()

    full_text = '\n\n'.join(texts)

    # Writing the full extracted text to a temporary .txt file
    temp_txt_path = "temp_extracted_text.txt"
    with open(temp_txt_path, 'w', encoding='utf-8') as temp_file:
        temp_file.write(full_text)

    # Convert the text file to Markdown using Pandoc
    output_md = pypandoc.convert_file(temp_txt_path, 'md', format='markdown', outputfile=output_path)

    os.remove(temp_txt_path)  # Clean up the temporary file

    return output_md

def batch_convert_pdf_to_md(pdf_folder, output_folder):
    """Converts all PDF files in a folder to Markdown format."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            md_filename = os.path.splitext(pdf_file)[0] + '.txt'
            output_path = os.path.join(output_folder, md_filename)
            convert_pdf_to_md(pdf_path, output_path)
            print(f"Converted '{pdf_file}' to Markdown format as '{md_filename}'.")

# Example usage
pdf_folder = 'pdf'
output_folder = 'txt'
batch_convert_pdf_to_md(pdf_folder, output_folder)


Converted 'AYKHN462376203698894.pdf' to Markdown format as 'AYKHN462376203698894.txt'.
Converted 'BVTCH887433695074115.pdf' to Markdown format as 'BVTCH887433695074115.txt'.
Converted 'CPCCU328169921838008.pdf' to Markdown format as 'CPCCU328169921838008.txt'.
Converted 'DCDWF474695898403043.pdf' to Markdown format as 'DCDWF474695898403043.txt'.
Converted 'EPCLN350055445701339.pdf' to Markdown format as 'EPCLN350055445701339.txt'.
Converted 'GCYDJ613756872589094.pdf' to Markdown format as 'GCYDJ613756872589094.txt'.
Converted 'GKKMF460375310283237.pdf' to Markdown format as 'GKKMF460375310283237.txt'.
Converted 'GWINA640121462959159.pdf' to Markdown format as 'GWINA640121462959159.txt'.
Converted 'GYBHM412085956672202.pdf' to Markdown format as 'GYBHM412085956672202.txt'.
Converted 'JDCOO910192708199279.pdf' to Markdown format as 'JDCOO910192708199279.txt'.
Converted 'JEPFU743554593716474.pdf' to Markdown format as 'JEPFU743554593716474.txt'.
Converted 'JUEHQ807721548413263.pdf' to Mar

In [1]:
# ALTEWRNATIVE EXTRACTION TOOLS WITH EASYOCR. CONVERT PDF TO MARKDOWN FORMAT.

import os
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import easyocr
import pypandoc
import numpy as np  # Add this import at the beginning of your script

def extract_text_or_perform_ocr(pdf_path, page_number):
    """Extracts text from a PDF page; performs OCR if the page is image-based."""
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    text = page.get_text().strip()
    doc.close()

    if not text:  # If no text, perform OCR on the page
        images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
        reader = easyocr.Reader(['en'])  # Initialize EasyOCR reader, specify language as needed
        image_np = np.array(images[0])  # Convert PIL image to NumPy array
        results = reader.readtext(image_np, paragraph=True)  # Use `paragraph` for better grouping
        text = ' '.join([result[1] for result in results])
    
    return text

def convert_pdf_to_md(pdf_path, output_path):
    """Converts a PDF file to Markdown format using Pandoc."""
    doc = fitz.open(pdf_path)
    texts = [extract_text_or_perform_ocr(pdf_path, pn) for pn in range(len(doc))]
    doc.close()

    full_text = '\n\n'.join(texts)
    temp_txt_path = "temp_extracted_text.txt"
    with open(temp_txt_path, 'w', encoding='utf-8') as temp_file:
        temp_file.write(full_text)

    output_md = pypandoc.convert_file(temp_txt_path, 'md', format='markdown', outputfile=output_path)
    os.remove(temp_txt_path)  # Clean up the temporary file

    return output_md

def batch_convert_pdf_to_md(pdf_folder, output_folder):
    """Converts all PDF files in a folder to Markdown format."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            md_filename = os.path.splitext(pdf_file)[0] + '.txt'
            output_path = os.path.join(output_folder, md_filename)
            convert_pdf_to_md(pdf_path, output_path)
            print(f"Converted '{pdf_file}' to Markdown format as '{md_filename}'.")

# Example usage
pdf_folder = 'pdf'
output_folder = 'txt'
batch_convert_pdf_to_md(pdf_folder, output_folder)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'BFDWJ294443998167406.pdf' to Markdown format as 'BFDWJ294443998167406.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'BSVLC438504446863059.pdf' to Markdown format as 'BSVLC438504446863059.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'BUOZN419077603018294.pdf' to Markdown format as 'BUOZN419077603018294.txt'.
Converted 'BYNXL764719245224585.pdf' to Markdown format as 'BYNXL764719245224585.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'CVIKU112930131656550.pdf' to Markdown format as 'CVIKU112930131656550.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'DDHNA406989948611301.pdf' to Markdown format as 'DDHNA406989948611301.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster wi

Converted 'DHHFY474803707097567.pdf' to Markdown format as 'DHHFY474803707097567.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'DSGEE791709887334906.pdf' to Markdown format as 'DSGEE791709887334906.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'DSIMB636936690227237.pdf' to Markdown format as 'DSIMB636936690227237.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'EGFOU600623631189800.pdf' to Markdown format as 'EGFOU600623631189800.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster wi

Converted 'FXYHY821521327363899.pdf' to Markdown format as 'FXYHY821521327363899.txt'.
Converted 'IUTSO416473275051279.pdf' to Markdown format as 'IUTSO416473275051279.txt'.
Converted 'JCGWM909548503708677.pdf' to Markdown format as 'JCGWM909548503708677.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'KGVJX821721351433602.pdf' to Markdown format as 'KGVJX821721351433602.txt'.
Converted 'KTHMN904959102109693.pdf' to Markdown format as 'KTHMN904959102109693.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'LDVNK910184062431924.pdf' to Markdown format as 'LDVNK910184062431924.txt'.
Converted 'NEBJE430008161977425.pdf' to Markdown format as 'NEBJE430008161977425.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'NHZRW299126219096885.pdf' to Markdown format as 'NHZRW299126219096885.txt'.
Converted 'NLECU103437326390728.pdf' to Markdown format as 'NLECU103437326390728.txt'.
Converted 'NSOCA568808643052061.pdf' to Markdown format as 'NSOCA568808643052061.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'OEEQA680128060024161.pdf' to Markdown format as 'OEEQA680128060024161.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'ONFYL351908871143354.pdf' to Markdown format as 'ONFYL351908871143354.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'PJFLS954259074905912.pdf' to Markdown format as 'PJFLS954259074905912.txt'.
Converted 'PPKDS702702030080664.pdf' to Markdown format as 'PPKDS702702030080664.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'PTLEV196172633452326.pdf' to Markdown format as 'PTLEV196172633452326.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'QCDNQ839041005188583.pdf' to Markdown format as 'QCDNQ839041005188583.txt'.
Converted 'RIJIU327567670100008.pdf' to Markdown format as 'RIJIU327567670100008.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'SDSRU104057636809556.pdf' to Markdown format as 'SDSRU104057636809556.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'STTHD419887220378078.pdf' to Markdown format as 'STTHD419887220378078.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'UOKVK340759535733471.pdf' to Markdown format as 'UOKVK340759535733471.txt'.
Converted 'UYYHG105771761160375.pdf' to Markdown format as 'UYYHG105771761160375.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'VMXBI813917592052114.pdf' to Markdown format as 'VMXBI813917592052114.txt'.
Converted 'VRNQG588242250317811.pdf' to Markdown format as 'VRNQG588242250317811.txt'.
Converted 'VTAJM960621684114851.pdf' to Markdown format as 'VTAJM960621684114851.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster wi

Converted 'WBBNH258798780470062.pdf' to Markdown format as 'WBBNH258798780470062.txt'.
Converted 'WIMFJ997717701583693.pdf' to Markdown format as 'WIMFJ997717701583693.txt'.
Converted 'WLGRR909529393679199.pdf' to Markdown format as 'WLGRR909529393679199.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'XKNTF585826236398212.pdf' to Markdown format as 'XKNTF585826236398212.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'YRDJN367111294925160.pdf' to Markdown format as 'YRDJN367111294925160.txt'.
Converted 'ZMMIQ461945328311158.pdf' to Markdown format as 'ZMMIQ461945328311158.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster wi

Converted 'ZRFJG636933917014684.pdf' to Markdown format as 'ZRFJG636933917014684.txt'.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Converted 'ZTCTL912131463521557.pdf' to Markdown format as 'ZTCTL912131463521557.txt'.


In [2]:
import os
import re

def process_text(text):
    text = re.sub(r"([a-zA-Z])(\n)([A-Z])", r"\1. \3", text)  # Adds missing dot between sentences split across lines.
    text = re.sub(r"([a-zA-Z])\n", r"\1 ", text)  # Removes inappropriate line breaks within paragraphs.
    #text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with a single space, if needed.
    return text

def process_files(directory_path):
    if not os.path.exists(directory_path):
        print("Directory does not exist:", directory_path)
        return
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory_path, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as file:
                    content = file.read()

                processed_content = process_text(content)

                # Change the file extension from .md to .txt
                txt_filename = os.path.splitext(filename)[0] + '.txt'
                txt_filepath = os.path.join(directory_path, txt_filename)

                with open(txt_filepath, 'w', encoding='utf-8') as file:
                    file.write(processed_content)
                print(f"Processed and saved {txt_filename}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")

directory_path = 'txt'
process_files(directory_path)


Processed and saved BFDWJ294443998167406.txt
Processed and saved BSVLC438504446863059.txt
Processed and saved BUOZN419077603018294.txt
Processed and saved BYNXL764719245224585.txt
Processed and saved CVIKU112930131656550.txt
Processed and saved DDHNA406989948611301.txt
Processed and saved DHHFY474803707097567.txt
Processed and saved DSGEE791709887334906.txt
Processed and saved DSIMB636936690227237.txt
Processed and saved EGFOU600623631189800.txt
Processed and saved FXYHY821521327363899.txt
Processed and saved IUTSO416473275051279.txt
Processed and saved JCGWM909548503708677.txt
Processed and saved KGVJX821721351433602.txt
Processed and saved KTHMN904959102109693.txt
Processed and saved LDVNK910184062431924.txt
Processed and saved NEBJE430008161977425.txt
Processed and saved NHZRW299126219096885.txt
Processed and saved NLECU103437326390728.txt
Processed and saved NSOCA568808643052061.txt
Processed and saved OEEQA680128060024161.txt
Processed and saved ONFYL351908871143354.txt
Processed 