# Optical Charcater Recognition - OCR

**OCR**:
Text extraction from scanned medical records using the **Marker library**.

How it works:
Marker is a pipeline of deep learning models:

- Extracts text, using OCR when needed (heuristics, Surya)

- Detects page layout and determines reading order (Surya)

- Cleans and formats each block (heuristics, Texify, Surya)

- Optionally uses an LLM to enhance text quality

- Combines blocks and post-processes the complete text

It only applies models when necessary, improving both speed and accuracy.

For more details, see:
[marker-github page](https://github.com/datalab-to/marker?tab=readme-ov-file)

## Imports and environment setup

In [None]:
!pip install torch==2.7.1 -q
!pip install marker-pdf -q
!pip install PyPDF2 -q
!pip install google-genai -q
!pip install torchvision==0.22.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.2/821.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m134.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m109.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
from google import genai
from google.colab import userdata

import os
import PyPDF2
import os
import subprocess
from concurrent.futures import ProcessPoolExecutor, as_completed

In [None]:
drive.mount('/content/drive' , force_remount=True)

Mounted at /content/drive


## Constants

In [None]:
DATA_PATH = '/content/drive/MyDrive/SanRaffaele/Data/Dataset/2019/cartelle_cliniche_2019'

In [None]:
OUTPUT_PATH = '/content/drive/MyDrive/SanRaffaele/Data/Dataset/2019/cartelle_cliniche_MD_2019'

In [None]:
print(len(os.listdir(DATA_PATH)))

461


## One per time

In [None]:
def process_pdfs_with_resume(input_dir, output_dir, log_file="processed.txt"):
    os.makedirs(output_dir, exist_ok=True)
    log_path = os.path.join(output_dir, log_file)

    # Carica l'elenco dei file PDF già processati
    if os.path.exists(log_path):
        with open(log_path, "r") as f:
            processed = set(line.strip() for line in f if line.strip())
    else:
        processed = set()

    files_list=os.listdir(input_dir)
    print(f"Trovati {len(files_list)} PDF.")

    for file in files_list:
        file_path = os.path.join(input_dir, file)
        if file in processed:
            print(f" Già processato: {file}")
            continue

        print(f"Processing: {file}")
        try:
            command = [
                "marker_single",
                file_path,
                "--output_format", "markdown",
                "--output_dir", output_dir,
                "--paginate_output",
                "--format_lines",
                "--force_ocr",
                "--strip_existing_ocr",
                "--pdftext_workers","2"
            ]
            subprocess.run(command, check=True)

            with open(log_path, "a") as f:
                f.write(file + "\n")
            print(f"Completato: {file}")
        except subprocess.CalledProcessError as e:
            print(f" Errore su {file}: {e}")

    print("Fine! Tutti i PDF non ancora processati sono stati elaborati.")

In [None]:
!marker_single {file_path} \
  --help\
  --output_format markdown \
  --output_dir {OUTPUT_PATH} \
  --paginate_output \
  --format_lines \
  --force_ocr \
  --strip_existing_ocr


2025-06-22 22:35:35.000916: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750631735.019935    2473 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750631735.025920    2473 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-22 22:35:35.045257: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Usage: marker_single [OPTIONS] FPATH

  Convert a single PDF to markdown.

Options:
  --llm_service TEXT             

In [None]:
log_path = os.path.join(OUTPUT_PATH, "processed.txt")

# Leggi i PDF già processati
processed = set()
if os.path.exists(log_path):
    with open(log_path, "r") as f:
        processed = set(line.strip() for line in f if line.strip())
        print(f"PDF già processati: {len(processed)}")

In [None]:
process_pdfs_with_resume(DATA_PATH,OUTPUT_PATH)

Trovati 461 PDF.
 Già processato: 2019034139.pdf
 Già processato: 2019041344.pdf
 Già processato: 2019035849.pdf
 Già processato: 2019041302.pdf
 Già processato: 2019043154.pdf
 Già processato: 0100 - C - 2019-034727-ORD - Cartella.pdf
 Già processato: 2019050418.pdf
 Già processato: 0067 - C - 2019-009711-ORD - Cartella.pdf
 Già processato: 2019009480.pdf
 Già processato: 2019029653.pdf
 Già processato: 2019038787.pdf
 Già processato: 2019051284.pdf
Processing: 2019037036.pdf
 Errore su 2019037036.pdf: Command '['marker_single', '/content/drive/MyDrive/SanRaffaele/Data/Dataset/2019/cartelle_cliniche_2019/2019037036.pdf', '--output_format', 'markdown', '--output_dir', '/content/drive/MyDrive/SanRaffaele/Data/Dataset/2019/cartelle_cliniche_MD_2019', '--paginate_output', '--format_lines', '--force_ocr', '--strip_existing_ocr', '--pdftext_workers', '2']' died with <Signals.SIGKILL: 9>.
 Già processato: 0064 - C - 2019-008904-ORD - Cartella.pdf
 Già processato: 2019014609.pdf
 Già processa

KeyboardInterrupt: 

## In parallel
Se si ha tanta GPU e CPU

In [None]:
def process_file(file_path, output_dir, pdftext_workers):
    file_name = os.path.basename(file_path)
    print(f"Inizio elaborazione: {file_name}")
    command = [
        "marker_single",
        file_path,
        "--output_format", "markdown",
        "--output_dir", output_dir,
        "--paginate_output",
        "--format_lines",
        "--force_ocr",
        "--strip_existing_ocr",
        "--pdftext_workers", str(pdftext_workers)
    ]

    try:
        subprocess.run(command, check=True)
        print(f"Completato: {file_name}")
        return file_name, True
    except subprocess.CalledProcessError as e:
        print(f" Errore su {file_name}: {e}")
        return file_name, False

In [None]:
def process_pdfs_with_resume(input_dir, output_dir, log_file="processed.txt", max_workers=2, pdftext_workers=2):
    os.makedirs(output_dir, exist_ok=True)
    log_path = os.path.join(output_dir, log_file)

    # Leggi i PDF già processati
    processed = set()
    if os.path.exists(log_path):
        with open(log_path, "r") as f:
            processed = set(line.strip() for line in f if line.strip())
            print(f"PDF già processati: {processed}")

    # Trova PDF non ancora processati
    all_files = [f for f in os.listdir(input_dir)]
    to_process = [f for f in all_files if f not in processed]
    print(f"Trovati {len(all_files)} PDF totali. Da processare: {len(to_process)}")

    if not to_process:
        print("Tutti i PDF sono già stati processati.")
        return

    # Lancia elaborazione parallela
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(
                process_file,
                os.path.join(input_dir, file),
                output_dir,
                pdftext_workers
            ): file
            for file in to_process
        }

        with open(log_path, "a") as log_f:
            for future in as_completed(futures):
                file, success = future.result()
                if success:
                    log_f.write(file + "\n")
                    print(f"Aggiunto al log: {file}")
                else:
                    print(f"Errore su {file}")

    print("Fine! Tutti i PDF non ancora processati sono stati elaborati.")
