# Scanned PDF Acquisition and Cleaning Notebook
## How to use this notebook ?

Enter the connect string and the PDF path and Run All Cells.

In [None]:
# DECLARATIONS & CONSTANTS      #
#################################

# Requirements : 
# 1. A working installation of Tesseract with the french package ('fra').
# 2. A working installation of Ollama with the selected model downloaded.
# 3. A SQL Server up and running with the right database.

#pip install pymupdf
#pip install pymongo
#pip install pdfminer.six
#pip install pytesseract
#pip install opencv-python
#pip install pyspellchecker
#pip install ollama

import numpy as oNumPy
import pandas as oPandas
import fitz  # PyMuPDF
import pytesseract
import pyodbc
import base64
import io
from PIL import Image
from pdfminer.high_level import extract_text
import cv2
import matplotlib.pyplot as plt
import re
import ollama
from datetime import datetime, timedelta
import time
import os

############# SOURCE PDF ############################################
_sPDF_Path = "Coléoptères Carabiques - Subset_Mistral.pdf"
#####################################################################

_sSave_Mode = "FILES"                 # Options: "SQL" or "FILES"
_sBase_Output_Dir = "ExtractedBooks"  # Root folder for file-based saving
_bUse_OCR = True                      # Toggle usage of Tesseract
_bUse_Ollama = True                   # Toggle usage of Ollama AI
_sTarget_Language = "français"        # ou "english", "dutch", etc.


_sConnectString = "DRIVER={SQL Server};SERVER=XXX;DATABASE=XXX;UID=XXX;PWD=XXX;"
_sOllamaModel = "mistral"     # Models : deepseek-r1:32b, deepseek-r1:8b, mistral #

# OCR & Text Cleaning Functions

In [None]:
# OCR of a book (or PDF Document) #
###################################

def Is_Text_PDF(sPDF_Path):
    # Check if the PDF is in text or image mode.
    sText = extract_text(sPDF_Path)
    return len(sText.strip()) > 0


def Clean_Text(sText):
    sText = sText.replace("\n", " ").strip()
    return ' '.join(sText.split())


def Clean_Text_Advanced(sText):   
    if not sText:
        return ""

    sText = sText.replace("\n", " ").strip()
    sText = re.sub(r'\s+', ' ', sText)  # Clean long spaces
    sText = re.sub(r'[^a-zA-ZÀ-ÿ0-9,.!? ]+', '', sText)  
    
    return sText


def Correct_Text_with_Ollama(sModel, sText, sPrompt):

    if not sText or sText.strip() == "":
        return sText
    
    try:
        response = ollama.chat(model=sModel, messages=[
            {"role": "user", "content": f"{sPrompt} {sText}"}
        ])
        
        return response["message"]["content"]
    
    except Exception as e:
        print(f"Error with Ollama : {e}")
        return sText


def Extract_Drawings_from_Page(oImage):
    # Detects and extracts illustrations from a PDF page using an improved method. 
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(oNumPy.array(oImage), cv2.COLOR_RGB2GRAY)

    # Apply slight blur to reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)

    # Adaptive thresholding to better isolate illustrations
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)

    # Contour detection
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # List of extracted drawings
    drawings = []
    min_size = 5000  # Minimum size to consider an illustration

    # Filter contours
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)

        # Ignore very small elements
        if w * h > min_size:
            cropped = oImage.crop((x, y, x + w, y + h))

            # Convert to base64 for storage
            img_byte_arr = io.BytesIO()
            cropped.save(img_byte_arr, format="PNG")
            img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
            drawings.append(img_base64)

    return drawings


def Extract_Text_and_Images_from_Page(oPage, bIsTextPDF):
    # Extracts text and illustrations from a PDF page. 
    
    # Convert the page to an image
    pix = oPage.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    # OCR to retrieve the text (if necessary)
    if _bUse_OCR:
        sOCRText = Clean_Text(pytesseract.image_to_string(img, lang="fra"))
    else:
        sOCRText = Clean_Text(oPage.get_text("text"))

    oImages = Extract_Drawings_from_Page(img)

    # If it's a text-based PDF, get the text directly, otherwise use OCR
    if bIsTextPDF:
        sText = Clean_Text(oPage.get_text("text"))
    else:
        sText = sOCRText

    return sText, sOCRText, oImages

In [None]:
# MAIN PROCESS DEFINITION            #
######################################

def Do_Process_PDF(sPDF_Path):
    try:
        oDoc = fitz.open(sPDF_Path)
        file_size = os.path.getsize(sPDF_Path) / (1024 * 1024)  # Size in MB

        print(f"File Size : {file_size:.2f} MB")
        print(f"Page(s) : {len(oDoc)}")

        sBook_Title = os.path.splitext(os.path.basename(sPDF_Path))[0]  # PDF Name without extension

        if _sSave_Mode == "SQL":
            cursor.execute("""
                INSERT INTO Books (book_title, book_date_added, book_pages_count)
                OUTPUT INSERTED.book_id_pkey
                VALUES (?, GETDATE(), ?)
            """, (sBook_Title, len(oDoc)))
            iBook_ID = cursor.fetchone()[0]
            conn.commit()
        else:
            output_dir = os.path.join(_sBase_Output_Dir, sBook_Title)
            os.makedirs(output_dir, exist_ok=True)

        for iPage_Num, oPage in enumerate(oDoc):
            dPage_Start_Time = time.time()

            print(f"Page {iPage_Num+1} : Begin OCR and Image extraction.")

            sText, sOCRText, oImages = Extract_Text_and_Images_from_Page(oPage, Is_Text_PDF(sPDF_Path))
            sTextClean = Clean_Text_Advanced(sText)

            print(f"Page {iPage_Num+1} : Begin AI enhancement step.")

            sText_Ollama = ""
            sText_OllamaExp = ""
            if _bUse_Ollama:
                prompt_clean = (
                    f"Ce texte provient d'un OCR d'une page d'un livre ou d'un document scientifique. "
                    f"Saurais-tu nettoyer le résultat (lettres manquantes, orthographe, mots ou phrases incomplètes, etc.) "
                    f"tout en modifiant le minimum et en conservant son sens, le tout toujours en {_sTarget_Language} ? "
                    f"N'ajoute aucun autre commentaire :"
                )
            
                prompt_explain = (
                    f"Ce texte provient d'un OCR d'une page d'un livre ou d'un document scientifique. "
                    f"Peux-tu reprendre les éléments de la page, les expliquer, annoter et compléter un maximum "
                    f"tout en conservant le sens original, le tout toujours en {_sTarget_Language} ? "
                    f"N'ajoute aucun autre commentaire :"
                )
            
                sText_Ollama = Correct_Text_with_Ollama(_sOllamaModel, sText, prompt_clean)
                sText_OllamaExp = Correct_Text_with_Ollama(_sOllamaModel, sText, prompt_explain)

            if _sSave_Mode == "SQL":
                cursor.execute("""
                    INSERT INTO Pages (page_book_id_fkey, page_number, page_raw_text, page_ocr_text, page_raw_text_cleaned, page_raw_text_llm, page_raw_text_llm_explain)
                    OUTPUT INSERTED.page_id_pkey
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                """, (iBook_ID, iPage_Num + 1, sText, sOCRText, sTextClean, sText_Ollama, sText_OllamaExp))
                page_id = cursor.fetchone()[0]
                conn.commit()

                for iImageCount, img_base64 in enumerate(oImages):
                    img_data = base64.b64decode(img_base64)
                    cursor.execute("""
                        INSERT INTO Images (image_page_id_fkey, image_data, image_index)
                        VALUES (?, ?, ?)
                    """, (page_id, img_data, iImageCount + 1))
                conn.commit()
            else:
                # File saving mode
                page_folder = os.path.join(output_dir, f"Page_{iPage_Num+1:03d}")
                os.makedirs(page_folder, exist_ok=True)

                with open(os.path.join(page_folder, "raw_text.txt"), "w", encoding="utf-8") as f:
                    f.write(sText)
                with open(os.path.join(page_folder, "ocr_text.txt"), "w", encoding="utf-8") as f:
                    f.write(sOCRText)
                with open(os.path.join(page_folder, "cleaned_text.txt"), "w", encoding="utf-8") as f:
                    f.write(sTextClean)
                if _bUse_Ollama:
                    with open(os.path.join(page_folder, "llm_text.txt"), "w", encoding="utf-8") as f:
                        f.write(sText_Ollama)
                    with open(os.path.join(page_folder, "llm_explanation.txt"), "w", encoding="utf-8") as f:
                        f.write(sText_OllamaExp)

                for iImageCount, img_base64 in enumerate(oImages):
                    img_data = base64.b64decode(img_base64)
                    img_path = os.path.join(page_folder, f"image_{iImageCount + 1:02d}.jpg")
                    with open(img_path, "wb") as img_file:
                        img_file.write(img_data)

            PageElapsed_Time = round(time.time() - dPage_Start_Time, 3)
            print(f"Page {iPage_Num+1} Processed and Stored in {PageElapsed_Time} seconds (Illustrations Count : {len(oImages)})")
    except Exception as e:
        print(f"Error : {e}")
    finally:
        if _sSave_Mode == "SQL":
            conn.close()
            print("Connection closed.")

# Process launch...

In [None]:
# LAUNCH THE IMPORT AND CLEANING     #
######################################

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "\n ")

if _sSave_Mode == "SQL":
    # Connecting SQL Server
    conn = pyodbc.connect(_sConnectString)
    cursor = conn.cursor()

Do_Process_PDF(_sPDF_Path)
print("\nDone !")

print ("\nCurrent Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")