# Scanned PDF Acquisition and Cleaning Notebook
## How to use this notebook ?

Enter the connect string and the PDF path and Run All Cells.

In [None]:
# DECLARATIONS & CONSTANTS      #
#################################

# Requirements : 
# 1. A working installation of Tesseract with the french package ('fra').
# 2. A working installation of Ollama with the selected model downloaded.
# 3. A SQL Server up and running with the right database.

#pip install pymupdf
#pip install pymongo
#pip install pdfminer.six
#pip install pytesseract
#pip install opencv-python
#pip install pyspellchecker
#pip install ollama

import numpy as oNumPy
import pandas as oPandas

from datetime import datetime, timedelta
import time
import os

_sConnectString = "DRIVER={SQL Server};SERVER=XXX;DATABASE=Digital_Library;UID=XXX;PWD=XXX;"
_sPDF_Path = "The Spiders of Great Britain and Ireland.pdf"
_sOllamaModel = "deepseek-r1:8b"     # Models : deepseek-r1:32b, deepseek-r1:8b, mistral #



In [None]:
# OCR + Data storage of a book #
################################

import fitz  # PyMuPDF
import pytesseract
import pyodbc
import base64
import io
from PIL import Image
from pdfminer.high_level import extract_text
import cv2
import matplotlib.pyplot as plt
import re
import ollama
from datetime import datetime
import time


def Display_Images(oImages):
    for iIndex, img_base64 in enumerate(oImages):
        img_data = base64.b64decode(img_base64)
        img = Image.open(io.BytesIO(img_data))
        
        # Use of matplotlib
        plt.figure(figsize=(5, 5))
        plt.imshow(img)
        plt.axis("off")
        plt.title(f"Illustration {iIndex + 1}")
        plt.show()

def Is_Text_PDF(sPDF_Path):
    # Check is the PDF is in text or image mode.
    sText = extract_text(sPDF_Path)
    return len(sText.strip()) > 0

def Clean_Text(sText):
    sText = sText.replace("\n", " ").strip()
    return ' '.join(sText.split())

def Clean_Text_Advanced(sText):   
    if not sText:
        return ""

    sText = sText.replace("\n", " ").strip()
    sText = re.sub(r'\s+', ' ', sText)  # Clean long spaces
    sText = re.sub(r'[^a-zA-ZÀ-ÿ0-9,.!? ]+', '', sText)  
    
    return sText

def Extract_Drawings_from_Page(oImage):
    # A améliorer...
    
    gray = cv2.cvtColor(oNumPy.array(oImage), cv2.COLOR_RGB2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    drawings = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 30:  # Ignore little element
            cropped = oImage.crop((x, y, x + w, y + h))
            img_byte_arr = io.BytesIO()
            cropped.save(img_byte_arr, format="PNG")
            img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
            drawings.append(img_base64)
    return drawings

def Extract_Text_and_Images_from_Page(oPage, bIsTextPDF):
   
    pix = oPage.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    sOCRText = Clean_Text(pytesseract.image_to_string(img, lang="fra"))
    oImages = Extract_Drawings_from_Page(img)

    if (bIsTextPDF):
        sText = Clean_Text(oPage.get_text("text"))
    else:
        sText = sOCRText
    
    return sText, sOCRText, oImages


def Correct_Text_with_Ollama(sModel, sText, sPrompt):

    if not sText or sText.strip() == "":
        return sText
    
    try:
        response = ollama.chat(model=sModel, messages=[
            {"role": "user", "content": f"{sPrompt} {sText}"}
        ])
        
        return response["message"]["content"]
    
    except Exception as e:
        print(f"Erreur avec Ollama : {e}")
        return sText


In [None]:
# MAIN PROCESS DEFINITION            #
######################################


def Do_Process_PDF(sPDF_Path):
    try:
        oDoc = fitz.open(sPDF_Path)
        file_size = os.path.getsize(sPDF_Path) / (1024 * 1024)  # Size in Mo

        print(f"📂 File Size : {file_size:.2f} Mo")
        print(f"📂 Page(s) : {len(oDoc)}")
        
        sBook_Title = sPDF_Path.split('/')[-1]  # PDF Name as title.
        
        cursor.execute("""
            INSERT INTO Books (book_title, book_date_added, book_pages_count)
            OUTPUT INSERTED.book_id_pkey
            VALUES (?, GETDATE(), ?)
        """, (sBook_Title, len(oDoc)))
        iBook_ID = cursor.fetchone()[0]
        conn.commit()
        
        for iPage_Num, oPage in enumerate(oDoc):
            dPage_Start_Time = time.time()
            
            print(f"Page {iPage_Num+1} : Begin OCR and Image extraction.")
            
            sText, sOCRText, oImages = Extract_Text_and_Images_from_Page(oPage, Is_Text_PDF(sPDF_Path))
            sTextClean = Clean_Text_Advanced(sText)
            
            print(f"Page {iPage_Num+1} : Begin using AI to enhance content.")

            sText_Ollama = Correct_Text_with_Ollama(_sOllamaModel, sText, "Ce texte provient d'un OCR d'une page d'un vieux livre de biologie. Saurais tu nettoyer le résultat pour que cela soit exploitable tout en modifiant le minimum et en conservant son sens, le tout toujours en français ? N'ajoute aucun autre commentaire : ")
            sText_OllamaExp = Correct_Text_with_Ollama(_sOllamaModel, sText, "Ce texte provient d'un OCR d'une page d'un vieux livre de biologie. Sais tu reprendre les éléments de la page et me les expliquer en conservant son sens, le tout toujours en français ? N'ajoute aucun autre commentaire :  ")
            
            cursor.execute("""
                INSERT INTO Pages (page_book_id_fkey, page_number, page_raw_text, page_ocr_text, page_raw_text_cleaned, page_raw_text_llm, page_raw_text_llm_explain)
                OUTPUT INSERTED.page_id_pkey
                VALUES (?, ?, ?, ?, ?, ?, ?)
            """, (iBook_ID, iPage_Num + 1, sText, sOCRText, sTextClean, sText_Ollama, sText_OllamaExp))
            page_id = cursor.fetchone()[0]
            conn.commit()      

            iImageCount = 0
            for img_base64 in oImages:
                img_data = base64.b64decode(img_base64)
                cursor.execute("""
                       INSERT INTO Images (image_page_id_fkey, image_data, image_index)
                       VALUES (?, ?, ?)
                """, (page_id, img_data, iImageCount + 1))
                iImageCount = iImageCount + 1
            conn.commit()

            PageElapsed_Time = round(time.time() - dPage_Start_Time, 3)
            print(f"Page {iPage_Num+1} Processed and Stored in {PageElapsed_Time} seconds (Illustrations Count : {len(oImages)})")
    except Exception as e:
        print(f"Error : {e}")
    finally:
        conn.close()
        print("Connection closed.")

In [None]:
# LAUNCH THE IMPORT AND CLEANING     #
######################################

start_time = time.time()
print ("Current Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "\n ")

# Connecting SQL Server
conn = pyodbc.connect(_sConnectString)
cursor = conn.cursor()

Do_Process_PDF(_sPDF_Path)
print("\nDone !")

print ("\nCurrent Time :", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
elapsed_time = round(time.time() - start_time, 3)
print(f"\nCell execution time : {elapsed_time} seconds")