In [None]:
import easyocr
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Charger l'image
image_path = r"C:\Users\emnad\Downloads\datasetsplit1\test\Other\3790--8882413--20230914_page_4.jpg"
image = cv2.imread(image_path)

# Convertir en niveaux de gris
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Normalisation et binarisation
normalized = cv2.normalize(gray, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
blurred = cv2.GaussianBlur(normalized, (5, 5), 0)
binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 5)

# Sauvegarde de l'image pr√©trait√©e
preprocessed_image_path = "preprocessed_image.jpg"
cv2.imwrite(preprocessed_image_path, binary)

# OCR avec EasyOCR
reader = easyocr.Reader(['en', 'ar'], gpu=False)
results = reader.readtext(image_path)

# Trier les r√©sultats par position verticale (y), puis horizontale (x)
sorted_results = sorted(results, key=lambda x: (x[0][0][1], x[0][0][0]))

# Fonction pour regrouper les √©l√©ments par ligne
def group_by_lines(results, line_threshold=10):
    lines = []
    current_line = []
    last_y = None

    for bbox, text, prob in results:
        y = bbox[0][1]  # Position y du coin sup√©rieur gauche
        if last_y is None or abs(y - last_y) < line_threshold:
            current_line.append((bbox, text, prob))
        else:
            # Trier les √©l√©ments de la ligne par position x
            current_line.sort(key=lambda x: x[0][0][0])
            lines.append(current_line)
            current_line = [(bbox, text, prob)]
        last_y = y

    if current_line:
        current_line.sort(key=lambda x: x[0][0][0])
        lines.append(current_line)

    return lines

# Fonction pour identifier les paires cl√©-valeur
def identify_key_value_pairs(line):
    text_items = [item[1] for item in line]
    # Une paire cl√©-valeur est g√©n√©ralement de la forme "Cl√© : Valeur"
    for i in range(len(text_items) - 1):
        if ":" in text_items[i] and i + 1 < len(text_items):
            key = text_items[i].strip()
            value = text_items[i + 1].strip()
            return key, value
    return None, None

# Fonction pour d√©tecter un tableau (bas√© sur la densit√© des √©l√©ments align√©s)
def detect_table(lines):
    table_lines = []
    table_start = None
    table_end = None
    max_columns = 0

    for i, line in enumerate(lines):
        # Compter le nombre d'√©l√©ments dans la ligne
        num_items = len(line)
        if num_items >= 3:  # Un tableau a g√©n√©ralement 3 colonnes ou plus
            if table_start is None:
                table_start = i
            max_columns = max(max_columns, num_items)
            table_lines.append(line)
        else:
            if table_start is not None and table_end is None:
                table_end = i
                break

    if table_start is not None and table_end is None:
        table_end = len(lines)

    return table_start, table_end, table_lines, max_columns

# Structurer les r√©sultats
lines = group_by_lines(sorted_results)

# D√©tecter le tableau
table_start, table_end, table_lines, max_columns = detect_table(lines)

# Organiser les r√©sultats
structured_output = {
    "header": [],
    "key_value_pairs": [],
    "table": [],
    "footer": []
}

# Traiter chaque ligne
for i, line in enumerate(lines):
    line_text = [item[1] for item in line]
    joined_text = " ".join(line_text)

    if i < table_start:
        # Avant le tableau : en-t√™tes ou paires cl√©-valeur
        key, value = identify_key_value_pairs(line)
        if key and value:
            structured_output["key_value_pairs"].append((key, value))
        else:
            structured_output["header"].append(joined_text)
    elif table_start <= i < table_end:
        # Dans le tableau
        structured_output["table"].append(line_text)
    else:
        # Apr√®s le tableau : pied de page ou autres informations
        structured_output["footer"].append(joined_text)

# Afficher les r√©sultats structur√©s
print("\nüìÑ Texte extrait organis√© :\n")

# En-t√™tes
print("=== En-t√™tes ===")
for header in structured_output["header"]:
    print(header)

# Paires cl√©-valeur
print("\n=== Paires Cl√©-Valeur ===")
for key, value in structured_output["key_value_pairs"]:
    print(f"{key} {value}")

# Tableau
print("\n=== Tableau ===")
if structured_output["table"]:
    # Convertir en DataFrame pour une pr√©sentation structur√©e
    table_df = pd.DataFrame(structured_output["table"])
    print(table_df.to_string(index=False))
else:
    print("Aucun tableau d√©tect√©.")

# Pied de page
print("\n=== Pied de page ===")
for footer in structured_output["footer"]:
    print(footer)