Test RAG raw vs Praline
------------------------

In [1]:
from __future__ import annotations

import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple

import fitz
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langchain_text_splitters import RecursiveCharacterTextSplitter

from textpraline.cleaner.clean import praline, PralineReport


# =============================================================================
# Config
# =============================================================================

CORPUS_DIR = Path("./corpus")  # .txt files recommended for now

MODEL_NAME = "BAAI/bge-base-en-v1.5"  # or "sentence-transformers/all-MiniLM-L6-v2"
TOP_KS = (1, 5, 10)

CHUNK_SIZE = 1200
CHUNK_OVERLAP = 150

COL_RAW = "docs_raw"
COL_PRALINE = "docs_praline"

# Optional: remove academic boilerplate that hurts retrieval (generic, safe-ish)
DROP_LINE_PATTERNS = [
    re.compile(r"^\s*send offprint requests to\s*:\s*.*$", re.IGNORECASE),
    re.compile(r"^\s*article number,\s*page\s*\d+\s*of\s*\d+.*$", re.IGNORECASE),
]


# =============================================================================
# Dataset format
# =============================================================================

# Start simple: list of (question, expected_doc_id)
EVAL: List[Tuple[str, str]] = [
    ("What is the splashback radius studied with?", "docu_astro"),
    ("What does PEP 8 recommend about line length?", "pep8"),
    ("What are ESG priorities discussed in the report?", "Morgan_Stanley_2023_ESG_Report"),
    ("What are the main risks mentioned in the IMF report?", "imf_report"),
    ("What is discussed about large language models?", "llm_pdf"),
]


# =============================================================================
# Helpers
# =============================================================================

def load_pdf_corpus(corpus_dir: Path) -> List[Tuple[str, str]]:
    """
    Returns list of (doc_id, extracted_text).
    """
    items = []

    for p in sorted(corpus_dir.glob("*.pdf")):
        doc_id = p.stem
        doc = fitz.open(p)
        text = ""

        for page in doc:
            text += page.get_text("text") + "\n"

        items.append((doc_id, text))

    if not items:
        raise RuntimeError(f"No .pdf files found in {corpus_dir.resolve()}")

    return items


def drop_lines(text: str, patterns: Iterable[re.Pattern]) -> str:
    """
    Remove lines matching any of the given regex patterns.
    """
    out: List[str] = []
    for ln in text.splitlines():
        if any(p.match(ln.strip()) for p in patterns):
            continue
        out.append(ln)
    return "\n".join(out)

def chunk_text(text: str, *, chunk_size: int, chunk_overlap: int) -> List[str]:
    """
    Deterministic character-based chunking with overlap.
    """
    if chunk_size <= 0:
        raise ValueError("chunk_size must be > 0")
    if chunk_overlap < 0:
        raise ValueError("chunk_overlap must be >= 0")
    if chunk_overlap >= chunk_size:
        raise ValueError("chunk_overlap must be < chunk_size")

    chunks: List[str] = []
    step = chunk_size - chunk_overlap
    n = len(text)

    i = 0
    while i < n:
        chunks.append(text[i : i + chunk_size])
        i += step

    return chunks



def build_collection(
    client: chromadb.ClientAPI,
    name: str,
    embed_fn: SentenceTransformerEmbeddingFunction,
    docs: List[Tuple[str, str]],
    *,
    apply_praline: bool,
    chunk_size: int,
    chunk_overlap: int,
) -> Tuple[int, Dict[str, Optional[PralineReport]]]:
    """
    Create/recreate a collection and index all documents as chunks.

    Returns:
      - chunks_added (int)
      - per_doc_reports: Dict[doc_id, PralineReport|None]
    """
    try:
        client.delete_collection(name)
    except Exception:
        pass
    col = client.get_or_create_collection(name=name, embedding_function=embed_fn)

    ids: List[str] = []
    texts: List[str] = []
    metas: List[Dict[str, object]] = []

    per_doc_reports: Dict[str, Optional[PralineReport]] = {}
    chunk_count = 0

    for doc_id, raw in docs:
        if apply_praline:
            txt, rep = praline(raw, report= True)  # (cleaned, report)
            per_doc_reports[doc_id] = rep
        else:
            txt = raw
            per_doc_reports[doc_id] = None

        txt = drop_lines(txt, DROP_LINE_PATTERNS)

        chunks = chunk_text(txt, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

        for idx, ch in enumerate(chunks):
            ch = ch.strip()
            if len(ch) < 40:
                continue
            ids.append(f"{doc_id}__{idx:05d}")
            texts.append(ch)
            metas.append({"doc_id": doc_id, "chunk_idx": idx})
            chunk_count += 1

    col.add(ids=ids, documents=texts, metadatas=metas)
    return chunk_count, per_doc_reports


def query_doc_ids(
    col,
    question: str,
    *,
    top_k: int,
) -> List[str]:
    """
    Return doc_id list for the top_k retrieved chunks.
    """
    res = col.query(
        query_texts=[question],
        n_results=top_k,
        include=["metadatas"],
    )
    metas = res["metadatas"][0]
    return [m["doc_id"] for m in metas]


def query_debug(col, question: str, *, top_k: int = 5) -> None:
    """
    Print top_k chunks with distances + snippets (for observability).
    """
    res = col.query(
        query_texts=[question],
        n_results=top_k,
        include=["documents", "metadatas", "distances"],
    )
    docs = res["documents"][0]
    metas = res["metadatas"][0]
    dists = res["distances"][0]

    print("\nQ:", question)
    for i, (txt, m, dist) in enumerate(zip(docs, metas, dists), 1):
        snippet = (txt[:260] + "…").replace("\n", " ")
        print(f"{i}. dist={dist:.4f} doc_id={m['doc_id']} chunk={m['chunk_idx']}")
        print("   ", snippet)


def recall_at_k(retrieved_doc_ids: List[str], expected_doc_id: str) -> int:
    return int(expected_doc_id in retrieved_doc_ids)


def reciprocal_rank(retrieved_doc_ids: List[str], expected_doc_id: str) -> float:
    for i, doc_id in enumerate(retrieved_doc_ids, start=1):
        if doc_id == expected_doc_id:
            return 1.0 / i
    return 0.0


def evaluate(
    col,
    eval_set: List[Tuple[str, str]],
    ks: Tuple[int, ...],
) -> Dict[str, Dict[str, float]]:
    """
    Multi-doc evaluation:
    - Recall@k
    - MRR
    - Per-document breakdown
    """
    max_k = max(ks)

    results = {}
    global_recall = {k: 0 for k in ks}
    global_rr = 0.0

    for q, expected_doc in eval_set:
        retrieved = query_doc_ids(col, q, top_k=max_k)

        doc_metrics = {}
        for k in ks:
            hit = recall_at_k(retrieved[:k], expected_doc)
            global_recall[k] += hit
            doc_metrics[f"recall@{k}"] = hit

        rr = reciprocal_rank(retrieved, expected_doc)
        global_rr += rr
        doc_metrics["mrr"] = rr

        results[q] = doc_metrics

    n = len(eval_set)

    results["__global__"] = {
        **{f"recall@{k}": global_recall[k] / n for k in ks},
        "mrr": global_rr / n,
    }

    return results


# =============================================================================
# Main
# =============================================================================

def main() -> None:
    docs = load_pdf_corpus(CORPUS_DIR)

    # In-memory Chroma for fast iteration
    client = chromadb.Client()

    # Local embeddings
    embed_fn = SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)


    chunks_raw, _ = build_collection(
        client,
        COL_RAW,
        embed_fn,
        docs,
        apply_praline=False,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )

    chunks_prl, praline_reports = build_collection(
        client,
        COL_PRALINE,
        embed_fn,
        docs,
        apply_praline=True,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )

    col_raw = client.get_collection(COL_RAW, embedding_function=embed_fn)
    col_prl = client.get_collection(COL_PRALINE, embedding_function=embed_fn)

    print(f"Indexed RAW:     docs={len(docs)}, chunks={chunks_raw}")
    print(f"Indexed PRALINE: docs={len(docs)}, chunks={chunks_prl}")

    print("\n==============================")
    print("FULL QUESTION DEBUG")
    print("==============================")

    for q, expected_doc in EVAL:

        print("\n-----------------------------------")
        print(f"Question: {q}")
        print(f"Expected doc: {expected_doc}")

        print("\nRAW:")
        query_debug(col_raw, q, top_k=5)

        print("\nPRALINE:")
        query_debug(col_prl, q, top_k=5)

    # Metrics
    m_raw = evaluate(col_raw, EVAL, TOP_KS)
    m_prl = evaluate(col_prl, EVAL, TOP_KS)

    print("\n--- RAW ---")
    for k, v in m_raw.items():
        print(k, ":", v)

    print("\n--- PRALINE ---")
    for k, v in m_prl.items():
        print(k, ":", v)

    print("\n--- Summary (doc-level) ---")
    print("RAW    :", m_raw)
    print("PRALINE:", m_prl)

    print("\n==============================")
    print("PRALINE REPORTS")
    print("==============================")

    for doc_id, report in praline_reports.items():
        print(f"\nDocument: {doc_id}")
        if report is None:
            print("No report.")
            continue

        print(report)


if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 921.68it/s, Materializing param=pooler.dense.weight]                                
[1mBertModel LOAD REPORT[0m from: BAAI/bge-base-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Indexed RAW:     docs=5, chunks=1238
Indexed PRALINE: docs=5, chunks=1222

FULL QUESTION DEBUG

-----------------------------------
Question: What is the splashback radius studied with?
Expected doc: docu_astro

RAW:

Q: What is the splashback radius studied with?
1. dist=0.1736 doc_id=docu_astro chunk=41
    y Duffy et al. (2008). 4.5. Splashback radius estimator The main goal of this study consists in constraining the following relation (introduced by More et al. 2015): ⟨Rsp(∆λ∗ ob, ∆zob)⟩= Asp " 1 + Bsp exp   −⟨ν200m(∆λ∗ ob, ∆zob)⟩ 2.44 !# , (33) where Asp and B…
2. dist=0.2292 doc_id=docu_astro chunk=7
    n 2023), which is about 2 times larger than rsp and corresponds to the scale above which the matter density decreases with time. Pizzardo et al. (2024) demonstrated that the inflection point in the radial velocity pro- file of cluster galaxies agrees with rsp …
3. dist=0.2580 doc_id=docu_astro chunk=1
    etti 101, 40129 Bologna, Italy 8 Ruhr University Bochum, Faculty of Physics

Test Praline on HTML
--------------------

In [2]:
import re
import time
import math
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional

import feedparser
import requests
from bs4 import BeautifulSoup
import trafilatura
from tqdm import tqdm
from textpraline.cleaner.clean import praline, PralineReport

In [3]:
RSS_URL = "https://www.lemonde.fr/rss/une.xml"

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (Notebook; +https://example.com)"})

def fetch_html(url: str, timeout: int = 20) -> str:
    r = session.get(url, timeout=timeout)
    r.raise_for_status()
    return r.text

def extract_text_raw(html: str) -> str:
    """
    RAW = extraction main-content (trafilatura) si possible,
    sinon fallback BeautifulSoup.
    """
    main = trafilatura.extract(
        html,
        include_comments=False,
        include_links=False,
        include_tables=False,
        favor_recall=False,
    )
    if main and main.strip():
        return main.strip()

    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    txt = soup.get_text("\n")
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    txt = re.sub(r"[ \t]{2,}", " ", txt)
    return txt.strip()

def make_doc_id(url: str) -> str:
    # stable-ish id, short
    return re.sub(r"\W+", "_", url).strip("_")[-60:]

# ---- 1) Read RSS
feed = feedparser.parse(RSS_URL)
entries = list(feed.entries)
print("RSS:", getattr(feed.feed, "title", ""))
print("entries:", len(entries))

N = 20  # augmente si tu veux
entries = entries[:N]

docs: List[Dict] = []
for e in tqdm(entries, desc="Fetch + extract + praline"):
    url = e.link
    title = getattr(e, "title", "").strip()
    published = getattr(e, "published", "")

    try:
        html = fetch_html(url)
        raw_text = extract_text_raw(html)
        praline_text, rep = praline(raw_text, report=True, normalize_extracted=True, drop_repeated_lines="off", drop_layout_noise="off")
    except Exception as ex:
        print("skip:", url, ex)
        continue

    docs.append({
        "doc_id": make_doc_id(url),
        "url": url,
        "title": title,
        "published": published,
        "raw": raw_text,
        "praline": praline_text,
        "report": rep,
    })

print("built docs:", len(docs))

# ---- 2) Summary table (light, lisible)
def summarize_reports(docs: List[Dict]) -> None:
    total_in = sum(d["report"].input_len for d in docs)
    total_out = sum(d["report"].output_len for d in docs)
    totals = {
        "docs": len(docs),
        "input_len_sum": total_in,
        "output_len_sum": total_out,
        "removed_toc_lines": sum(d["report"].removed_toc_lines for d in docs),
        "normalized_extracted_true": sum(1 for d in docs if d["report"].normalized_extracted),
        "removed_layout_noise_lines": sum(d["report"].removed_layout_noise_lines for d in docs),
        "removed_repeated_lines": sum(d["report"].removed_repeated_lines for d in docs),
        "removed_boilerplate_lines": sum(d["report"].removed_boilerplate_lines for d in docs),
        "compression_ratio_out_in": (total_out / total_in) if total_in else 0.0,
    }
    print("\n=== PRALINE REPORT SUMMARY (aggregate) ===")
    for k, v in totals.items():
        print(f"{k:28} {v}")

summarize_reports(docs)

# ---- 3) Inspect a few examples side-by-side
def preview(i: int, n: int = 900):
    d = docs[i]
    print("\n" + "="*100)
    print(d["title"])
    print(d["url"])
    print("published:", d["published"])
    print("\nreport:", d["report"])
    print("\n--- RAW (head) ---")
    print(d["raw"][:n])
    print("\n--- PRALINE (head) ---")
    print(d["praline"][:n])

for i in range(min(3, len(docs))):
    preview(i)

RSS: Le Monde.fr - Actualités et Infos en France et dans le monde
entries: 19


Fetch + extract + praline: 100%|██████████| 19/19 [00:03<00:00,  6.18it/s]

built docs: 19

=== PRALINE REPORT SUMMARY (aggregate) ===
docs                         19
input_len_sum                93333
output_len_sum               93341
removed_toc_lines            0
normalized_extracted_true    19
removed_layout_noise_lines   0
removed_repeated_lines       0
removed_boilerplate_lines    0
compression_ratio_out_in     1.0000857145918378

EN DIRECT, droits de douane : la Cour suprême américaine déclare illégale une grande partie des taxes de Donald Trump
https://www.lemonde.fr/international/live/2026/02/20/en-direct-droits-de-douane-la-cour-supreme-americaine-declare-illegale-une-grande-partie-des-taxes-de-donald-trump_6667585_3210.html
published: Fri, 20 Feb 2026 16:11:49 +0100

report: PralineReport(input_len=7293, output_len=7293, removed_toc_lines=0, normalized_extracted=True, removed_layout_noise_lines=0, removed_repeated_lines=0, removed_boilerplate_lines=0, text_profile='unknown', detail_enabled=False)

--- RAW (head) ---
Retrouvez tous nos articles sur 




In [4]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# --------- Embedding
EMBED_MODEL = "all-MiniLM-L6-v2"
embed_fn = SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL)

client = chromadb.Client()

def rebuild_collection(name: str):
    try:
        client.delete_collection(name)
    except Exception:
        pass
    return client.get_or_create_collection(name=name, embedding_function=embed_fn)

def chunk_text(text: str, chunk_size: int = 900, overlap: int = 120) -> List[str]:
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    if not text:
        return []
    out = []
    i = 0
    while i < len(text):
        j = min(len(text), i + chunk_size)
        out.append(text[i:j])
        i = max(i + chunk_size - overlap, i + 1)
    return out

def index_docs(col, docs: List[Dict], field: str) -> int:
    ids, texts, metas = [], [], []
    total = 0
    for d in docs:
        chunks = chunk_text(d[field])
        for k, ch in enumerate(chunks):
            ids.append(f'{d["doc_id"]}__{field}__{k}')
            texts.append(ch)
            metas.append({
                "doc_id": d["doc_id"],
                "url": d["url"],
                "title": d["title"],
                "chunk_idx": k,
                "field": field,
            })
        total += len(chunks)
    if ids:
        col.add(ids=ids, documents=texts, metadatas=metas)
    return total

raw_col = rebuild_collection("lemonde_raw")
pra_col = rebuild_collection("lemonde_praline")

raw_chunks = index_docs(raw_col, docs, "raw")
pra_chunks = index_docs(pra_col, docs, "praline")

print("Indexed RAW chunks    :", raw_chunks)
print("Indexed PRALINE chunks:", pra_chunks)

# --------- Retrieval eval (doc-level): queries = titles (baseline)
eval_set = [{"q": d["title"], "expected_doc": d["doc_id"]} for d in docs if d["title"]]

def top_docs(col, query: str, top_k: int = 10) -> List[str]:
    res = col.query(query_texts=[query], n_results=top_k, include=["metadatas"])
    seen, ranked = set(), []
    for md in res["metadatas"][0]:
        doc_id = md.get("doc_id")
        if doc_id and doc_id not in seen:
            seen.add(doc_id)
            ranked.append(doc_id)
    return ranked

def compute_metrics(col, eval_set, ks=(1,5,10)) -> Dict[str, float]:
    recalls = {k: 0 for k in ks}
    rr_sum = 0.0
    n = 0
    for ex in eval_set:
        q, gold = ex["q"], ex["expected_doc"]
        ranked = top_docs(col, q, top_k=max(ks))
        n += 1
        for k in ks:
            if gold in ranked[:k]:
                recalls[k] += 1
        if gold in ranked:
            rr_sum += 1.0 / (ranked.index(gold) + 1)
    out = {f"recall@{k}": recalls[k] / n for k in ks}
    out["mrr"] = rr_sum / n
    out["n_queries"] = n
    return out

raw_metrics = compute_metrics(raw_col, eval_set)
pra_metrics = compute_metrics(pra_col, eval_set)

print("\nRAW metrics   :", raw_metrics)
print("PRALINE metrics:", pra_metrics)

# --------- Debug few queries side-by-side
def debug_query(q: str, expected_doc: str, top_k: int = 5):
    print("\n" + "-"*90)
    print("Q:", q)
    print("Expected doc:", expected_doc)

    for label, col in [("RAW", raw_col), ("PRALINE", pra_col)]:
        res = col.query(query_texts=[q], n_results=top_k, include=["metadatas","distances","documents"])
        print(f"\n{label}:")
        for i, (md, dist, doc) in enumerate(zip(res["metadatas"][0], res["distances"][0], res["documents"][0]), start=1):
            print(f"{i}. dist={dist:.4f} doc_id={md['doc_id']} chunk={md['chunk_idx']}")
            print("   ", doc[:160].replace("\n", " ") + " ...")

for ex in eval_set[:3]:
    debug_query(ex["q"], ex["expected_doc"])

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1520.11it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Indexed RAW chunks    : 129
Indexed PRALINE chunks: 129

RAW metrics   : {'recall@1': 0.6842105263157895, 'recall@5': 0.7894736842105263, 'recall@10': 0.8421052631578947, 'mrr': 0.7456140350877193, 'n_queries': 19}
PRALINE metrics: {'recall@1': 0.6842105263157895, 'recall@5': 0.7894736842105263, 'recall@10': 0.8421052631578947, 'mrr': 0.7324561403508771, 'n_queries': 19}

------------------------------------------------------------------------------------------
Q: EN DIRECT, droits de douane : la Cour suprême américaine déclare illégale une grande partie des taxes de Donald Trump
Expected doc: ne_grande_partie_des_taxes_de_donald_trump_6667585_3210_html

RAW:
1. dist=0.2249 doc_id=ne_grande_partie_des_taxes_de_donald_trump_6667585_3210_html chunk=0
    Retrouvez tous nos articles sur les droits de douane et sur Donald Trump. EN DIRECT, droits de douane : la Cour suprême américaine déclare illégale une grande p ...
2. dist=0.3642 doc_id=ne_grande_partie_des_taxes_de_donald_trump_6667585

In [5]:
# trouve le doc le plus "compressé"
d = max(docs, key=lambda x: x["report"].input_len - x["report"].output_len)

raw_lines = [ln.strip() for ln in d["raw"].splitlines() if ln.strip()]
pra_lines = [ln.strip() for ln in d["praline"].splitlines() if ln.strip()]

removed = [ln for ln in raw_lines if ln not in set(pra_lines)]

print("URL:", d["url"])
print("delta chars:", d["report"].input_len - d["report"].output_len)
print("raw lines:", len(raw_lines), "praline lines:", len(pra_lines), "removed lines:", len(removed))
print("\n--- sample removed lines ---")
for ln in removed[:30]:
    print("-", ln[:180])

URL: https://www.lemonde.fr/international/live/2026/02/20/en-direct-droits-de-douane-la-cour-supreme-americaine-declare-illegale-une-grande-partie-des-taxes-de-donald-trump_6667585_3210.html
delta chars: 0
raw lines: 110 praline lines: 110 removed lines: 68

--- sample removed lines ---
- Selon la décision rendue vendredi à une majorité de six juges contre trois, le président américain ne pouvait pas justifier ces droits de douane par la nécessité d’urgence économiq
- Cette décision concerne les droits de douane présentés comme « réciproques » par Donald Trump mais pas ceux appliqués à des secteurs d’activités particuliers, comme l’automobile ou
- N’hésitez pas à nous poser vos questions, nous essaierons d’y répondre, dès que nous aurons les éléments nécessaires pour le faire.
- Ce message s’affichera sur l’autre appareil.
- Ajouter un compte Découvrir l’offre Famille Découvrir les offres multicomptes-
- Parce qu’une autre personne (ou vous) est en train de lire Le Monde avec ce compte

In [6]:
from collections import Counter
import re

d = max(docs, key=lambda x: x["report"].input_len - x["report"].output_len)

def norm_line(ln: str) -> str:
    return re.sub(r"[ \t]+", " ", ln.strip())

raw_norm = [norm_line(ln) for ln in d["raw"].splitlines() if norm_line(ln)]
c = Counter(raw_norm)

# lignes "suspectes" : celles qui ont >=5 occurrences
suspects = [ln for ln, n in c.items() if n >= 5]
suspects[:30], len(suspects)

(['Parce qu’une autre personne (ou vous) est en train de lire Le Monde avec ce compte sur un autre appareil.',
  'Vous ne pouvez lire Le Monde que sur un seul appareil à la fois (ordinateur, téléphone ou tablette).',
  '-',
  'Comment ne plus voir ce message ?',
  'Nous vous conseillons de modifier votre mot de passe.',
  'Que se passera-t-il si vous continuez à lire ici ?',
  'Ce message s’affichera sur l’autre appareil. Ce dernier restera connecté avec ce compte.',
  'Y a-t-il d’autres limites ?',
  'Non. Vous pouvez vous connecter avec votre compte sur autant d’appareils que vous le souhaitez, mais en les utilisant à des moments différents.'],
 9)

In [7]:
from textpraline.cleaner import clean

print(clean.detect_text_profile(raw_text))
path = Path(CORPUS_DIR)
for doc_id, text in load_pdf_corpus(path):
    doc_text = text
    break 
print(clean.detect_text_profile(doc_text))

unknown
ocr_like


In [8]:
praline(doc_text, report="detail")

('2023 ESG Report\n\nTABLE OF CONTENTS\n 3\nFrom Our CEO\n 4\nIntroduction\n4\nOur Business\n5\nAbout This Report\n6\nOur Approach to ESG\n7\n2023 ESG Highlights\n 8\nSustainable Finance\n8\nSustainable Finance Target\n11 Institutional Securities\n15 Wealth Management\n20 Investment Management\n24 Community Development Finance\n26 Inclusive Ventures Lab\n27\nInstitute for Sustainable Investing\n 28\nHuman Capital\n28 Introduction\n29 Building Our Workforce\n30 Investing in Our Workforce\n33 Developing Our Workforce\n\n34 Workforce Diversity Data\n38 Supporting Our Communities\n\n40 Institute for Inclusion\n 43\nClimate\n43 Climate Strategy\n45 Our Climate Ambitions\n\n45 Mobilize Capital Toward\nLow-Carbon and Green Solutions\n\n45 Strive Toward Our Financed\nEmissions Targets\n\n47\nMaintain Carbon Neutral Operations\n48 Climate-Related Engagements\n50 Climate Risk Management\n54 Climate Metrics and Targets\n\n54 Financed Emissions\n\n58\nOperational Emissions\n59 Looking Ahead\n 60\n