In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:
!pip install spacy

In [None]:
!pip install pytesseract

In [None]:
import pytesseract
from PIL import Image

image = Image.open("/content/0088-understand-invoices.jpg")
text = pytesseract.image_to_string(image, lang='eng')
print(text)


In [None]:
import re

# Exemple : suppression des sauts de lignes inutiles
text_clean = re.sub(r'\n+', '\n', text)
print(text_clean)

In [None]:
# Extraire le numéro de facture
match = re.search(r"Facture\s*#?\s*(\d+)", text)
if match:
    numero_facture = match.group(1)

# Extraire la date
match_date = re.search(r"(\d{1,2}/\d{1,2}/\d{2,4})", text)
if match_date:
    date_facture = match_date.group(1)

# Extraire le montant
match_montant = re.search(r"Total\s*[:\-]?\s*(\d+[\.,]?\d*)\s*€", text)
if match_montant:
    montant = match_montant.group(1)


In [None]:
import spacy

nlp = spacy.load("en_core_news_md")
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


In [None]:
!python -m spacy download en_core_web_md


In [None]:

# Charger le modèle anglais de spaCy
nlp = spacy.load("en_core_web_md")  # ou "en_core_web_sm" si tu veux un modèle plus léger

# Exemple de texte issu d'une facture (extrait avec pytesseract par exemple)
text = """

Holi Industrial Estate
Hoty
Cork
République dtlande
© trormaions cont
(oTOD Media
1000 ve fetve
Vile, Pays
Identiiant du compte 000000
‘Apple Distribution InternationalLtd.
@ feeture
‘Numéro du document :0000000000
Date du document: 31 anv. 2023 1:11:15 (UTC)
[Ne pas effectuerle paiement



Détalis des frais ® @ @ © © © ©
© Pande detransacion —Modileda,.TOdeinea,. Nomdslncompaone Tigon Unda Monta des dipenses.. Moan acta (ER)
1 atin: 2023-24020. CHL 0000000 Base Anpromeon a * me 00
2 ahjna.2028-21amc20.. COTW 0000000 yin Sachets, ategny ow 2370 00
2. atin 2022-220. CRTST 0000000 Ander Saar ab fe 5 10 00
44 btm 2022-3iom20. CPTE 000000 rider Pod age € 2 ww 00
sous um 200
watacnrie 20.00% 200
rest am 200
3) Mode depsioment
Umontat de 0 EUR sera delacrte Mastercard se temioant ar 0727
@ Uncastrmetomel ae 3774UR« 8 epee atpeses ete capone.

"""

# Traitement du texte avec spaCy
doc = nlp(text)

# Affichage des entités nommées
print("Extracted Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} --> {ent.label_}")



Now, we're going to use EasyOCR package to "read" the titles. Modern OCR solutions may still produce a handful of errors, which is why you're encouraged to apply a language model or alternative OCR model to fix possible misspelings.

In [None]:
documents = [
    {
        "text": "This contract is between ACME Corp and John Doe. Start date: 01/01/2023. End date: 31/12/2023. Role: Data Scientist.",
        "label": "contract"
    },
    {
        "text": "Invoice Number: INV-2024-001. Client: John Doe. Amount: $5,000. Date: 15/06/2024.",
        "label": "invoice"
    },
    {
        "text": "John Doe is a Machine Learning Engineer with 5 years of experience in Python and TensorFlow.",
        "label": "cv"
    }
]


In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import joblib

# Exemple de données (à étendre dans un vrai cas)
data = pd.DataFrame({
    "text": [
        "Contrat n°5432. M. Jean Dupont. Assurance auto. Début: 01/01/2024. Fin: 31/12/2024.",
        "Déclaration de sinistre : accident le 03/03/2024 à Bordeaux avec ma Renault Clio.",
        "Relevé d’information : 2 sinistres survenus en 2022 et 2023. Bonus actuel : 0.85.",
        "Police n°2345. Assurance habitation. Début: 01/02/2024. Fin: 31/01/2025.",
        "Accident survenu le 15/05/2024 à Marseille. Véhicule : Citroën C3.",
        "Historique des sinistres de 2021 à 2023. Bonus-malus : 0.90."
    ],
    "label": ["contrat", "sinistre", "releve", "contrat", "sinistre", "releve"]
})

# Séparation en train/test
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.3, random_state=42)

# Pipeline
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("rfc", RandomForestClassifier())
])

# Entraînement
pipe.fit(X_train, y_train)

# Prédiction sur le test
y_pred = pipe.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
f1





In [None]:
!pip install scikit-learn pandas spacy
!python -m spacy download en_core_web_md


In [None]:

import re

nlp = spacy.load("en_core_web_md")

def extract_info(text, doc_type):
    doc = nlp(text)
    infos = {}

    if doc_type == "sinistre":
        for ent in doc.ents:
            if ent.label_ == "DATE":
                infos["date_incident"] = ent.text
            elif ent.label_ == "LOC":
                infos["lieu"] = ent.text
        match = re.search(r"véhicule.*?([A-Z].+)", text, re.IGNORECASE)
        if match:
            infos["véhicule"] = match.group(1)

    elif doc_type == "contrat":
        match_debut = re.search(r"Début\s*:\s*(\d{2}/\d{2}/\d{4})", text)
        match_fin = re.search(r"Fin\s*:\s*(\d{2}/\d{2}/\d{4})", text)
        if match_debut:
            infos["date_debut"] = match_debut.group(1)
        if match_fin:
            infos["date_fin"] = match_fin.group(1)

    return infos


In [None]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [None]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.53.1
    Uninstalling transformers-4.53.1:
      Successfully uninstalled transformers-4.53.1
Successfully installed transformers-4.53.2


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import torch

# 1. Documents à indexer
documents = [
    "Contrat n°5432. M. Jean Dupont. Assurance auto. Début: 01/01/2024. Fin: 31/12/2024.",
    "Déclaration de sinistre : accident le 03/03/2024 à Bordeaux avec ma Renault Clio.",
    "Relevé d’information : 2 sinistres survenus en 2022 et 2023. Bonus actuel : 0.85."
]

# 2. Embeddings avec sentence-transformers
embedder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = embedder.encode(documents, convert_to_tensor=True)

# 3. Construction de la base FAISS
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings.cpu().detach().numpy())

# 4. Chargement du tokenizer et modèle Mistral (7B par ex.)
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

# Pipeline génération
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

# 5. Fonction recherche + génération
def rag_answer(question, top_k=2):
    # Encode la question
    q_embedding = embedder.encode([question], convert_to_tensor=True).cpu().detach().numpy()

    # Recherche des docs les plus proches
    D, I = index.search(q_embedding, top_k)
    contexts = [documents[i] for i in I[0]]

    # Concaténer contexte + question pour le prompt
    prompt = "Contexte : " + " ".join(contexts) + "\nQuestion : " + question + "\nRéponse :"

    # Génération
    outputs = generator(prompt, max_length=200, do_sample=True, temperature=0.7)
    return outputs[0]['generated_text']

# 6. Test
question = "Quelle est la date de début du contrat de Jean Dupont ?"
print(rag_answer(question))
