In [2]:
import os
import glob
import joblib
import pytesseract
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

ModuleNotFoundError: No module named 'pandas'

In [None]:
import os
import glob
import joblib
import pytesseract
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

DATA_DIR = "data"
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

SUPPORTED_IMG_EXT = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}

def ocr_image(img: Image.Image, lang="eng+fra") -> str:
    g = img.convert("L")
    return pytesseract.image_to_string(g, lang=lang) or ""

def ocr_file(path: str) -> str:
    path_low = path.lower()
    if path_low.endswith(".pdf"):
        pages = convert_from_path(path, dpi=200, first_page=1, last_page=3)
        texts = [ocr_image(p) for p in pages]
        return "\n".join(texts)
    else:
        return ocr_image(Image.open(path))

rows = []
labels_map = {"identity":0, "invoice":1, "mail":2, "other":3}

for label in labels_map:
    folder = os.path.join(DATA_DIR, label)
    if not os.path.isdir(folder):
        continue
    for ext in list(SUPPORTED_IMG_EXT) + [".pdf"]:
        for f in glob.glob(os.path.join(folder, f"**/*{ext}"), recursive=True):
            try:
                text = ocr_file(f)
                rows.append({"text": text, "label": labels_map[label]})
            except Exception as e:
                print("[WARN] OCR failed:", f, e)

df = pd.DataFrame(rows)

X_train, X_val, y_train, y_val = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"])

# Hugging Face Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len)
        self.labels = labels.tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

train_dataset = TextDataset(X_train, y_train, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)

model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=4)

training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
print("Saved CamemBERT model to", MODEL_DIR)