In [1]:
# ─── Cell 1: Imports & Pretrained Model Setup ─────────────────────────────────
from transformers import (
    TrOCRProcessor,
    VisionEncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import torch
import pandas as pd
from datasets import Dataset as HFDataset, DatasetDict
from sklearn.model_selection import train_test_split
import numpy as np
import json
import cv2
import os

# Load the pretrained TrOCR (printed) model and its processor
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")

# Most of these are required for generation to work
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id           = processor.tokenizer.pad_token_id
model.config.vocab_size            = model.config.decoder.vocab_size


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# ─── Cell 2: Build DataFrame & Split ──────────────────────────────────────────
images_folder    = r"C:\Users\user\Downloads\TrOCR_Data\extracted"
annotations_file = r"C:\Users\user\Downloads\TrOCR_Data\merged_json.json"

with open(annotations_file, "r", encoding="utf-8") as f:
    annotations_list = json.load(f)

annotations = {
    os.path.basename(item["image_path"]): item["text"]
    for item in annotations_list
}

data = []
for image_file in os.listdir(images_folder):
    if image_file.lower().endswith((".png", ".jpg", ".jpeg")):
        image_path = os.path.join(images_folder, image_file)
        label      = annotations.get(image_file, "")
        data.append({"image_path": image_path, "text": label})

df = pd.DataFrame(data)
print(df.head())

# No test set, just train/validation
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

# Create Hugging Face datasets
hf_dsets = DatasetDict({
    "train":      HFDataset.from_pandas(train_df).rename_columns({"image_path": "image", "text": "text"}),
    "validation": HFDataset.from_pandas(val_df).rename_columns({"image_path": "image", "text": "text"}),
})



                                          image_path      text
0  C:\Users\user\Downloads\TrOCR_Data\extracted\+...    +Bilan
1  C:\Users\user\Downloads\TrOCR_Data\extracted\0...      0002
2  C:\Users\user\Downloads\TrOCR_Data\extracted\0...  000pA157
3  C:\Users\user\Downloads\TrOCR_Data\extracted\0...  00632506
4  C:\Users\user\Downloads\TrOCR_Data\extracted\0...  00737966


In [3]:
# ─── Cell 3: Preprocessing Function & Map ────────────────────────────────────
image_transform = transforms.Compose([
    transforms.Resize((384, 384)),   # TrOCR backbones expect 384×384
    transforms.ToTensor()
])

def preprocess_fn(example):
    text = example.get("text", "").strip()
    if not text:
        return None

    # Load & denoise
    img = Image.open(example["image"]).convert("RGB")
    gray = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)
    blur = cv2.GaussianBlur(gray, (3, 3), 0)
    img  = Image.fromarray(cv2.cvtColor(blur, cv2.COLOR_GRAY2RGB))

    # Resize with padding
    w, h   = img.size
    scale  = 384 / max(w, h)
    new_w, new_h = int(w*scale), int(h*scale)
    img = img.resize((new_w, new_h), Image.BILINEAR)
    canvas = Image.new("RGB", (384, 384), (255,255,255))
    canvas.paste(img, ((384-new_w)//2, (384-new_h)//2))
    img = canvas

    # Tokenize & prepare pixel‐values
    pixel_values = processor(images=img, return_tensors="pt").pixel_values[0]
    labels = processor.tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128,
    ).input_ids[0]
    labels[labels == processor.tokenizer.pad_token_id] = -100

    return {"pixel_values": pixel_values, "labels": labels}

In [4]:
# Custom Trainer Class
class CustomTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]
        pixel_values = inputs["pixel_values"]
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

# Ensure the datasets are processed correctly and the columns are named as expected
train_dataset = hf_dsets["train"].map(preprocess_fn, remove_columns=hf_dsets["train"].column_names, batched=False)
val_dataset = hf_dsets["validation"].map(preprocess_fn, remove_columns=hf_dsets["validation"].column_names, batched=False)

# Remove None values from the dataset
train_dataset = train_dataset.filter(lambda ex: ex is not None)
val_dataset = val_dataset.filter(lambda ex: ex is not None)

import evaluate
import numpy as np
import torch

# Load the WER and CER metrics
wer = evaluate.load("wer")
cer = evaluate.load("cer")

def compute_metrics_fn(eval_pred):
    logits, labels = eval_pred

    # If logits are raw, take argmax to get predicted token IDs
    if isinstance(logits, tuple):  # safety check
        logits = logits[0]
    predictions = np.argmax(logits, axis=-1)

    # Replace -100 in labels before decoding
    labels = np.where(labels == -100, processor.tokenizer.pad_token_id, labels)

    # Decode
    pred_str = processor.tokenizer.batch_decode(predictions, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute WER and CER
    wer_score = wer.compute(predictions=pred_str, references=label_str)
    cer_score = cer.compute(predictions=pred_str, references=label_str)

    return {
        "wer": wer_score,
        "cer": cer_score
    }

# 3) Set up training arguments with tuned hyperparams
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr-large-finetuned",
    per_device_train_batch_size=4,           
    gradient_accumulation_steps=4,           
    learning_rate=5e-5,                      
    weight_decay=0.01,                       
    num_train_epochs=20,                     
    warmup_ratio=0.1,                        
    fp16=True,                               
    evaluation_strategy="steps",             
    eval_steps=100,                          
    save_strategy="steps",                   
    save_steps=250,                          
    save_total_limit=3,
    predict_with_generate=True,              
    generation_max_length=128,               
    label_smoothing_factor=0.0,              
)

# 4) Instantiate the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics_fn,
    data_collator=lambda batch: {
        "pixel_values": torch.stack([torch.tensor(x["pixel_values"]) for x in batch]),
        "labels": torch.stack([torch.tensor(x["labels"]) for x in batch]),
    },
)

# 5) Launch training
trainer.train()

# 6) Save final model & processor
model.save_pretrained("./trocr-large-finetuned")
processor.save_pretrained("./trocr-large-finetuned")


Map:   0%|          | 0/744 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Filter:   0%|          | 0/744 [00:00<?, ? examples/s]

Filter:   0%|          | 0/132 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler()
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


Step,Training Loss,Validation Loss,Wer,Cer
100,No log,2.546263,1.0,0.988024
200,No log,2.460259,1.0,0.988772
300,No log,2.624455,1.0,0.986527
400,No log,2.43636,1.0,0.98503
500,1.129700,2.464883,1.0,0.987275


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


KeyboardInterrupt: 

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch


tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


training_data = []
with open("training_data.txt", "r", encoding="utf-8") as f:
    for line in f:
        input_text, output_text = line.strip().split("|||")
        training_data.append({"input": input_text, "output": output_text})


class LabelCorrectionDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return {"input": self.data[idx]["input"], "output": self.data[idx]["output"]}

dataset = LabelCorrectionDataset(training_data)

# Tokenize data
def tokenize_batch(batch):
    inputs = tokenizer([b["input"] for b in batch], padding=True, truncation=True, return_tensors="pt")
    outputs = tokenizer([b["output"] for b in batch], padding=True, truncation=True, return_tensors="pt")
    return {"input_ids": inputs.input_ids, "labels": outputs.input_ids}

# Create DataLoader to handle batching
dataloader = DataLoader(dataset, batch_size=4, collate_fn=tokenize_batch)

# Fine-tune
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(3):
    for batch_idx, tokenized in enumerate(dataloader):
        # Get tokenized input and labels
        input_ids = tokenized["input_ids"]
        labels = tokenized["labels"]
        
        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}")

model.save_pretrained("label_correction_model")
tokenizer.save_pretrained("label_correction_model")


In [None]:
trocr_model = VisionEncoderDecoderModel.from_pretrained("./trocr-finetuned")
trocr_processor = TrOCRProcessor.from_pretrained("./trocr-finetuned")

In [None]:
# Charger le modèle YOLOv8
yolo_model = YOLO(r"C:\Users\user\Downloads\train6\weights\best.pt")

# Charger l'image
image_path =  r"C:\Users\user\Downloads\dataset\test\Bulletin_de_soin\5246--6555004--20230914_page_0.jpg"
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Conversion pour Matplotlib

# Détection avec un seuil de confiance bas
results = yolo_model(image_path, conf=0.05)

# Paramètres personnalisés pour les boîtes
box_alpha = 0.2  # Transparence du remplissage
line_width = 1    # Épaisseur des contours
font_scale = 0.6  # Taille du texte
text_color = (255, 0, 0)  # Couleur du texte (bleu)
box_color = (0, 255, 0)   # Couleur des boîtes (vert)

# Créer une copie pour l'annotation
annotated_image = image.copy()
# Extraire les coordonnées, scores et classes
boxes = []
scores = []
classes = []

for result in results:
    for box in result.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        boxes.append([x1, y1, x2 - x1, y2 - y1])  # Format [x, y, w, h]
        scores.append(float(box.conf[0]))
        classes.append(int(box.cls))

# Paramètres NMS
nms_threshold = 0.4  # Ajustable
confidence_threshold = 0.3  # Ne garder que les détections pertinentes

# Appliquer NMS
indices = cv2.dnn.NMSBoxes(boxes, scores, confidence_threshold, nms_threshold)
indices = [i[0] if isinstance(i, (list, tuple, np.ndarray)) else i for i in indices]

# Annoter l’image uniquement avec les boîtes gardées
annotated_image = image.copy()

for i in indices:
    x, y, w, h = boxes[i]
    x2, y2 = x + w, y + h
    label = f"{result.names[classes[i]]} {scores[i]:.2f}"

    overlay = annotated_image.copy()
    cv2.rectangle(overlay, (x, y), (x2, y2), box_color, -1)
    annotated_image = cv2.addWeighted(overlay, box_alpha, annotated_image, 1 - box_alpha, 0)
    cv2.rectangle(annotated_image, (x, y), (x2, y2), box_color, line_width)
    cv2.putText(annotated_image, label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, 1)


# Affichage
plt.figure(figsize=(12, 10))
plt.imshow(annotated_image)
plt.axis("off")

# Enregistrer le résultat si besoin
output_path = "detection_result.jpg"
cv2.imwrite(output_path, cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))
print(f"Résultat sauvegardé sous : {output_path}")

plt.show()

In [None]:
from PIL import Image

for result in results:
    for i, box in enumerate(result.boxes):
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
        image_pil = Image.fromarray(image)
        cropped = image_pil.crop((x1, y1, x2, y2))

        if np.array(cropped).size == 0:
            continue

        # Préparation pour le modèle TrOCR
        inputs = trocr_processor(images=cropped, return_tensors="pt").pixel_values
        inputs = inputs.to(trocr_model.device)

        with torch.no_grad():
            output = trocr_model.generate(inputs)

        text = trocr_processor.batch_decode(output, skip_special_tokens=True)[0]
        text = text.strip() if text.strip() != "" else "No text detected"

        plt.figure()
        plt.imshow(cropped)
        plt.axis("off")
        plt.title(f"OCR Output: {text}", fontsize=10)
        plt.show()