<a href="https://colab.research.google.com/github/Fairuza12/MSProject/blob/main/TrOCR_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AdamW
from PIL import Image
import pandas as pd
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Step 1: Load the dataset from Excel file and clean null values
excel_file = "/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/Image_text_Mapping_1.xlsx"
df = pd.read_excel(excel_file)
df = df.dropna(subset=['Text'])  # Remove rows with missing text

In [4]:
# Step 2: Define a simple custom dataset class for Bangla handwritten images and text
class BanglaDataset(Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_folder, row['Image'])
        image = Image.open(image_path).convert("L")  # Convert to grayscale (black and white)

        # Preprocess the image using TrOCR processor
        pixel_values = self.processor(image, return_tensors="pt").pixel_values.squeeze(0)

        # Get the actual Bangla text as label
        text = row['Text']

        return pixel_values, text

In [11]:
# Step 3: Training function
def train_model(dataset, batch_size, epochs, learning_rate):
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

    # Fine-tune on Bangla dataset
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model.train()  # Set the model to training mode

    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader):
            images, texts = batch

            # Move tensors to GPU (if available)
            pixel_values = images.to(device)

            # Tokenize the text using the processor (Bangla-specific text tokenization)
            inputs = processor(text=texts, padding="max_length", return_tensors="pt", truncation=True).input_ids.to(device)
            labels = inputs.clone()

            # Forward pass
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

    # Save the fine-tuned model
    model.save_pretrained('./fine_tuned_trocr_bangla')

In [6]:
# Step 4: Prediction function
def predict_text(image_tensor, model, processor):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        generated_ids = model.generate(image_tensor.unsqueeze(0).to(device))
        predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return predicted_text

In [7]:
# Step 5: Calculate accuracy
def calculate_accuracy(predicted, true):
    return predicted.strip() == true.strip()

In [8]:
# Step 6: Evaluation of the model's accuracy on the dataset
def evaluate_model(df_subset, model, processor):
    total_accuracy = 0
    for index, row in df_subset.iterrows():
        image_path = os.path.join("/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/images", row['Image'])
        true_text = row['Text']  # True Bangla text

        # Preprocess the image
        image_tensor = processor(Image.open(image_path).convert("L"))
        image = Image.merge("RGB", (image, image, image))  # Convert grayscale to RGB
        image_tensor = processor(image, return_tensors="pt").pixel_values.squeeze(0)

        # Predict text using the model
        predicted_text = predict_text(image_tensor, model, processor)

        # Calculate accuracy for this sample
        accuracy = calculate_accuracy(predicted_text, true_text)
        total_accuracy += accuracy

    # Return the average accuracy over the dataset
    return total_accuracy / len(df_subset)

In [9]:
# Step 7: Setup device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
# Step 8: Test model with different dataset sizes
dataset_sizes = [50, 100, 150, 200, 250, 300]
accuracy_results = []

image_folder = "/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/images"

for size in dataset_sizes:
    df_subset = df[:size]  # Subset of the dataframe
    dataset = BanglaDataset(df_subset, image_folder, processor)

    # Fine-tune the model on the current dataset size
    train_model(dataset, batch_size=4, epochs=3, learning_rate=5e-5)

    # Load the fine-tuned model and evaluate
    model = VisionEncoderDecoderModel.from_pretrained('./fine_tuned_trocr_bangla').to(device)
    avg_accuracy = evaluate_model(df_subset, model, processor)

    # Store accuracy results
    accuracy_results.append((size, avg_accuracy))
    print(f"Dataset size: {size}, Average Accuracy: {avg_accuracy * 100:.2f}%")

# Final accuracy report
print("\nFinal Accuracy Results for Different Dataset Sizes:")
for size, accuracy in accuracy_results:
    print(f"Dataset size: {size}, Accuracy: {accuracy * 100:.2f}%")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/13 [00:01<?, ?it/s]


ValueError: Unsupported number of image dimensions: 2