<a href="https://colab.research.google.com/github/Fairuza12/MSProject/blob/main/TrOCR_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AdamW
from PIL import Image
import pandas as pd
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Step 1: Load the dataset from Excel file and clean null values
excel_file = "/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/Image_text_Mapping_1.xlsx"
#excel_file = "/content/drive/MyDrive/HCR_DATASET/VHD/Image_text_Mapping_1.xlsx"
df = pd.read_excel(excel_file)
df = df.dropna(subset=['Text'])  # Remove rows with missing text

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Step 2: Define a simple custom dataset class for Bangla handwritten images and text
class BanglaDataset(Dataset):
    def __init__(self, dataframe, image_folder,processor):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_folder, row['Image'])
        image = Image.open(image_path).convert("RGB")  # Convert to grayscale (black and white)

        # Preprocess the image using TrOCR processor
        pixel_values = self.processor(image, return_tensors="pt").pixel_values.squeeze(0)

        # Get the actual Bangla text as label (read from file)
        text_file_name = row['Text']
        text = read_annotation_file(text_file_name)

        return pixel_values, text

In [None]:
# Path to the folder containing annotation files
annotation_folder = "/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/annotations"

# Function to read the content of a .txt file
def read_annotation_file(filename):
    file_path = os.path.join(annotation_folder, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().strip()  # Read the text from the file and remove any leading/trailing whitespace
    return text

In [None]:
# Step 3: Training function
def train_model(dataset, batch_size, epochs, learning_rate):
    model.config.decoder_start_token_id = processor.tokenizer.cls_token_id ##added this
    model.config.pad_token_id = processor.tokenizer.pad_token_id##added this

    # Fine-tune on Bangla dataset
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model.train()  # Set the model to training mode

    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader):
            images, texts = batch

            # Move tensors to GPU (if available)
            pixel_values = images.to(device)

            # Tokenize the text using the processor (Bangla-specific text tokenization)
            inputs = processor(text=texts, padding="max_length", return_tensors="pt", truncation=True).input_ids.to(device)
            labels = inputs.clone()

            # Forward pass
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

    # Save the fine-tuned model
    model.save_pretrained('./fine_tuned_trocr_bangla')

In [None]:
# Step 4: Prediction function
def predict_text(image_tensor, model, processor):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        generated_ids = model.generate(image_tensor.unsqueeze(0).to(device))
        predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return predicted_text

In [None]:
# Step 5: Calculate accuracy
def calculate_accuracy(predicted, true):
    return predicted.strip() == true.strip()

In [None]:
# Step 6: Evaluation of the model's accuracy on the dataset
def evaluate_model(df_subset, model, processor):
    total_accuracy = 0.0
    num_samples = len(df_subset)

    for index, row in df_subset.iterrows():
        image_path = os.path.join('/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/images', row['Image'])
        text_file_name = row['Text']
        true_text = read_annotation_file(text_file_name)  # True Bangla text

        # Open and process the image using the processor
        image = Image.open(image_path).convert("RGB")
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

        # Predict text using the model
        predicted_text = predict_text(pixel_values, model, processor)
        print(predicted_text)

        # Calculate accuracy for this sample
        accuracy = calculate_accuracy(predicted_text, true_text)
        total_accuracy += accuracy

    # Return the average accuracy over the dataset
    return total_accuracy / num_samples

In [None]:
# Step 8: Test model with different dataset sizes
dataset_sizes = [50, 100, 150, 200, 250, 300]
accuracy_results = []

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

image_folder = "/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/images"
#image_folder = "/content/drive/MyDrive/HCR_DATASET/VHD/images"

for size in dataset_sizes:
    df_subset = df[:size]  # Subset of the dataframe
    dataset = BanglaDataset(df_subset, image_folder,processor)

    # Fine-tune the model on the current dataset size
    train_model(dataset, batch_size=4, epochs=3, learning_rate=5e-5)

    # Load the fine-tuned model and evaluate
    model = VisionEncoderDecoderModel.from_pretrained('./fine_tuned_trocr_bangla').to(device)
    avg_accuracy = evaluate_model(df_subset, model, processor)

    # Store accuracy results
    accuracy_results.append((size, avg_accuracy))
    print(f"Dataset size: {size}, Average Accuracy: {avg_accuracy * 100:.2f}%")

# Final accuracy report
print("\nFinal Accuracy Results for Different Dataset Sizes:")
for size, accuracy in accuracy_results:
    print(f"Dataset size: {size}, Accuracy: {accuracy * 100:.2f}%")

100%|██████████| 13/13 [00:37<00:00,  2.85s/it]


Epoch 1/3, Loss: 4.1711


100%|██████████| 13/13 [00:39<00:00,  3.03s/it]


Epoch 2/3, Loss: 3.0077


100%|██████████| 13/13 [00:40<00:00,  3.15s/it]


Epoch 3/3, Loss: 2.9550


ValueError: too many values to unpack (expected 4)