<a href="https://colab.research.google.com/github/Fairuza12/MSProject/blob/main/TrOCR_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers torch torchvision pandas pillow



In [2]:
!pip install transformers[sentencepiece]



In [3]:
!pip install transformers datasets accelerate



In [4]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AdamW
from PIL import Image
import pandas as pd
from tqdm import tqdm

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Step 1: Load the dataset from Excel file and clean null values
excel_file = "/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/Image_text_Mapping_1.xlsx"
#excel_file = "/content/drive/MyDrive/HCR_DATASET/VHD/Image_text_Mapping_1.xlsx"
df = pd.read_excel(excel_file)
df = df.dropna(subset=['Text'])  # Remove rows with missing text
print(df.head())

         Image    Text Remarks
0  image_1.png  23.txt     NaN
1  image_2.png   4.txt     NaN
2  image_3.png   6.txt     NaN
3  image_4.png   7.txt     NaN
4  image_5.png  19.txt     NaN


In [7]:
image_folder = "/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/images"
annotation_folder = "/content/drive/MyDrive/MS_Project/Datasets/HCR_DATASET/VHD/annotations"

In [8]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

In [10]:
from torch.utils.data import Dataset

class BanglaDataset(Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_folder, row['Image'])
        image = Image.open(image_path).convert("RGB")

        # Preprocess the image
        pixel_values = self.processor(image, return_tensors="pt").pixel_values

        # Read the corresponding text
        text = row['Text']

        return pixel_values, text

In [11]:
from torch.utils.data import DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AdamW, AutoTokenizer
from tqdm import tqdm
import torch

def train_model(dataset, batch_size, epochs, learning_rate):
    # Load the tokenizer for Bangla from Indic-BERT
    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

    # Load the TrOCR processor with a suitable configuration
    # (e.g., 'microsoft/trocr-base-handwritten')
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

    # Update the processor's tokenizer with the Indic-BERT tokenizer
    processor.tokenizer = tokenizer


    # Load the model (you may need to change this if a Bangla-specific model is available)
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

    # Set the decoder_start_token_id and pad_token_id
    model.config.decoder_start_token_id = processor.tokenizer.bos_token_id  # Use the appropriate start token
    model.config.pad_token_id = processor.tokenizer.pad_token_id  # Set the pad token

    # Ensure the model's vocabulary size is aligned with the tokenizer's vocabulary size
    model.config.vocab_size = model.decoder.config.vocab_size = len(processor.tokenizer)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader):
            images, texts = batch
            pixel_values = images.squeeze(1).to(device)
             # Process the text for the labels
            with processor.as_target_processor():  # Use as_target_processor context
                labels = processor(texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt").input_ids.to(device)

            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

    # Save the model
    model.save_pretrained('./fine_tuned_trocr_bangla')

In [12]:
def predict_text(image_tensor, model, processor):
    model.eval()
    with torch.no_grad():
        generated_ids = model.generate(image_tensor).to(device)
        predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return predicted_text

In [13]:
def calculate_accuracy(predicted, true):
    return predicted.strip() == true.strip()

def evaluate_model(df_subset, model, processor):
    total_accuracy = 0.0
    num_samples = len(df_subset)

    for index, row in df_subset.iterrows():
        image_path = os.path.join(image_folder, row['Image'])
        true_text = row['Text']

        image = Image.open(image_path).convert("RGB")
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

        predicted_text = predict_text(pixel_values, model, processor)
        accuracy = calculate_accuracy(predicted_text, true_text)
        total_accuracy += accuracy

    return total_accuracy / num_samples

In [14]:
dataset_sizes = [50, 100, 150, 200, 250, 300]
accuracy_results = []

for size in dataset_sizes:
    df_subset = df[:size]  # Subset of the dataframe
    dataset = BanglaDataset(df_subset, image_folder, processor)

    train_model(dataset, batch_size=4, epochs=3, learning_rate=5e-5)

    model = VisionEncoderDecoderModel.from_pretrained('./fine_tuned_trocr_bangla').to(device)
    avg_accuracy = evaluate_model(df_subset, model, processor)

    accuracy_results.append((size, avg_accuracy))
    print(f"Dataset size: {size}, Average Accuracy: {avg_accuracy * 100:.2f}%")

# Print final results
print("\nFinal Accuracy Results for Different Dataset Sizes:")
for size, accuracy in accuracy_results:
    print(f"Dataset size: {size}, Accuracy: {accuracy * 100:.2f}%")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/13 [00:02<?, ?it/s]


RuntimeError: shape '[-1, 200000]' is invalid for input of size 102942720