In [None]:
!pip install -q datasets jiwer

In [None]:
!pip install transformers -U

In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers[torch] -U

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
Path = '/content/drive/MyDrive/ImgToLatex/First_15K_of_30K_LCDataset.xlsx'

In [None]:
import pandas as pd

SourceExcelFile = pd.read_excel(Path)

SourceExcelFile.head()

We split up the data into training + testing, using sklearn's `train_test_split` function.

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(SourceExcelFile, test_size=0.3)

# we reset the indices to start from zero
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

Each element of the dataset should return 2 things:
* `pixel_values`, which serve as input to the model.
* `labels`, which are the `input_ids` of the corresponding text in the image.

We use `TrOCRProcessor` to prepare the data for the model. `TrOCRProcessor` is actually just a wrapper around a `ViTFeatureExtractor` (which can be used to resize + normalize images) and a `RobertaTokenizer` (which can be used to encode and decode text into/from `input_ids`).

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

def Loader(df, maxe):
  ImgSourceDir = '/content/drive/My Drive/ImgToLatex/LatexImages/'

  FileNames = list(df.columns[2 : 6])

  FinalDF = pd.DataFrame(columns = ['Image', 'Text'])
  NotFoundFiles = []

  count = 0

  for i in range(len(df)):

    DirectoryName = df.iloc[i]['DirectoryName']

    for FileDpi in FileNames:

      try:
        text = df.iloc[i]['Latex Code']
        ImgFile = ImgSourceDir + DirectoryName + '/' + df.iloc[i][FileDpi] + '.png'
        img = Image.open(ImgFile)

        newRow = pd.DataFrame({"Image": [ImgFile], "Text": [text]})
        FinalDF = pd.concat([FinalDF, newRow], ignore_index = True)
        count += 1
        print(len(FinalDF))
        if count == maxe:
          return(FinalDF, NotFoundFiles)
      except FileNotFoundError:
        NotFoundFiles.append(df.iloc[i][FileDpi])
  return(FinalDF, NotFoundFiles)

In [None]:
train, files = Loader(train_df, 10000)

In [None]:
#Dont Run This

train.to_csv('/content/drive/My Drive/ImgToLatex/train.csv')

In [None]:
#Run from here..........

import pandas as pd
train = pd.read_csv('/content/drive/My Drive/ImgToLatex/First10000/train.csv')

In [None]:
eval, efiles = Loader(test_df, 2000)

In [None]:
#Dont Run This

eval.to_csv('/content/drive/My Drive/ImgToLatex/eval.csv')

In [None]:
eval = pd.read_csv('/content/drive/My Drive/ImgToLatex/First10000/eval.csv')

Let's initialize the training and evaluation datasets:

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=256):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text
        file_name = self.df['Image'][idx]
        text = self.df['Text'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text,
                                          padding="max_length",
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
from transformers import TrOCRProcessor
from transformers import convert_slow_tokenizer

ImgFilePath = '/content/drive/MyDrive/LatexDataset_02_01_2024/LatexImages'

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

In [None]:
train_dataset = IAMDataset(root_dir = ImgFilePath, df = train, processor = processor)
eval_dataset = IAMDataset(root_dir = ImgFilePath, df = eval, processor=processor)

Let's verify an example from the training dataset:

In [None]:
encoding = train_dataset[0]
for k,v in encoding.items():
  print(k, v.shape)

We can also check the original image and decode the labels:

In [None]:
#Dont Run This

image = Image.open(train_dataset.root_dir + train_df['DPI-200'][0]).convert("RGB")
image

In [None]:
labels = encoding['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)

## Train a model

Here, we initialize the TrOCR model from its pretrained weights. Note that the weights of the language modeling head are already initialized from pre-training, as the model was already trained to generate text during its pre-training stage. Refer to the paper for details.

In [None]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

Importantly, we need to set a couple of attributes, namely:
* the attributes required for creating the `decoder_input_ids` from the `labels` (the model will automatically create the `decoder_input_ids` by shifting the `labels` one position to the right and prepending the `decoder_start_token_id`, as well as replacing ids which are -100 by the pad_token_id)
* the vocabulary size of the model (for the language modeling head on top of the decoder)
* beam-search related parameters which are used when generating text.

In [None]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
from datasets import load_metric

cer_metric = load_metric("cer")

The compute_metrics function takes an `EvalPrediction` (which is a NamedTuple) as input, and should return a dictionary. The model will return an EvalPrediction at evaluation, which consists of 2 things:
* predictions: the predictions by the model.
* label_ids: the actual ground-truth labels.

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

We will evaluate the model on the Character Error Rate (CER), which is available in HuggingFace Datasets (see [here](https://huggingface.co/metrics/cer)).

In [None]:
import os
from google.colab import drive
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
import accelerate

# Specify the directory path in your Google Drive
drive_output_dir = "/content/drive/MyDrive/ImgToLatex/TrOcr"

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    num_train_epochs = 25,
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True,
    output_dir=drive_output_dir,
    logging_steps=2,
    save_steps=1000,
    eval_steps=200,
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.image_processor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
)

try:
    # Train the model
    trainer.train()
except (KeyboardInterrupt, RuntimeError) as e:
    # Handle keyboard interrupt and runtime disconnect errors
    print(f"Error during training: {e}")
finally:
    # Save the model even if there's an interruption or error
    print("Saving model...")
    model.save_pretrained(drive_output_dir)
    #training_args.save_model_args(drive_output_dir)
    print("Model saved in Google Drive.")


## Inference

Note that after training, you can easily load the model using the .`from_pretrained(output_dir)` method.

In [None]:
TestPath = '/content/drive/My Drive/ImgToLatex/LatexImages/1-1500/latex_image_750_dpi_200_normal.png'

image = Image.open(TestPath).convert("RGB")

image

In [None]:
#Dont Run This

pixel_values = processor(images=image, return_tensors="pt").pixel_values

generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(generated_text)

In [None]:
Model = VisionEncoderDecoderModel.from_pretrained(drive_output_dir)

pixel_values = processor(images=image, return_tensors="pt").pixel_values

generated_ids = Model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(generated_text)