In [4]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 592 kB/s eta 0:00:01
Collecting tqdm>=4.62.1
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 1.4 MB/s eta 0:00:01
[?25hCollecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 670 kB/s eta 0:00:01
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyarrow>=6.0.0
  Downloading pyarrow-10.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.0 MB)
[K     |████████████████████████████████| 36.0 MB 167 kB/s eta 0:00:01    |████▋                           | 5.2 MB 143 kB/s eta 0:03:34     |██████████████████████▌         | 25.3 MB 190 kB/s eta 0:00:57     |████████████████████████

In [3]:
#!pip install accelerate
# !pip install datasets
# !pip install jiwer
#!pip install evaluate
import torch
from torch.utils.data import Dataset
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from torch.utils.data import DataLoader
import torch.nn as nn
import pandas as pd
import glob
import os
import random
from tqdm import tqdm
from datasets import load_metric
from transformers import AdamW
import matplotlib.pyplot as plt

import torch.optim as optim


# Reading the training file into a DataFrame
IAM_lines = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/tesseract-training/training/IAM/gt/lines/'
IAM_words = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/tesseract-training/training/IAM/gt/words/'
IAM_sentences = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/tesseract-training/training/IAM/gt/sentences/'

model_directory = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/'
# All files and directories ending with .txt and that don't begin with a dot:
def get_lists(directory,directory_percentage):
    image_list = glob.glob(directory+"*.png")

    text_list = []
    for image in image_list:
        text_list.extend(open(image.split('.')[0]+'.gt.txt','r').read().splitlines())
    # Take a random percentage of the data
#     image_list, text_list = zip(*random.sample(list(zip(image_list, text_list)), round(directory_percentage/100*len(image_list))))
    image_list, text_list = zip(*random.sample(list(zip(image_list, text_list)),directory_percentage))
    return image_list,text_list


# Taken from https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb
class IAMDataset(Dataset):
    def __init__(self, df, processor, max_target_length=128):
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        image = self.df['image'][idx]
        text = self.df['text'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(image).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding


image_list = []
text_list = []
lines_percentage = 1000
words_percentage = 1000
sentences_percentage = 1000
for directory,percentage in zip([IAM_lines, IAM_words, IAM_sentences],[lines_percentage, words_percentage, sentences_percentage]):
    images,text = get_lists(directory,percentage)
    image_list.extend(images)
    text_list.extend(text)
    

df = pd.DataFrame({'image':image_list,'text':text_list})


ModuleNotFoundError: No module named 'datasets'

In [48]:

from accelerate import Accelerator
    
#Setting up the accelerator
accelerator = Accelerator()


# Splitting the data into training and validation sets
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)


#Loading model and processor 
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") 
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

# Datasets and dataloaders for train validation
train_dataset = IAMDataset(df=train_df,processor=processor)
val_dataset = IAMDataset(df=val_df,processor=processor)


train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)


# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)


# Getting the accelerator set up
model, optimizer, train_dataloader, val_dataloader= accelerator.prepare(
    model, optimizer, train_dataloader,val_dataloader)


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import Trainer, TrainingArguments
from accelerate import Accelerator
import evaluate

metric = evaluate.load("cer")

def compute_cer(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = metric.compute(predictions=pred_str, references=label_str)

    return cer
# Training
avg_cer_list = []
avg_loss_list = []
min_val = float('inf')
patience = 2
EPOCH = 1
for epoch in range(EPOCH):
    model.train()
    avg_loss = 0
    for batch in tqdm(train_dataloader):
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        # Backward pass
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
    avg_loss += loss.item() / len(train_dataloader)

    # Evaluation
    val_cer = 0
    model.eval()
    for batch in val_dataloader:
        with torch.no_grad():
#             outputs = model.generate(batch["pixel_values"].to(device))
#             print(compute_cer(outputs,batch["labels"]))
            # Gather all predictions and targets

            outputs = model.generate(batch["pixel_values"])
            outputs = accelerator.gather(outputs)
            labels = accelerator.gather(batch["labels"])

            # compute metrics

            cer = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
            val_cer += cer
#             print(cer)
    last_val = val_cer/len(val_dataloader)
    avg_cer_list.append(last_val)

    # Early stopping if the avg_cer does not decrease for patience epochs
    
    if accelerator.is_main_process:
        if last_val > min_val:
            counter += 1
        else:
            counter = 0
            min_val = last_val
       1     # saving the best model so far
            torch.save(model.state_dict(), model_directory + 'best_model.pt')
        if counter == patience:
            break
        
    

    
    print("Epoch: {}, Avg Loss: {}, Avg Validation CER: {}".format(epoch, avg_loss, val_cer))

100%|██████████| 600/600 [03:19<00:00,  3.01it/s]


0.8028169014084507
0.6236559139784946
0.5027624309392266
0.5597014925373134
0.5675675675675675
0.5741935483870968
0.6351351351351351
0.20754716981132076
0.4845360824742268
0.46825396825396826
0.5333333333333333
0.32954545454545453
0.5047619047619047
0.47305389221556887
0.3163265306122449
0.5375
0.8125
0.41964285714285715
0.6885245901639344
0.673469387755102
0.29850746268656714
0.5547445255474452
0.6134453781512605
0.4647887323943662
0.44776119402985076
0.42990654205607476
0.7216494845360825
0.7816091954022989
0.5407407407407407
0.7272727272727273
0.36231884057971014
0.47474747474747475
0.42857142857142855
0.6530612244897959
0.504424778761062
0.6629213483146067
0.425
0.3697478991596639
0.42574257425742573
0.5714285714285714
0.5
0.3076923076923077
0.5845070422535211
0.625
0.5135135135135135
0.53125
0.6933333333333334
0.5373134328358209
0.618421052631579
0.55
0.5189873417721519
0.75
0.6627906976744186
0.7361111111111112
0.6764705882352942
0.5137614678899083
0.6744186046511628
0.5945945945

In [45]:
labels = encoding['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)



<class 'str'>


AttributeError: 'str' object has no attribute 'numpy'