In [1]:
import os
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AutoTokenizer
from PIL import Image
import torch
device = torch.device('cpu')

model = VisionEncoderDecoderModel.from_pretrained("checkpoints/checkpoint-310000/")
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

for file in os.listdir("sample_imgs"):
    if not file.endswith((".jpg", ".png")):
        continue

    image = Image.open(os.path.join("sample_imgs", file)).convert("RGB")

    pixel_values = (processor(image, return_tensors="pt").pixel_values).to(device)
    generated_ids = model.generate(pixel_values)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


李 文 晋 就 走 来 了 。 他 告 诉 大 家 说 ： 啊 才 我 从 瘦
教 不 好 ， 才 是 我 仍 好 惹 任 · 今 天 礼 讲 到 这 里 为
育 原 发 现 24 个 新 物 种 ， 其 中 包 括 一 种 能 发 荧 光 的 紫 色 蛙 。
家 在 这 片 死 亡 之 海 中 视 角 不 同 的 观 察 ， 导 致 了 一 个 世 纪 的 学 术 争 论 ，
是 可 表 行 业 内 知 名 度 和 市 场 占 有 份 额 。 而 其


In [1]:
import evaluate
import numpy as np
from transformers import BasicTokenizer

cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Predictions come with space between
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=labels_str)
    wer = wer_metric.compute(predictions=pred_str, references=labels_str)

    return {"cer": cer, "wer": wer}

# pred = {
#     "predictions": np.array([[ 101,  101, 3330, 3152, 3232, 2218, 6624, 1092,  749,  511,  800, 1440, 6401, 1920, 2157, 6432, 8038, 1157, 2798, 2769,  794, 2445,  102]]),
#     "label_ids": np.array([[101, 3330, 3152, 3232, 2218, 1139, 3341, 749, 511, 800, 1440, 6401, 1920, 2157, 6432, 8038, 1157, 2798, 2769, 794, 2445, 102]])
# }

# compute_metrics(pred)

In [14]:
import os
from os import path

import torch
from torch.utils.data import Dataset
import torchvision.io as io
import pandas as pd
from PIL import Image

from data_aug import build_data_aug
from torch.utils.data import Subset
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AutoTokenizer
from transformers import default_data_collator

model = VisionEncoderDecoderModel.from_pretrained("checkpoints/checkpoint-310000/")
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")


class OCRDataset(Dataset):
    def __init__(self, dataset_dir, labels_dir, transform, processor, tokenizer, mode="train", max_target_length=32, device=None):
        self.dataset_dir = dataset_dir
        self.labels_dir = labels_dir
        self.transform = transform
        self.device = device
        self.processor = processor
        self.mode = mode
        self.max_target_length = max_target_length
        self.tokenizer = tokenizer
        self.df = self.build_df()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text
        file_name = self.df["file_name"][idx]
        text = self.df['text'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(path.join(self.dataset_dir, file_name)).convert("RGB")
        if self.mode == "train" and self.transform:
            image = self.transform(image)

        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        labels = self.tokenizer(text, padding="max_length",
                                stride=32,
                                truncation=True,
                                max_length=self.max_target_length).input_ids
        
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

    def build_df(self):
        li = []
        for root, dirs, files in os.walk(self.labels_dir):
            for file in files:  # Loop through the dataset tsvfiles
                if not file.endswith(".tsv"):
                    continue

                print(f"Processing {file}")
                li.append(pd.read_table(path.join(root, file),
                          names=["file_name", "text"]))

        return pd.concat(li, axis=0, ignore_index=True)
    

dataset_dir = 'dataset/data'
max_length = 64

# Define the number of samples to keep in eval dataset
num_samples = 200

eval_dataset = OCRDataset(
    dataset_dir=dataset_dir,
    labels_dir="dataset/labels/test",
    tokenizer=tokenizer,
    processor=processor,
    mode="eval",
    transform=None,
    max_target_length=max_length
)

# Create a random subset of the dataset
subset_indices = torch.randperm(len(eval_dataset))[:num_samples]
eval_dataset = Subset(eval_dataset, subset_indices.tolist())

print("Number of validation examples:", len(eval_dataset))


def init_trainer(model, tokenizer, compute_metrics, train_dataset,
                 eval_dataset):
    training_args = Seq2SeqTrainingArguments(
        predict_with_generate=True,
        evaluation_strategy="steps",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        fp16=True,
        learning_rate=4e-5,
        output_dir="./checkpoints",
        logging_dir="./logs",
        logging_strategy="steps",
        logging_steps=100,
        save_strategy="steps",
        save_total_limit=5,
        save_steps=10000,
        eval_steps=10000,
        resume_from_checkpoint="./checkpoints/",
        dataloader_num_workers=10)

    # instantiate trainer
    return Seq2SeqTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=default_data_collator,
    )

trainer = init_trainer(model, tokenizer, compute_metrics, eval_dataset, eval_dataset)

eval_result = None
with torch.no_grad():
    eval_result = trainer.evaluate(eval_dataset, max_length=64)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Processing hwdb2.0_4k.tsv
Processing hwdb_ic13_3k.tsv
Processing hw_chinese_17k.tsv
Processing hwdb2.2_3k.tsv
Processing hwdb2.1_3k.tsv
Processing hwdb_ic13_val_5k.tsv
Number of validation examples: 200


In [15]:
eval_result

{'eval_loss': 0.2817695736885071,
 'eval_cer': 0.06323755772190867,
 'eval_wer': 0.11390157280568239,
 'eval_runtime': 21.6212,
 'eval_samples_per_second': 9.25,
 'eval_steps_per_second': 0.601}

In [1]:
import torch  
from torchvision.models import efficientnet_b0, vit_l_16, densenet161, regnet_y_1_6gf
from pytorch_benchmark import benchmark

print("################################################################")
model = efficientnet_b0().cuda()
sample = torch.randn(64, 3, 224, 224)  # (B, C, H, W)
results = benchmark(model, sample, num_runs=10)

print("################################################################")
model2 =  densenet161().cuda()
sample2 = torch.randn(64, 3, 224, 224)  # (B, C, H, W)
results2 = benchmark(model2, sample2, num_runs=10)

print("################################################################")
model3 =  regnet_y_1_6gf().cuda()
sample3 = torch.randn(64, 3, 224, 224)  # (B, C, H, W)
results3 = benchmark(model3, sample3, num_runs=10)


################################################################


/opt/amdgpu/share/libdrm/amdgpu.ids: No such file or directory
/opt/amdgpu/share/libdrm/amdgpu.ids: No such file or directory
Warming up with batch_size=1: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]
Unable to measure model FLOPs due to error: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
Warming up with batch_size=1: 100%|██████████| 1/1 [00:00<00:00, 212.54it/s]
Measuring inference for batch_size=1: 100%|██████████| 10/10 [00:00<00:00, 231.46it/s]
Unable to measure energy consumption. Device must be a NVIDIA Jetson.
Warming up with batch_size=64: 100%|██████████| 1/1 [00:00<00:00, 27.10it/s]
Measuring inference for batch_size=64: 100%|██████████| 10/10 [00:00<00:00, 27.00it/s]
Unable to measure energy consumption. Device must be a NVIDIA Jetson.


################################################################


Warming up with batch_size=1: 100%|██████████| 1/1 [00:00<00:00,  1.49it/s]
Unable to measure model FLOPs due to error: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
Warming up with batch_size=1: 100%|██████████| 1/1 [00:00<00:00, 93.60it/s]
Measuring inference for batch_size=1: 100%|██████████| 10/10 [00:00<00:00, 99.21it/s]
Unable to measure energy consumption. Device must be a NVIDIA Jetson.
Warming up with batch_size=64: 100%|██████████| 1/1 [00:00<00:00,  7.00it/s]
Measuring inference for batch_size=64: 100%|██████████| 10/10 [00:01<00:00,  6.92it/s]
Unable to measure energy consumption. Device must be a NVIDIA Jetson.


################################################################


Warming up with batch_size=1: 100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
Unable to measure model FLOPs due to error: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
Warming up with batch_size=1: 100%|██████████| 1/1 [00:00<00:00, 119.08it/s]
Measuring inference for batch_size=1: 100%|██████████| 10/10 [00:00<00:00, 124.15it/s]
Unable to measure energy consumption. Device must be a NVIDIA Jetson.
Warming up with batch_size=64: 100%|██████████| 1/1 [00:00<00:00, 27.94it/s]
Measuring inference for batch_size=64: 100%|██████████| 10/10 [00:00<00:00, 27.06it/s]
Unable to measure energy consumption. Device must be a NVIDIA Jetson.
