# 0 Download Kaggle Data to Google Drive

In [None]:
!pip uninstall -y -q cffi
!pip install -q cffi gdown

In [None]:
url = "" # paste your URL link to the Google Cloud service account key
!gdown --fuzzy {url}

In [None]:
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials


SERVICE_ACCOUNT_FILE = "/kaggle/working/your-service-account-4.json"  # paste the service account key you downloaded

# Create credentials
creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE)

# Create service Google Drive API
drive_service = build("drive", "v3", credentials=creds)

# 1 Preprocessing datasets

In [None]:
!pip install -q transformers evaluate jiwer peft

## 1.1 Data loading and preprocessing

In [None]:
import os
import torch
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

from peft import LoraConfig, get_peft_model
from transformers import VisionEncoderDecoderModel, TrOCRProcessor

In [None]:
num_epochs = 5
batch_size = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    multi_gpu = True
else:
    multi_gpu = False

print(f"num_epochs: {num_epochs} and batch_size: {batch_size}")

In [None]:
# split data into 'num_files' parts if necessary
def split_data(df: pd.DataFrame, num_files: int, name_df: str):
    path = os.path.join("/kaggle/working", name_df)
    
    try:
        os.makedirs(path, exist_ok=True)
        df_list = np.array_split(df, num_files)
        
        for i, df_part in enumerate(df_list):
            df_part.to_csv(f"{path}/{name_df}_chunk{i}.csv", sep=';', header=None, index=False, escapechar='\\')
        print(f"{name_df} data is splitted")
            
    except OSError as error:
        print("Directory can not be created")
    

In [None]:
real_df = pd.read_csv(
    "/kaggle/input/dialectic-real-all/data.csv",
    sep=";",
    escapechar='\\',
    skiprows=1,
    header=None,
    names=["file_name", "text"],
)

real_df["file_name"] = real_df["file_name"].apply(lambda x: "/kaggle/input/dialectic-real-all/images/" + x)
real_df.head()

In [None]:
stackmix_df = pd.read_csv(
    "/kaggle/input/stackmix-dialectic-2-0/_output_/data.csv",
    sep=";",    
    escapechar='\\',
    header=None,
    skiprows=1,
    names=["text", "file_name"],
)
stackmix_df["file_name"] = stackmix_df["file_name"].apply(lambda x: "/kaggle/input/stackmix-dialectic-2-0/_output_/images/" + x)

stackmix_df.head()

In [None]:
split_data(df=stackmix_df, num_files=4, name_df="stackmix")

In [None]:
stackmix_df = pd.read_csv(
    "/kaggle/working/stackmix/stackmix_chunk2.csv",
    sep=";",    
    escapechar='\\',
    header=None,    
    names=["text", "file_name"],
)

print(f"stackmix_df.shape: {stackmix_df.shape}")
stackmix_df.head()

In [None]:
cyrillic_train_df = pd.read_csv(
    "/kaggle/input/cyrillic-handwriting-dataset/train.tsv",
    sep="\t",    
    escapechar='\\',
    header=None,
    names=["file_name", "text"],
)

cyrillic_train_df["file_name"] = cyrillic_train_df["file_name"].apply(lambda x: "/kaggle/input/cyrillic-handwriting-dataset/train/" + x)
cyrillic_train_df.head()

In [None]:
split_data(df=cyrillic_train_df, num_files=5, name_df="cyrillic")

In [None]:
cyrillic_train_df = pd.read_csv(
    "/kaggle/working/cyrillic/cyrillic_chunk1.csv",
    sep=";",    
    escapechar='\\',
    header=None,    
    names=["file_name", "text"],
)

print(f"cyrillic_train_df.shape: {cyrillic_train_df.shape}")
cyrillic_train_df.head()

In [None]:
print(real_df.isnull().sum(), "\n")
print(stackmix_df.isnull().sum(), "\n")
print(cyrillic_train_df.isnull().sum(), "\n")

In [None]:
real_df = real_df.dropna()
stackmix_df = stackmix_df.dropna()
cyrillic_train_df = cyrillic_train_df.dropna()

In [None]:
# we reset the indices to start from zero
real_df.reset_index(drop=True, inplace=True)
stackmix_df.reset_index(drop=True, inplace=True)
cyrillic_train_df.reset_index(drop=True, inplace=True)

In [None]:
train_df = pd.concat([stackmix_df, cyrillic_train_df], ignore_index=True)
train_df.head()

In [None]:
class CHDataset(Dataset):
    def __init__(self, df, processor, root_dir=None, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx, return_image=False):
        # get file name + text
        file_name = self.df["file_name"][idx]
        text = self.df["text"][idx]

        # prepare image (i.e. resize + normalize)
        # image = Image.open(self.root_dir + file_name).convert("RGB")
        image = Image.open(file_name).convert("RGB")

        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(
            text, padding="max_length", max_length=self.max_target_length
        ).input_ids
        # important: make sure that PAD tokens are ignored by the loss function

        labels = [
            label if label != self.processor.tokenizer.pad_token_id else -100
            for label in labels
        ]

        encoding = {
            "pixel_values": pixel_values.squeeze(),
            "labels": torch.tensor(labels),
        }
        if return_image:
            return encoding, image

        return encoding

## 1.2 Creating dataloaders

In [None]:
model_name = "/kaggle/input/dialectic-aug-1.0/other/default/2/dialectic-aug-1.0-dora/model/TrOCRModel/weights" # paste your TrOCR model
model = VisionEncoderDecoderModel.from_pretrained(model_name)

In [None]:
processor = TrOCRProcessor.from_pretrained(model_name)

vocab = processor.tokenizer.get_vocab()
diacritic_chars = list("\u0301\u0302\u0304\u0311\u0306\u203f")
missing_tokens = [char for char in diacritic_chars if char not in vocab]

if missing_tokens:
    print("Missing characters:", missing_tokens)
    processor.tokenizer.add_tokens(missing_tokens)
    model.decoder.resize_token_embeddings(len(processor.tokenizer))

else:
    print("All the necessary characters are already present in vocab.")

In [None]:
train_dataset = CHDataset(df=train_df, processor=processor)

val_dataset = CHDataset(df=real_df, processor=processor)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of cards val examples:", len(val_dataset))

In [None]:
def custom_collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = [item["labels"] for item in batch]
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)
    return {"pixel_values": pixel_values, "labels": labels_padded}

In [None]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_fn,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_fn,
    num_workers=4,
    pin_memory=True,
)

## 1.3 Checking Decoder

In [None]:
# if you also want to get the image, set 'return_image' in __get_item__ (class: CHDataset) to 'True'
# encoding, image = train_dataset[0]
encoding = train_dataset[0]
for k, v in encoding.items():
    print(k, v.shape)


labels = encoding["labels"]
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)

# image

# 2 Fine-tuning

In [None]:
import evaluate
from transformers import GenerationConfig, TrainingArguments

import copy

from tqdm.auto import tqdm
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter

from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

from accelerate import Accelerator
from accelerate.utils import LoggerType

## 2.1 DoRA

In [None]:
dora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value", "q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    use_dora=True,
    use_rslora=True,
)
model = get_peft_model(model, dora_config)
model.print_trainable_parameters()

## 2.2 Metrics and tools

In [None]:
cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred_ids, label_ids):
    labels = label_ids.clone()
    labels[labels == -100] = processor.tokenizer.pad_token_id

    pred_texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_texts = processor.batch_decode(labels, skip_special_tokens=True)

    # CER и WER
    cer = cer_metric.compute(predictions=pred_texts, references=label_texts)
    wer = wer_metric.compute(predictions=pred_texts, references=label_texts)

    # Accuracy
    correct = sum(
        pred_text == gt_text for pred_text, gt_text in zip(pred_texts, label_texts)
    )
    accuracy = correct / len(label_texts) * 100

    return cer, wer, accuracy

In [None]:
def evaluate_model(model, dataloader, device):
    model.eval()
    total_cer, total_wer, total_accuracy = 0.0, 0.0, 0.0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            outputs = model.generate(pixel_values)

            cer, wer, accuracy = compute_metrics(outputs, labels)

            total_cer += cer
            total_wer += wer
            total_accuracy += accuracy

    dataset_size = len(dataloader)
    return {
        "CER": total_cer / dataset_size,
        "WER": total_wer / dataset_size,
        "Accuracy": total_accuracy / dataset_size,
    }

In [None]:
def beam_search(model):
    # set special tokens used for creating the decoder_input_ids from the labels
    model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
    model.config.pad_token_id = processor.tokenizer.pad_token_id
    model.config.vocab_size = model.config.decoder.vocab_size

    # set beam search parameters
    generation_config = GenerationConfig(
        max_length=64,
        early_stopping=True,
        no_repeat_ngram_size=3,
        length_penalty=2.0,
        num_beams=4,
    )
    model.generation_config = generation_config

    model.generation_config.eos_token_id = processor.tokenizer.sep_token_id
    model.generation_config.decoder_start_token_id = processor.tokenizer.cls_token_id
    model.generation_config.pad_token_id = processor.tokenizer.pad_token_id

In [None]:
# Configuring generation parameters
beam_search(model)

if multi_gpu:
    model = torch.nn.DataParallel(model)

## 2.3 Auxiliary functions

In [None]:
import os
import zipfile
import shutil
import time
import socket
import ssl

from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from googleapiclient.errors import HttpError


FOLDER_ID = ""  # paste folder ID from your Google Drive


def create_zip(folder_to_zip, zip_path):
    base_name = zip_path[:-4]
    shutil.make_archive(base_name=base_name, format="zip", root_dir=folder_to_zip)
    folder_name = os.path.basename(folder_to_zip)
    print(f"Zip archive of the {folder_name} created")


def upload_to_drive(file_path, file_name, folder_id, max_retries=5):
    file_metadata = {"name": file_name, "parents": [folder_id]}

    media = MediaFileUpload(file_path, mimetype="application/zip")

    for attempt in range(max_retries):
        try:
            file = (
                drive_service.files()
                .create(body=file_metadata, media_body=media)
                .execute()
            )
            print(f"File {file_name} uploaded to Google Drive")
            return
        except (HttpError, ssl.SSLEOFError, socket.timeout) as e:
            print(f"Uploading error: {e}, attempt {attempt+1} from {max_retries}")
            time.sleep(5)

    print(
        f"Failed to upload {file_name} after {max_retries} attempts. We continue to execute..."
    )


def main_model():
    zip_name = "dialectic-mix-dora-1.1.zip"
    zip_path = f"/kaggle/working/{zip_name}"  # Path for model zip file
    folder_to_zip = "/kaggle/working/model"  # Model directory to zip

    if os.path.exists(zip_path):
        os.remove(zip_path)

    create_zip(folder_to_zip, zip_path)
    upload_to_drive(zip_path, zip_name, FOLDER_ID)


def main_logs():
    zip_name = "dialectic-mix-dora-1.1-logs.zip"
    zip_path = f"/kaggle/working/{zip_name}"  # Path for logs zip file
    folder_to_zip = "/kaggle/working/logs"  # Logs directory to zip

    if os.path.exists(zip_path):
        os.remove(zip_path)

    create_zip(folder_to_zip, zip_path)
    upload_to_drive(zip_path, zip_name, FOLDER_ID)


def zip_best_model():
    zip_name = "dialectic-mix-dora-1.1.zip"
    zip_path = f"/kaggle/working/best_model-{zip_name}"  # Path for best model zip
    folder_to_zip = (
        "/kaggle/working/model/TrOCRModel/best_model"  # Best model checkpoint directory
    )

    if os.path.exists(zip_path):
        os.remove(zip_path)

    create_zip(folder_to_zip, zip_path)

## 2.4 Accelerate

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir="model/TrOCRModel/weights",
    learning_rate=5e-5,
    num_train_epochs=num_epochs,
    logging_dir="logs",
    fp16=True,
    gradient_accumulation_steps=4,
)

best_cer = float("inf")
best_model_path = os.path.join("model/TrOCRModel", "best_model")

In [None]:
accelerator = Accelerator(
    mixed_precision="fp16" if training_args.fp16 else "no",
    gradient_accumulation_steps=training_args.gradient_accumulation_steps,
)

if accelerator.is_main_process:
    writer = SummaryWriter(log_dir=training_args.logging_dir)

optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
num_training_steps = len(train_dataloader) * training_args.num_train_epochs
warmup_steps = int(0.1 * num_training_steps)

scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
)

model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
    model,
    optimizer,
    train_dataloader,
    val_dataloader,
    scheduler,
)

device = accelerator.device
model.to(device)

In [None]:
for epoch in range(training_args.num_train_epochs):
    model.train()
    train_loss = 0.0

    progress_bar = tqdm(
        train_dataloader,
        desc=f"Epoch {epoch+1}",
        disable=not accelerator.is_local_main_process,
    )

    for batch in progress_bar:
        with accelerator.accumulate(model):
            optimizer.zero_grad()

            outputs = model(**batch)
            loss = outputs.loss.mean()

            accelerator.backward(loss)
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = train_loss / len(train_dataloader)

    # Validation
    model_copy = copy.deepcopy(model.module)
    merged_model = model_copy.merge_and_unload()
    metrics = evaluate_model(merged_model, val_dataloader, device)
    print(
        f"Epoch {epoch+1}: Validation CER: {metrics['CER']:.4f}, WER: {metrics['WER']:.4f}, Accuracy: {metrics['Accuracy']:.2f}%, Train loss: {avg_train_loss:.2f}"
    )

    # Logging on main process
    if accelerator.is_main_process:
        writer.add_scalar("Loss/train", avg_train_loss, epoch)
        writer.add_scalar("Metrics/CER", metrics["CER"], epoch)
        writer.add_scalar("Metrics/WER", metrics["WER"], epoch)
        writer.add_scalar("Metrics/Accuracy", metrics["Accuracy"], epoch)

    # Save the best model
    if metrics["CER"] < best_cer:
        best_cer = metrics["CER"]

        if accelerator.is_main_process:
            print(f"New best CER: {best_cer:.4f}. Saving copy merged model...")
            unwrapped_model = accelerator.unwrap_model(merged_model)
            unwrapped_model.save_pretrained(best_model_path)
            processor.save_pretrained(best_model_path)
            accelerator.save_state(os.path.join(best_model_path, "checkpoint"))
            zip_best_model()

    accelerator.wait_for_everyone()
    torch.cuda.empty_cache()


merged_model = model.module.merge_and_unload()
processor.save_pretrained(training_args.output_dir)
unwrapped_model = accelerator.unwrap_model(merged_model)
unwrapped_model.save_pretrained(training_args.output_dir)

accelerator.end_training()
accelerator.save_state(os.path.join("model/TrOCRModel/saves_states", "checkpoint"))

if accelerator.is_main_process:
    writer.close()

In [None]:
# Upload your last model and logs to Google Drive
main_model()
main_logs()

# 3 Test

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel


model_name = "/kaggle/input/dialectic-stackmix-2.0-dora-1.3/other/default/1"
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

model.to(device)

In [None]:
results = evaluate_model(model, val_dataloader, device)
for k, v in results.items():
    print(f"{k}: {v}")

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import os


path = "/kaggle/input/dialectic-real-all/images"
for file in os.listdir(path)[:5]:
    file_path = os.path.join(path, file)
    if not os.path.isfile(file_path):
        continue

    image = Image.open(file_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

    plt.imshow(image)
    plt.show()