In [2]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from PIL import Image
from datasets import Dataset
import pandas as pd

# Load the processor and model
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')

# Load the CSV file
data = pd.read_csv("english.csv")

# Shuffle the DataFrame
data = data.sample(frac=1).reset_index(drop=True)

# Create a list of dictionaries for the dataset
dataset = []

# Loop through the data and load images and labels
for i in range(len(data)):
    image_add = data.iloc[i]["image"]
    image_path = "Img/" + image_add  # Assuming images are in the "Img" directory
    label = data.iloc[i]["label"]
    dataset.append({"image": image_path, "text": label})  # Add image path and label to the dataset list

# Convert the list to a Dataset object
dataset = Dataset.from_pandas(pd.DataFrame(dataset))

# Preprocess the images and texts
def preprocess_function(examples):
    images = [Image.open(image_path).convert("RGB") for image_path in examples['image']]
    pixel_values = processor(images, return_tensors="pt").pixel_values
    labels = processor.tokenizer(examples['text'], return_tensors="pt", padding=True, truncation=True).input_ids
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Set padding tokens to -100 for loss calculation
    return {"pixel_values": pixel_values, "labels": labels}

processed_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["image", "text"])

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10,
    eval_steps=10,
    num_train_epochs=3,
)

# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    eval_dataset=processed_dataset,
    tokenizer=processor.feature_extractor,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("./trocr-finetuned")
processor.save_pretrained("./trocr-finetuned")


KeyboardInterrupt: 

In [3]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from PIL import Image
from datasets import Dataset
import pandas as pd

In [None]:
# Load the processor and model
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten')

pytorch_model.bin:   0%|          | 0.00/2.23G [00:00<?, ?B/s]

In [6]:
# Load the CSV file
data = pd.read_csv("english.csv")

# Shuffle the DataFrame
data = data.sample(frac=1).reset_index(drop=True)

# Create a list of dictionaries for the dataset
dataset = []

# Loop through the data and load images and labels
for i in range(len(data)):
    image_add = data.iloc[i]["image"]
    image_path = "Img/" + image_add  # Assuming images are in the "Img" directory
    label = data.iloc[i]["label"]
    dataset.append({"image": image_path, "text": label})  # Add image path and label to the dataset list

# Convert the list to a Dataset object
dataset = Dataset.from_pandas(pd.DataFrame(dataset))

In [7]:
# Preprocess the images and texts
def preprocess_function(examples):
    pixel_values = processor(examples['image'], return_tensors="pt").pixel_values.squeeze()
    labels = processor.tokenizer(examples['label'], return_tensors="pt").input_ids
    return {"pixel_values": pixel_values, "labels": labels}

In [8]:
processed_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3410 [00:00<?, ? examples/s]

AttributeError: module 'tensorflow' has no attribute 'Tensor'

In [None]:
# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10,
    eval_steps=10,
    num_train_epochs=3,
)

In [None]:
# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    eval_dataset=processed_dataset,
    tokenizer=processor.feature_extractor,
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Save the model
model.save_pretrained("./trocr-finetuned")
processor.save_pretrained("./trocr-finetuned")