In [1]:
!pip install -q -U transformers==4.37.2 bitsandbytes==0.41.3 accelerate==0.25.0 datasets torchvision


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!pip install peft



In [4]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from PIL import Image
import torchvision.transforms as transforms
import os

import requests
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import load_dataset
from torch.utils.data import DataLoader


In [5]:
import os

import requests
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig


In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# checkpoint = "HuggingFaceM4/tiny-random-idefics"
checkpoint = "HuggingFaceM4/idefics-9b"

# Here we skip some special modules that can't be quantized properly
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_skip_modules=["lm_head", "embed_tokens"],
)

processor = AutoProcessor.from_pretrained(checkpoint, token=True)
# Simply take-off the quantization_config arg if you want to load the original model
model = IdeficsForVisionText2Text.from_pretrained(checkpoint, quantization_config=bnb_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [7]:
from datasets import load_dataset

dataset = load_dataset("Peppertuna/ChartQADatasetV2")


In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['imgname', 'image', 'table', 'query', 'query_token', 'label'],
        num_rows: 20901
    })
    validation: Dataset({
        features: ['imgname', 'image', 'table', 'query', 'query_token', 'label'],
        num_rows: 960
    })
    test: Dataset({
        features: ['imgname', 'image', 'table', 'query', 'query_token', 'label'],
        num_rows: 1250
    })
})

In [9]:

from sklearn.model_selection import train_test_split
train_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)["train"]
eval_dataset = dataset["test"].train_test_split(test_size=0.2, seed=42)["test"]

In [10]:
def convert_to_rgb(image):
    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
    if image.mode == "RGB":
        return image

    # For transparent images, call to `alpha_composite` handles it's case
    image_rgba = image.convert("RGBA")
    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
    alpha_composite = Image.alpha_composite(background, image_rgba)
    alpha_composite = alpha_composite.convert("RGB")
    return alpha_composite

In [11]:
def ds_transforms(example_batch):
    image_size = processor.image_processor.image_size
    image_mean = processor.image_processor.image_mean
    image_std = processor.image_processor.image_std

    image_transform = transforms.Compose([
        convert_to_rgb,
        transforms.RandomResizedCrop((image_size, image_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC), # Bicubic interpolation
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std),
    ])

    prompts = []
    for i in range(len(example_batch['query'])):

        prompts.append(
            [
                example_batch['image'][i],
                f"Question: {example_batch['query'][i]} Answer: {example_batch['label'][i]}.</s>",
            ],
        )

    inputs = processor(prompts, transform=image_transform, return_tensors="pt").to(device)

    inputs["labels"] = inputs["input_ids"]

    return inputs

In [12]:
train_dataset.set_transform(ds_transforms)
eval_dataset.set_transform(ds_transforms)

In [13]:
model_name = checkpoint.split("/")[1]
config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)
model = get_peft_model(model, config)

In [14]:
model.print_trainable_parameters()

trainable params: 79,003,648 || all params: 9,008,683,280 || trainable%: 0.8769722005367249


In [15]:
train_dataset

Dataset({
    features: ['imgname', 'image', 'table', 'query', 'query_token', 'label'],
    num_rows: 16720
})

In [16]:

train_dataset_subset = train_dataset.select(range(1000))
eval_dataset_subset=eval_dataset.select(range(100))

In [17]:


# Define compute_loss function
def compute_loss(logits, labels):
    # Assuming logits are the output of your model and labels are the ground truth labels

    # Cross-entropy loss
    loss_fn = nn.CrossEntropyLoss()

    # Calculate loss
    loss = loss_fn(logits, labels)

    return loss.item()  # Return the loss as a scalar value

# Define compute_metrics function to compute evaluation metrics including loss
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # You need to compute loss here, assuming logits are the model predictions
    loss = compute_loss(logits, labels)  # Implement compute_loss function
    return {"eval_loss": loss}


In [19]:
from transformers import Trainer, TrainingArguments
import torch.cuda.amp as amp  # Import AMP

# Assuming you have already defined your model, train_dataset_subset, eval_dataset_subset, and compute_metrics function

# Training arguments
training_args = TrainingArguments(
    output_dir=f"{model_name}-chartsdataset",
    learning_rate=2e-4,
    fp16=True,  # Enable mixed precision training (fp16)
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    gradient_accumulation_steps=8,
    dataloader_pin_memory=False,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=5,
    eval_steps=5,
    logging_steps=5,
    max_steps=10,
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["label"],
    load_best_model_at_end=True,
    report_to=None,
    optim="paged_adamw_8bit",  # Assuming you have your own optimizer
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_subset,
    eval_dataset=eval_dataset_subset,
    compute_metrics=compute_metrics,
)


In [23]:
# Example fix
import torch

def torch_pad_and_concatenate(tensor1, tensor2, padding_index):
    # Check if tensors have compatible sizes
    if tensor1.size(1) != tensor2.size(1):
        # Adjust the size of tensor2 to match tensor1
        desired_size = tensor1.size(1)
        tensor2 = tensor2[:, :desired_size]

    # Concatenate tensors along dimension 0
    return torch.cat((tensor1, tensor2), dim=0)


In [None]:
eval_dataset