In [None]:
!pip install datasets
!pip install peft
!pip install -U bitsandbytes


In [2]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `data` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authent

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
import torch
from peft import LoraConfig
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration

DEVICE = "cuda:0"
USE_LORA = False
USE_QLORA = True

processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct")


In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct")
tokenizer.pad_token_id = tokenizer.eos_token_id


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration,AutoModelForPreTraining
from peft import LoraConfig  # Import LoRA configuration

# Check if LoRA or QLoRA should be used
if USE_QLORA or USE_LORA:
    # Define LoRA configuration
    lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    bias="none",
    init_lora_weights="gaussian"
)

    # If using QLoRA, configure Bits and Bytes
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
    else:
        bnb_config = None  # Set to None if not using QLoRA

    # Load model with quantization configuration if using QLoRA
    model =AutoModelForPreTraining.from_pretrained(
        "meta-llama/Llama-3.2-11B-Vision-Instruct",
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
    )

    # Add LoRA configuration to the model
    model.add_adapter(lora_config)
    model.enable_adapters()
else:
    # Load model without LoRA
    model = AutoModelForPreTraining.from_pretrained(
        "meta-llama/Llama-3.2-11B-Vision-Instruct",
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2",
    ).to(DEVICE)



`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
processor.pad_token_id = tokenizer.pad_token_id

In [7]:
from datasets import load_dataset

dataset = load_dataset("mdwiratathya/SLAKE-vqa-english")



In [8]:
from datasets import load_dataset
from torch.utils.data import Dataset
from PIL import Image
import random

class VQADataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        question = self.dataset[idx]['question']
        answer = self.dataset[idx]['answer']
        image = self.dataset[idx]['image']  # Assuming it's a PIL image

        return {
            "image": image,
            "query": {"en": question},
            "answers": [answer]
        }


# Define the MyDataCollator class
class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token = '<image>'
        self.end_of_utterance_token = '<end_of_utterance>'

    def __call__(self, examples):
        texts = []
        for example in examples:
            question = example["query"]['en']
            answer = random.choice(example["answers"])

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Answer briefly."},
                        {"type": "text", "text": self.image_token},
                        {"type": "text", "text": question}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer},
                        {"type": "text", "text": self.end_of_utterance_token}
                    ]
                }
            ]

            # Assuming processor has apply_chat_template method
            text = self.processor.apply_chat_template(messages, add_generation_prompt=False)

            if isinstance(text, list):
                text = " ".join(str(item) for item in text)
            else:
                text = str(text)

            texts.append(text.strip())

        # Process text batch using processor
        text_batch = self.processor(text=texts, return_tensors="pt", padding=True)

        # Prepare the batch dictionary
        batch = {
            "input_ids": text_batch["input_ids"],
            "attention_mask": text_batch["attention_mask"]
        }

        # Create labels
        labels = batch["input_ids"].clone()
        labels[labels == self.processor.pad_token_id] = -100  # Ignore padding in loss calculation
        batch["labels"] = labels

        return batch



train_dataset = VQADataset(dataset=dataset['train'], processor=processor)
test_dataset = VQADataset(dataset=dataset['test'], processor=processor)
valid_dataset = VQADataset(dataset=dataset['validation'], processor=processor)

# Data collator for batching
data_collator = MyDataCollator(processor=processor)


In [9]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback


training_args = TrainingArguments(
    output_dir="IDEFICS2",
    learning_rate=2e-4,
    fp16=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    dataloader_pin_memory=False,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=50,
    save_steps=50,
    max_steps=200,
    logging_steps=5,
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["labels"],
    report_to="none",
    optim="paged_adamw_8bit",
    weight_decay=0.05,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)



In [10]:
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset
)

In [11]:
trainer.train()


Step,Training Loss,Validation Loss
50,0.3425,0.338918
100,0.256,0.246567
150,0.1934,0.213155


Step,Training Loss,Validation Loss
50,0.3425,0.338918
100,0.256,0.246567
150,0.1934,0.213155
200,0.2201,0.194988


TrainOutput(global_step=200, training_loss=0.39382890701293943, metrics={'train_runtime': 1867.9482, 'train_samples_per_second': 0.857, 'train_steps_per_second': 0.107, 'total_flos': 5855656152532098.0, 'train_loss': 0.39382890701293943, 'epoch': 0.32526936369180726})

In [12]:
results = trainer.evaluate()

# Print the evaluation results
print(results)

{'eval_loss': 0.19498774409294128, 'eval_runtime': 263.6435, 'eval_samples_per_second': 3.994, 'eval_steps_per_second': 3.994, 'epoch': 0.32526936369180726}


In [16]:
test_results = trainer.evaluate(test_dataset)

# Print the evaluation results
print(test_results)

{'eval_loss': 0.19015422463417053, 'eval_runtime': 266.2583, 'eval_samples_per_second': 3.985, 'eval_steps_per_second': 3.985, 'epoch': 0.32526936369180726}


In [14]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,LlavaProcessor

#32
save_directory = "/content/drive/MyDrive/VQA12Feb/models/Loralamaslake"

from transformers import AutoConfig

# config.save_pretrained(save_directory)
model.save_pretrained(save_directory)
processor.save_pretrained(save_directory)

print(f"✅ Model saved at: {save_directory}")

✅ Model saved at: /content/drive/MyDrive/VQA12Feb/models/Loralamaslake


In [15]:
from huggingface_hub import HfApi

# Set up the model and tokenizer save path
save_directory = "/content/drive/MyDrive/VQA12Feb/models/Loralamaslake"
repo_name = "LoraLamaSlake"
user_name = "Datargets"

api = HfApi()

api.create_repo(repo_id=f"{user_name}/{repo_name}", private=True, repo_type="model")

api.upload_folder(
    folder_path=save_directory,
    repo_id=f"{user_name}/{repo_name}",
    path_in_repo=""
)

print(f"✅ Model uploaded to Hugging Face privately: https://huggingface.co/{user_name}/{repo_name}")


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/47.3M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

✅ Model uploaded to Hugging Face privately: https://huggingface.co/Datargets/LoraLamaSlake
