In [None]:
! pip install torch torchvision torchaudio --quiet
! pip install transformers --quiet
! pip install datasets --quiet
! pip install accelerate --quiet
! pip install sentencepiece --quiet
! pip install peft --quiet

In [None]:
# Import necessary libraries and modules
import torch  # PyTorch library for tensor computations and deep learning
import os  # Module for interacting with the operating system
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig  # Transformers library for model and processor
from transformers.image_utils import load_image  # Utility for loading images
from datasets import load_dataset
from PIL import Image
import PIL
PIL.Image.MAX_IMAGE_PIXELS = None
Image.MAX_IMAGE_PIXELS = None

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Check the available device (GPU, MPS, or CPU) for computation
device = (
    "cuda"  # Use CUDA if available
    if torch.cuda.is_available()
    else "mps"  # Use Metal Performance Shaders (MPS) if available
    if torch.backends.mps.is_available()
    else "cpu"  # Default to CPU if no GPU or MPS is available
)

# Configure quantization settings for efficient model loading
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load model weights in 4-bit precision
    bnb_4bit_use_double_quant=True,  # Use double quantization for better accuracy
    bnb_4bit_quant_type="nf4",  # Specify quantization type as NF4
    bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation
)

# Define the model name to be loaded
model_name = "HuggingFaceTB/SmolVLM-500M-Instruct"

# Load the pre-trained model with the specified quantization configuration
model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,  # Use bfloat16 for model computations
    _attn_implementation="flash_attention_2",  # Use optimized attention implementation
).to(device)  # Move the model to the selected device

# Load the processor associated with the model for preprocessing inputs
processor = AutoProcessor.from_pretrained(model_name)

In [None]:
# Define the dataset name to be loaded
dataset_name = 'naver-clova-ix/cord-v1'

# Load the dataset using the Hugging Face datasets library
# The dataset contains scanned document images and their corresponding ground truth labels
ds = load_dataset(dataset_name)

# Display the loaded dataset structure
ds


Generating train split: 100%|██████████| 800/800 [00:03<00:00, 204.34 examples/s]
Generating validation split: 100%|██████████| 100/100 [00:00<00:00, 224.03 examples/s]
Generating test split: 100%|██████████| 100/100 [00:00<00:00, 258.14 examples/s]


DatasetDict({
    train: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 100
    })
    test: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 100
    })
})

In [None]:
import matplotlib.pyplot as plt
sample = ds['train'][3]

sample['image_size'] = sample['image'].size,

plt.imshow(sample['image'])
plt.axis("off")
plt.title("Sample Chart Image")
plt.show()

print(sample)

query = 'Extract the nutritional facts from the image.'
# Preprocess the sample
prompt = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}]
formatted_query = processor.apply_chat_template(prompt, tokenize=False)

inputs = processor(
    images=sample["image"], 
    text=formatted_query, 
    return_tensors="pt"
).to(device)
inputs = {key: val.to(device, dtype=torch.bfloat16) if val.dtype == torch.float else val.to(device) for key, val in inputs.items()}

# Generate predictions
with torch.no_grad():
    outputs = model.generate(**inputs,
                             max_length=1600)

# Decode the prediction
prediction = processor.batch_decode(outputs, skip_special_tokens=True)

# Display the result
print(f"Query: {query}")
print(f"Expected Answer: {sample['ground_truth']}")
print(f"Model Prediction: {prediction[0]}")

In [11]:
import json
print(json.loads((sample['ground_truth'])))


{'gt_parse': {'menu': [{'nm': 'Bintang Bremer', 'cnt': '1', 'price': '59,000'}, {'nm': 'Chicken H-H', 'cnt': '1', 'price': '190,000'}, {'nm': 'Ades', 'cnt': '1', 'price': '10,000'}], 'sub_total': {'subtotal_price': '259,000', 'discount_price': '19,000', 'service_price': '9,600', 'tax_price': '52,416'}, 'total': {'total_price': '302,016'}}, 'meta': {'version': '1.0.0', 'split': 'train', 'image_id': 3, 'image_size': {'width': 1108, 'height': 1478}}, 'valid_line': [{'words': [{'quad': {'x2': 261, 'y3': 969, 'x3': 258, 'y4': 966, 'x1': 188, 'y1': 932, 'x4': 186, 'y2': 935}, 'is_key': 1, 'row_id': 988386, 'text': 'Sub'}, {'quad': {'x2': 353, 'y3': 967, 'x3': 352, 'y4': 966, 'x1': 265, 'y1': 932, 'x4': 264, 'y2': 931}, 'is_key': 1, 'row_id': 988386, 'text': 'Total'}, {'quad': {'x2': 851, 'y3': 967, 'x3': 851, 'y4': 968, 'x1': 704, 'y1': 925, 'x4': 705, 'y2': 926}, 'is_key': 0, 'row_id': 988386, 'text': '259,000'}], 'category': 'sub_total.subtotal_price', 'group_id': 31}, {'words': [{'quad': 

In [9]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 1,114,112 || all params: 508,596,416 || trainable%: 0.2191


In [None]:
# Define a function to collate and preprocess a batch of examples for training
def collate_fn(examples):
    """
    Collates and preprocesses a batch of examples for training.

    Args:
        examples (list): A list of examples, where each example is a dictionary containing:
            - 'image': A PIL image object representing the input image.
            - 'ground_truth': The ground truth data associated with the image.

    Returns:
        dict: A dictionary containing the following keys:
            - 'input_ids': Tokenized text inputs.
            - 'attention_mask': Attention masks for the text inputs.
            - 'pixel_values': Preprocessed image tensors.
            - 'labels': Tokenized labels with padding tokens and image tokens masked out.
    """
    # Define the system message for the Vision Language Model
    system_message = """You are a Vision Language Model specialized in interpreting visual data from a dataset containing fast-selling food product images and their details.
    Your task is to analyze the provided image and extract structured nutritional facts such as calories, fat content, protein, and other relevant information.
    Your responses should be concise, accurate, and relevant to the visual content of the product. Avoid additional explanation unless absolutely necessary."""

    # Initialize lists to store text and image inputs
    text_inputs = []
    image_inputs = []

    # Iterate through each example in the batch
    for example in examples:
        # Format the example with system and user messages
        formatted_example = {
            "messages": [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": system_message}],
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                        },
                        {
                            "type": "text",
                            "text": query,
                        },
                    ],
                },
            ]
        }
        # Apply the chat template to format the text input
        text_inputs.append(processor.apply_chat_template(formatted_example["messages"], tokenize=False).strip())
        
        # Ensure the image is in RGB mode
        image = example["image"]
        if image.mode != 'RGB':
            image = image.convert('RGB')
        image_inputs.append([image])

    # Preprocess the text and image inputs using the processor
    batch = processor(
        text=text_inputs,
        images=image_inputs,
        return_tensors="pt",
        padding=True
    )

    # Clone the input IDs to create labels
    labels = batch["input_ids"].clone()
    # Replace padding token IDs with -100 to ignore them during loss computation
    labels[labels == processor.tokenizer.pad_token_id] = -100 

    # Get the token ID for image tokens
    image_token_id = processor.tokenizer.convert_tokens_to_ids(str(processor.image_token))
    # Replace image token IDs with -100 to ignore them during loss computation
    labels[labels == image_token_id] = -100

    # Add the processed labels to the batch
    batch["labels"] = labels

    return batch

In [None]:
from trl import SFTConfig, SFTTrainer

# Configure the training arguments for supervised fine-tuning (SFT)
training_args = SFTConfig(
    output_dir="sft_output",  # Directory to save the fine-tuned model
    num_train_epochs=10,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size per device during training
    gradient_accumulation_steps=2,  # Number of steps to accumulate gradients before updating weights
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    optim="adamw_torch_fused",  # Optimizer to use for training
    logging_steps=500,  # Log training progress every 500 steps
    save_strategy="epoch",  # Save the model at the end of each epoch
    learning_rate=2e-4,  # Learning rate for the optimizer
    bf16=True,  # Use bfloat16 precision for training
    tf32=True,  # Enable TensorFloat-32 precision for faster computation on supported hardware
    max_grad_norm=0.3,  # Maximum gradient norm for gradient clipping
    warmup_ratio=0.03,  # Warmup ratio for the learning rate scheduler
    lr_scheduler_type="constant",  # Type of learning rate scheduler
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Additional kwargs for gradient checkpointing
    dataloader_num_workers=4,  # Number of workers for data loading
    dataset_text_field="",  # Placeholder for dataset text field (not used in this case)
    dataset_kwargs={"skip_prepare_dataset": True},  # Skip dataset preparation step
    remove_unused_columns=False  # Do not remove unused columns from the dataset
)

# Initialize the SFTTrainer for supervised fine-tuning
trainer = SFTTrainer(
    model=model,  # Pre-trained model to fine-tune
    args=training_args,  # Training arguments
    train_dataset=ds["train"],  # Training dataset
    eval_dataset=ds["test"],  # Evaluation dataset
    data_collator=collate_fn,  # Function to collate and preprocess batches of data
    peft_config=peft_config,  # PEFT (Parameter-Efficient Fine-Tuning) configuration
    tokenizer=processor.tokenizer,  # Tokenizer associated with the model
)


[2025-02-09 21:39:44,963] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  trainer = SFTTrainer(
/opt/conda/envs/udop_env_v1/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/envs/udop_env_v1/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [None]:
trainer.train() # Start the training process

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
500,0.234
1000,0.0413
1500,0.0003
2000,0.0001
2500,0.0001
3000,0.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 

In [None]:
trainer.save_model("./smolvlm_instruct_finetuned") # Save the fine-tuned model

In [None]:
# Load the fine-tuned model for inference
model = AutoModelForVision2Seq.from_pretrained(
    "./smolvlm_instruct_finetuned",
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2",
).to(device)
processor = AutoProcessor.from_pretrained("./smolvlm_instruct_finetuned") # Load the processor
# Load the test dataset
test_dataset = load_dataset(dataset_name, split="test") # Load the test dataset
# Define a function to evaluate the model on the test dataset


In [None]:
def evaluate_model(model, processor, test_dataset):
    """
    Evaluates the model on the test dataset and returns the predictions and ground truth labels.

    """
    predictions = []
    ground_truths = []

    # Iterate through each example in the test dataset
    for example in test_dataset:
        # Preprocess the image and query
        prompt = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}]
        formatted_query = processor.apply_chat_template(prompt, tokenize=False)
        inputs = processor(
            images=example["image"],
            text=formatted_query,
            return_tensors="pt"
        ).to(device)
        inputs = {key: val.to(device, dtype=torch.bfloat16) if val.dtype == torch.float else val.to(device) for key, val in inputs.items()}

        # Generate predictions
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=1600)

        # Decode the prediction
        prediction = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        
        # Append the prediction and ground truth to their respective lists
        predictions.append(prediction)
        ground_truths.append(example["ground_truth"])

    return predictions, ground_truths
# Evaluate the model on the test dataset
predictions, ground_truths = evaluate_model(model, processor, test_dataset)



In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from jiwer import wer, cer
import json

def generate_evaluation_metrics(predictions, ground_truths):

    # Flatten the predictions and ground truths if they are nested
    flat_predictions = [pred.strip() for pred in predictions]
    flat_ground_truths = [json.loads(gt)["gt_parse"] for gt in ground_truths]

    # Calculate precision, recall, F1-score, and accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(flat_ground_truths, flat_predictions, average='weighted')
    accuracy = accuracy_score(flat_ground_truths, flat_predictions)

    # Calculate Word Error Rate (WER) and Character Error Rate (CER)
    wer_score = wer(flat_ground_truths, flat_predictions)
    cer_score = cer(flat_ground_truths, flat_predictions)

    # Combine all metrics into a dictionary
    metrics = {
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Accuracy": accuracy,
        "Word Error Rate (WER)": wer_score,
        "Character Error Rate (CER)": cer_score
    }

    return metrics

# Generate evaluation metrics
evaluation_metrics = generate_evaluation_metrics(predictions, ground_truths)

# Display the evaluation metrics
print("Evaluation Metrics:")
for metric, value in evaluation_metrics.items():
    print(f"{metric}: {value:.4f}")