In [None]:
!pip install "numpy<2.0"

In [None]:
!nvidia-smi

In [None]:
!python --version


In [None]:
# Uninstall libraries to ensure a clean environment
!pip uninstall -y torch torchvision torchaudio transformers accelerate bitsandbytes peft
!pip uninstall flash-attn -y

# Install compatible libraries for your GPU
!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121
!pip install transformers==4.41.2 accelerate==0.30.1 bitsandbytes==0.43.1 peft==0.11.1

print("✅ Installation complete. Please restart the runtime now.")

In [2]:
import os
import json
import pandas as pd
from PIL import Image
from datasets import Dataset, DatasetDict
from transformers import AutoProcessor, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

In [3]:
import torch
import transformers
import bitsandbytes
import peft
import accelerate

print("--- Library Versions ---")
print(f"Torch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"Bitsandbytes: {bitsandbytes.__version__}")
print(f"PEFT: {peft.__version__}")
print(f"Accelerate: {accelerate.__version__}")

--- Library Versions ---
Torch: 2.3.0+cu121
Transformers: 4.41.2
Bitsandbytes: 0.43.1
PEFT: 0.11.1
Accelerate: 0.30.1


# Step 1: Consolidate Image and Annotation Paths into a DataFrame 

In [4]:
def create_dataset_df(image_dir, annot_dir):
    """Pairs images with their JSON annotations and returns a pandas DataFrame."""
    image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])
    data = []
    for img_file in image_files:
        base_name = os.path.splitext(img_file)[0]
        annot_file = base_name + '.json'
        image_path = os.path.join(image_dir, img_file)
        annot_path = os.path.join(annot_dir, annot_file)
        if os.path.exists(annot_path):
            data.append({'image_path': image_path, 'json_path': annot_path})
    return pd.DataFrame(data)

# Define file paths
base_dir = '/kaggle/input/ocr-dataset-for-healthcare/dataset'
train_images_dir = os.path.join(base_dir, 'training_data', 'images')
train_annots_dir = os.path.join(base_dir, 'training_data', 'annotations')
test_images_dir = os.path.join(base_dir, 'testing_data', 'images')
test_annots_dir = os.path.join(base_dir, 'testing_data', 'annotations')

train_df = create_dataset_df(train_images_dir, train_annots_dir)
test_df = create_dataset_df(test_images_dir, test_annots_dir)

# Step 2: Convert to Hugging Face Dataset Format 

In [5]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
train_val_split = train_dataset.train_test_split(test_size=0.1, seed=42)

dataset_dict = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],
    'test': test_dataset
})

print("Created Hugging Face DatasetDict:")
print(dataset_dict)

Created Hugging Face DatasetDict:
DatasetDict({
    train: Dataset({
        features: ['image_path', 'json_path'],
        num_rows: 134
    })
    validation: Dataset({
        features: ['image_path', 'json_path'],
        num_rows: 15
    })
    test: Dataset({
        features: ['image_path', 'json_path'],
        num_rows: 50
    })
})


# Step 3: Define the Core Processing Function

In [6]:
TASK_PROMPT = "<OD>"
TOKENIZER_MAX_COORD = 999

def process_example(example):
    """
    Loads image, formats the target text from JSON, and returns None if no valid
    annotations are found.
    """
    image = Image.open(example['image_path']).convert("RGB")
    with open(example['json_path'], 'r') as f:
        annotations = json.load(f)
    target_text = ""
    
    # Check if the 'form' key exists and is a non-empty list
    if 'form' not in annotations or not isinstance(annotations['form'], list) or not annotations['form']:
        print(f"WARNING: No 'form' data in {example.get('json_path', 'unknown file')}. Skipping.")
        return None

    for item in annotations.get('form', []):
        box = item['box']
        text = item['text']
        
        # Skip if text is missing or box is malformed
        if not text or len(box) != 4:
            continue

        try:
            x1 = int(min(max(0, box[0]), TOKENIZER_MAX_COORD))
            y1 = int(min(max(0, box[1]), TOKENIZER_MAX_COORD))
            x2 = int(min(max(0, box[2]), TOKENIZER_MAX_COORD))
            y2 = int(min(max(0, box[3]), TOKENIZER_MAX_COORD))
        except (ValueError, TypeError):
            continue

        box_str = f"<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>"
        target_text += box_str + text
    
    # --- FIX: If after all processing, the target is empty, return None ---
    if not target_text:
        print(f"WARNING: No valid text/box pairs found in {example.get('json_path', 'unknown file')}. Skipping.")
        return None
    
    return {'image': image, 'prompt': TASK_PROMPT, 'target_text': target_text}

# --- Map the function and then FILTER out the None values ---
processed_dataset = dataset_dict.map(
    process_example, 
    remove_columns=['image_path', 'json_path']
)

# This is the crucial new step:
final_processed_dataset = processed_dataset.filter(lambda example: example is not None)


print("\n--- Dataset Cleaning Report ---")
print(f"Original training samples: {len(dataset_dict['train'])}")
print(f"Cleaned training samples: {len(final_processed_dataset['train'])}")
print(f"Original validation samples: {len(dataset_dict['validation'])}")
print(f"Cleaned validation samples: {len(final_processed_dataset['validation'])}")

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter:   0%|          | 0/134 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50 [00:00<?, ? examples/s]


--- Dataset Cleaning Report ---
Original training samples: 134
Cleaned training samples: 134
Original validation samples: 15
Cleaned validation samples: 15


# Step 4: Define and Apply the Final Florence-2 Processor

In [None]:
# # 3. Setup Processor and Process Dataset for the Model
# model_name = "microsoft/Florence-2-base"
# processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
# if processor.tokenizer.pad_token is None:
#     processor.tokenizer.pad_token = processor.tokenizer.eos_token

# def apply_processor_batched(batch):
#     inputs = processor(text=batch["prompt"], images=batch["image"], return_tensors="pt", padding=True, truncation=True)
#     labels = processor.tokenizer(text=batch["target_text"], return_tensors="pt", padding=True, max_length=2048, truncation=True).input_ids
#     labels[labels == processor.tokenizer.pad_token_id] = -100
#     inputs["labels"] = labels
#     return inputs
# final_dataset = final_processed_dataset.map(apply_processor_batched, batched=True, remove_columns=list(final_processed_dataset["train"].features))

In [None]:
# print("\nDataset is fully processed and ready for training!")
# print(final_dataset)
# print("Keys in a single training sample:", final_dataset['train'][0].keys())

# Step 5: Fine-Tuning the Florence-2 Model

In [None]:
# # Define Quantization Configuration (Q-LoRA) 
# # This config tells the model to load its weights in 4-bit precision.
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16, # Use float16 for stability
#     bnb_4bit_use_double_quant=True,
# )

In [7]:
!pip install flash-attn==2.5.8 --no-build-isolation # Configured according to CUDA version

Collecting flash-attn==2.5.8
  Using cached flash_attn-2.5.8.tar.gz (2.5 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.5.8-cp311-cp311-linux_x86_64.whl size=120938665 sha256=0b28549d7a7105dfa9300751de038f1e607fe11bb96ac2bdd29b474926ca8e41
  Stored in directory: /root/.cache/pip/wheels/2a/88/b2/587b498e2caa887707a63d0ed7d7f4beca27f5034640382845
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn-2.5.8


In [None]:
# model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, quantization_config=quantization_config)
# model.resize_token_embeddings(len(processor.tokenizer))
# model = prepare_model_for_kbit_training(model)
# lora_config = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", target_modules="all-linear", task_type="CAUSAL_LM")
# model = get_peft_model(model, lora_config)
# model.config.use_cache = False

In [None]:
# # bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

# # FIX: Set fp16=True and bf16=False 
# training_args = TrainingArguments(
#     output_dir="./florence2-qlora-healthcare-ocr",
#     num_train_epochs=50,
#     learning_rate=2e-5,
#     per_device_train_batch_size=1, 
#     per_device_eval_batch_size=2,
#     gradient_accumulation_steps=4,
    
#     # Use fp16 for T4 stability
#     fp16=True,
#     bf16=False,
    
#     gradient_checkpointing=False,
#     logging_steps=25,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     greater_is_better=False,
#     report_to="none"
# )

In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=final_dataset["train"],
#     eval_dataset=final_dataset["validation"]
# )

# print("Starting model fine-tuning...")
# trainer.train()
# print("Fine-tuning complete!")

In [9]:
# Add AutoModelForCausalLM to this line
from transformers import AutoModelForCausalLM, AutoProcessor, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# (Your data processing code should be in the cells above this)

# 1. Setup Processor
model_name = "microsoft/Florence-2-base"
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

# 2. Model Configuration with Manual Device Mapping
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    load_in_8bit=True,
    device_map={"": 0}  # <-- FIX: Manually assign the model to the first GPU
)

# 3. Resize embeddings and prepare for LoRA
model.resize_token_embeddings(len(processor.tokenizer))
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", 
    target_modules="all-linear", task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.config.use_cache = False
model.print_trainable_parameters()

# 4. Training Arguments
# Training arguments for maximum stability
training_args = TrainingArguments(
    output_dir="./florence2-lora-8bit-fp32-ocr", # New output directory
    num_train_epochs=50,
    learning_rate=2e-5,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    
    # --- THE FINAL FIX ---
    fp16=False, # Disable mixed-precision training
    bf16=False,
                                 
    gradient_checkpointing=True,
    logging_steps=25,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none"
)

# 5. Initialize and Run Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["validation"]
)

print("\nStarting model fine-tuning with STABLE 8-BIT configuration...")
trainer.train()
print("\nFine-tuning complete!")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


trainable params: 7,319,232 || all params: 238,733,248 || trainable%: 3.0659

Starting model fine-tuning with STABLE 8-BIT configuration...




ValueError: Attempting to unscale FP16 gradients.

In [None]:
# After training, save your fine-tuned model and the processor to a directory.
# This allows you to easily load it later for inference.

final_model_dir = "./florence2-healthcare-ocr-final"
print(f"\nSaving the best model to {final_model_dir}")
trainer.save_model(final_model_dir)
processor.save_pretrained(final_model_dir) # The processor was defined in the previous step

print("\n--- Your model is fine-tuned and saved! ---")
print("You can now use it for inference on new documents.")