# Multimodal LLM Workflow
Load, quantize, and fine-tune a vision-language model using the shared utilities.

In [None]:
from src.config import ModelConfig, QuantizationConfig, FinetuneConfig
from src.models.loader import load_multimodal_model
from src.models.quantization import quantize_model
from src.training.finetune import fine_tune_model

In [None]:
model_cfg = ModelConfig(
    model_name_or_path="microsoft/Phi-3-vision-128k-instruct",
    device_map="auto",
    torch_dtype="bfloat16",
    trust_remote_code=True,
)

quant_cfg = QuantizationConfig(
    load_in_4bit=True,
    bnb_compute_dtype="bfloat16",
    bnb_quant_type="nf4",
    use_double_quant=True,
)

finetune_cfg = FinetuneConfig(
    dataset_path="data/train.jsonl",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    num_train_epochs=1,
    per_device_train_batch_size=1,
)


In [None]:
model, processor = load_multimodal_model(model_cfg, quant_cfg)

In [None]:
quantized_model = quantize_model(model, quant_cfg)

In [None]:
fine_tuned_model = fine_tune_model(
    quantized_model,
    processor,
    finetune_cfg,
)
