In [None]:
from datasets import Dataset
from PIL import Image
import os

data_dir = "data"
records = []


for file in os.listdir(data_dir):
    if file.endswith(".jpg"):

        image_path = os.path.join(data_dir, file)
        txt_path = image_path.replace(".jpg", ".txt")
        image = Image.open(image_path).convert("RGB")

        if os.path.exists(txt_path):
            with open(txt_path, "r", encoding="utf-8") as f:
                caption = f.read().strip()
        else:
            caption = ""

        records.append({
            "image": image,
            "caption": caption,
        })


custom_dataset = Dataset.from_list(records)
print(custom_dataset)

from IPython.display import display

for x in custom_dataset:
  print(x["image"])
  print(x["caption"])


Dataset({
    features: ['image', 'caption'],
    num_rows: 5
})
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=487x578 at 0x78E935A03E50>
2 computers connecting to a switch and then router
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=513x502 at 0x78E935A03BD0>
Star topology networking picture
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=309x240 at 0x78E935A03AD0>
Point to Point topology
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=942x536 at 0x78E935A038D0>
Hybrid topology: can see many network topologies in the picture
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=488x480 at 0x78E935A03790>
Ring topology networking picture


In [None]:
!unzip data

Archive:  data.zip
   creating: data/
  inflating: data/1.jpg              
  inflating: data/1.txt              
  inflating: data/2.jpg              
 extracting: data/2.txt              
  inflating: data/3.jpg              
  inflating: data/3.txt              
  inflating: data/4.jpg              
 extracting: data/4.txt              
  inflating: data/5.jpg              
  inflating: data/5.txt              


In [None]:
!uv pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!uv pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!uv pip install transformers==4.51.3
!uv pip install --no-deps unsloth


import os
from unsloth import FastVisionModel
import torch
from datasets import load_dataset
from transformers import TextStreamer
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from datasets import Dataset



model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)


model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules      = True,
    r = 4,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)





instruction = "You are an expert at networking in computing. Describe the image in related to network configurations."

def convert_to_conversation(sample):

  print("{DEBUG} sample : ", sample)
  conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["caption"]} ]
        },
    ]
  print("{DEBUG} conversation : ", conversation)
  return { "messages" : conversation }


converted_dataset = [convert_to_conversation(sample) for sample in custom_dataset]
print("Last dataset : ", converted_dataset)
FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 50,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)



trainer_stats = trainer.train()

model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

[2mUsing Python 3.11.12 environment at: /usr[0m
[2K[2mResolved [1m8 packages[0m [2min 86ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2mcut-cross-entropy[0m [32m----------------------[2m--------[0m[0m 16.00 KiB/22.14 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2mcut-cross-entropy[0m [32m------------------------------[2m[0m[0m 22.14 KiB/22.14 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/5)
[2munsloth-zoo[0m [32m[2m---------------------

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Unsloth: Making `model.base_model.model.visual` require gradients
{DEBUG} sample :  {'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=487x578 at 0x78E7B9CB7F50>, 'caption': '2 computers connecting to a switch and then router'}
{DEBUG} conversation :  [{'role': 'user', 'content': [{'type': 'text', 'text': 'You are an expert at networking in computing. Describe the image in related to network configurations.'}, {'type': 'image', 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=487x578 at 0x78E7B9CB7F50>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': '2 computers connecting to a switch and then router'}]}]
{DEBUG} sample :  {'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=513x502 at 0x78E7C7E07350>, 'caption': 'Star topology networking picture'}
{DEBUG} conversation :  [{'role': 'user', 'content': [{'type': 'text', 'text': 'You are an expert at networking in computing. Describe the image in related to network configurations.'}, {'typ

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 7,237,632/2,000,000,000 (0.36% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


[]

In [None]:
!uv pip install gradio
import gradio as gr
import torch
from transformers import AutoTokenizer, TextStreamer
from unsloth import FastVisionModel

# Load model and tokenizer
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",
    max_seq_length=2048,
    dtype=torch.float16,
    load_in_4bit=True,
    trust_remote_code=True
)

model.load_adapter("lora_model")
# Remove: text_streamer = TextStreamer(tokenizer, skip_prompt=True)

def analyze_image(image):
    instruction = "You are an expert at networking in computing. Describe the image in related to network configurations."

    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to("cuda")

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            use_cache=True,
            temperature=1.5,
            min_p=0.1
        )

    # Decode the output
    output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return output_text

demo = gr.Interface(
    fn=analyze_image,
    inputs=gr.Image(type="pil"),
    outputs="text",
    title="Network Configuration Visual Assistant",
    description="Upload a network-related image to get configuration-specific insights."
)

demo.launch(share=True)


[2mUsing Python 3.11.12 environment at: /usr[0m
[2K[2mResolved [1m53 packages[0m [2min 403ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/14)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/14)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/14)
[2mpython-multipart[0m [32m[2m------------------------------[0m[0m     0 B/23.97 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/14)
[2mgroovy    [0m [32m[2m------------------------------[0m[0m     0 B/13.76 KiB
[2mpython-multipart[0m [32m[2m------------------------------[0m[0m     0 B/23.97 KiB
[2K[3A[37m⠙[0m [2mPreparing packages...[0m (0/14)
[2mgroovy    [0m [32m[2m------------------------------[0m[0m     0 B/13.76 KiB
[2mpython-multipart[0m [32m[2m------------------------------[0m[0m     0 B/23.97 KiB
[2mpydub     [0m [32m[2m------------------------------[0m[0m     0 B/31.57 KiB
[2K[4A[37m⠙[0m [2mPreparing packages...[0m (0/14)
[2mgroovy    [0m [32m[2m----

