In [1]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [2]:
from huggingface_hub import login
login(hf_token, add_to_git_credential=True)

In [3]:
import zipfile

# Specify the path to your zip file and extraction directory
zip_path = 'model.zip'
extract_dir = './'  # Change as needed

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Unzipped to: {extract_dir}")

Unzipped to: ./


In [4]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
!pip install  -U -q transformers trl datasets bitsandbytes peft accelerate num2words

In [6]:
from transformers import AutoTokenizer, AutoModelForImageTextToText

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")

In [7]:
adapter_path = "smolvlm-instruct-trl-sft-ChartQA"
model.load_adapter(adapter_path)

In [8]:
def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample[1:2],  # Use the sample without the system message
        add_generation_prompt=True
    )

    image_inputs = []
    image = sample[1]['content'][0]['image']
    if image.mode != 'RGB':
        image = image.convert('RGB')
    image_inputs.append([image])

    # Prepare the inputs for the model
    model_inputs = processor(
        #text=[text_input],
        text=text_input,
        images=image_inputs,
        return_tensors="pt",
    ).to(device)  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [9]:
!pip install datasets



In [30]:
system_message = """You are a Vision-Language Model specialized in interpreting medical images such as X-rays, MRIs, CT scans, and ultrasounds.
Your task is to analyze the provided medical image and respond to queries with concise diagnostic observations, typically using short phrases, keywords, or brief sentences.
Focus on identifying key abnormalities, conditions, or notable findings.
Avoid providing detailed explanations or recommendations unless specifically requested.
Base your answers strictly on the visual information presented in the image."""

In [11]:
def format_data(sample):
    return [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": system_message
                }
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": sample["image"],
                },
                {
                    "type": "text",
                    "text": sample['text'],
                }
            ],
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": sample["report"]
                }
            ],
        },
    ]

In [12]:
from datasets import load_dataset

ds = load_dataset("hongrui/mimic_chest_xray_v_1")

shuffled_dataset = ds['train'].shuffle(seed=42)

# Select the first 500 samples for training
train_dataset = shuffled_dataset.select(range(500))

# Select the next 80 for validation
validation_dataset = shuffled_dataset.select(range(500, 580))

# Select the next 80 for testing
test_dataset = shuffled_dataset.select(range(580, 660))

# Create the final dataset dictionary
final_dataset = {
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
}

# Verify the sizes
print(f"Train samples: {len(final_dataset['train'])}")  # Should be 500
print(f"Validation samples: {len(final_dataset['validation'])}")  # Should be 80
print(f"Test samples: {len(final_dataset['test'])}")  # Should be 80

Train samples: 500
Validation samples: 80
Test samples: 80


In [13]:
train_dataset = [format_data(sample) for sample in train_dataset]
val_dataset = [format_data(sample) for sample in validation_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

In [14]:
!pip install num2words



In [15]:
from transformers import AutoProcessor, AutoModelForImageTextToText

model_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"

# Load processor (handles both images and text)
processor = AutoProcessor.from_pretrained(model_id)
model.to(device)

processor_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

SmolVLMForConditionalGeneration(
  (model): SmolVLMModel(
    (vision_model): SmolVLMVisionTransformer(
      (embeddings): SmolVLMVisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
        (position_embedding): Embedding(1024, 768)
      )
      (encoder): SmolVLMEncoder(
        (layers): ModuleList(
          (0-11): 12 x SmolVLMEncoderLayer(
            (self_attn): SmolVLMVisionAttention(
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=768, out_features=768, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=768, bias=False)
                )
                (lora_emb

In [16]:
output = generate_text_from_sample(model, processor, train_dataset[1])

In [17]:
output

' The chest x-ray shows a right sided pleural effusion, which is a fluid accumulation in the pleural space. The fluid is likely to be a result of a pleural effusion, a condition in which fluid accumulates in the pleural space. The fluid is likely to be a result of a pleural effusion, a condition in which fluid accumulates in the pleural space. The presence of fluid in the pleural space is a common finding in patients with pleural effusion.'

In [19]:
def predict(model, processor, user_input, max_new_tokens=512, device="cuda"):
    """
    Perform inference on a single user input consisting of an image and text.

    Args:
        model: The vision-language model for inference.
        processor: The processor for preparing inputs.
        user_input: A dictionary with 'role'='user' and 'content'=[{'type': 'image', 'image': PIL.Image}, {'type': 'text', 'text': str}]
        max_new_tokens: Maximum number of tokens to generate.
        device: Device to perform inference ('cuda' or 'cpu').

    Returns:
        Generated text response from the model.
    """

    # Prepare the input following the expected format
    sample = [
        {   # You can add a system message if needed here
            'role': 'system',
            'content': [{'type': 'text', 'text': 'You are a Vision Language Model.'}]
        },
        user_input
    ]

    # Apply chat template (as you did in generate_text_from_sample)
    text_input = processor.apply_chat_template(
        sample[1:2],  # Only the user input
        add_generation_prompt=True
    )

    # Prepare image input
    image_inputs = []
    image = user_input['content'][0]['image']
    if image.mode != 'RGB':
        image = image.convert('RGB')
    image_inputs.append([image])

    # Process model inputs
    model_inputs = processor(
        text=text_input,
        images=image_inputs,
        return_tensors="pt",
    ).to(device)

    # Generate output
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the output (remove prompt)
    trimmed_generated_ids = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    # Decode
    output_text = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return output_text[0]


In [20]:
!pip install gradio



In [27]:
import gradio as gr
from PIL import Image

# Assuming you already have these ready:
# - model
# - processor
# - predict(model, processor, user_input) defined

# Define a simple wrapper function for Gradio
def gradio_predict(image, text):
    if image is None or text.strip() == "":
        return "Please provide both an image and a text prompt."

    user_input = {
        'role': 'user',
        'content': [
            {'type': 'image', 'image': image},
            {'type': 'text', 'text': text}
        ]
    }
    output = predict(model, processor, user_input)
    return output

# Function to set image and default text when an example is selected
def select_example(evt: gr.SelectData):
    # The index of the selected image
    index = evt.index
    # Return the selected image and default text
    return example_images[index], "Describe the findings in this chest X-ray."

# Extract example images from your random_datasets
example_images = []
for i in range(3):
    image = random_datasets[i][1]['content'][0]['image']
    example_images.append(image)

# Build the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## 🩻 Vision-Language Model Inference")

    # Main input/output components
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Chest X-ray Image")
            text_input = gr.Textbox(lines=2, placeholder="Describe the findings...", label="Findings Text")
            submit_btn = gr.Button("Generate Report")

        with gr.Column():
            output_text = gr.Textbox(label="Generated Output", lines=10)

    # Add examples section
    gr.Markdown("### Examples")
    gallery = gr.Gallery(
        value=example_images,
        columns=4,
        height="150px",
        show_label=False
    )

    # Connect gallery selection to input fields
    gallery.select(
        fn=select_example,
        outputs=[image_input, text_input]
    )

    # Connect submit button to prediction function
    submit_btn.click(
        fn=gradio_predict,
        inputs=[image_input, text_input],
        outputs=output_text
    )

# Launch the app
demo.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3e092d975a366603f8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


