In [1]:
!pip install -q gradio
!pip install -q transformers==4.31.0
!pip install -q bitsandbytes==0.41.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m114.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m117.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!git clone https://github.com/AndyGongDS/LISA.git
%cd LISA

Cloning into 'LISA'...
remote: Enumerating objects: 774, done.[K
remote: Counting objects: 100% (247/247), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 774 (delta 223), reused 175 (delta 173), pack-reused 527 (from 2)[K
Receiving objects: 100% (774/774), 28.49 MiB | 17.60 MiB/s, done.
Resolving deltas: 100% (402/402), done.
/content/LISA


In [3]:
import argparse
import os
import re
import sys

import bleach
import cv2
import gradio as gr
import numpy as np
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor

from model.LISA import LISAForCausalLM
from model.llava import conversation as conversation_lib
from model.llava.mm_utils import tokenizer_image_token
from model.segment_anything.utils.transforms import ResizeLongestSide
from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)


from utils.data_processing import get_mask_from_json
from utils.utils import intersectionAndUnionGPU

## 0. Arg Parsing

In [4]:
def parse_args(args):
    parser = argparse.ArgumentParser(description="LISA chat")
    parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
    parser.add_argument(
        "--precision",
        default="fp16",
        type=str,
        choices=["fp32", "bf16", "fp16"],
        help="precision for inference",
    )
    parser.add_argument("--image_size", default=1024, type=int, help="image size")
    parser.add_argument("--model_max_length", default=512, type=int)
    parser.add_argument("--lora_r", default=8, type=int)
    parser.add_argument(
        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
    )
    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
    parser.add_argument("--load_in_8bit", action="store_true", default=False)
    parser.add_argument("--load_in_4bit", action="store_true", default=False)
    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
    parser.add_argument(
        "--conv_type",
        default="llava_v1",
        type=str,
        choices=["llava_v1", "llava_llama_2"],
    )
    return parser.parse_args(args)


In [5]:
# Use the existing parse_args function with no arguments to get defaults
args = parse_args(["--precision", "bf16"])

# Create output directory
os.makedirs(args.vis_save_path, exist_ok=True)

print(f"Arguments configured for inference:")
print(f"  Model: {args.version}")
print(f"  Vision tower: {args.vision_tower}")
print(f"  Precision: {args.precision}")
print(f"  Output directory: {args.vis_save_path}")

Arguments configured for inference:
  Model: xinlai/LISA-13B-llama2-v1
  Vision tower: openai/clip-vit-large-patch14
  Precision: bf16
  Output directory: ./vis_output


# 1. Preprocessing input images

In [6]:
def preprocess(
    x,
    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
    img_size=1024,
) -> torch.Tensor:
    """Normalize pixel values and pad to a square input."""
    # Normalize colors
    x = (x - pixel_mean) / pixel_std
    # Pad
    h, w = x.shape[-2:]
    padh = img_size - h
    padw = img_size - w
    x = F.pad(x, (0, padw, 0, padh))
    return x


# 2. Initiation of Model

In [7]:
# Create model
tokenizer = AutoTokenizer.from_pretrained(
    args.version,
    cache_dir=None,
    model_max_length=args.model_max_length,
    padding_side="right",
    use_fast=False,
)
tokenizer.pad_token = tokenizer.unk_token
args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]

torch_dtype = torch.float32
if args.precision == "bf16":
    torch_dtype = torch.bfloat16
elif args.precision == "fp16":
    torch_dtype = torch.half

kwargs = {"torch_dtype": torch_dtype}
if args.load_in_4bit:
    kwargs.update(
        {
            "torch_dtype": torch.half,
            "load_in_4bit": True,
            "quantization_config": BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                llm_int8_skip_modules=["visual_model"],
            ),
        }
    )
elif args.load_in_8bit:
    kwargs.update(
        {
            "torch_dtype": torch.half,
            "quantization_config": BitsAndBytesConfig(
                llm_int8_skip_modules=["visual_model"],
                load_in_8bit=True,
            ),
        }
    )

model = LISAForCausalLM.from_pretrained(
    args.version, low_cpu_mem_usage=True, vision_tower=args.vision_tower, seg_token_idx=args.seg_token_idx, **kwargs
)

model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

model.get_model().initialize_vision_modules(model.get_model().config)
vision_tower = model.get_model().get_vision_tower()
vision_tower.to(dtype=torch_dtype)

if args.precision == "bf16":
    model = model.bfloat16().cuda()
elif (
    args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit)
):
    vision_tower = model.get_model().get_vision_tower()
    model.model.vision_tower = None
    import deepspeed

    model_engine = deepspeed.init_inference(
        model=model,
        dtype=torch.half,
        replace_with_kernel_inject=True,
        replace_method="auto",
    )
    model = model_engine.module
    model.model.vision_tower = vision_tower.half().cuda()
elif args.precision == "fp32":
    model = model.float().cuda()

vision_tower = model.get_model().get_vision_tower()
vision_tower.to(device=args.local_rank)

clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
transform = ResizeLongestSide(args.image_size)

model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/95.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/8.86G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

LISAForCausalLM(
  (model): LisaModel(
    (embed_tokens): Embedding(32003, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm

# 3. Inference

In [8]:
import os
import numpy as np
import cv2
import torch
import gradio as gr
from utils.data_processing import get_mask_from_json
from model.llava.mm_utils import tokenizer_image_token
from utils.utils import intersectionAndUnionGPU

def inference_with_eval(instruction, image_path, json_path):
    """Process image with LISA and compare to ground truth mask"""
    # Load the image
    image_np = cv2.imread(image_path)
    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
    original_size_list = [image_np.shape[:2]]

    # Create conversation prompt
    if args.use_mm_start_end:
        input_str = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
    else:
        input_str = DEFAULT_IMAGE_TOKEN

    if instruction:
        input_str += "\n" + instruction

    conv = conversation_lib.conv_templates[args.conv_type].copy()
    conv.append_message(conv.roles[0], input_str)
    conv.append_message(conv.roles[1], "")
    prompt = conv.get_prompt()

    # Process image for CLIP
    image_clip = clip_image_processor.preprocess(image_np, return_tensors="pt")["pixel_values"][0].unsqueeze(0).cuda()
    if args.precision == "bf16":
        image_clip = image_clip.bfloat16()
    elif args.precision == "fp16":
        image_clip = image_clip.half()
    else:
        image_clip = image_clip.float()

    # Process image for SAM
    image = transform.apply_image(image_np)
    resize_list = [image.shape[:2]]
    image = preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous()).unsqueeze(0).cuda()
    if args.precision == "bf16":
        image = image.bfloat16()
    elif args.precision == "fp16":
        image = image.half()
    else:
        image = image.float()

    # Run model inference
    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
    input_ids = input_ids.unsqueeze(0).cuda()

    output_ids, pred_masks = model.evaluate(
        image_clip,
        image,
        input_ids,
        resize_list,
        original_size_list,
        max_new_tokens=512,
        tokenizer=tokenizer,
    )

    # Get text output
    output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
    text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
    text_output = text_output.replace("\n", "").replace("  ", " ")
    text_output = text_output.split("ASSISTANT: ")[-1]

    # Process prediction mask
    pred_output = None
    pred_mask_np = None
    for i, pred_mask in enumerate(pred_masks):
        if pred_mask.shape[0] == 0:
            continue

        pred_mask_np = pred_mask.detach().cpu().numpy()[0]
        pred_mask_np = pred_mask_np > 0

        # Create prediction visualization
        pred_output = image_np.copy()
        pred_output[pred_mask_np] = (
            image_np * 0.5
            + pred_mask_np[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
        )[pred_mask_np]

    # Load ground truth mask from JSON
    gt_mask, comments, _ = get_mask_from_json(json_path, image_np)

    # Create ground truth visualization
    gt_output = image_np.copy()
    gt_output[gt_mask == 1] = (
        image_np * 0.5
        + (gt_mask == 1)[:, :, None].astype(np.uint8) * np.array([0, 255, 0]) * 0.5
    )[gt_mask == 1]

    # Calculate IoU metrics
    metrics_text = "No prediction mask generated"
    if pred_mask_np is not None:
        # Convert to tensors
        pred_tensor = torch.from_numpy((pred_mask_np).astype(np.int32)).cuda()
        gt_tensor = torch.from_numpy(gt_mask).cuda()

        # Calculate intersection and union
        intersection, union, acc_iou = intersectionAndUnionGPU(
            pred_tensor.contiguous(),
            gt_tensor.contiguous(),
            2,  # num_classes (0: background, 1: object)
            ignore_index=255
        )

        # Get CPU values
        intersection = intersection.cpu().numpy()
        union = union.cpu().numpy()
        acc_iou = acc_iou.cpu().numpy()

        # Calculate IoU metrics
        ciou = intersection[1] / (union[1] + 1e-10)
        giou = acc_iou[1]

        metrics_text = f"Evaluation Metrics:\n- Global IoU (gIoU): {giou:.4f}\n- Class IoU (cIoU): {ciou:.4f}"

    # If prediction failed, use placeholder
    if pred_output is None:
        pred_output = cv2.imread("/no_seg_out.png")[:, :, ::-1]

    return gt_output, pred_output, metrics_text



# 4. Gradio UI setup

In [10]:
# Create the new Gradio interface
demo = gr.Interface(
    inference_with_eval,
    inputs=[
        gr.Textbox(lines=1, placeholder="Segment the sky", label="Text Instruction"), # Takes prompt
        gr.Image(type="filepath", label="Input Image"), # Takes example images
        gr.File(label="Ground Truth JSON File") # Takes json files that have ground truth mask in the file
    ],
    outputs=[
        gr.Image(type="pil", label="Ground Truth Mask (Green)"),
        gr.Image(type="pil", label="Predicted Mask (Red)"),
        gr.Textbox(lines=3, label="Evaluation Metrics") # Evaluation Metrics
    ],
    title="LISA Segmentation Accuracy Test",
    description="Compare LISA segmentation predictions with ground truth masks",
    examples=[ # Remove all examples for now
    ],
    allow_flagging="auto",
)

demo.queue()
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://335bad84f6e6821ea8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


