In [1]:
!nvcc --version

# !pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830
!pip install git+https://github.com/huggingface/transformers
!pip install accelerate
!pip install qwen-vl-utils
!pip install datasets
!CUDA_VERSION=cu121
!pip install 'vllm==0.6.1' --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION}

from google.colab import drive
import os
import json
import re
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from datasets import load_dataset

# Mount Google Drive
drive.mount('/content/drive')

# Create necessary directories in Google Drive
output_dir = "/content/drive/My Drive/World Models"
os.makedirs(output_dir, exist_ok=True)

# Use GPU
if torch.cuda.is_available():
    print("GPU is available and ready to use!")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available. Make sure it's enabled in the runtime settings.")

# Response Generation with vLLM
from vllm import LLM, SamplingParams
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info

model_save_path = os.path.join(output_dir, "qwen_model")
if os.path.exists(model_save_path):
    print("Loading model from local save path.")
    model_path = model_save_path
else:
    print("Downloading model. This may take some time...")
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
    model.save_pretrained(model_save_path)
    processor.save_pretrained(model_save_path)

print(model_save_path)
print(model_path)

from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_path)
if not hasattr(config, "rope_scaling") or "factor" not in config.rope_scaling:
    config.rope_scaling = {"type": "linear", "factor": 1.0}  # Default
if not hasattr(config, "rope_type"):
    config.rope_type = "yarn"  # Default
if not hasattr(config, "max_model_len"):
    config.max_model_len = 2048  # Adjust per the model

config.save_pretrained(model_path)

llm = LLM(
    model=model_path,
    limit_mm_per_prompt={"image": 10, "video": 10},
    dtype=torch.float16,
)

sampling_params = SamplingParams(
    temperature=0.1,
    top_p=0.001,
    repetition_penalty=1.05,
    max_tokens=256,
    stop_token_ids=[],
)

# Load the MathVista dataset
dataset = load_dataset("AI4Math/MathVista")


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-1j22og66
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-1j22og66
  Resolved https://github.com/huggingface/transformers to commit b2f2977533445c4f62bf58e10b1360e6856e78ce
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10291149 sha256=f6a74f14e089

Mounted at /content/drive
GPU is available and ready to use!
Device name: NVIDIA A100-SXM4-40GB
Loading model from local save path.
/content/drive/My Drive/World Models/qwen_model
/content/drive/My Drive/World Models/qwen_model
INFO 01-06 01:58:16 llm_engine.py:232] Initializing an LLM engine (v0.6.1) with config: model='/content/drive/My Drive/World Models/qwen_model', speculative_config=None, tokenizer='/content/drive/My Drive/World Models/qwen_model', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 01-06 01:59:48 model_runner.py:1008] Loading model weights took 4.1277 GB


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 01-06 01:59:52 gpu_executor.py:122] # GPU blocks: 60586, # CPU blocks: 9362
INFO 01-06 01:59:56 model_runner.py:1309] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-06 01:59:56 model_runner.py:1313] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 01-06 02:00:20 model_runner.py:1428] Graph capturing finished in 24 secs.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

(…)-00000-of-00001-725687bf7a18d64b.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

(…)-00000-of-00002-6b81bd7f7e2065e6.parquet:   0%|          | 0.00/358M [00:00<?, ?B/s]

(…)-00001-of-00002-6a611c71596db30f.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

Generating testmini split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5141 [00:00<?, ? examples/s]

TypeError: unhashable type: 'slice'

In [10]:
subset_dataset = dataset["testmini"].select(range(100))
# Prepare results storage
responses = {}

# Batch processing parameters
batch_size = 8  # Adjust based on GPU memory and model size

# Start timing
start_time = time.time()

# Initialize processor outside the loop for efficiency
processor = AutoProcessor.from_pretrained(model_path)

# Split dataset into batches
for i in range(0, len(subset_dataset), batch_size):
    batch = subset_dataset.select(range(i, min(i + batch_size, len(subset_dataset))))
    prompts = []
    mm_data_list = []

    # Prepare prompts and multimodal data for the batch
    for example in batch:
        try:
            query = example["query"]
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": example["decoded_image"]},
                        {"type": "text", "text": query},
                    ],
                }
            ]

            prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, _ = process_vision_info(messages)

            mm_data = {}
            if image_inputs is not None:
                mm_data["image"] = image_inputs

            prompts.append(prompt)
            mm_data_list.append(mm_data)
        except KeyError as e:
            print(f"Missing key in example: {e}. Skipping this example.")
            continue

    # Skip batch if no valid prompts
    if not prompts:
        print("No valid prompts in this batch. Skipping batch.")
        continue

    # Batch inference
    outputs = llm.generate(
        [
            {"prompt": prompt, "multi_modal_data": mm_data}
            for prompt, mm_data in zip(prompts, mm_data_list)
        ],
        sampling_params=sampling_params
    )

    # Process the results for the batch
    for idx, example in enumerate(batch):
        try:
            response = outputs[idx].outputs[0].text
            responses[example["pid"]] = {
                "query": example["query"],
                "response": response,
                "question_type": example["question_type"],
                "answer_type": example["answer_type"],
                "choices": example.get("choices"),
                "precision": example.get("precision"),
                "answer": example["answer"],
            }
        except (KeyError, IndexError) as e:
            print(f"Error processing example: {e}. Skipping this result.")
            continue

# End timing
time_cost = time.time() - start_time
print(f"Time cost for generating responses: {time_cost:.2f} seconds")

# Save responses to a file
responses_file = os.path.join(output_dir, "responses_batch.json")
with open(responses_file, "w") as f:
    json.dump(responses, f)

print(f"Response generation completed. Saved to {responses_file}.")

Processed prompts: 100%|██████████| 8/8 [00:02<00:00,  3.73it/s, est. speed input: 1856.82 toks/s, output: 191.00 toks/s]
Processed prompts: 100%|██████████| 8/8 [00:00<00:00, 12.39it/s, est. speed input: 3708.16 toks/s, output: 125.72 toks/s]
Processed prompts: 100%|██████████| 8/8 [00:01<00:00,  4.02it/s, est. speed input: 1879.18 toks/s, output: 165.48 toks/s]
Processed prompts: 100%|██████████| 8/8 [00:03<00:00,  2.57it/s, est. speed input: 2730.22 toks/s, output: 103.11 toks/s]
Processed prompts: 100%|██████████| 8/8 [00:00<00:00, 10.61it/s, est. speed input: 3010.60 toks/s, output: 155.44 toks/s]
Processed prompts: 100%|██████████| 8/8 [00:08<00:00,  1.09s/it, est. speed input: 2179.58 toks/s, output: 51.95 toks/s]
Processed prompts: 100%|██████████| 8/8 [00:00<00:00,  8.09it/s, est. speed input: 3001.35 toks/s, output: 297.70 toks/s]
Processed prompts: 100%|██████████| 8/8 [00:00<00:00, 12.63it/s, est. speed input: 3288.04 toks/s, output: 139.24 toks/s]
Processed prompts: 100%|█

Time cost for generating responses: 34.30 seconds
Response generation completed. Saved to /content/drive/My Drive/World Models/responses_batch.json.



