In [1]:
!nvcc --version

# !pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830
!pip install git+https://github.com/huggingface/transformers
!pip install accelerate
!pip install qwen-vl-utils
!pip install datasets
!CUDA_VERSION=cu121
!pip install 'vllm==0.6.1' --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION}


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-w8zc86at
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-w8zc86at
  Resolved https://github.com/huggingface/transformers to commit b2f2977533445c4f62bf58e10b1360e6856e78ce
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10291149 sha256=a3dd3be04ff1

In [1]:
from google.colab import drive
import os
import json
import re
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from datasets import load_dataset

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Create necessary directories in Google Drive
output_dir = "/content/drive/My Drive/World Models"
os.makedirs(output_dir, exist_ok=True)

# Use GPU
if torch.cuda.is_available():
    print("GPU is available and ready to use!")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available. Make sure it's enabled in the runtime settings.")

# Response Generation with vLLM
from vllm import LLM, SamplingParams
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
GPU is available and ready to use!
Device name: NVIDIA A100-SXM4-40GB


In [3]:
model_save_path = os.path.join(output_dir, "qwen_model")
if os.path.exists(model_save_path):
    print("Loading model from local save path.")
    model_path = model_save_path
else:
    print("Downloading model. This may take some time...")
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
    model.save_pretrained(model_save_path)
    processor.save_pretrained(model_save_path)

Loading model from local save path.


In [4]:
print(model_save_path)
print(model_path)

/content/drive/My Drive/World Models/qwen_model
/content/drive/My Drive/World Models/qwen_model


In [5]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_path)
if not hasattr(config, "rope_scaling") or "factor" not in config.rope_scaling:
    config.rope_scaling = {"type": "linear", "factor": 1.0}  # Default
if not hasattr(config, "rope_type"):
    config.rope_type = "yarn"  # Default
if not hasattr(config, "max_model_len"):
    config.max_model_len = 2048  # Adjust per the model

config.save_pretrained(model_path)


In [6]:
# from transformers import AutoTokenizer
# model_name = "Qwen/Qwen2-VL-2B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.save_pretrained(model_path)

llm = LLM(
    model=model_path,
    limit_mm_per_prompt={"image": 10, "video": 10},
    dtype=torch.float16,
)

sampling_params = SamplingParams(
    temperature=0.1,
    top_p=0.001,
    repetition_penalty=1.05,
    max_tokens=256,
    stop_token_ids=[],
)

# Load the MathVista dataset
dataset = load_dataset("AI4Math/MathVista")


INFO 01-05 18:47:10 llm_engine.py:232] Initializing an LLM engine (v0.6.1) with config: model='/content/drive/My Drive/World Models/qwen_model', speculative_config=None, tokenizer='/content/drive/My Drive/World Models/qwen_model', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/content/drive/My Drive/World Models/qwen_model, use_v2_block_manager=False, n

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 01-05 18:48:40 model_runner.py:1008] Loading model weights took 4.1277 GB


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 01-05 18:48:44 gpu_executor.py:122] # GPU blocks: 60586, # CPU blocks: 9362
INFO 01-05 18:48:47 model_runner.py:1309] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-05 18:48:47 model_runner.py:1313] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 01-05 18:49:12 model_runner.py:1428] Graph capturing finished in 25 secs.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

(…)-00000-of-00001-725687bf7a18d64b.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

(…)-00000-of-00002-6b81bd7f7e2065e6.parquet:   0%|          | 0.00/358M [00:00<?, ?B/s]

(…)-00001-of-00002-6a611c71596db30f.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

Generating testmini split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5141 [00:00<?, ? examples/s]

In [9]:
subset_dataset = dataset["testmini"]

# Prepare results storage
responses = {}

import time
start_time = time.time()

# Generate responses
for idx, example in enumerate(subset_dataset):

    query = example["query"]  # Use the query field directly from the dataset
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": example["decoded_image"]},
                {"type": "text", "text": query},
            ],
        }
    ]
    processor = AutoProcessor.from_pretrained(model_path)
    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)

    mm_data = {}
    if image_inputs is not None:
        mm_data["image"] = image_inputs

    llm_inputs = {
        "prompt": prompt,
        "multi_modal_data": mm_data,
    }

    outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
    response = outputs[0].outputs[0].text
    # response_cleaned = response.split("\nassistant\n")[-1].strip()

    responses[example["pid"]] = {
        "query": query,
        "response": response,
        "question_type": example["question_type"],
        "answer_type": example["answer_type"],
        "choices": example.get("choices"),
        "precision": example.get("precision"),
        "answer": example["answer"],
    }



time_cost = time.time() - start_time
print(f"Time cost for generating responses: {time_cost:.2f} seconds")

responses_file = os.path.join(output_dir, "responses.json")
with open(responses_file, "w") as f:
    json.dump(responses, f)

print(f"Response generation completed. Saved to {responses_file}.")



Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it, est. speed input: 856.68 toks/s, output: 132.03 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.33it/s, est. speed input: 4655.87 toks/s, output: 17.42 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it, est. speed input: 127.02 toks/s, output: 133.91 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 10.71it/s, est. speed input: 1845.68 toks/s, output: 21.46 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  9.99it/s, est. speed input: 1941.51 toks/s, output: 30.02 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 10.27it/s, est. speed input: 1225.02 toks/s, output: 30.87 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 10.27it/s, est. speed input: 1501.86 toks/s, output: 30.85 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  8.84it/s, est. speed input: 4346.79 toks/s, output: 17.88 toks/s]
Processed prompts: 100%|████████

Time cost for generating responses: 1781.47 seconds
Response generation completed. Saved to /content/drive/My Drive/World Models/responses.json.



