In [1]:
import torch
from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from PIL import Image
from transformers import TrainerCallback
from unsloth import FastVisionModel 
from trl import SFTTrainer, SFTConfig
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator

class FinetuneQwenVL:
    def __init__(self, 
                 data,
                 epochs=1, 
                 learning_rate=1e-4,
                 warmup_ratio=0.1,
                 gradient_accumulation_steps=64,
                 optim="adamw_torch",
                 model_id="unsloth/Qwen2-VL-7B-Instruct", 
                 peft_r=8,
                 peft_alpha=16,
                 peft_dropout=0.05,
                ):
        self.epochs = epochs
        self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
        self.model_id = model_id

        self.base_model, self.tokenizer = FastVisionModel.from_pretrained(
            model_name = self.model_id,
            load_in_4bit = False,
            use_gradient_checkpointing = "unsloth",
        )
        self.model = FastVisionModel.get_peft_model(
            self.base_model,
            finetune_vision_layers     = True, # False if not finetuning vision layers
            finetune_language_layers   = True, # False if not finetuning language layers
            finetune_attention_modules = True, # False if not finetuning attention layers
            finetune_mlp_modules       = True, # False if not finetuning MLP layers
            r = peft_r,           
            lora_alpha = peft_alpha,  
            lora_dropout = peft_dropout,
            bias = "none",
            random_state = 3407,
            use_rslora = False,  
            loftq_config = None
        )
        self.learning_rate = learning_rate
        self.warmup_ratio = warmup_ratio
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.optim = optim
        self.data = data
    
    def format_data(self, row):
        image_path = row["image"]
        input_text = row['input']
        output_text = row['output']
        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            raise FileNotFoundError(f"Unable to load image at path: {image_path}. Error: {e}")

        return {
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": input_text,
                        },
                        {
                            "type": "image",
                            "image": image,  
                        }
                    ],
                },
                {
                    "role": "assistant",
                    "content": [
                        {
                            "type": "text",
                            "text": output_text,
                        }
                    ],
                },
            ],
        }

    def run(self):
        """
        Executes the fine-tuning process.
        """
        converted_dataset = [self.format_data(row) for row in self.data]
        converted_dataset = converted_dataset
        training_args = SFTConfig(
            learning_rate=self.learning_rate,
            output_dir='./model_cp',
            optim=self.optim,
            logging_steps=1,
            report_to="none",
            fp16 = not is_bf16_supported(),
            bf16 = is_bf16_supported(),
            logging_first_step=True,
            warmup_ratio=self.warmup_ratio,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            logging_dir='./logs',
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            num_train_epochs=self.epochs,
            weight_decay = 0.01,            # Regularization term for preventing overfitting
            lr_scheduler_type = "linear",   # Chooses a linear learning rate decay
            seed = 3407,
            logging_strategy = "steps",
            # load_best_model_at_end = True,
            # You MUST put the below items for vision finetuning:
            remove_unused_columns = False,
            dataset_text_field = "",
            dataset_kwargs = {"skip_prepare_dataset": True},
            dataset_num_proc = 4,
            max_seq_length = 2048,
        )
        FastVisionModel.for_training(self.model)
        
        trainer = SFTTrainer(
            model = self.model,
            tokenizer = self.tokenizer,
            data_collator = UnslothVisionDataCollator(self.model, self.tokenizer), # Must use!
            train_dataset = converted_dataset,
            args = training_args,
        )
        trainer.train()

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
# Sample data
prompt = "I want you to act as a quantum computer specialized in performing Grover’s algorithm. I will type a circuit, and you will reply with what a quantum computer should output. I want you to only reply with the output in a dictionary that contains the top-30 probabilities and nothing else. Circuit:"
qasm = """
qreg q[1];
creg c[1];
sdg q[0];
tdg q[0];
z q[0];
sxdg q[0];
u(5.6895009903655875,0.7219691165931532,2.039946133576617) q[0];
rx(5.1067509121076995) q[0];
y q[0];
id q[0];
u2(5.492256483666354,4.262016973063137) q[0];
y q[0];
measure q[0] -> c[0];
"""

# get csv data and get specific column 'openqasm'
import pandas as pd

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

CSV_FILENAME = 'Dataset/Source1/quantum_circuits_3_qubit.csv' 
data = pd.read_csv(CSV_FILENAME)
x_train = data['openqasm'][:]
image_path_1 = data['image_path1'][:]
image_path_2 = data['image_path2'][:]
y_train = data['ground_truth'][:]

data = [
    # {
    #     "image": "./Dataset/Source1/images/q01_d010_s0000_latex.png",
    #     "input": prompt + qasm,
    #     "output": "{\"0\": 0.099609375, \"1\": 0.900390625}"
    # },
]

for i in range(len(x_train)):
    data.append({
        "image": image_path_1[i],
        "input": prompt + x_train[i],
        "output": y_train[i],
    })
    # data.append({
    #     "image": image_path_2[i],
    #     "input": prompt + x_train[i],
    #     "output": y_train[i],
    # })

print(len(data))

# Initialize the finetune class with the data
finetuner = FinetuneQwenVL(
    data=data,
    epochs=30,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,
    optim="adamw_torch_fused",
    model_id="unsloth/Qwen2-VL-2B-Instruct",
    peft_r=16,
    peft_alpha=32,
    peft_dropout=0.0,
)

# Run the finetuning process
finetuner.run()


189
==((====))==  Unsloth 2025.1.6: Fast Qwen2_Vl vision patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Unsloth: Making `model.base_model.model.visual` require gradients


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 189 | Num Epochs = 30
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 690
 "-____-"     Number of trainable parameters = 28,950,528
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.7883
2,1.7586


  source = re.sub("([^\.])nn\.", r"\1torch.nn.", source)
  "self.rotary_emb = .+?\)", function,
  "self.rotary_emb = .+?\)", function,


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.00 GiB. GPU 0 has a total capacity of 23.68 GiB of which 7.93 GiB is free. Process 372511 has 9.14 GiB memory in use. Process 377890 has 470.00 MiB memory in use. Including non-PyTorch memory, this process has 6.13 GiB memory in use. Of the allocated memory 5.62 GiB is allocated by PyTorch, and 177.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
!pip install --upgrade transformers



In [2]:
import os
import torch
from unsloth import FastVisionModel
from PIL import Image

# Globals for holding the loaded model and processor
MODEL = None
TOKENIZER = None


def find_highest_checkpoint(checkpoint_dir: str) -> str:
    checkpoints = [
        d for d in os.listdir(checkpoint_dir)
        if d.startswith("checkpoint-") and os.path.isdir(os.path.join(checkpoint_dir, d))
    ]
    if not checkpoints:
        raise ValueError(f"No checkpoints found in {checkpoint_dir}")

    # Sort by the numeric portion after "checkpoint-"
    checkpoints_sorted = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
    highest_checkpoint = checkpoints_sorted[-1]
    return os.path.join(checkpoint_dir, highest_checkpoint)


def initialize_model(model_id: str, checkpoint_root: str = "./model_cp_qutip"):
    global MODEL, TOKENIZER

    # If already loaded, just return
    if MODEL is not None and TOKENIZER is not None:
        return MODEL, TOKENIZER

    adapter_path = find_highest_checkpoint(checkpoint_root)
    print(f"Highest checkpoint found: {adapter_path}")
    
    print("Loading base model...")
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name =  adapter_path,  # Trained model either locally or from huggingface
        load_in_4bit = False,
    )
    print("Base model loaded.")

    # 2. Find highest checkpoint

    MODEL = model.to("cuda")
    TOKENIZER = tokenizer

    return MODEL, TOKENIZER


def run_inference_qwenvl(image: Image.Image, user_input: str, temperature: float = 0.0, 
                        max_tokens: int = 500, model_id: str = "unsloth/Qwen2-VL-7B-Instruct") -> str:

    model, tokenizer = initialize_model(model_id)
    FastVisionModel.for_inference(model) 
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                },
                {
                    "type": "text",
                    "text": user_input
                },
            ]
        }
    ]
    # Tokenize prompt using the built-in chat template
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens = False,
        return_tensors = "pt",
    ).to("cuda")
    
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        use_cache=True,
        temperature=temperature,
        min_p=0.1
    )
    generate_ids = output_ids[:, inputs['input_ids'].shape[1]:]
    generated_text = tokenizer.decode(generate_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    return generated_text


In [3]:
prompt = "I want you to act as a quantum computer specialized in performing Grover’s algorithm. I will type a circuit, and you will reply with what a quantum computer should output. I want you to only reply with the output in a dictionary that contains the top-30 probabilities and nothing else. Circuit:"
# qasm = "OPENQASM 2.0;\ninclude \"qelib1.inc\";\nqreg q[3];\ncreg c[3];\nh q[0];\nch q[2],q[1];\nmeasure q[0] -> c[0];\nmeasure q[1] -> c[1];\nmeasure q[2] -> c[2];\",1024,\"{\n    \"001\": 0.490234375,\n    \"000\": 0.509765625\n}"

# get data from csv file
import pandas as pd
import random  


CSV_FILENAME = 'Dataset/Source1/quantum_circuits_3_qubit_test.csv'
data = pd.read_csv(CSV_FILENAME)
x_test = data['openqasm'][:]
image_path_1 = data['image_path1'][:]
image_path_2 = data['image_path2'][:]
ground_truth = data['ground_truth'][:]

for i in range(len(x_test)):
    # generate random number between 0 and 1 to chooose between image_path_1 and image_path_2, import random
    random_number = random.random()
    if random_number < 0.5:
        image = Image.open(image_path_1[i]).convert("RGB")
    else:
        image = Image.open(image_path_2[i]).convert("RGB")
        
    image = image.resize((336, 336))
    
    # image = Image.open().convert("RGB")
    user_input = prompt + x_test[i]
    temperature = 1.5
    max_tokens = 500
    model_id = "unsloth/Qwen2-VL-2B-Instruct"

    generated_text = run_inference_qwenvl(image, user_input, temperature, max_tokens, model_id)
    # generate print for the number of the test case and the generated textn result of generated_text and ground_truth
    print(f"Test case {i+1}")
    print(f"Generated text: {generated_text}")
    print(f"Ground truth: {ground_truth[i]}")
    print("\n")

Highest checkpoint found: ./model_cp/checkpoint-2480
Loading base model...


KeyboardInterrupt: 

In [4]:
import torch
from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from PIL import Image
from transformers import TrainerCallback
from unsloth import FastVisionModel 
from trl import SFTTrainer, SFTConfig
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
import pandas as pd
from sklearn.model_selection import train_test_split

CSV_FILENAME = 'metadata-qutip-updated.csv' 
data = pd.read_csv(CSV_FILENAME)

required_columns = ['type', 'image', 'ground_truth', 'prompt']
for column in required_columns:
    if column not in data.columns:
        raise ValueError(f"Column '{column}' not found in the CSV file.")

# Shuffle the dataset with a controlled random state
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split data into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

test_data = test_data.dropna().reset_index(drop=True)


x_test = test_data['type'][:]
images = test_data['image'][:]
y_test = test_data['ground_truth'][:]
prompts = test_data['prompt'][:]



for i in range(len(test_data)):
    image = Image.open(images[i]).convert("RGB").resize((336,336))
    
    user_input = "What quantum state is depicted in this image?" + x_test[i]
    temperature = 0.5
    max_tokens = 500
    model_id = "unsloth/Qwen2-VL-7B-Instruct"

    generated_text = run_inference_qwenvl(image, user_input, temperature, max_tokens, model_id)
    # generate print for the number of the test case and the generated textn result of generated_text and ground_truth
    print(f"Test case {i+1}")
    print(f"Generated text: {generated_text}")
    print(f"Ground truth: {y_test[i]}")
    print("\n")
    


# get len images,  x_train, y_train, prompts
# print(f"len(images): {len(images)}")
# print(f"len(x_train): {len(x_train)}")
# print(f"len(y_train): {len(y_train)}")
# print(f"len(prompts): {len(prompts)}")

# print(images[16])

# fine_tune_data = []
# for i in range(len(x_train)):
#     # print the index for debugging
#     print(f"Index: {i}")
#     fine_tune_data.append({
#         "image": images[i],
#         "input": prompts[i] + x_train[i],
#         "output": y_train[i],
#     })

# finetuner = FinetuneQwenVL(
#     data=fine_tune_data,
#     epochs=10,
#     learning_rate=5e-5,
#     warmup_ratio=0.1,
#     gradient_accumulation_steps=8,
#     optim="adamw_torch_fused",
#     model_id="unsloth/Qwen2-VL-7B-Instruct",
#     peft_r=16,
#     peft_alpha=16,
#     peft_dropout=0.0,
# )

# finetuner.run()


Test case 1
Generated text: This is a coherent state with alpha equal to 1, number of qubits equal to 17 in the linear space -10 to 10.
Ground truth: This is a coherent state with alpha equal to 4, number of qubits equal to 18 in the linear space -5 to 5.


Test case 2
Generated text: This is a thermal state with average number of photons is 5, number of qubits equal to 15 in the linear space -10 to 10.
Ground truth: This is a thermal state with average number of photons is 5, number of qubits equal to 15 in the linear space -5 to 5.


Test case 3
Generated text: This is a random state with density is 0.1, number of qubits equal to 15 in the linear space -10 to 10.
Ground truth: This is a random state with density is 0.7, number of qubits equal to 18 in the linear space -5 to 5.


Test case 4
Generated text: This is a random state with density is 0.5, number of qubits equal to 11 in the linear space -10 to 10.
Ground truth: This is a random state with density is 0.5, number of qubits e