In [3]:
! pip install transformers peft datasets accelerate

[33mDEPRECATION: Loading egg at /home/g2/LLM-GGC-Upgrade/.env/lib/python3.12/site-packages/bitsandbytes-0.41.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m


In [12]:
# test.py
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from peft import LoraConfig, get_peft_model, PeftModel

from datasets import load_dataset
from accelerate import Accelerator

accelerator = Accelerator()

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

model = AutoModel.from_pretrained(
    'openbmb/MiniCPM-V-2_6-int4',
    trust_remote_code=True,
    quantization_config=quantization_config,
    device_map="auto",
    offload_folder="offload"
)
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


model, tokenizer = accelerator.prepare(model, tokenizer)

# Define LoRA configuration with target_modules
lora_config = LoraConfig(
    task_type="CAUSAL_LM",  # Task type (e.g., Causal Language Modeling)
    inference_mode=False,  # Set to True for inference-only use
    r=8,  # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=["q_proj", "v_proj"],  # Specify target modules for LoRA
)

# Apply LoRA to the model
peft_model = get_peft_model(model, lora_config)

dataset = load_dataset("U4R/ChartX", split="validation")


def tokenize_function(examples):

    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

from transformers import TrainingArguments, Trainer

# Define training arguments

training_args = TrainingArguments(

    output_dir="./lora-fine-tuned", # Output directory

    per_device_train_batch_size=8,

    num_train_epochs=3,

    logging_dir="./logs",

    logging_steps=10,

    save_steps=500,

    save_total_limit=2,

    evaluation_strategy="steps",

    eval_steps=500,

    learning_rate=5e-4, # Smaller learning rate for LoRA

    ddp_find_unused_parameters=False,  # Helps avoid certain DDP bugs

)

# Define a Trainer

trainer = Trainer(

    model=peft_model,

    args=training_args,

    train_dataset=tokenized_dataset,
    

)

# Fine-tune the model

trainer.train()

peft_model.save_pretrained("./lora-fine-tuned")

# Load the fine-tuned model for inference

fine_tuned_model = PeftModel.from_pretrained(model, "./lora-fine-tuned")







ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
model.eval()

image = Image.open('xx.jpg').convert('RGB')
question = 'What is in the image?'
msgs = [{'role': 'user', 'content': [image, question]}]

res = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(res)

## if you want to use streaming, please make sure sampling=True and stream=True
## the model.chat will return a generator
res = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=True,
    temperature=0.7,
    stream=True
)

generated_text = ""
for new_text in res:
    generated_text += new_text
    print(new_text, flush=True, end='')
