In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# Load base model (4-bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-1.7B",
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")

# Load LoRA adapters
model = PeftModel.from_pretrained(
    base_model, 
    "GermanovDev/qwen3-pubmed-summarization"
)

# Generate summary
prompt = """You are a helpful assistant who writes concise, factual summaries of articles. Summarize the following article into a few sentences.
Article:
GLP-1 receptor agonists have demonstrated cardiovascular benefits in patients with type 2 diabetes, reducing major adverse cardiac events by 14% in recent trials.
Summary:"""

device = "cuda" if torch.cuda.is_available() else "cpu"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(
    **inputs, 
    max_new_tokens=128,
    do_sample=False,
    pad_token_id=tokenizer.eos_token_id
)
summary = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(summary)

 The article discusses the cardiovascular benefits of GLP-1 receptor agonists in patients with type 2 diabetes, showing a 14% reduction in major adverse cardiac events in recent clinical trials.
