In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected.")

CUDA available: True
GPU device name: NVIDIA GeForce RTX 4070


In [5]:
import os
import json
from dataclasses import dataclass, field
from typing import Optional, Dict, Any

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# Ensure torch is available for device selection
import torch

In [6]:
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

DATA_DIR = ".//dataset"

OUTPUT_DIR = ".//qwen25_resume_lora"


In [7]:
if os.path.isdir(DATA_DIR):
    print("Directory exists")
else:
    print("Directory does not exist")

Directory exists


In [10]:
def answer_question(question: str, model_name: str = MODEL_NAME, max_new_tokens: int = 50, device: Optional[str] = None) -> str:
    """Load model and tokenizer, generate an answer for the given question, and return the response string."""
    # Select device: provided device > CUDA if available > CPU
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Load tokenizer and model (this will download weights the first time)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    # Build chat-style inputs expected by the tokenizer helper
    messages = [{"role": "user", "content": question}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)

    # Move tensors to model device — support both dict and plain tensor returns
    if isinstance(inputs, dict):
        inputs = {k: (v.to(model.device) if hasattr(v, 'to') else v) for k, v in inputs.items()}
        input_ids = inputs.get('input_ids')
    else:
        inputs = inputs.to(model.device)
        input_ids = inputs

    # Generate using appropriate call signature
    if isinstance(inputs, dict):
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    else:
        outputs = model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)

    # If model.generate returned a tuple, take first element
    if isinstance(outputs, tuple):
        outputs = outputs[0]

    # Compute input length for slicing generated tokens
    if input_ids is not None:
        input_len = input_ids.shape[1]
    else:
        input_len = 0

    # Slice out the newly generated tokens and decode
    generated = outputs[0, input_len:] if outputs.dim() == 2 else outputs[input_len:]
    response = tokenizer.decode(generated, skip_special_tokens=True)
    return response

In [12]:
# Example: call the function from another cell
question = "What is the capital of France?"
print(answer_question(question))

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.47it/s]



The capital of France is Paris.
