In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.cuda.device_count())  # Should show how many GPUs are available
print(torch.cuda.current_device())  # Should show the current active GPU
print(torch.__version__)  # This will print the version of PyTorch

True
1
0
2.5.1+cu121


In [3]:
# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.76s/it]


In [5]:
def GenerateResponse(Text,model,tokenizer,device):
    # Define the message
    messages = [
        {"role": "user", "content": Text},
    ]
    # Prepare the inputs
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(device)
    # Check devices
    print("Model is on:", model.device)
    print("Input tensor is on:", inputs['input_ids'].device)
    outputs = model.generate(**inputs, max_new_tokens=40)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])
    return response

In [6]:
Text = "Who are you?"

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [8]:
# Decode the output
print(GenerateResponse(Text,model,tokenizer,device))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Model is on: cuda:0
Input tensor is on: cuda:0
I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."<|eot_id|>


In [9]:
!nvidia-smi

Thu Oct  2 20:30:07 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 577.03                 Driver Version: 577.03         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   49C    P8              8W /   75W |    2473MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                