In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "google/gemma-2b"
save_path = "./gemma2b-4bit"

In [None]:
import json

# Load configuration
with open('config.json', 'r') as f:
    config = json.load(f)

hf_token = config['huggingface_token']

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [4]:
#4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # or "fp4"
    bnb_4bit_compute_dtype=torch.float16
)


In [9]:
# Load tokenizer and model (4-bit quantized)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto" if device == "cuda" else {"": "cpu"},
    token=hf_token
)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.
Fetching 2 files: 100%|██████████| 2/2 [09:03<00:00, 271.77s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [04:08<00:00, 124.41s/it]


In [10]:
#  Save model and tokenizer locally
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")

Model saved to ./gemma2b-4bit
