In [1]:
# LLM Inference & Quantization Demo (Google Colab)

# Install dependencies
!pip install -q transformers accelerate bitsandbytes

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Choose a small, quantized model (using bitsandbytes if supported)
MODEL_NAME = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"  # Placeholder for demonstration

# Try loading a real model from HuggingFace Hub that supports 4-bit quantization
QUANTIZED_MODEL = "tiiuae/falcon-7b-instruct"  # Replace with a real quantized model if available

print("Checking GPU availability...")
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    print("Running on CPU. Inference will be slow.")

# Load tokenizer and model with 8-bit quantization (simulated here)
print("Loading quantized model...")
tokenizer = AutoTokenizer.from_pretrained(QUANTIZED_MODEL)

model = AutoModelForCausalLM.from_pretrained(
    QUANTIZED_MODEL,
    load_in_8bit=True,
    device_map="auto"
)

# Build text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Prompt the model
prompt = "Explain how large language models can run on a local machine."

print("\nGenerating response...")
outputs = generator(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)
print(outputs[0]['generated_text'])

# Memory size simulation for float32, float16, int8
def estimate_model_size(params_in_billions):
    sizes = {
        "float32": params_in_billions * 4 * 1e9 / (1024 ** 3),
        "float16": params_in_billions * 2 * 1e9 / (1024 ** 3),
        "int8": params_in_billions * 1 * 1e9 / (1024 ** 3),
        "int4": params_in_billions * 0.5 * 1e9 / (1024 ** 3),
    }
    return sizes

# Simulate sizes for a 7B model
sizes_7b = estimate_model_size(7)

print("\nEstimated VRAM usage for 7B parameters:")
for precision, size in sizes_7b.items():
    print(f"{precision.upper()}: ~{size:.2f} GB")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



Generating response...
Explain how large language models can run on a local machine.
Large language models can be run on a local machine by using TensorFlow or PyTorch, which are popular deep learning frameworks. These frameworks allow for efficient training and running of large-scale language models. The model is typically split into smaller parts that can be loaded and executed in parallel on multiple GPUs for faster training and inference. TensorFlow and PyTorch offer various features such as tensor cores for efficient matrix multiplication, which can significantly speed up model execution. Additionally, they provide tools for data

Estimated VRAM usage for 7B parameters:
FLOAT32: ~26.08 GB
FLOAT16: ~13.04 GB
INT8: ~6.52 GB
INT4: ~3.26 GB
