#GPU memory for LLM models
Made by: Wilfredo Aaron Sosa Ramos

In [15]:
%%writefile get_gpu_memory.py
from typing import Dict, Union
from huggingface_hub import get_safetensors_metadata
import argparse
import sys

# Example:
# python get_gpu_memory.py Owen/Qwen2.5-7B-Instruct

# Dictionary mapping dtype strings to their byte sizes
bytes_per_dtype = {"int4": 0.5, "int8": 1, "float8": 1, "float16": 2, "float32": 4}

def calculate_gpu_memory(parameters: float, bytes: float) -> float:
    """
    Calculates the GPU memory required for serving a Large Language Model (LLM).
    This function estimates the GPU memory needed using the formula:
    M = (P * 4B) / (32 / Q) * 1.18
    where:
    - M is the GPU memory in Gigabytes
    - P is the number of parameters in billions (e.g., 7 for a 7B model)
    - 4B represents 4 bytes per parameter
    - 32 represents bits in 4 bytes
    - Q is the quantization bits (e.g., 16, 8, or 4 bits)
    - 1.18 represents ~18% overhead for additional GPU memory requirements

    Args:
        parameters: Number of model parameters in billions
        bytes: Number of bytes per parameter based on dtype

    Returns:
        Estimated GPU memory required in Gigabytes
    """
    memory = round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2)
    return memory

def get_model_size(model_id: str, dtype: str = "float16") -> Union[float, None]:
    """
    Get the estimated GPU memory requirement for a Hugging Face model.

    Args:
        model_id: Hugging Face model ID (e.g., "facebook/opt-350m")
        dtype: Data type for model loading ("float16", "int8", etc.)

    Returns:
        Estimated GPU memory in GB, or None if estimation fails
    """
    try:
        if dtype not in bytes_per_dtype:
            raise ValueError(
                f"Unsupported dtype: {dtype}. Supported types: {list(bytes_per_dtype.keys())}"
            )

        metadata = get_safetensors_metadata(model_id)
        if not metadata or not metadata.parameter_count:
            raise ValueError(f"Could not fetch metadata for model: {model_id}")

        model_parameters = list(metadata.parameter_count.values())[0]
        model_parameters = int(model_parameters) / 1_000_000_000  # Convert to billions
        return calculate_gpu_memory(model_parameters, bytes_per_dtype[dtype])

    except Exception as e:
        print(f"Error estimating model size: {str(e)}", file=sys.stderr)
        return None

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "model_id", help="Hugging Face model ID (e.g., Qwen/Qwen2.5-7B-Instruct)"
    )
    parser.add_argument(
        "--dtype",
        default="float16",
        choices=bytes_per_dtype.keys(),
        help="Data type for model loading",
    )
    args = parser.parse_args()
    size = get_model_size(args.model_id, args.dtype)
    print(
        f"Estimated GPU memory requirement for {args.model_id}: {size:.2f} GB ({args.dtype})"
    )

if __name__ == "__main__":
    main()

Overwriting get_gpu_memory.py


In [9]:
!python get_gpu_memory.py microsoft/phi-4

Parse safetensors files: 100% 6/6 [00:00<00:00, 29.70it/s]
Estimated GPU memory requirement for microsoft/phi-4: 34.60 GB (float16)


In [11]:
!python get_gpu_memory.py deepseek-ai/DeepSeek-V3

model.safetensors.index.json: 100% 8.90M/8.90M [00:00<00:00, 23.5MB/s]
Parse safetensors files: 100% 163/163 [00:03<00:00, 53.33it/s]
Estimated GPU memory requirement for deepseek-ai/DeepSeek-V3: 0.10 GB (float16)


In [16]:
!python get_gpu_memory.py jinaai/ReaderLM-v2

Estimated GPU memory requirement for jinaai/ReaderLM-v2: 4.19 GB (float16)
