In [1]:
def estimate_memory_usage(model_size, precision, sequence_length, batch_size, hidden_size, num_layers, optimizer):
    """
    Estimate the VRAM required for fine-tuning a model.

    Parameters:
        model_size (int): Number of parameters in the model (e.g., 7 billion = 7e9).
        precision (int): Bytes per parameter (e.g., 2 for FP16, 4 for FP32).
        sequence_length (int): Length of input sequences.
        batch_size (int): Number of samples per batch.
        hidden_size (int): Hidden size of the model.
        num_layers (int): Number of layers in the model.
        optimizer (str): Optimizer type ('adam', 'adamw', 'sgd', 'rmsprop', 'adafactor').

    Returns:
        float: Estimated VRAM usage in GB.
    """
    # Define optimizer memory factors
    optimizer_memory_factors = {
        'adam': 2,
        'adamw': 2,
        'sgd': 1,
        'rmsprop': 2,
        'adafactor': 1.5
    }

    optimizer = optimizer.lower()
    if optimizer not in optimizer_memory_factors:
        print(f"Please specify a valid optimizer. Valid optimizers are: {list(optimizer_memory_factors.keys())}")
        return

    # Model Weights
    model_weight_memory = model_size * precision

    # Activations (approximate)
    activation_memory = batch_size * sequence_length * hidden_size * num_layers * precision

    # Gradients (similar size to activations)
    gradient_memory = activation_memory

    # Optimizer States
    optimizer_factor = optimizer_memory_factors[optimizer]
    optimizer_memory = optimizer_factor * model_weight_memory

    # Total Memory
    total_memory = model_weight_memory + activation_memory + gradient_memory + optimizer_memory

    # Convert to GB
    total_memory_gb = total_memory / (1024 ** 3)
    return total_memory_gb

### Testing the function
The following code will test the function using the torch.cuda.memory_allocated, which tells the user how much memory will be allocated during a finetuning run.

In [2]:
TEXT = """
Once upon a time, in a faraway kingdom, there lived a wise and kind king. His name was King Cedric, and he ruled the kingdom with fairness and compassion. The people loved him dearly for his just decisions and his love for the well-being of his subjects.

One day, a messenger arrived at the palace with a troubling message. The neighboring kingdom had been struck by a terrible drought, and the people there were suffering greatly. King Cedric, being the compassionate ruler that he was, immediately called for a meeting of his advisors to discuss how they could help.

The advisors were divided. Some suggested sending food and water to the neighboring kingdom, while others suggested sending gold to help them rebuild their infrastructure. King Cedric listened attentively to all of their ideas, weighing the pros and cons of each suggestion.

After much discussion, King Cedric made his decision. He would send not only food and water but also skilled engineers and laborers to help rebuild the kingdom's infrastructure. He knew that this would not only help the people in the neighboring kingdom but also strengthen the bond between the two nations.

As the days passed, King Cedric's decision proved to be a wise one. The neighboring kingdom flourished, and the bond between the two kingdoms grew stronger. King Cedric's people admired him even more for his selflessness and the positive impact he had on the world around him.

And so, the story of King Cedric and his wise decision became a tale told for generations, reminding everyone of the power of compassion, leadership, and the importance of helping others in times of need.
"""

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
import numpy as np

model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '<pad>'})
model.resize_token_embeddings(len(tokenizer))

# Move Model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Tokenize the long text document
batch_size = 4  # Adjust batch size as needed
sequence_length = model.config.max_position_embeddings

# Tokenizing and preparing input text for the model
long_text_input = tokenizer([TEXT] * batch_size,
                            return_tensors="pt",
                            padding="max_length",
                            max_length=sequence_length,
                            truncation=True).to(device)

# Resize model's token embeddings with new special tokens
model.resize_token_embeddings(len(tokenizer))

# Define Optimizer
optimizer = torch.optim.Adam(model.parameters())

# Reset GPU Memory Stats
torch.cuda.empty_cache()

# Measure VRAM During Multiple Training Steps
def measure_vram(num_passes=10):
    allocated_memory_list = []
    peak_memory_list = []

    for i in range(num_passes):
        # Forward Pass
        labels = long_text_input['input_ids']
        outputs = model(**long_text_input, labels=labels)
        loss = outputs.loss  # Use the proper loss attribute

        # Backward Pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Memory Stats
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**3  # GB
        peak_memory = torch.cuda.max_memory_allocated(device) / 1024**3  # GB

        allocated_memory_list.append(allocated_memory)
        peak_memory_list.append(peak_memory)

    # Return average memory stats over all passes
    avg_allocated_memory = sum(allocated_memory_list) / num_passes
    avg_peak_memory = sum(peak_memory_list) / num_passes

    # Calculate variance
    allocated_variance = np.var(allocated_memory_list)
    peak_variance = np.var(peak_memory_list)

    return avg_allocated_memory, avg_peak_memory, allocated_variance, peak_variance

# Measure VRAM over 10 passes
num_passes = 10
avg_allocated, avg_peak, allocated_variance, peak_variance = measure_vram(num_passes=num_passes)
print(f"Average Allocated VRAM over {num_passes} passes: {avg_allocated:.2f} GB")
print(f"Average Peak VRAM over {num_passes} passes: {avg_peak:.2f} GB")
print(f"Variance of Allocated VRAM: {allocated_variance:.4f} GB^2")
print(f"Variance of Peak VRAM: {peak_variance:.4f} GB^2")

# Dynamically retrieve model parameters
hidden_size = model.config.n_embd  # Hidden size
num_layers = model.config.n_layer  # Number of layers

# Automatically detect precision (FP16 if available)
# precision = 16 if model.device.type == "cuda" and torch.cuda.get_device_capability(model.device)[0] >= 7 else 32
precision = 32

# Total parameters in the model
model_size = sum(p.numel() for p in model.parameters())  # Total parameters

# Estimate VRAM based on the model's architecture and precision
estimated_vram = estimate_memory_usage(
    model_size=model_size,
    precision=precision,
    sequence_length=sequence_length,
    batch_size=batch_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    optimizer="adam"
)

print(f"\nEstimated VRAM: {estimated_vram:.2f} GB\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Average Allocated VRAM over 10 passes: 2.63 GB
Average Peak VRAM over 10 passes: 8.77 GB
Variance of Allocated VRAM: 0.0000 GB^2
Variance of Peak VRAM: 0.1711 GB^2

Estimated VRAM: 13.38 GB

