In [3]:
# run this cell if you are in colab with a single notebook opened, otherwise ignore this cell

#!git clone https://github.com/CowboyPhilip/HPML-Energy-Efficient-LLM
%cd HPML-Energy-Efficient-LLM 
!ls

/content/HPML-Energy-Efficient-LLM
basework	     nvml_test.py	      results
DeepSeek_MBPP.ipynb  patched_energy_utils.py  run_deepseek_mmlu_test.py
measure_mmlu.ipynb   README.md		      utils


In [4]:
import sys, importlib
import os
sys.path.append("/content/HPML-Energy-Efficient-LLM")

In [30]:
import importlib
import utils.test_generation
importlib.reload(utils.test_generation)
from utils.test_generation import test_generation_MATH

In [4]:
# -*- coding: utf-8 -*-
"""DeepSeek Energy Consumption Benchmark

This notebook measures energy consumption and carbon footprint of LLMs with
different quantization methods (FP16, INT8, INT4).
Optimized for Google Colab A100 GPU (40GB).
"""

'DeepSeek Energy Consumption Benchmark\n\nThis notebook measures energy consumption and carbon footprint of LLMs with\ndifferent quantization methods (FP16, INT8, INT4).\nOptimized for Google Colab A100 GPU (40GB).\n'

In [5]:
# Cell 1: Install the correct Zeus package in colab
!pip install zeus-ml  # The correct package name is zeus-ml, not zeus
!pip install --upgrade pip setuptools
!pip install transformers \
            bitsandbytes \
            zeus-ml \
            torch \
            datasets \
            evaluate \
            scikit-learn \
            geocoder \
            requests \
            flash-attn

Collecting zeus-ml
  Downloading zeus_ml-0.11.0.post1-py3-none-any.whl.metadata (8.9 kB)
Collecting tyro (from zeus-ml)
  Downloading tyro-0.9.19-py3-none-any.whl.metadata (9.9 kB)
Collecting amdsmi (from zeus-ml)
  Downloading amdsmi-6.4.0-py3-none-any.whl.metadata (2.8 kB)
Collecting shtab>=1.5.6 (from tyro->zeus-ml)
  Downloading shtab-1.7.2-py3-none-any.whl.metadata (7.4 kB)
Downloading zeus_ml-0.11.0.post1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading amdsmi-6.4.0-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.7/56.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-0.9.19-py3-none-any.whl (124 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.3/124.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.2-py3-none-any.whl (14 kB)
Installing collected packages: s

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)


In [8]:
# Cell 2: Environment setup and imports
import torch
import os
import numpy as np
from functools import partial
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding
)
from zeus.monitor import ZeusMonitor
from datasets import load_dataset, concatenate_datasets
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import json
import geocoder
import requests
import matplotlib.pyplot as plt
import pandas as pd
import time
import gc

# Clear GPU cache
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

# Environment variables for better performance
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["ZEUS_DISABLE_AMD_SMI"] = "1"
os.environ["TRANSFORMERS_LLM_INT8_ENABLE_FP32_CPU_OFFLOAD"] = "1"

# Check GPU information
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device: {torch.cuda.get_device_name(0)}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU device: Tesla T4
GPU memory: 15.83 GB


In [9]:
# Cell 3: Memory management utilities
from utils.memory_utils import clean_memory, print_gpu_memory

In [10]:
# Cell 4: EnergyTracker and Carbon intensity estimation
from utils.energy_utils import EnergyTracker, get_carbon_intensity, joules_to_co2

In [None]:
# Cell 5: Model loading functions with memory optimization
from utils.load_llm import load_llm, load_classifier

In [11]:
# Cell 6: Text Generation Energy Benchmark
from utils.test_generation import compare_generation_energy, quick_test_generation

In [12]:
def convert_numpy(obj):
    """
    Recursively convert NumPy types to Python native types for JSON serialization.
    """
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()  # Convert NumPy array to list
    elif isinstance(obj, dict):
        return {k: convert_numpy(v) for k, v in obj.items()}  # Recursively convert dict
    elif isinstance(obj, list):
        return [convert_numpy(i) for i in obj]  # Recursively convert list
    else:
        return obj  # Return unchanged if not a NumPy type

In [13]:
# Cell 7: GLUE Task Energy Benchmarking, and GLUE benchmark with different quantization methods
from utils.test_glue import run_glue_energy_monitoring, test_quantized_models_on_glue

In [14]:
# Cell 8: Full Benchmark Function
def run_full_benchmark(model_name, run_fp16=False):
    """
    Run a full benchmark of both generation and GLUE tasks with different quantization modes

    Args:
        model_name: HuggingFace model name to benchmark
        run_fp16: Whether to include FP16 mode (memory intensive)

    Returns:
        Dictionary with benchmark results
    """
    print("="*80)
    print(f"RUNNING ENERGY BENCHMARK FOR {model_name}")
    print("="*80)
    results = {}

    # Determine modes to test
    modes = ['int8', 'int4']
    if run_fp16:
        modes.insert(0, 'fp16')  # Add fp16 at beginning if requested

    # Part 1: Text Generation Benchmark
    print("\n\n==== PART 1: TEXT GENERATION ENERGY BENCHMARK ====\n")
    prompt = "DeepSeek AI is an advanced open-source language model designed to power AI applications."
    generation_results = compare_generation_energy(
        model_name=model_name,
        prompt=prompt,
        quantization_modes=modes,
        verbose=True
    )
    results['generation'] = generation_results

    # Part 2: GLUE Tasks Benchmark
    print("\n\n==== PART 2: GLUE TASKS ENERGY BENCHMARK ====\n")
    glue_tasks = ['sst2']  # Just one task for memory efficiency
    glue_results = test_quantized_models_on_glue(
        model_name=model_name,
        tasks=glue_tasks,
        quantization_modes=modes,
        batch_size=1  # Single sample batch size
    )
    results['glue'] = glue_results

    # Final Summary
    print("\n\n==== FINAL SUMMARY ====\n")
    print("Comparison of Energy Efficiency Across Tasks and Quantization Modes:")

    # Get carbon intensity for final calculations
    carbon_intensity = get_carbon_intensity()

    # Calculate total energy and carbon footprint
    total_energy = {mode: 0.0 for mode in modes}

    # Add generation energy
    for mode in modes:
        if mode in generation_results and 'total_energy' in generation_results[mode]:
            total_energy[mode] += generation_results[mode]['total_energy']

    # Add GLUE energy
    for task in glue_tasks:
        for mode in modes:
            if mode in glue_results[task] and 'total_energy' in glue_results[task][mode]:
                total_energy[mode] += glue_results[task][mode]['total_energy']

    # Print total energy and carbon footprint
    print("\nTotal Energy Consumption:")
    for mode in modes:
        carbon = joules_to_co2(total_energy[mode], carbon_intensity)
        print(f"{mode.upper()}: {total_energy[mode]:.4f} J = {carbon:.6f} gCO2eq")

    # Calculate energy savings if we have fp16 data
    if 'fp16' in modes and total_energy['fp16'] > 0:
        baseline = total_energy['fp16']
        print("\nTotal Energy Savings:")
        for mode in ['int8', 'int4']:
            savings = 100 * (baseline - total_energy[mode]) / baseline
            print(f"{mode.upper()} saves {savings:.2f}% energy compared to FP16")
    elif len(modes) > 1:
        # If no fp16, compare to highest energy mode
        baseline_mode = max([m for m in modes if total_energy[m] > 0], key=lambda m: total_energy[m])
        baseline = total_energy[baseline_mode]
        print(f"\nTotal Energy Savings (compared to {baseline_mode.upper()}):")
        for mode in modes:
            if mode != baseline_mode and total_energy[mode] > 0:
                savings = 100 * (baseline - total_energy[mode]) / baseline
                print(f"{mode.upper()} saves {savings:.2f}% energy compared to {baseline_mode.upper()}")

    return results

In [15]:
# Cell 9: Results Visualization Functions
from utils.plot_utils import plot_component_energy, plot_energy_comparison

In [16]:
# Cell 10: Main execution cell
"""
# Set model name
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Choose one of the following options to run:

# Option 1: Quick single-mode test (fastest)
quick_test_generation(MODEL_NAME, quant_mode='int4')

# Option 2: Generation benchmark with INT8 and INT4 only
generation_results = compare_generation_energy(
    model_name=MODEL_NAME,
    prompt="DeepSeek AI is an advanced open-source language model designed to power AI applications.",
    quantization_modes=['int8', 'int4'],
    verbose=True
)
plot_energy_comparison({"generation": generation_results})
plot_component_energy({"generation": generation_results}, task_type='generation', quant_mode='int4')

# Option 3: GLUE benchmark with INT8 and INT4 only
glue_results = test_quantized_models_on_glue(
    model_name=MODEL_NAME,
    tasks=['sst2'],
    quantization_modes=['int8', 'int4'],
    batch_size=1
)
plot_energy_comparison({"glue": glue_results})
plot_component_energy({"glue": glue_results}, task_type='glue', quant_mode='int4')

# Option 4: Full benchmark
results = run_full_benchmark(MODEL_NAME, run_fp16=False)
"""

'\n# Set model name\nMODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"\n\n# Choose one of the following options to run:\n\n# Option 1: Quick single-mode test (fastest)\nquick_test_generation(MODEL_NAME, quant_mode=\'int4\')\n\n# Option 2: Generation benchmark with INT8 and INT4 only\ngeneration_results = compare_generation_energy(\n    model_name=MODEL_NAME,\n    prompt="DeepSeek AI is an advanced open-source language model designed to power AI applications.",\n    quantization_modes=[\'int8\', \'int4\'],\n    verbose=True\n)\nplot_energy_comparison({"generation": generation_results})\nplot_component_energy({"generation": generation_results}, task_type=\'generation\', quant_mode=\'int4\')\n\n# Option 3: GLUE benchmark with INT8 and INT4 only\nglue_results = test_quantized_models_on_glue(\n    model_name=MODEL_NAME,\n    tasks=[\'sst2\'],\n    quantization_modes=[\'int8\', \'int4\'],\n    batch_size=1\n)\nplot_energy_comparison({"glue": glue_results})\nplot_component_energy({"glu

In [17]:
# Cell 14: Usage Instructions
"""
# DeepSeek Energy Consumption Benchmark for Google Colab A100 (40GB)

This notebook measures energy consumption and carbon footprint of LLMs
with different quantization methods (FP16, INT8, INT4).

## Instructions for Running on Google Colab

1. Run cells 1-12 to set up the environment and define all functions
2. In cell 13, uncomment one of the benchmark options:
   - Option 1: Quick single-mode test (recommended for initial testing)
   - Option 2: Generation benchmark comparing INT8 and INT4
   - Option 3: GLUE task benchmark comparing INT8 and INT4
   - Option 4: Full benchmark of both tasks
3. For FP16 testing (if your memory allows):
   - Modify the quantization_modes parameter to include 'fp16'
   - Or set run_fp16=True in the full benchmark function

## Memory Management Tips

- Run one benchmark at a time, not all options simultaneously
- Monitor GPU memory usage in Colab (Runtime > Resource usage)
- If you encounter OOM errors, try:
  1. Restart the runtime to clear all memory
  2. Run only INT4 benchmarks first
  3. Reduce batch sizes further
  4. Use shorter input sequences
"""

"\n# DeepSeek Energy Consumption Benchmark for Google Colab A100 (40GB)\n\nThis notebook measures energy consumption and carbon footprint of LLMs\nwith different quantization methods (FP16, INT8, INT4).\n\n## Instructions for Running on Google Colab\n\n1. Run cells 1-12 to set up the environment and define all functions\n2. In cell 13, uncomment one of the benchmark options:\n   - Option 1: Quick single-mode test (recommended for initial testing)\n   - Option 2: Generation benchmark comparing INT8 and INT4\n   - Option 3: GLUE task benchmark comparing INT8 and INT4\n   - Option 4: Full benchmark of both tasks\n3. For FP16 testing (if your memory allows):\n   - Modify the quantization_modes parameter to include 'fp16'\n   - Or set run_fp16=True in the full benchmark function\n\n## Memory Management Tips\n\n- Run one benchmark at a time, not all options simultaneously\n- Monitor GPU memory usage in Colab (Runtime > Resource usage)\n- If you encounter OOM errors, try:\n  1. Restart the ru

In [18]:
# Cell 15: Step-by-step execution examples for Google Colab

# Example 1: Testing a single model with INT4 quantization
def run_int4_test():
    """Run a quick test with INT4 quantization only"""
    MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    print(f"Running quick test on {MODEL_NAME} with INT4 quantization")

    # Clean memory first
    clean_memory()
    print_gpu_memory()

    # Run generation test
    stats = quick_test_generation(MODEL_NAME, quant_mode='int4')

    return stats

# Example 2: Testing with both INT8 and INT4 on generation task
def run_generation_benchmark():
    """Run generation benchmark with INT8 and INT4"""
    MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    print(f"Running generation benchmark on {MODEL_NAME}")

    # Clean memory first
    clean_memory()
    print_gpu_memory()

    # Run benchmark with both quantization modes
    generation_results = compare_generation_energy(
        model_name=MODEL_NAME,
        prompt="DeepSeek AI is an advanced open-source language model designed to power AI applications.",
        quantization_modes=['int8', 'int4'],
        verbose=True
    )

    # Visualize results
    try:
        plot_energy_comparison({"generation": generation_results})
        for mode in ['int8', 'int4']:
            if mode in generation_results and 'components' in generation_results[mode]:
                plot_component_energy({"generation": generation_results}, task_type='generation', quant_mode=mode)
    except Exception as e:
        print(f"Error plotting results: {e}")

    return generation_results

# Example 3: Testing with both INT8 and INT4 on GLUE task
def run_glue_benchmark():
    """Run GLUE benchmark with INT8 and INT4"""
    MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
    print(f"Running GLUE benchmark on {MODEL_NAME}")

    # Clean memory first
    clean_memory()
    print_gpu_memory()

    # Run benchmark with both quantization modes
    glue_results = test_quantized_models_on_glue(
        model_name=MODEL_NAME,
        tasks=['sst2'],
        quantization_modes=['int8', 'int4'],
        batch_size=1
    )

    # Visualize results
    try:
        plot_energy_comparison({"glue": glue_results})
        for mode in ['int8', 'int4']:
            task = list(glue_results.keys())[0]
            if mode in glue_results[task] and 'component_energy' in glue_results[task][mode]:
                plot_component_energy({"glue": glue_results}, task_type='glue', quant_mode=mode)
    except Exception as e:
        print(f"Error plotting results: {e}")

    return glue_results

# Example 4: Advanced - Testing with FP16 (if memory allows)
def run_fp16_test():
    """Attempt to run FP16 test with careful memory management"""
    MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    print(f"Attempting FP16 test on {MODEL_NAME}")

    # Clean memory thoroughly
    clean_memory()
    print_gpu_memory()

    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        # Load model with aggressive offloading
        print("Loading model in FP16 mode with aggressive memory offloading...")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16,
            offload_state_dict=True,
            max_memory={0: "30GB"},  # Limit GPU memory usage
            device_map="auto",
            offload_folder="offload",
            low_cpu_mem_usage=True
        )

        print("Model loaded. Creating energy tracker...")
        print_gpu_memory()

        # Create tracker
        tracker = EnergyTracker(model, precision_mode='float16')

        # Use a very short prompt
        prompt = "AI model"
        print(f"Running inference with mini prompt: '{prompt}'")

        # Tokenize with max truncation
        tokens = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=16)

        # Run inference with minimal input
        print("Starting energy measurement...")
        _, stats = tracker.measure_text(tokens.input_ids, tokenizer)

        # Calculate carbon footprint
        carbon_intensity = get_carbon_intensity()
        carbon_emissions = joules_to_co2(stats['total_energy'], carbon_intensity)

        # Print results
        print("\nResults:")
        print(f"Total Energy: {stats['total_energy']:.4f} J")
        print(f"Energy per token: {stats['energy_per_token']:.6f} J/token")
        print(f"Inference time: {stats['time']:.3f} s")
        print(f"Carbon emissions: {carbon_emissions:.6f} gCO2eq")

        # Clean up immediately
        del model, tracker
        clean_memory()
        print_gpu_memory()

        return stats

    except torch.cuda.OutOfMemoryError as e:
        print(f"Out of memory error: {e}")
        print("FP16 mode is too memory intensive for this GPU. Try INT8 or INT4 instead.")
        clean_memory()
        return {"error": "OOM"}

    except Exception as e:
        print(f"Error running FP16 test: {e}")
        clean_memory()
        return {"error": str(e)}

# Example 5: Full benchmark with safe mode
def run_safe_full_benchmark():
    """Run full benchmark with the safest settings"""
    MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    print(f"Running full safe benchmark on {MODEL_NAME}")

    # Use only INT4 for best memory efficiency
    results = {
        'generation': {},
        'glue': {}
    }

    # Clean memory
    clean_memory()
    print_gpu_memory()

    # Part 1: Generation benchmark with INT4 only
    print("\n==== PART 1: GENERATION BENCHMARK (INT4) ====")
    gen_results = compare_generation_energy(
        model_name=MODEL_NAME,
        prompt="DeepSeek AI is an advanced language model.",  # Shorter prompt
        quantization_modes=['int4'],
        verbose=True
    )
    results['generation'] = gen_results

    # Clean up thoroughly between tests
    clean_memory()
    print_gpu_memory()

    # Part 2: GLUE benchmark with INT4 only
    print("\n==== PART 2: GLUE BENCHMARK (INT4) ====")
    glue_results = test_quantized_models_on_glue(
        model_name=MODEL_NAME,
        tasks=['sst2'],
        quantization_modes=['int4'],
        batch_size=1
    )
    results['glue'] = glue_results

    # Final cleanup
    clean_memory()
    print_gpu_memory()

    return results

In [19]:
# Cell 16: Running basic tests to measure memory usage
def measure_model_sizes():
    """Measure memory usage for different quantization modes"""
    MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    results = {}

    for mode in ['int4', 'int8']:
        try:
            print(f"\n===== Testing {mode.upper()} Mode Memory Usage =====")

            # Clean memory
            clean_memory()
            print("Memory before loading:")
            before = torch.cuda.memory_allocated() / 1e9
            print_gpu_memory()

            # Load model
            if mode == 'int4':
                bnb = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_quant_type='nf4',
                    bnb_4bit_compute_dtype=torch.float16,
                    bnb_4bit_use_double_quant=True
                )
            else:
                bnb = BitsAndBytesConfig(
                    load_in_8bit=True,
                    llm_int8_enable_fp32_cpu_offload=True
                )

            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                quantization_config=bnb,
                device_map="auto",
                torch_dtype=torch.float16,
                offload_folder="offload",
                low_cpu_mem_usage=True
            )

            # Measure memory
            after = torch.cuda.memory_allocated() / 1e9
            print("Memory after loading:")
            print_gpu_memory()

            # Record result
            results[mode] = {
                'memory_before': before,
                'memory_after': after,
                'memory_used': after - before
            }

            # Clean up
            del model
            clean_memory()

        except Exception as e:
            print(f"Error measuring {mode} mode: {e}")
            results[mode] = {"error": str(e)}

    # Print summary
    print("\n===== Memory Usage Summary =====")
    for mode in results:
        if 'memory_used' in results[mode]:
            print(f"{mode.upper()}: {results[mode]['memory_used']:.2f} GB")
        else:
            print(f"{mode.upper()}: Failed - {results[mode].get('error', 'Unknown error')}")

    return results


In [None]:
# To run any of these examples, call the function:
run_int4_test()
# run_generation_benchmark()
# run_glue_benchmark()
# run_fp16_test()  # Only if you have enough memory!
# run_safe_full_benchmark()
# measure_model_sizes()

Running quick test on deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B with INT4 quantization
GPU Memory: Allocated: 0.00 GB | Reserved: 0.00 GB | Max: 0.00 GB
Quick test for deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B with int4 quantization


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Starting to load model in INT4 mode...
GPU Memory: Allocated: 0.00 GB | Reserved: 0.00 GB | Max: 0.00 GB


config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Model loaded successfully in INT4 mode
GPU Memory: Allocated: 1.62 GB | Reserved: 1.82 GB | Max: 1.78 GB
[2025-05-02 22:40:28,009] [zeus.device.gpu.nvidia](nvidia.py:47) pynvml is available and initialized.
[2025-05-02 22:40:28,011] [zeus.device.cpu.rapl](rapl.py:137) RAPL is not supported on this CPU.
[2025-05-02 22:40:28,011] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-02 22:40:28,013] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor
Running inference with prompt: 'DeepSeek AI is an advanced open-source language model designed to power AI applications.'
[2025-05-02 22:40:28,015] [zeus.utils.framework](framework.py:25) PyTorch with CUDA support is available.
Location detected: The Dalles, US (lat: 45.5946, lon: -121.1787)
Using estimated carbon intensity.
Using estimated carbon intensity for US: 417 gCO2eq/kWh

Results:
Total Energy: 43.4877 J
Energy per token: 2.558102 J/token
Inference time: 1.461 s
Car

{'total_energy': 43.48773182153651,
 'tokenization_energy': 0.5527318215370178,
 'inference_energy': 42.93499999999949,
 'energy_per_token': 2.5581018718550887,
 'time': 1.4607436656951904,
 'components': {'embeddings': np.float64(2.771999999999025),
  'attention': np.float64(14.01240735959934),
  'ffn': np.float64(23.439957810641122),
  'layernorm': np.float64(0.07457853698730468),
  'output_layer': np.float64(0.6435411567687989)},
 'num_tokens': 17}

In [32]:
import tqdm
from utils.test_generation import test_generation_MATH
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
# def test_generation_MATH(
#     model_name,
#     quantization_modes=['fp16'],
#     dataset_name='math',
#     dataset_config='all',
#     split='test',
#     num_examples=50,
#     verbose=True
# ):
#     """
#     Benchmark energy use and accuracy on MATH dataset.
#     """
#     # Prepare result container
#     results = {mode: {"examples": [], "summary": {}} for mode in quantization_modes}

#     # Load and sample dataset
#     ds = load_dataset(dataset_name, dataset_config, split=split)
#     ds = ds.select(range(num_examples))

#     # Get carbon intensity
#     carbon_intensity = get_carbon_intensity()
#     if verbose:
#         print(f"Carbon intensity: {carbon_intensity} gCO2eq/kWh")

#     # Load tokenizer
#     tokenizer = AutoTokenizer.from_pretrained(model_name)

#     # Test each quantization mode
#     for mode in quantization_modes:
#         if verbose:
#             print(f"\n=== Testing {mode.upper()} on MATH ===")
#         try:
#             clean_memory()
#             # Load model with given quantization
#             model = load_llm(model_name, mode=mode)
#             precision = 'float16' if mode == 'fp16' else None
#             tracker = EnergyTracker(model, precision_mode=precision)

#             correct = 0
#             total_tokens = 0

#             # Iterate over examples
#             for item in tqdm(ds, desc=f"MATH {mode.upper()}"):
#                 question = item['question']
#                 answer = item['answer']

#                 # Measure energy and get logits
#                 try:
#                     logits, stats = tracker.measure_text(question, tokenizer)
#                 except torch.cuda.OutOfMemoryError:
#                     # Retry with shorter input on OOM
#                     tokens = tokenizer(question, return_tensors='pt', truncation=True, max_length=256)
#                     logits, stats = tracker.measure_text(tokens.input_ids, tokenizer)

#                 # Decode prediction
#                 pred_tokens = torch.argmax(logits, dim=-1)
#                 pred_text = tokenizer.batch_decode(pred_tokens, skip_special_tokens=True)[0].strip()

#                 # Exact match accuracy
#                 is_correct = (pred_text == answer.strip())
#                 correct += int(is_correct)
#                 total_tokens += stats.get('num_tokens', 1)

#                 # Record example result
#                 results[mode]["examples"].append({
#                     "question": question,
#                     "ground_truth": answer,
#                     "prediction": pred_text,
#                     "is_correct": is_correct,
#                     "stats": stats
#                 })

#             # Compute summary metrics
#             count = len(results[mode]["examples"])
#             total_energy = sum(e["stats"]["total_energy"] for e in results[mode]["examples"])
#             total_time = sum(e["stats"]["time"] for e in results[mode]["examples"])
#             energy_per_token = total_energy / total_tokens if total_tokens else 0
#             accuracy = 100 * correct / count
#             carbon_emissions = joules_to_co2(total_energy, carbon_intensity)

#             results[mode]["summary"] = {
#                 "examples": count,
#                 "avg_energy": total_energy / count,
#                 "avg_time": total_time / count,
#                 "energy_per_token": energy_per_token,
#                 "accuracy": accuracy,
#                 "carbon_emissions": carbon_emissions
#             }

#             if verbose:
#                 print(f"\n{mode.upper()} SUMMARY:")
#                 print(f"  Samples       : {count}")
#                 print(f"  Accuracy      : {accuracy:.2f}%")
#                 print(f"  Energy/Infer  : {results[mode]['summary']['avg_energy']:.4f} J")
#                 print(f"  Time/Infer    : {results[mode]['summary']['avg_time']:.3f} s")
#                 print(f"  Energy/Token  : {energy_per_token:.6f} J/token")
#                 print(f"  CO2 Emissions : {carbon_emissions:.6f} gCO2eq")

#             # Cleanup
#             del model, tracker
#             clean_memory()

#         except Exception as e:
#             print(f"Error in {mode}: {e}")
#             results[mode]["summary"]["error"] = str(e)

#     return results

In [33]:
from utils.test_generation import test_generation_MBPP

MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
r2 = test_generation_MATH(
    model_name=MODEL_NAME,
    quantization_modes=["fp16", "int8", "int4"],
    dataset_name="deepmind/math_dataset",       # 注意这里
    dataset_config="algebra__linear_1d",        # 或者其它config
    split="test",
    num_examples=50,
    verbose=True
)
#r2 = test_generation_MATH(MODEL_NAME)
r2

Location detected: Las Vegas, US (lat: 36.175, lon: -115.1372)
Using estimated carbon intensity.
Using estimated carbon intensity for US: 417 gCO2eq/kWh
Carbon intensity: 417 gCO2eq/kWh

=== Testing FP16 on MATH ===
Starting to load model in FP16 mode...
GPU Memory: Allocated: 0.00 GB | Reserved: 0.00 GB | Max: 0.00 GB


config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Model loaded successfully in FP16 mode
GPU Memory: Allocated: 2.70 GB | Reserved: 2.85 GB | Max: 2.70 GB
[2025-05-02 23:23:11,865] [zeus.device.gpu.nvidia](nvidia.py:47) pynvml is available and initialized.
[2025-05-02 23:23:11,867] [zeus.device.cpu.rapl](rapl.py:137) RAPL is not supported on this CPU.
[2025-05-02 23:23:11,867] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-02 23:23:11,868] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MATH FP16:   0%|          | 0/50 [00:00<?, ?it/s]

[2025-05-02 23:23:11,878] [zeus.utils.framework](framework.py:25) PyTorch with CUDA support is available.


MATH FP16: 100%|██████████| 50/50 [00:20<00:00,  2.40it/s]



FP16 SUMMARY:
  Samples       : 50
  Accuracy      : 0.00%
  Energy/Infer  : 11.4484 J
  Time/Infer    : 0.415 s
  Energy/Token  : 0.347132 J/token
  CO2 Emissions : 66.305353 gCO2eq

=== Testing INT8 on MATH ===
Starting to load model in INT8 mode...
GPU Memory: Allocated: 0.01 GB | Reserved: 0.04 GB | Max: 2.72 GB
Model loaded successfully in INT8 mode
GPU Memory: Allocated: 1.49 GB | Reserved: 1.61 GB | Max: 2.72 GB
[2025-05-02 23:23:38,240] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-02 23:23:38,241] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MATH INT8: 100%|██████████| 50/50 [00:24<00:00,  2.02it/s]



INT8 SUMMARY:
  Samples       : 50
  Accuracy      : 0.00%
  Energy/Infer  : 13.6304 J
  Time/Infer    : 0.493 s
  Energy/Token  : 0.413291 J/token
  CO2 Emissions : 78.942454 gCO2eq

=== Testing INT4 on MATH ===
Starting to load model in INT4 mode...
GPU Memory: Allocated: 0.01 GB | Reserved: 0.04 GB | Max: 2.72 GB
Model loaded successfully in INT4 mode
GPU Memory: Allocated: 0.91 GB | Reserved: 1.41 GB | Max: 2.72 GB
[2025-05-02 23:24:06,575] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-02 23:24:06,575] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MATH INT4: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]



INT4 SUMMARY:
  Samples       : 50
  Accuracy      : 0.00%
  Energy/Infer  : 19.0704 J
  Time/Infer    : 0.546 s
  Energy/Token  : 0.578241 J/token
  CO2 Emissions : 110.449262 gCO2eq


{'fp16': {'examples': [{'question': "b'Solve -282*d + 929 - 178 = -1223 for d.\\n'",
    'ground_truth': "b'7\\n'",
    'prediction': "erer\nve for>x.3x219** 101* -1105\\ the inn'",
    'is_correct': False,
    'stats': {'total_energy': 34.69465719222915,
     'tokenization_energy': 0.1106571922302246,
     'inference_energy': 34.58399999999892,
     'energy_per_token': 1.119182490071908,
     'time': 1.2704172134399414,
     'components': {'embeddings': np.float64(2.611000000000786),
      'attention': np.float64(12.335857456447936),
      'ffn': np.float64(7.888506878851098),
      'layernorm': np.float64(0.060836652755737304),
      'output_layer': np.float64(2.8369999999995343)},
     'num_tokens': 31}},
   {'question': "b'Solve 49*l + 45*l - 125 - 63 = 0 for l.\\n'",
    'ground_truth': "b'2\\n'",
    'prediction': "erer\nve for1 + 94cm29*l^ 40*' 10 = 0' l'n'",
    'is_correct': False,
    'stats': {'total_energy': 11.158241736891359,
     'tokenization_energy': 0.1282417368888855

In [34]:
r2["int8"]["examples"][0]

{'question': "b'Solve -282*d + 929 - 178 = -1223 for d.\\n'",
 'ground_truth': "b'7\\n'",
 'prediction': "erer\nve the>x.5x229** 102* -1105\\ the inn'",
 'is_correct': False,
 'stats': {'total_energy': 16.424866054296466,
  'tokenization_energy': 0.06786605429649353,
  'inference_energy': 16.35699999999997,
  'energy_per_token': 0.529834388848273,
  'time': 0.6435303688049316,
  'components': {'embeddings': np.float64(0.053559872388839726),
   'attention': np.float64(11.17100820827589),
   'ffn': np.float64(10.849968232869024),
   'layernorm': np.float64(0.0615195939540863),
   'output_layer': np.float64(0.24238462471961975)},
  'num_tokens': 31}}

In [None]:
r2["fp16"]["examples"][1]

{'prompt': 'Write a function to sort a given matrix in ascending order according to the sum of its rows.',
 'ground_truth_code': 'def sort_matrix(M):\r\n    result = sorted(M, key=sum)\r\n    return result',
 'generated_code': '\n a program that find an list list in ascending order. to the sum of its elements.\n',
 'test_cases': ['assert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]',
  'assert sort_matrix([[1, 2, 3], [-2, 4, -5], [1, -1, 1]])==[[-2, 4, -5], [1, -1, 1], [1, 2, 3]]',
  'assert sort_matrix([[5,8,9],[6,4,3],[2,1,4]])==[[2, 1, 4], [6, 4, 3], [5, 8, 9]]'],
 'is_correct': False,
 'stats': {'total_energy': 24.351412544269813,
  'tokenization_energy': 0.14941254425048828,
  'inference_energy': 24.202000000019325,
  'energy_per_token': 1.1595910735366577,
  'time': 0.4883391857147217,
  'components': {'embeddings': np.float64(4.295000000012806),
   'attention': np.float64(13.832348618499237),
   'ffn': np.float64(3.870716459035873),
   'layer

In [None]:
!ls