In [None]:
# run this cell if you are in colab with a single notebook opened, otherwise ignore this cell

#!git clone https://github.com/CowboyPhilip/HPML-Energy-Efficient-LLM
#%cd HPML-Energy-Efficient-LLM
!ls

In [22]:
# 1. Install dependencies
!pip install --upgrade pip setuptools
!pip install \
    transformers \
    bitsandbytes \
    zeus-ml \
    torch \
    datasets \
    evaluate \
    scikit-learn \
    geocoder \
    requests \
    flash-attn==2.0.5 \
    triton==2.0.0 \
    vllm \
    numpy

Collecting flash-attn==2.0.5
  Using cached flash_attn-2.0.5.tar.gz (2.3 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting triton==2.0.0
  Using cached triton-2.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.0 kB)
Collecting vllm
  Using cached vllm-0.8.5.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (14 kB)
Collecting ninja (from flash-attn==2.0.5)
  Using cached ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting lit (from triton==2.0.0)
  Using cached lit-18.1.8-py3-none-any.whl.metadata (2.5 kB)
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Using cached torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
 

In [97]:
# global configuration for experiments
cfg = {
    "task":           "math",
    "model":          "deepseek-ai/deepseek-coder-1.3b-instruct",
    # test default (vanilla) kernel at fp16/int8/int4, plus adaptive switching
    "modes":          [
        "fp16_vanilla",    # FP16 + vanilla Transformer
        "int8_vanilla",    # INT8 + vanilla
        "int4_vanilla",    # INT4 + vanilla& low_mode
    ],
    # for adaptive mode: which two modes to switch between
    "high_mode":      "fp16_vanilla",
    "low_mode":       "int8_vanilla",

    "dataset_name":   "deepmind/math_dataset",
    "dataset_config": "algebra__linear_1d",
    "split":          "test",
    "num_examples":   20,
    "subjects":       ["physics","chemistry"],
    "quick":          True,
    "max_samples":    20,
    "glue_tasks":     ["sst2","cola"],
    "batch_size":     1,
    "verbose":        True,
    "output_file":    "results.json"
}



In [10]:
cfg = {
    "task":        "math",                              # only MATH
    "model":       "deepseek-ai/deepseek-coder-1.3b-instruct",
    

    "modes":       ["adaptive"],                        # only adaptive
    "high_mode":   "fp16_vanilla",                      # high precision = FP16 + vanilla
    "low_mode":    "int8_vanilla",                      # low precision = INT8 + vanilla

    # MATH dataset
    "dataset_name":   "deepmind/math_dataset",
    "dataset_config": "algebra__linear_1d",
    "split":      "test",
    "num_examples":   20,

    "verbose":     True,
    "output_file": "adaptive_math_results.json"
}


In [5]:
# exp for mbpp
cfg = {
    "task":           "mbpp",
    # "model":          "deepseek-ai/deepseek-coder-1.3b-instruct",
    "model":       "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    # test default (vanilla) kernel at fp16/int8/int4, plus adaptive switching
    "modes":          [
        "fp16_flash-v2",    # FP16 + flash attn v2 Transformer
        # "int8_vanilla",    # INT8 + vanilla
        # "int4_vanilla",    # INT4 + vanilla& low_mode
    ],
    # for adaptive mode: which two modes to switch between
    "high_mode":      "fp16_vanilla",
    "low_mode":       "int8_vanilla",

    "dataset_name":   "deepmind/math_dataset",
    "dataset_config": "algebra__linear_1d",
    "split":          "test",
    "num_examples":   10,
    "subjects":       ["physics","chemistry"],
    "quick":          True,
    "max_samples":    500,
    "glue_tasks":     ["sst2","cola"],
    "batch_size":     1,
    "verbose":        True,
    "output_file":    "results.json"
}

In [2]:
import json
import time
import numpy as np
from pathlib import Path
from tqdm import tqdm
from datasets import load_dataset  # ensure load_dataset is defined

# benchmark functions
from utils.test_generation import quick_test_generation, test_generation_MATH, test_generation_MBPP
from utils.test_mmlu      import quick_test_mmlu, test_quantized_models_on_mmlu
from utils.test_glue      import test_quantized_models_on_glue

# energy & tracking
from utils.energy_utils   import EnergyTracker, get_carbon_intensity, joules_to_co2
from utils.memory_utils   import clean_memory

# adaptive quant wrapper
from utils.adaptive_quant      import AdaptiveQuantGenerator

# plotting
from utils.plot_utils    import plot_energy_comparison, plot_component_energy

%load_ext autoreload
%autoreload 2 

  from .autonotebook import tqdm as notebook_tqdm


/opt/rocm/lib/libamd_smi.so: cannot open shared object file: No such file or directory
Unable to find libamd_smi.so library try installing amd-smi-lib from your package manager


In [3]:
# Monkey-patch EnergyTracker to support `with tracker:` and save_results
def _et_enter(self):
    if getattr(self, 'zeus', None):
        try:
            self.zeus.begin_window('inference')
            self.active_windows.add('inference')
        except:
            pass
    self._enter_ts = time.time()
    return self

def _et_exit(self, exc_type, exc_val, exc_tb):
    end_ts = time.time()
    inf_e = 0
    if getattr(self, 'zeus', None) and 'inference' in self.active_windows:
        try:
            m = self.zeus.end_window('inference')
            inf_e = m.total_energy
            self.active_windows.remove('inference')
        except:
            pass
    elapsed = end_ts - getattr(self, '_enter_ts', end_ts)
    comp = {k: np.sum(v) for k, v in self.comp_energy.items()}
    self.stats = {
        'total_energy': inf_e,
        'time': elapsed,
        'components': comp,
        'num_tokens': None
    }
    return False

def _save_results(self, extra_metrics):
    if not hasattr(self, 'stats'):
        self.stats = {}
    self.stats.update(extra_metrics)

EnergyTracker.__enter__    = _et_enter
EnergyTracker.__exit__     = _et_exit
EnergyTracker.save_results = _save_results

In [4]:
def run_task(cfg):
    """Dispatch benchmarks based on cfg['task']."""
    task = cfg["task"]
    modes = list(cfg["modes"])
    results = {}

    # skip adaptive for pure classification tasks
    if task in ("glue", "mmlu") and "adaptive" in modes:
        print("⚠️  Skipping adaptive for classification tasks")
        modes.remove("adaptive")

    # text generation benchmark
    if task == "generation":
        results["generation"] = {}
        # adaptive mode
        if "adaptive" in modes:
            print("\n=== ADAPTIVE generation ===")
            agent = AdaptiveQuantGenerator(
                cfg["model"],
                high_mode=cfg["high_mode"],
                low_mode=cfg["low_mode"]
            )
            _ = agent.generate(cfg["prompt"], max_new_tokens=cfg["tokens"])
            results["generation"]["adaptive"] = {"note": "see adaptive_quant logs"}
            modes.remove("adaptive")
        # other quant/kernel modes
        for mode in modes:
            print(f"\n=== {mode.upper()} generation ===")
            stats = quick_test_generation(
                model_name=cfg["model"],
                quant_mode=mode,
                prompt=cfg["prompt"],
                max_new_tokens=cfg["tokens"]
            )
            results["generation"][mode] = stats

    # MATH dataset benchmark (generation-style)
    elif task == "math":
        results["math"] = {}
        # adaptive on MATH
        if "adaptive" in modes:
            print("\n=== ADAPTIVE on MATH ===")
            ds = load_dataset(
                cfg["dataset_name"],
                cfg["dataset_config"],
                split=cfg["split"]
            ).select(range(cfg["num_examples"]))
            adapter = AdaptiveQuantGenerator(
                cfg["model"],
                high_mode=cfg["high_mode"],
                low_mode=cfg["low_mode"]
            )
            examples, correct, total_tokens = [], 0, 0
            for ex in tqdm(ds, desc="Adaptive MATH"):
                q, ans = ex["question"], ex["answer"].strip()
                tracker = EnergyTracker("adaptive_quant")
                with tracker:
                    out = adapter.generate(q, max_new_tokens=32)
                pred = out.strip()
                correct += int(pred == ans)
                total_tokens += tracker.stats.get("tokens_generated", 1)
                examples.append({
                    "question": q,
                    "prediction": pred,
                    "is_correct": pred == ans,
                    "stats": tracker.stats
                })
                clean_memory()
            n = len(examples)
            total_e = sum(e["stats"]["total_energy"] for e in examples)
            total_t = sum(e["stats"]["time"]         for e in examples)
            results["math"]["adaptive"] = {
                "examples": examples,
                "summary": {
                    "accuracy":         100 * correct / n,
                    "avg_energy":       total_e / n,
                    "avg_time":         total_t / n,
                    "energy_per_token": total_e / total_tokens,
                    "carbon_emissions": joules_to_co2(total_e, get_carbon_intensity())
                }
            }
            plot_component_energy(results, task_type="math", quant_mode="adaptive")
            modes.remove("adaptive")

        # standard quant modes on MATH
        if modes:
            print(f"\n=== standard modes on MATH: {modes} ===")
            std = test_generation_MATH(
                model_name=cfg["model"],
                quantization_modes=modes,
                dataset_name=cfg["dataset_name"],
                dataset_config=cfg["dataset_config"],
                split=cfg["split"],
                num_examples=cfg["num_examples"],
                verbose=cfg["verbose"]
            )
            results["math"].update(std)

    # MBPP dataset benchmark
    elif task == "mbpp":
        print("\n=== MBPP task ===")
        results["mbpp"] = test_generation_MBPP(
            model_name=cfg["model"],
            quantization_modes=modes,
            num_examples=cfg["num_examples"],
            verbose=cfg["verbose"]
        )

    # MMLU multiple-choice benchmark
    elif task == "mmlu":
        print("\n=== MMLU task ===")
        if cfg.get("quick", False):
            stats = quick_test_mmlu(
                model_name=cfg["model"],
                quant_mode=modes[0],
                subjects=cfg["subjects"],
                max_samples=cfg["max_samples"]
            )
        else:
            stats = test_quantized_models_on_mmlu(
                model_name=cfg["model"],
                quantization_modes=modes,
                subjects=cfg["subjects"]
            )
        results["mmlu"] = stats

    # GLUE classification benchmark
    else:
        print("\n=== GLUE task ===")
        results["glue"] = test_quantized_models_on_glue(
            model_name=cfg["model"],
            tasks=cfg["glue_tasks"],
            quantization_modes=modes,
            batch_size=cfg["batch_size"]
        )

    return results


In [6]:
# run the selected benchmark
results = run_task(cfg)
results


=== MBPP task ===
Location detected: São Paulo, BR (lat: -23.5475, lon: -46.6361)
Using estimated carbon intensity.
Using estimate for BR: 110 gCO2eq/kWh
Carbon intensity: 110 gCO2eq/kWh

===== Testing FP16_FLASH-V2 Mode on MBPP =====
Loading FP16 model …
GPU Memory: Allocated: 0.00 GB | Reserved: 0.00 GB | Max: 0.00 GB


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Model ready → quantisation: FP16, kernel: flash-v2
GPU Memory: Allocated: 3.55 GB | Reserved: 7.26 GB | Max: 7.12 GB
[2025-05-06 19:49:41,004] [zeus.device.gpu.nvidia](nvidia.py:47) pynvml is available and initialized.
[2025-05-06 19:49:41,012] [zeus.device.cpu.rapl](rapl.py:137) RAPL is not supported on this CPU.
[2025-05-06 19:49:41,013] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-06 19:49:41,014] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


Testing FP16_FLASH-V2:   0%|          | 0/500 [00:00<?, ?it/s]

[2025-05-06 19:49:41,132] [zeus.utils.framework](framework.py:25) PyTorch with CUDA support is available.


Testing FP16_FLASH-V2:   2%|▏         | 10/500 [00:08<06:35,  1.24it/s]



===== Summary =====
Mode | Avg Energy per Infer(J) | Avg Time per Infer (s) | Energy/Token (J) | Accuracy (%) | CO2 (gCO2eq)
----------------------------------------------------------------------------------------------------
FP16_FLASH-V2 | 30.7857 | 0.788 | 1.293517 | 0.00 | 9.406744

Component Energy Breakdown for FP16_FLASH-V2
  ffn: 95.9747 J (47.8%)
  attention: 83.3830 J (41.5%)
  output_layer: 13.6563 J (6.8%)
  embeddings: 6.6113 J (3.3%)
  layernorm: 1.1731 J (0.6%)


{'mbpp': {'fp16_flash-v2': {'examples': [{'prompt': 'output only the code, no explanation: Write a python function to remove first and last occurrence of a given character from the string.',
     'ground_truth_code': 'def remove_Occ(s,ch): \r\n    for i in range(len(s)): \r\n        if (s[i] == ch): \r\n            s = s[0 : i] + s[i + 1:] \r\n            break\r\n    for i in range(len(s) - 1,-1,-1):  \r\n        if (s[i] == ch): \r\n            s = s[0 : i] + s[i + 1:] \r\n            break\r\n    return s ',
     'generated_code': ')\n\n the the number.\n\n no explanation or # a Python script that compute duplicate occurrence last character of a character number in a string, If',
     'test_cases': ['assert remove_Occ("hello","l") == "heo"',
      'assert remove_Occ("abcda","a") == "bcd"',
      'assert remove_Occ("PHP","P") == "H"'],
     'is_correct': False,
     'stats': {'total_energy': 60.978000000002794,
      'tokenization_energy': 2.719000000040978,
      'inference_energy':

In [13]:
results["mbpp"]["fp16"]["examples"][0]

{'prompt': 'who are ShakeSpear?',
 'ground_truth_code': 'def remove_Occ(s,ch): \r\n    for i in range(len(s)): \r\n        if (s[i] == ch): \r\n            s = s[0 : i] + s[i + 1:] \r\n            break\r\n    for i in range(len(s) - 1,-1,-1):  \r\n        if (s[i] == ch): \r\n            s = s[0 : i] + s[i + 1:] \r\n            break\r\n    return s ',
 'generated_code': ")\n\n is theypeare's What",
 'test_cases': ['assert remove_Occ("hello","l") == "heo"',
  'assert remove_Occ("abcda","a") == "bcd"',
  'assert remove_Occ("PHP","P") == "H"'],
 'is_correct': False,
 'stats': {'total_energy': 26.15790844726551,
  'tokenization_energy': 0.104908447265625,
  'inference_energy': 26.052999999999884,
  'energy_per_token': 3.736844063895073,
  'time': 0.8198516368865967,
  'components': {'embeddings': np.float64(0.087492919921875),
   'attention': np.float64(11.333255250458228),
   'ffn': np.float64(14.213735697983967),
   'layernorm': np.float64(0.1033801999092102),
   'output_layer': np.flo

In [102]:
# print summary for each task and mode
ci = get_carbon_intensity()
for task, modes in results.items():
    print(f"\n=== {task.upper()} SUMMARY ===")
    for mode, data in modes.items():
        summary = data.get("summary", data)
        e   = summary.get("avg_energy",     summary.get("total_energy", 0.0))
        t   = summary.get("avg_time",       summary.get("total_time",   0.0))
        acc = summary.get("accuracy",       None)
        co2 = summary.get(
            "carbon_emissions",
            joules_to_co2(summary.get("total_energy", e), ci)
        )
        line = f"{mode:>12}: E={e:.2f} J, Lat={t:.3f}s"
        if acc is not None:
            line += f", Acc={acc:.2f}%"
        line += f", CO₂={co2:.4f}g"
        print(line)


Location detected: Singapore, SG (lat: 1.2897, lon: 103.8501)
Using estimated carbon intensity.
No specific estimate for SG. Using global average: 475 gCO2eq/kWh

=== MATH SUMMARY ===
fp16_vanilla: E=16.09 J, Lat=0.501s, Acc=0.00%, CO₂=42.4599g
int8_vanilla: E=13.51 J, Lat=0.500s, Acc=0.00%, CO₂=35.6470g
int4_vanilla: E=18.01 J, Lat=0.548s, Acc=0.00%, CO₂=47.5221g


In [104]:
# Plot overall energy comparison
plot_energy_comparison(results)

In [105]:
# Plot per-component breakdown for each task and mode
for task, modes in results.items():
    for mode in modes:
        # skip modes without component stats
        stat = results[task][mode]
        comps = stat.get("summary", stat).get("components", None)
        if comps:
            plot_component_energy(results, task_type=task, quant_mode=mode)

In [106]:
# save raw results to JSON
with open(cfg["output_file"], "w") as f:
    json.dump(results, f, indent=2)
print(f"Results saved to {cfg['output_file']}")


Results saved to results.json
