In [1]:
# run this cell if you are in colab with a single notebook opened, otherwise ignore this cell

#!git clone https://github.com/CowboyPhilip/HPML-Energy-Efficient-LLM
#%cd HPML-Energy-Efficient-LLM
!ls

run_experiment.py  sample_data	utils


In [6]:
# 1. Install dependencies
!pip install --upgrade pip setuptools
!pip install \
    transformers \
    bitsandbytes \
    zeus-ml \
    torch \
    datasets \
    evaluate \
    scikit-learn \
    geocoder \
    requests \
    numpy \
    wandb



In [6]:
!pip install \
    flash-attn==2.0.5 \
    triton==2.0.0 \
    vllm

Collecting flash-attn==2.0.5
  Using cached flash_attn-2.0.5.tar.gz (2.3 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting triton==2.0.0
  Using cached triton-2.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.0 kB)
Collecting vllm
  Using cached vllm-0.8.5.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (14 kB)
Collecting ninja (from flash-attn==2.0.5)
  Using cached ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting lit (from triton==2.0.0)
  Using cached lit-18.1.8-py3-none-any.whl.metadata (2.5 kB)
Collecting blake3 (from vllm)
  Using cached blake3-1.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting fastapi>=0.115.0 (from fastapi[standard]>=0.115.0->vllm)
  Using cached fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Using cached prometheus_fastapi_instrumentator-7.1.0-py3-none-a

In [8]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mjy3475[0m ([33mHPML-Energy-Efficient-LLM[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
!wandb status

[1mCurrent Settings[0m
{
  "_extra_http_headers": null,
  "_proxies": null,
  "api_key": null,
  "base_url": "https://api.wandb.ai",
  "entity": null,
  "git_remote": "origin",
  "ignore_globs": [],
  "organization": null,
  "project": null,
  "root_dir": null,
  "section": "default"
}


In [10]:
# import wandb

# wandb.init(project="HPML-Energy-Efficient-LLM", name="test-connection")

# wandb.log({"test_value": "init wandb"})

# wandb.finish()

In [21]:
# global configuration for experiments
cfg = {
    "task":           "math",
    "model":          "deepseek-ai/deepseek-coder-1.3b-instruct",
    # test default (vanilla) kernel at fp16/int8/int4, plus adaptive switching
    "modes":          [
        "fp32_vanilla",    # FP16 + vanilla Transformer
        "fp16_vanilla",    # FP16 + vanilla Transformer
        "int8_vanilla",    # INT8 + vanilla
        "int4_vanilla",    # INT4 + vanilla& low_mode
    ],
    # for adaptive mode: which two modes to switch between
    # "high_mode":      "fp16_vanilla",
    # "low_mode":       "int8_vanilla",

    "dataset_name":   "deepmind/math_dataset",
    "dataset_config": "algebra__linear_1d",
    "split":          "test",
    "num_examples":   20,

    # "subjects":       ["physics","chemistry"],
    # "quick":          True,
    # "max_samples":    500,

    # "glue_tasks":     ["sst2","cola"],
    # "batch_size":     1,

    "verbose":        True,
    "output_file":    "results.json",
    "device_map": "cuda"
}



In [None]:
cfg = {
    "task":        "math",                              # only MATH
    "model":       "deepseek-ai/deepseek-coder-1.3b-instruct",


    "modes":       ["adaptive"],                        # only adaptive
    "high_mode":   "fp16_vanilla",                      # high precision = FP16 + vanilla
    "low_mode":    "int8_vanilla",                      # low precision = INT8 + vanilla

    # MATH dataset
    "dataset_name":   "deepmind/math_dataset",
    "dataset_config": "algebra__linear_1d",
    "split":      "test",
    "num_examples":   20,

    "verbose":     True,
    "output_file": "adaptive_math_results.json"
}


In [8]:
# exp for mbpp
cfg = {
    "task":           "mbpp",
    # "model":          "deepseek-ai/deepseek-coder-1.3b-instruct",
    "model":       "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    # test default (vanilla) kernel at fp16/int8/int4, plus adaptive switching
    "modes":          [
        "fp16_flash-v2",    # FP16 + flash attn v2 Transformer
        # "int8_vanilla",    # INT8 + vanilla
        # "int4_vanilla",    # INT4 + vanilla& low_mode
    ],
    # for adaptive mode: which two modes to switch between
    # "high_mode":      "fp16_vanilla",
    # "low_mode":       "int8_vanilla",

    "dataset_name":   "deepmind/math_dataset",
    "dataset_config": "algebra__linear_1d",
    "split":          "test",
    "num_examples":   10,
    "subjects":       ["physics","chemistry"],
    "quick":          True,
    "max_samples":    500,
    "glue_tasks":     ["sst2","cola"],
    "batch_size":     1,
    "verbose":        True,
    "output_file":    "results.json"
}

In [22]:
import json
import time
import numpy as np
from pathlib import Path
from tqdm import tqdm
from datasets import load_dataset  # ensure load_dataset is defined

# benchmark functions
from utils.test_generation import quick_test_generation, test_generation_MATH, test_generation_MBPP
from utils.test_mmlu      import quick_test_mmlu, test_quantized_models_on_mmlu
from utils.test_glue      import test_quantized_models_on_glue

# energy & tracking
from utils.energy_utils   import EnergyTracker, get_carbon_intensity, joules_to_co2
from utils.memory_utils   import clean_memory

# adaptive quant wrapper
from utils.adaptive_quant      import AdaptiveQuantGenerator

# plotting
from utils.plot_utils    import plot_energy_comparison, plot_component_energy

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
# Monkey-patch EnergyTracker to support `with tracker:` and save_results
def _et_enter(self):
    if getattr(self, 'zeus', None):
        try:
            self.zeus.begin_window('inference')
            self.active_windows.add('inference')
        except:
            pass
    self._enter_ts = time.time()
    return self

def _et_exit(self, exc_type, exc_val, exc_tb):
    end_ts = time.time()
    inf_e = 0
    if getattr(self, 'zeus', None) and 'inference' in self.active_windows:
        try:
            m = self.zeus.end_window('inference')
            inf_e = m.total_energy
            self.active_windows.remove('inference')
        except:
            pass
    elapsed = end_ts - getattr(self, '_enter_ts', end_ts)
    comp = {k: np.sum(v) for k, v in self.comp_energy.items()}
    self.stats = {
        'total_energy': inf_e,
        'time': elapsed,
        'components': comp,
        'num_tokens': None
    }
    return False

def _save_results(self, extra_metrics):
    if not hasattr(self, 'stats'):
        self.stats = {}
    self.stats.update(extra_metrics)

EnergyTracker.__enter__    = _et_enter
EnergyTracker.__exit__     = _et_exit
EnergyTracker.save_results = _save_results

In [24]:
def run_task(cfg):
    """Dispatch benchmarks based on cfg['task']."""
    task = cfg["task"]
    modes = list(cfg["modes"])
    results = {}

    # skip adaptive for pure classification tasks
    if task in ("glue", "mmlu") and "adaptive" in modes:
        print("⚠️  Skipping adaptive for classification tasks")
        modes.remove("adaptive")

    # text generation benchmark
    if task == "generation":
        results["generation"] = {}
        # adaptive mode
        if "adaptive" in modes:
            print("\n=== ADAPTIVE generation ===")
            agent = AdaptiveQuantGenerator(
                cfg["model"],
                high_mode=cfg["high_mode"],
                low_mode=cfg["low_mode"]
            )
            _ = agent.generate(cfg["prompt"], max_new_tokens=cfg["tokens"])
            results["generation"]["adaptive"] = {"note": "see adaptive_quant logs"}
            modes.remove("adaptive")
        # other quant/kernel modes
        for mode in modes:
            print(f"\n=== {mode.upper()} generation ===")
            stats = quick_test_generation(
                model_name=cfg["model"],
                quant_mode=mode,
                prompt=cfg["prompt"],
                max_new_tokens=cfg["tokens"]
            )
            results["generation"][mode] = stats

    # MATH dataset benchmark (generation-style)
    elif task == "math":
        results["math"] = {}
        # adaptive on MATH
        if "adaptive" in modes:
            print("\n=== ADAPTIVE on MATH ===")
            ds = load_dataset(
                cfg["dataset_name"],
                cfg["dataset_config"],
                split=cfg["split"]
            ).select(range(cfg["num_examples"]))
            adapter = AdaptiveQuantGenerator(
                cfg["model"],
                high_mode=cfg["high_mode"],
                low_mode=cfg["low_mode"]
            )
            examples, correct, total_tokens = [], 0, 0
            for ex in tqdm(ds, desc="Adaptive MATH"):
                q, ans = ex["question"], ex["answer"].strip()
                tracker = EnergyTracker("adaptive_quant")
                with tracker:
                    out = adapter.generate(q, max_new_tokens=32)
                pred = out.strip()
                correct += int(pred == ans)
                total_tokens += tracker.stats.get("tokens_generated", 1)
                examples.append({
                    "question": q,
                    "prediction": pred,
                    "is_correct": pred == ans,
                    "stats": tracker.stats
                })
                clean_memory()
            n = len(examples)
            total_e = sum(e["stats"]["total_energy"] for e in examples)
            total_t = sum(e["stats"]["time"]         for e in examples)
            results["math"]["adaptive"] = {
                "examples": examples,
                "summary": {
                    "accuracy":         100 * correct / n,
                    "avg_energy":       total_e / n,
                    "avg_time":         total_t / n,
                    "energy_per_token": total_e / total_tokens,
                    "carbon_emissions": joules_to_co2(total_e, get_carbon_intensity())
                }
            }
            plot_component_energy(results, task_type="math", quant_mode="adaptive")
            modes.remove("adaptive")

        # standard quant modes on MATH
        if modes:
            print(f"\n=== standard modes on MATH: {modes} ===")
            std = test_generation_MATH(
                model_name=cfg["model"],
                quantization_modes=modes,
                dataset_name=cfg["dataset_name"],
                dataset_config=cfg["dataset_config"],
                split=cfg["split"],
                num_examples=cfg["num_examples"],
                verbose=cfg["verbose"]
            )
            results["math"].update(std)

    # MBPP dataset benchmark
    elif task == "mbpp":
        print("\n=== MBPP task ===")
        results["mbpp"] = test_generation_MBPP(
            model_name=cfg["model"],
            quantization_modes=modes,
            num_examples=cfg["num_examples"],
            verbose=cfg["verbose"]
        )

    # MMLU multiple-choice benchmark
    elif task == "mmlu":
        print("\n=== MMLU task ===")
        if cfg.get("quick", False):
            stats = quick_test_mmlu(
                model_name=cfg["model"],
                quant_mode=modes[0],
                subjects=cfg["subjects"],
                max_samples=cfg["max_samples"]
            )
        else:
            stats = test_quantized_models_on_mmlu(
                model_name=cfg["model"],
                quantization_modes=modes,
                subjects=cfg["subjects"]
            )
        results["mmlu"] = stats

    # GLUE classification benchmark
    else:
        print("\n=== GLUE task ===")
        results["glue"] = test_quantized_models_on_glue(
            model_name=cfg["model"],
            tasks=cfg["glue_tasks"],
            quantization_modes=modes,
            batch_size=cfg["batch_size"]
        )

    return results


In [25]:
import wandb

wandb.init(
    project="HPML-Energy-Efficient-LLM",
    name=f"{cfg['model']}-{cfg['task']}-{'-'.join(cfg['modes'])}",
    tags=[cfg['model'].split('/')[-1], cfg['task']] + cfg['modes'],
    group=cfg['model'].split('/')[-1],
    job_type=cfg['task'],
    config=cfg
)

results = run_task(cfg)

wandb.log(results)
wandb.finish()
print(f"\nSaved results to wandb")


=== standard modes on MATH: ['fp32_vanilla', 'fp16_vanilla', 'int8_vanilla', 'int4_vanilla'] ===
Location detected: Las Vegas, US (lat: 36.175, lon: -115.1372)
Using estimated carbon intensity.
Using estimate for US: 417 gCO2eq/kWh
Carbon intensity: 417 gCO2eq/kWh

=== Testing FP32_VANILLA on MATH ===
Loading FP32 model …
GPU Memory: Allocated: 5.40 GB | Reserved: 10.78 GB | Max: 5.43 GB
Model ready → quantisation: FP32, kernel: vanilla
GPU Memory: Allocated: 10.79 GB | Reserved: 11.09 GB | Max: 10.80 GB
[2025-05-07 01:26:24,314] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-07 01:26:24,315] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MATH FP32_VANILLA: 100%|██████████| 20/20 [00:09<00:00,  2.07it/s]



FP32_VANILLA SUMMARY:
  Samples       : 20
  Accuracy      : 0.00%
  Energy/Infer  : 18.2258 J
  Time/Infer    : 0.480 s
  Energy/Token  : 0.542434 J/token
  CO2 Emissions : 42.223072 gCO2eq

=== Testing FP16_VANILLA on MATH ===
Loading FP16 model …
GPU Memory: Allocated: 5.41 GB | Reserved: 10.78 GB | Max: 10.82 GB
Model ready → quantisation: FP16, kernel: vanilla
GPU Memory: Allocated: 8.10 GB | Reserved: 10.78 GB | Max: 10.82 GB
[2025-05-07 01:26:36,200] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-07 01:26:36,200] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MATH FP16_VANILLA: 100%|██████████| 20/20 [00:10<00:00,  1.87it/s]



FP16_VANILLA SUMMARY:
  Samples       : 20
  Accuracy      : 0.00%
  Energy/Infer  : 21.5509 J
  Time/Infer    : 0.533 s
  Energy/Token  : 0.641394 J/token
  CO2 Emissions : 49.926146 gCO2eq

=== Testing INT8_VANILLA on MATH ===
Loading INT8 model …
GPU Memory: Allocated: 5.41 GB | Reserved: 10.78 GB | Max: 10.82 GB
Model ready → quantisation: INT8, kernel: vanilla
GPU Memory: Allocated: 6.89 GB | Reserved: 10.79 GB | Max: 10.82 GB
[2025-05-07 01:26:50,451] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-07 01:26:50,452] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MATH INT8_VANILLA: 100%|██████████| 20/20 [00:09<00:00,  2.02it/s]



INT8_VANILLA SUMMARY:
  Samples       : 20
  Accuracy      : 0.00%
  Energy/Infer  : 17.2849 J
  Time/Infer    : 0.493 s
  Energy/Token  : 0.514431 J/token
  CO2 Emissions : 40.043328 gCO2eq

=== Testing INT4_VANILLA on MATH ===
Loading INT4 model …
GPU Memory: Allocated: 5.41 GB | Reserved: 10.78 GB | Max: 10.82 GB
Model ready → quantisation: INT4, kernel: vanilla
GPU Memory: Allocated: 6.30 GB | Reserved: 10.80 GB | Max: 10.82 GB
[2025-05-07 01:27:04,052] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-07 01:27:04,054] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MATH INT4_VANILLA: 100%|██████████| 20/20 [00:11<00:00,  1.78it/s]



INT4_VANILLA SUMMARY:
  Samples       : 20
  Accuracy      : 0.00%
  Energy/Infer  : 22.4808 J
  Time/Infer    : 0.558 s
  Energy/Token  : 0.669071 J/token
  CO2 Emissions : 52.080473 gCO2eq



Saved results to wandb


In [34]:
results["math"]["fp32_vanilla"]["examples"][0]

{'question': "b'Solve -282*d + 929 - 178 = -1223 for d.\\n'",
 'ground_truth': "b'7\\n'",
 'prediction': "erer\nve for>x.3x219** 102* -1105\\ the inn'",
 'is_correct': False,
 'stats': {'total_energy': 18.70159947586525,
  'tokenization_energy': 0.0715994758605957,
  'inference_energy': 18.630000000004657,
  'energy_per_token': 0.6032774024472661,
  'time': 0.45252323150634766,
  'components': {'embeddings': np.float64(0.07191507029533387),
   'attention': np.float64(9.929176444523968),
   'ffn': np.float64(7.49257587741781),
   'layernorm': np.float64(0.077672860622406),
   'output_layer': np.float64(0.1590070080757141)},
  'num_tokens': 31}}

In [30]:
# print summary for each task and mode
ci = get_carbon_intensity()
for task, modes in results.items():
    print(f"\n=== {task.upper()} SUMMARY ===")
    for mode, data in modes.items():
        summary = data.get("summary", data)
        e   = summary.get("avg_energy",     summary.get("total_energy", 0.0))
        t   = summary.get("avg_time",       summary.get("total_time",   0.0))
        # acc = summary.get("accuracy",       None)
        co2 = summary.get(
            "carbon_emissions",
            joules_to_co2(summary.get("total_energy", e), ci)
        )
        line = f"{mode:>12}: E={e:.2f} J, Lat={t:.3f}s"
        # if acc is not None:
        #     line += f", Acc={acc:.2f}%"
        line += f", CO₂={co2:.4f}g"
        print(line)


Location detected: Las Vegas, US (lat: 36.175, lon: -115.1372)
Using estimated carbon intensity.
Using estimate for US: 417 gCO2eq/kWh

=== MATH SUMMARY ===
fp32_vanilla: E=18.23 J, Lat=0.480s, CO₂=42.2231g
fp16_vanilla: E=21.55 J, Lat=0.533s, CO₂=49.9261g
int8_vanilla: E=17.28 J, Lat=0.493s, CO₂=40.0433g
int4_vanilla: E=22.48 J, Lat=0.558s, CO₂=52.0805g


In [31]:
# Plot overall energy comparison
plot_energy_comparison(results)

In [32]:
# Plot per-component breakdown for each task and mode
for task, modes in results.items():
    for mode in modes:
        # skip modes without component stats
        stat = results[task][mode]
        comps = stat.get("summary", stat).get("components", None)
        if comps:
            plot_component_energy(results, task_type=task, quant_mode=mode)

In [33]:
# save raw results to JSON
with open(cfg["output_file"], "w") as f:
    json.dump(results, f, indent=2)
print(f"Results saved to {cfg['output_file']}")


Results saved to results.json
