In [None]:
# run this cell if you are in colab with a single notebook opened, otherwise ignore this cell

#!git clone https://github.com/CowboyPhilip/HPML-Energy-Efficient-LLM
#%cd HPML-Energy-Efficient-LLM
!ls

In [None]:
# 1. Install dependencies
!pip install --upgrade pip setuptools
!pip install \
    transformers \
    bitsandbytes \
    zeus-ml \
    torch \
    datasets \
    evaluate \
    scikit-learn \
    geocoder \
    requests \
    numpy \
    wandb

In [None]:
!pip install \
    flash-attn==2.0.5 \
    triton==2.0.0 \
    vllm

In [None]:
!wandb login

In [None]:
!wandb status

In [None]:
# import wandb

# wandb.init(project="HPML-Energy-Efficient-LLM", name="test-connection")

# wandb.log({"test_value": "init wandb"})

# wandb.finish()

In [None]:
# global configuration for experiments
cfg = {
    "task":           "math",
    "model":          "deepseek-ai/deepseek-coder-1.3b-instruct",
    # test adaptive quant
    "modes": [
        "adaptive"
        # "fp32_vanilla",    # FP16 + vanilla Transformer
        # "fp16_vanilla",    # FP16 + vanilla Transformer
        # "int8_vanilla",    # INT8 + vanilla
        # "int4_vanilla",    # INT4 + vanilla&
    ],
    # for adaptive mode: which two modes to switch between
    "high_mode":      "fp16_vanilla",
    "low_mode":       "int8_vanilla",
    "ctx_threshold": 512,
    "latency_threshold": 0.08,

    "dataset_name":   "deepmind/math_dataset",
    "dataset_config": "algebra__linear_1d",
    "split":          "test",
    "num_examples":   20,

    "temperature": 0.5,
    "top_p":0.9,
    "verbose":        True,
    "output_file":    "results.json",
    "device_map": "cuda"

}



In [None]:
# global configuration for experiments
cfg = {
    "task":           "math",
    "model":          "deepseek-ai/deepseek-coder-1.3b-instruct",
    # test default (vanilla) kernel
    "modes": [
        "fp32_vanilla",    # FP16 + vanilla Transformer
        "fp16_vanilla",    # FP16 + vanilla Transformer
        "int8_vanilla",    # INT8 + vanilla
        "int4_vanilla",    # INT4 + vanilla&
    ],

    "dataset_name":   "deepmind/math_dataset",
    "dataset_config": "algebra__linear_1d",
    "split":          "test",
    "num_examples":   20,

    "temperature": 0.5,
    "top_p":0.9,
    "verbose":        True,
    "output_file":    "results.json",
    "device_map": "cuda"

}

In [74]:
# exp for mbpp
cfg = {
    "task":           "mbpp",
    "model":          "deepseek-ai/deepseek-coder-1.3b-instruct",
    # "model":       "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    # test default (vanilla) kernel at fp16/int8/int4, plus adaptive switching
    "modes":          [
        # "fp16_flash-v2",    # FP16 + flash attn v2 Transformer, cannot use on t4
        # "fp16",
        # "int8_vanilla",    # INT8 + vanilla
        "int4_vanilla",    # INT4 + vanilla& low_mode
        "adaptive"
    ],
    # for adaptive mode: which two modes to switch between
    # "high_mode":      "fp16_vanilla",
    # "low_mode":       "int8_vanilla",
    "dataset_name": "mbpp",
    "temperature": 0,
    "top_p":0.95,
    "num_examples":   1,
    "quick":          True,
    "max_samples":    500,
    "batch_size":     8,
    "verbose":        True,
    "output_file":    "mbpp_results.json"
}

In [2]:
%load_ext autoreload
%autoreload 2
import json
import time
import numpy as np
from pathlib import Path
from tqdm import tqdm
from datasets import load_dataset  # ensure load_dataset is defined

# benchmark functions
from utils.test_generation import quick_test_generation, test_generation_MATH, test_generation_MBPP
from utils.test_mmlu    import quick_test_mmlu, test_quantized_models_on_mmlu
from utils.test_glue    import test_quantized_models_on_glue

# energy & tracking
from utils.energy_utils   import EnergyTracker, get_carbon_intensity, joules_to_co2
from utils.memory_utils   import clean_memory

# adaptive quant wrapper
from utils.adaptive_quant  import AdaptiveQuantGenerator

# plotting
from utils.plot_utils    import plot_energy_comparison, plot_component_energy


  from .autonotebook import tqdm as notebook_tqdm


/opt/rocm/lib/libamd_smi.so: cannot open shared object file: No such file or directory
Unable to find libamd_smi.so library try installing amd-smi-lib from your package manager


In [3]:
# Monkey-patch EnergyTracker to support `with tracker:` and save_results
def _et_enter(self):
    if getattr(self, 'zeus', None):
        try:
            self.zeus.begin_window('inference')
            self.active_windows.add('inference')
        except:
            pass
    self._enter_ts = time.time()
    return self

def _et_exit(self, exc_type, exc_val, exc_tb):
    end_ts = time.time()
    inf_e = 0
    if getattr(self, 'zeus', None) and 'inference' in self.active_windows:
        try:
            m = self.zeus.end_window('inference')
            inf_e = m.total_energy
            self.active_windows.remove('inference')
        except:
            pass
    elapsed = end_ts - getattr(self, '_enter_ts', end_ts)
    comp = {k: np.sum(v) for k, v in self.comp_energy.items()}
    self.stats = {
        'total_energy': inf_e,
        'time': elapsed,
        'components': comp,
        'num_tokens': None
    }
    return False

def _save_results(self, extra_metrics):
    if not hasattr(self, 'stats'):
        self.stats = {}
    self.stats.update(extra_metrics)

EnergyTracker.__enter__    = _et_enter
EnergyTracker.__exit__     = _et_exit
EnergyTracker.save_results = _save_results

In [76]:
def run_task(cfg):
    """Dispatch benchmarks based on cfg['task'], with adaptive-quant support."""
    task = cfg["task"]
    modes = list(cfg["modes"])
    results = {}

    # unpack shared params
    high_mode  = cfg.get("high_mode")
    low_mode   = cfg.get("low_mode")
    ctx_th     = cfg.get("ctx_threshold", 1024)
    lat_th     = cfg.get("latency_threshold", 0.08)
    device_map = cfg.get("device_map", "auto")
    temp       = cfg.get("temperature", 0.5)
    top_p      = cfg.get("top_p", 0.9)
    carbon_int = get_carbon_intensity()

    # skip adaptive for pure classification
    if task in ("glue", "mmlu") and "adaptive" in modes:
        print("⚠️  Skipping adaptive for classification tasks")
        modes.remove("adaptive")

    # -------------------------
    # TEXT GENERATION
    # -------------------------
    if task == "generation":
        results["generation"] = {}

        # adaptive generation
        if "adaptive" in modes:
            print("\n=== ADAPTIVE generation ===")
            agent = AdaptiveQuantGenerator(
                cfg["model"],
                high_mode=high_mode,
                low_mode=low_mode,
                ctx_threshold=ctx_th,
                latency_threshold=lat_th,
                device_map=device_map
            )
            _, _, stats = agent.generate(
                cfg["prompt"],
                max_new_tokens=cfg["tokens"],
                temperature=temp,
                top_p=top_p
            )
            results["generation"]["adaptive"] = stats
            modes.remove("adaptive")

        # static generation modes
        for mode in modes:
            print(f"\n=== {mode.upper()} generation ===")
            stats = quick_test_generation(
                model_name=cfg["model"],
                quant_mode=mode,
                prompt=cfg["prompt"],
                max_new_tokens=cfg["tokens"],
                temperature=temp,
                top_p=top_p
            )
            results["generation"][mode] = stats

    # -------------------------
    # MATH (generation-style)
    # -------------------------
    elif task == "math":
        results["math"] = {}

        # adaptive MATH
        if "adaptive" in modes:
            print("\n=== ADAPTIVE on MATH ===")
            ds = load_dataset(
                cfg["dataset_name"],
                cfg["dataset_config"],
                split=cfg["split"]
            ).select(range(cfg["num_examples"]))

            agent = AdaptiveQuantGenerator(
                cfg["model"],
                high_mode=high_mode,
                low_mode=low_mode,
                ctx_threshold=ctx_th,
                latency_threshold=lat_th,
                device_map=device_map
            )

            examples, correct, total_tokens = [], 0, 0
            for ex in tqdm(ds, desc="Adaptive MATH"):
                prompt = f"<｜User｜>{ex['question']}<｜Assistant｜><think>"
                gen_ids, _, stats = agent.evaluate(
                    prompt,
                    agent.tokenizer,
                    max_new_tokens=cfg.get("max_new_tokens", 32),
                    temperature=temp,
                    top_p=top_p
                )
                inp_len = stats["input_length"]
                pred = agent.tokenizer.decode(
                    gen_ids[0, inp_len:], skip_special_tokens=True
                ).strip()

                is_corr = (pred == ex["answer"].strip())
                correct += int(is_corr)
                total_tokens += stats.get("num_tokens", 1)
                examples.append({
                    "question":   ex["question"],
                    "prediction": pred,
                    "is_correct": is_corr,
                    "stats":      stats
                })
                clean_memory()

            n = len(examples)
            total_e = sum(e["stats"]["total_energy"] for e in examples)
            total_t = sum(e["stats"]["time"]         for e in examples)
            results["math"]["adaptive"] = {
                "examples": examples,
                "summary": {
                    "accuracy":         100 * correct / n,
                    "avg_energy":       total_e / n,
                    "avg_time":         total_t / n,
                    "energy_per_token": total_e / total_tokens,
                    "carbon_emissions": joules_to_co2(total_e, carbon_int)
                }
            }
            plot_component_energy(results, task_type="math", quant_mode="adaptive")
            modes.remove("adaptive")

        # static MATH modes
        if modes:
            print(f"\n=== standard modes on MATH: {modes} ===")
            std = test_generation_MATH(
                model_name=cfg["model"],
                quantization_modes=modes,
                dataset_name=cfg["dataset_name"],
                dataset_config=cfg["dataset_config"],
                split=cfg["split"],
                num_examples=cfg["num_examples"],
                verbose=cfg.get("verbose", True)
            )
            results["math"].update(std)

    # -------------------------
    # MBPP
    # -------------------------
    elif task == "mbpp":
        print(f"\n=== MBPP task on {cfg['model']} ===")
        results["mbpp"] = {}

        # adaptive MBPP
        # if "adaptive" in modes:
        #     # print("\n=== ADAPTIVE on MBPP ===")
        #     ds = load_dataset(
        #         cfg["dataset_name"],
        #         split=cfg.get("split", "test")
        #     ).select(range(cfg["num_examples"]))

        #     agent = AdaptiveQuantGenerator(
        #         cfg["model"],
        #         high_mode=high_mode,
        #         low_mode=low_mode,
        #         ctx_threshold=ctx_th,
        #         latency_threshold=lat_th,
        #         device_map=device_map
        #     )

        #     examples, correct, total_tokens = [], 0, 0
        #     for ex in tqdm(ds, desc="Adaptive MBPP"):
        #         prompt = ex["text"]
        #         gen_ids, _, stats = agent.evaluate(
        #             prompt,
        #             agent.tokenizer,
        #             max_new_tokens=cfg.get("max_new_tokens", 128),
        #             temperature=temp,
        #             top_p=top_p
        #         )
        #         inp_len = stats["input_length"]
        #         pred = agent.tokenizer.decode(
        #             gen_ids[0, inp_len:], skip_special_tokens=True
        #         )
        #         is_corr = check_mbpp(pred, ex["expected_code"])
        #         correct += int(is_corr)
        #         total_tokens += stats.get("num_tokens", 1)
        #         examples.append({
        #             "prompt":     prompt,
        #             "prediction": pred,
        #             "is_correct": is_corr,
        #             "stats":      stats
        #         })
        #         clean_memory()

        #     n = len(examples)
        #     total_e = sum(e["stats"]["total_energy"] for e in examples)
        #     total_t = sum(e["stats"]["time"]         for e in examples)
        #     results["mbpp"]["adaptive"] = {
        #         "examples": examples,
        #         "summary": {
        #             "accuracy":         100 * correct / n,
        #             "avg_energy":       total_e / n,
        #             "avg_time":         total_t / n,
        #             "energy_per_token": total_e / total_tokens,
        #             "carbon_emissions": joules_to_co2(total_e, carbon_int)
        #         }
        #     }
        #     modes.remove("adaptive")

        # static MBPP modes
        std = test_generation_MBPP(
            model_name=cfg["model"],
            quantization_modes=modes,
            num_examples=cfg["num_examples"],
            verbose=cfg.get("verbose", True),
            temperature=temp,
            top_p=top_p
        )
        results["mbpp"].update(std)

    # -------------------------
    # MMLU
    # -------------------------
    elif task == "mmlu":
        print("\n=== MMLU task ===")

        if cfg.get("quick", False):
            stats = quick_test_mmlu(
                model_name=cfg["model"],
                quant_mode=modes[0],
                subjects=cfg["subjects"],
                max_samples=cfg["max_samples"]
            )
        else:
            stats = test_quantized_models_on_mmlu(
                model_name=cfg["model"],
                quantization_modes=modes,
                subjects=cfg["subjects"]
            )
        results["mmlu"] = stats

    # -------------------------
    # GLUE
    # -------------------------
    else:
        print("\n=== GLUE task ===")
        results["glue"] = test_quantized_models_on_glue(
            model_name=cfg["model"],
            tasks=cfg["glue_tasks"],
            quantization_modes=modes,
            batch_size=cfg["batch_size"]
        )

    return results

In [77]:
results = run_task(cfg)


Location detected: São Paulo, BR (lat: -23.5475, lon: -46.6361)
Using estimated carbon intensity.
Using estimate for BR: 110 gCO2eq/kWh

=== MBPP task on deepseek-ai/deepseek-coder-1.3b-instruct ===
Location detected: São Paulo, BR (lat: -23.5475, lon: -46.6361)
Using estimated carbon intensity.
Using estimate for BR: 110 gCO2eq/kWh
Carbon intensity: 110 gCO2eq/kWh

=== Testing FP16 on MBPP ===
Loading FP16 model …
GPU Memory: Allocated: 0.01 GB | Reserved: 0.02 GB | Max: 3.27 GB
Model ready → quantisation: FP16, kernel: vanilla
GPU Memory: Allocated: 2.70 GB | Reserved: 2.88 GB | Max: 3.27 GB
[2025-05-08 00:38:53,032] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-08 00:38:53,033] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MBPP FP16: 100%|██████████| 1/1 [01:37<00:00, 97.53s/it]


FP16 SUMMARY: Samples=1, Acc=0.00%,

=== Testing INT8_VANILLA on MBPP ===
Loading INT8 model …
GPU Memory: Allocated: 0.01 GB | Reserved: 0.02 GB | Max: 3.27 GB
Model ready → quantisation: INT8, kernel: vanilla
GPU Memory: Allocated: 1.49 GB | Reserved: 1.60 GB | Max: 3.27 GB
[2025-05-08 00:40:45,339] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-08 00:40:45,344] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MBPP INT8_VANILLA: 100%|██████████| 1/1 [02:06<00:00, 126.80s/it]


INT8_VANILLA SUMMARY: Samples=1, Acc=0.00%,

=== Testing INT4_VANILLA on MBPP ===
Loading INT4 model …
GPU Memory: Allocated: 0.01 GB | Reserved: 0.02 GB | Max: 3.27 GB
Model ready → quantisation: INT4, kernel: vanilla
GPU Memory: Allocated: 0.90 GB | Reserved: 1.39 GB | Max: 3.27 GB
[2025-05-08 00:43:09,098] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-08 00:43:09,102] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MBPP INT4_VANILLA: 100%|██████████| 1/1 [03:09<00:00, 189.18s/it]

AST Error Input:
 def remove_Occ(string, char):
    return string.replace(char, '')

print(remove_Occ("hello","l"))
print(remove_Occ("abcda","a"))
print(remove_Occ("PHP","P"))
[END]

The `replace()` function will replace all occurrences of the specified character with an empty string.

Please note that the `replace()` function does not remove the first and last occurrence of the character. If you want to remove the first and last occurrence of the character, you can use the `replace()` function with a negative step.

def remove_Occ(string, char):
    return string.replace(char, '', 1)

print(remove_Occ("hello","l"))
print(remove_Occ("abcda","a"))
print(remove_Occ("PHP","P"))
[END]

The `replace()` function with a negative step will remove the specified number of occurrences of the character.

Please note that the `replace()` function does not remove the first and last occurrence of the character. If you want to remove the first and last occurrence of the character, you can use the `rep




INT4_VANILLA SUMMARY: Samples=1, Acc=0.00%,

=== Testing ADAPTIVE on MBPP ===
Loading FP16 model …
GPU Memory: Allocated: 0.01 GB | Reserved: 0.03 GB | Max: 3.27 GB
Model ready → quantisation: FP16, kernel: vanilla
GPU Memory: Allocated: 2.70 GB | Reserved: 2.88 GB | Max: 3.27 GB
Loading INT8 model …
GPU Memory: Allocated: 2.70 GB | Reserved: 2.88 GB | Max: 3.27 GB
Model ready → quantisation: INT8, kernel: vanilla
GPU Memory: Allocated: 4.18 GB | Reserved: 4.33 GB | Max: 4.21 GB
[2025-05-08 00:46:50,802] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-08 00:46:50,806] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor
[2025-05-08 00:46:50,808] [zeus.monitor.energy](energy.py:209) Monitoring GPU indices [0].
[2025-05-08 00:46:50,808] [zeus.monitor.energy](energy.py:210) Monitoring CPU indices []
Successfully initialized ZeusMonitor


MBPP ADAPTIVE:   0%|          | 0/1 [00:00<?, ?it/s]

Error testing adaptive mode: __enter__





In [71]:
print(results["mbpp"]["fp16"]["examples"][1]["generated_code"])
print(results["mbpp"]["fp16"]["examples"][1]["test_list"])
# results["mbpp"]["fp16"]["examples"][0]["generated_code"]
# results["mbpp"]["fp16"]["examples"][0]["test_list"]

for i in range(5):
    print(f"================= sample {i} =======================")
    # print(repr(results["mbpp"]["fp16"]["examples"][i]["generated_code"]))
    # print(repr(results["mbpp"]["fp16"]["examples"][i]["generated_text"]))
    print(results["mbpp"]["fp16"]["examples"][i]["generated_text"])

None
['assert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]', 'assert sort_matrix([[1, 2, 3], [-2, 4, -5], [1, -1, 1]])==[[-2, 4, -5], [1, -1, 1], [1, 2, 3]]', 'assert sort_matrix([[5,8,9],[6,4,3],[2,1,4]])==[[2, 1, 4], [6, 4, 3], [5, 8, 9]]']
<｜User｜>You are an expert Python programmer, and here is your task: Write a python function to remove first and last occurrence of a given character from the string. Your code should pass these tests:

assert remove_Occ("hello","l") == "heo"
assert remove_Occ("abcda","a") == "bcd"
assert remove_Occ("PHP","P") == "H"
[BEGIN]<｜Assistant｜><think>

def remove_Occ(s, ch):
    if ch not in s:
        return s
    else:
        return s.replace(ch, '', 1)

print(remove_Occ("hello","l"))
print(remove_Occ("abcda","a"))
print(remove_Occ("PHP","P"))

[END]<｜Assistant｜><think>

The function remove_Occ takes a string and a character as input and returns the string after removing the first occurrence of the character. If the

In [None]:
import wandb

wandb.init(
    project="HPML-Energy-Efficient-LLM",
    entity="HPML-Energy-Efficient-LLM",
    name=f"{cfg['model']}-{cfg['task']}-({'/'.join(cfg['modes'])})",
    tags=[cfg['model'].split('/')[-1], cfg['task']] + cfg['modes'],
    group=cfg['model'].split('/')[-1],
    job_type=cfg['task'],
    config=cfg,
)

results = run_task(cfg)

wandb.log(results)
wandb.finish()
print(f"\nSaved results to wandb")

In [83]:
# results["mbpp"]["fp16"]["examples"][9]["prompt"]
print(repr(results["mbpp"]["int4_vanilla"]["examples"][0]["generated_text"]))
# results["mbpp"]["fp16"]["summary"]

'<｜User｜>You are an expert Python programmer, and here is your task: Write a python function to remove first and last occurrence of a given character from the string. Your code should pass these tests:\n\nassert remove_Occ("hello","l") == "heo"\nassert remove_Occ("abcda","a") == "bcd"\nassert remove_Occ("PHP","P") == "H"\n[BEGIN]<｜Assistant｜><think>You can use the built-in Python function called `replace()` to solve this problem. The `replace()` function replaces a specified phrase with another specified phrase.\n\ndef remove_Occ(string, char):\n    return string.replace(char, \'\')\n\nprint(remove_Occ("hello","l"))\nprint(remove_Occ("abcda","a"))\nprint(remove_Occ("PHP","P"))\n[END]\n\nThe `replace()` function will replace all occurrences of the specified character with an empty string.\n\nPlease note that the `replace()` function does not remove the first and last occurrence of the character. If you want to remove the first and last occurrence of the character, you can use the `repla

In [None]:
results["math"]["int8_vanilla"]["examples"][0]

In [None]:
# print summary for each task and mode
ci = get_carbon_intensity()
for task, modes in results.items():
    print(f"\n=== {task.upper()} SUMMARY ===")
    for mode, data in modes.items():
        summary = data.get("summary", data)
        e   = summary.get("avg_energy",     summary.get("total_energy", 0.0))
        t   = summary.get("avg_time",       summary.get("total_time",   0.0))
        # acc = summary.get("accuracy",       None)
        co2 = summary.get(
            "carbon_emissions",
            joules_to_co2(summary.get("total_energy", e), ci)
        )
        line = f"{mode:>12}: E={e:.2f} J, Lat={t:.3f}s"
        # if acc is not None:
        #     line += f", Acc={acc:.2f}%"
        line += f", CO₂={co2:.4f}g"
        print(line)


In [None]:
# Plot overall energy comparison
plot_energy_comparison(results)

In [None]:
# Plot per-component breakdown for each task and mode
for task, modes in results.items():
    for mode in modes:
        # skip modes without component stats
        stat = results[task][mode]
        comps = stat.get("summary", stat).get("components", None)
        if comps:
            plot_component_energy(results, task_type=task, quant_mode=mode)

In [None]:
# save raw results to JSON
with open(cfg["output_file"], "w") as f:
    json.dump(results, f, indent=2)
print(f"Results saved to {cfg['output_file']}")
