In [None]:
# Upgrade pip
!pip install --upgrade pip

# Install/upgrade required libraries
!pip install torch --quiet
!pip install torchvision --quiet
!pip install torchaudio --quiet
!pip install transformers==4.33.1 --quiet
!pip install datasets --quiet
!pip install scikit-learn --quiet
!pip install codecarbon --quiet
!pip install numpy==1.26.4 --quiet
!pip install pandas --quiet
!pip install tqdm --quiet
!pip install scikit-learn --quiet
!pip install evaluate --quiet

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2


In [None]:

import torch
import torch.nn as nn
from transformers import DistilBertForSequenceClassification, DistilBertForQuestionAnswering, BertForSequenceClassification, BertForQuestionAnswering
from transformers import AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import random
import numpy as np
import time
import json
import logging
from sklearn.metrics import accuracy_score, f1_score
from codecarbon import OfflineEmissionsTracker
import warnings
import os
import pandas as pd
# --- SECTION: 1. CONFIGURATION ---
warnings.filterwarnings("ignore")
logging.getLogger("codecarbon").setLevel(logging.INFO)
DEVICE_CONFIG = {
    'optimize_for_gpu': True,
    'mixed_precision': True
}
DEVICE = "cuda" if torch.cuda.is_available() and DEVICE_CONFIG['optimize_for_gpu'] else "cpu"
print(f"Using device: {DEVICE}")

DATASETS = {
    'glue_sst2': {'name': 'glue', 'config': 'sst2', 'split_train': 'train', 'split_val': 'validation'},
    'glue_mrpc': {'name': 'glue', 'config': 'mrpc', 'split_train': 'train', 'split_val': 'validation'},
    'glue_rte': {'name': 'glue', 'config': 'rte', 'split_train': 'train', 'split_val': 'validation'},
    'squad': {'name': 'squad', 'split_train': 'train', 'split_val': 'validation'}
}
MAX_SAMPLES = 5000
WATER_USAGE_FACTORS = {"average_l_per_kwh": 1.8}
CARBON_INTENSITY = 250  # gCO2e/kWh
BATCH_SIZE = 16
SEQ_LENGTH_CLASSIFICATION = 128
SEQ_LENGTH_QA = 384
NUM_LAYERS = 1
GLUE_TASKS_TO_RUN = ['sst2', 'mrpc', 'rte']
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def get_device():
    device = torch.device(DEVICE)
    if DEVICE == "cuda":
        try:
            print(f"✓ Using GPU: {torch.cuda.get_device_name(0)} (CUDA)")
        except Exception:
            print("✓ Using GPU (name unknown)")
        if DEVICE_CONFIG['mixed_precision']:
            print("✓ Mixed precision (FP16) enabled.")
    else:
        print(f"✓ Using CPU")
    return device
# --- SECTION: 2. QUANTIZATION ---
class AdaptiveQuantizedLayer(nn.Module):
    def __init__(self, layer, quant_bits=4):
        super().__init__()
        self.layer = layer
        self.current_precision = 0
        self.switch_count = 0
        self.set_precision(quant_bits)

    def set_precision(self, bits):
        if self.current_precision != bits and self.current_precision != 0:
            self.switch_count += 1
            
        self.quant_bits = bits
        self.current_precision = bits
        
        if bits < 32:
            self.min_val = -2 ** (bits - 1)
            self.max_val = 2 ** (bits - 1) - 1
        else:
            self.min_val, self.max_val = None, None

    def quantize(self, x):
        scale = (self.max_val - self.min_val) / (x.max() - x.min() + 1e-6)
        zero_point = x.min() - self.min_val / scale
        x_quant = torch.round(x * scale + zero_point)
        x_quant = torch.clamp(x_quant, self.min_val, self.max_val)
        x_dequant = (x_quant - zero_point) / scale
        return x_dequant

    def forward(self, *args, **kwargs):
        output = self.layer(*args, **kwargs)
        if self.current_precision == 32:
            return output
        if isinstance(output, tuple):
            hidden_states = output[0]
            hidden_states = self.quantize(hidden_states)
            return (hidden_states,) + output[1:]
        else:
            return self.quantize(output)

class PrecisionScheduler:
    def __init__(self, model, grad_norm_threshold=1.0):
        self.model = model
        self.grad_norm_threshold = grad_norm_threshold
        self.adaptable_layers = [m for m in self.model.modules() if isinstance(m, AdaptiveQuantizedLayer)]
        self.switch_log = [] 
        self.current_epoch = 0
        print(f"✓ Gradient-Aware PrecisionScheduler initialized for {len(self.adaptable_layers)} adaptable layers.")
        
    def get_global_norm(self):
        total_norm = 0.0
        for p in self.model.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        return total_norm ** 0.5

    def step(self, epoch):
        """Update precision based on gradient norms and log events."""
        self.current_epoch = epoch
        global_norm = self.get_global_norm()
        
        # Check if we need to switch precision
        if global_norm > self.grad_norm_threshold:
            self.increase_precision(global_norm)
        elif global_norm < self.grad_norm_threshold / 2.0:
            self.decrease_precision(global_norm)
            
        return global_norm

    def increase_precision(self, current_norm):
        for idx, layer in enumerate(self.adaptable_layers):
            if layer.current_precision == 4:
                new_prec = 8
            elif layer.current_precision == 8:
                new_prec = 32
            else:
                continue
            
            layer.set_precision(new_prec)
            event = {
                "layer_id": idx,
                "new_precision": new_prec,
                "grad_norm": current_norm,
                "epoch": self.current_epoch,
                "type": "increase"
            }
            self.switch_log.append(event)
            print(f"⚡ Epoch {self.current_epoch} | Layer {idx} switched to {new_prec}-bit (grad_norm={current_norm:.4f})")
            return

    def decrease_precision(self, current_norm):
        for idx, layer in enumerate(self.adaptable_layers):
            if layer.current_precision == 32:
                new_prec = 8
            elif layer.current_precision == 8:
                new_prec = 4
            else:
                continue
            
            layer.set_precision(new_prec)
            event = {
                "layer_id": idx,
                "new_precision": new_prec,
                "grad_norm": current_norm,
                "epoch": self.current_epoch,
                "type": "decrease"
            }
            self.switch_log.append(event)
            print(f"📉 Epoch {self.current_epoch} | Layer {idx} switched to {new_prec}-bit (grad_norm={current_norm:.4f})")
            return



def build_adaptive_quantized_model(device, model_type="distilbert", task_type="classification"):
    print(f"\n🏗️ Building Adaptive Quantized {model_type.capitalize()} Model...")
    ModelClass = DistilBertForSequenceClassification if task_type == "classification" else DistilBertForQuestionAnswering
    ModelBase = "distilbert-base-uncased"
    if model_type == "bert":
        ModelClass = BertForSequenceClassification if task_type == "classification" else BertForQuestionAnswering
        ModelBase = "bert-base-uncased"

    model_args = {'num_labels': 2} if task_type == "classification" else {}
    model = ModelClass.from_pretrained(ModelBase, **model_args).to(device)
    
    class QuantizedModel(nn.Module):
        def __init__(self, model_to_wrap):
            super().__init__()
            self.model = model_to_wrap
            self.model_type = model_type
            self.config = self.model.config
            layers_to_quantize = self.model.bert.encoder.layer if model_type == "bert" else self.model.distilbert.transformer.layer
            for i in range(len(layers_to_quantize)):
                layers_to_quantize[i] = AdaptiveQuantizedLayer(layers_to_quantize[i], quant_bits=8)
        
        def forward(self, **kwargs):
            return self.model(**kwargs)

    quantized_model = QuantizedModel(model).to(device)
    print(f"✓ Successfully created quantized {model_type.capitalize()} model.")
    return quantized_model


# --- SECTION: 3. DATA LOADING ---
def get_dataloaders(model_type, seq_len, batch_size):
    from transformers import DistilBertTokenizerFast, BertTokenizerFast
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') if model_type == 'distilbert' else BertTokenizerFast.from_pretrained('bert-base-uncased')
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    def preprocess_glue_sst2(examples):
        enc = tokenizer(examples["sentence"], padding="max_length", max_length=seq_len, truncation=True)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": examples["label"]}

    def preprocess_glue_mrpc(examples):
        enc = tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", max_length=seq_len, truncation=True)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": examples["label"]}

    def preprocess_glue_rte(examples):
        enc = tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", max_length=seq_len, truncation=True)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": examples["label"]}

    try:
        glue_sst2_train_data = load_dataset("glue", "sst2", split=f"train[:{MAX_SAMPLES}]")
        glue_sst2_validation_data = load_dataset("glue", "sst2", split=f"validation[:{MAX_SAMPLES}]")
        glue_mrpc_train_data = load_dataset("glue", "mrpc", split=f"train[:{5000}]")
        glue_mrpc_validation_data = load_dataset("glue", "mrpc", split=f"validation[:{5000}]")
        glue_rte_train_data = load_dataset("glue", "rte", split=f"train[:{2490}]")
        glue_rte_validation_data = load_dataset("glue", "rte", split=f"validation[:{277}]")

        glue_sst2_train_data = glue_sst2_train_data.map(preprocess_glue_sst2, batched=True, remove_columns=['sentence', 'idx', 'label'])
        glue_sst2_validation_data = glue_sst2_validation_data.map(preprocess_glue_sst2, batched=True, remove_columns=['sentence', 'idx', 'label'])
        glue_mrpc_train_data = glue_mrpc_train_data.map(preprocess_glue_mrpc, batched=True, remove_columns=['sentence1', 'sentence2', 'idx', 'label'])
        glue_mrpc_validation_data = glue_mrpc_validation_data.map(preprocess_glue_mrpc, batched=True, remove_columns=['sentence1', 'sentence2', 'idx', 'label'])
        glue_rte_train_data = glue_rte_train_data.map(preprocess_glue_rte, batched=True, remove_columns=['sentence1', 'sentence2', 'idx', 'label'])
        glue_rte_validation_data = glue_rte_validation_data.map(preprocess_glue_rte, batched=True, remove_columns=['sentence1', 'sentence2', 'idx', 'label'])

        glue_sst2_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_sst2_validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_mrpc_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_mrpc_validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_rte_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_rte_validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

        dataloaders = {
            'glue_sst2_train': DataLoader(glue_sst2_train_data, batch_size=batch_size, shuffle=True),
            'glue_sst2_validation': DataLoader(glue_sst2_validation_data, batch_size=batch_size, shuffle=False),
            'glue_mrpc_train': DataLoader(glue_mrpc_train_data, batch_size=batch_size, shuffle=True),
            'glue_mrpc_validation': DataLoader(glue_mrpc_validation_data, batch_size=batch_size, shuffle=False), 
            'glue_rte_train': DataLoader(glue_rte_train_data, batch_size=batch_size, shuffle=True), 
            'glue_rte_validation': DataLoader(glue_rte_validation_data, batch_size=batch_size, shuffle=False),
        }
        return dataloaders, tokenizer
    except Exception as e:
        print(f"⚠️ get_dataloaders failed: {e}. Returning empty dataloaders.")
        return {}, None

# --- SECTION: 4. EVALUATION METRICS ---
def evaluate_classification(model, dataloader, device, is_bert):
    model.eval()
    predictions = []
    labels = []
    for batch in tqdm(dataloader, desc="Evaluating"):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        if not is_bert:
            inputs.pop('token_type_ids', None)
        if DEVICE_CONFIG['mixed_precision'] and device.type == 'cuda':
            inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits if hasattr(outputs, 'logits') else outputs.last_hidden_state.norm(dim=-1)
        predicted = torch.argmax(logits, dim=-1) if logits.dim() > 1 else (logits > 0.5).long()
        predictions.extend(predicted.cpu().numpy())
        labels.extend(batch['labels'].cpu().numpy())
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}


# --- SECTION: 5. FINE-TUNING ---
# --- SECTION: 5. FINE-TUNING ---
def fine_tune(model, dataloader, device, epochs=3, task_type="classification", scheduler_callback=None, is_bert=False):
    model.train()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    num_training_steps = len(dataloader) * epochs
    num_warmup_steps = int(num_training_steps * 0.1)

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    precision_scheduler = None
    if any(isinstance(m, AdaptiveQuantizedLayer) for m in model.modules()):
        # FIX: The grad_norm_threshold was too low.
        # A common practice is to allow some initial instability during warmup.
        # We will dynamically adjust the threshold based on the epoch.
        precision_scheduler = PrecisionScheduler(model, grad_norm_threshold=1.0)
    
    # FIX: Initialize all layers to 32-bit for a proper warmup phase.
    for layer in model.modules():
        if isinstance(layer, AdaptiveQuantizedLayer):
            layer.set_precision(8)

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        total_loss = 0
        
        for batch in tqdm(dataloader, desc=f"Training Epoch {epoch+1}"):
            # FIX: Moved optimizer.zero_grad() here to clear gradients from previous batch.
            optimizer.zero_grad()
            
            inputs = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
            if 'input_ids' not in inputs:
                print("Skipping a batch with missing 'input_ids'.")
                continue

            # --- Forward Pass ---
            outputs = model(**inputs)
            loss = outputs.loss
            
            # --- Backward Pass ---
            loss.backward()

            # --- Gradient-Aware Precision Scheduling & Logging ---
            # FIX: Re-located the scheduler logic to after the backward pass, before the optimizer step.
            # This ensures we are checking actual gradients.
            if precision_scheduler: # Start adapting after a warmup period
                grad_norm = precision_scheduler.step(epoch + 1)
                print(f"🔎 Epoch {epoch+1}: Global gradient norm = {grad_norm:.4f}")
                # FIX: Check the grad_norm against the threshold *and* adapt precision here
                if grad_norm > precision_scheduler.grad_norm_threshold:
                    precision_scheduler.increase_precision(grad_norm)
                
            # FIX: Added a strong gradient clipping step here.
            # Your original code already had this, but it's crucial for stability.
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # --- Optimizer Step ---
            optimizer.step()
            lr_scheduler.step()
            total_loss += loss.item()

            if scheduler_callback:
                scheduler_callback()

        if len(dataloader) > 0:
            print(f"Average loss: {total_loss / len(dataloader):.4f}")
        else:
            print("Warning: Dataloader was empty.")
            
    if precision_scheduler:
        return precision_scheduler.switch_log
    else:
        return []

# --- SECTION: 6. EXPERIMENT RUNNER ---
def run_experiment(model, model_name, dataloaders, device, is_bert, batch_size, seq_len, task_type, run_sst2=True, run_mrpc=True, run_rte=True):
    results = {
        'model_name': model_name,
        'batch_size': batch_size,
        'seq_length': seq_len,
        'task_type': task_type,
        'accuracy_metrics': {},
        'performance_metrics': {},
        'scheduler_metrics': {
            'precision_distribution': {'INT4': 0.0, 'INT8': 0.0, 'FP32': 1.0},
            'precision_switch_counts_int4': 0,
            'precision_switch_counts_int8': 0,
            'precision_switch_counts_fp32': 0,
            'avg_precision_level': 32.0,
            'adaptation_log': []
        }
    }
    
    def update_scheduler_metrics(model):
        int4_count, int8_count, fp32_count = 0, 0, 0
        switch_int4, switch_int8, switch_fp32 = 0, 0, 0
        total_precision, total_layers = 0, 0

        for layer in model.modules():
            if isinstance(layer, AdaptiveQuantizedLayer):
                total_layers += 1
                total_precision += layer.current_precision
                if layer.current_precision == 4:
                    int4_count += 1
                    switch_int4 += layer.switch_count
                elif layer.current_precision == 8:
                    int8_count += 1
                    switch_int8 += layer.switch_count
                elif layer.current_precision == 32:
                    fp32_count += 1
                    switch_fp32 += layer.switch_count

        if total_layers == 0:
            return results['scheduler_metrics']

        return {
            'precision_distribution': {'INT4': int4_count / total_layers, 'INT8': int8_count / total_layers, 'FP32': fp32_count / total_layers},
            'precision_switch_counts_int4': switch_int4,
            'precision_switch_counts_int8': switch_int8,
            'precision_switch_counts_fp32': switch_fp32,
            'avg_precision_level': total_precision / total_layers,
            'adaptation_log': results['scheduler_metrics'].get('adaptation_log', [])
        }

    print(f"\n--- 🚀 Measuring Metrics for {model_name} ({task_type}) ---")
    start_time = time.time()
    num_queries = 0
    emissions_kwh = 0.0

    tracker = OfflineEmissionsTracker(
        project_name=f"Experiment_{model_name.replace(' ', '_')}",
        measure_power_secs=1,
        output_dir=".",
        log_level='info',
        country_iso_code="USA",
        region=None
    )
    tracker.start()
    try:
        if run_sst2 and task_type == "classification":
            print("\n--- Fine-tuning on GLUE SST-2 ---")
            adaptation_log = fine_tune(model, dataloaders['glue_sst2_train'], device, epochs=5, task_type=task_type)
            results['scheduler_metrics']['adaptation_log'] = adaptation_log
            print(" Evaluating GLUE SST-2...")
            metrics = evaluate_classification(model, dataloaders['glue_sst2_validation'], device, is_bert)
            results['accuracy_metrics']['sst2_accuracy'] = metrics['accuracy']
            results['accuracy_metrics']['sst2_f1'] = metrics['f1']
            num_queries += len(dataloaders['glue_sst2_validation'].dataset)
            print(f" SST-2 Accuracy: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}")

        if run_mrpc and task_type == "classification":
            print("\n--- Fine-tuning on GLUE MRPC ---")
            adaptation_log = fine_tune(model, dataloaders['glue_mrpc_train'], device, epochs=5, task_type=task_type)
            results['scheduler_metrics']['adaptation_log'] = adaptation_log
            print(" Evaluating GLUE MRPC...")
            metrics = evaluate_classification(model, dataloaders['glue_mrpc_validation'], device, is_bert)
            results['accuracy_metrics']['mrpc_accuracy'] = metrics['accuracy']
            results['accuracy_metrics']['mrpc_f1'] = metrics['f1']
            num_queries += len(dataloaders['glue_mrpc_validation'].dataset)
            print(f" MRPC Accuracy: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}")

        if run_rte and task_type == "classification":
            print("\n--- Fine-tuning on GLUE RTE ---")
            adaptation_log = fine_tune(model, dataloaders['glue_rte_train'], device, epochs=3, task_type=task_type)
            results['scheduler_metrics']['adaptation_log'] = adaptation_log
            print(" Evaluating GLUE RTE...")
            metrics = evaluate_classification(model, dataloaders['glue_rte_validation'], device, is_bert)
            results['accuracy_metrics']['rte_accuracy'] = metrics['accuracy']
            results['accuracy_metrics']['rte_f1'] = metrics['f1']
            num_queries += len(dataloaders['glue_rte_validation'].dataset)
            print(f" RTE Accuracy: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}")

        results['scheduler_metrics'] = update_scheduler_metrics(model)
        emissions_kwh = tracker.stop() or 0.0

    except Exception as e:
        print(f"🚨 Experiment failed for {model_name}: {e}")
        try:
            tracker.stop()
        except Exception:
            pass
        emissions_kwh = 0.0

    total_duration_s = time.time() - start_time
    total_tokens_processed = num_queries * seq_len
    total_carbon_g = emissions_kwh * CARBON_INTENSITY
    results['performance_metrics'] = {
        'latency_ms_query': (total_duration_s / num_queries) * 1000 if num_queries > 0 else 0,
        'throughput_tokens_sec': total_tokens_processed / total_duration_s if total_duration_s > 0 else 0,
        'energy_wh_token': (emissions_kwh * 1000) / total_tokens_processed if total_tokens_processed > 0 else 0,
        'sci_gco2e_query': total_carbon_g / num_queries if num_queries > 0 else 0,
        'wue_avg_liters_query': (emissions_kwh * WATER_USAGE_FACTORS['average_l_per_kwh']) / num_queries if num_queries > 0 else 0,
        'total_emissions_kwh': emissions_kwh
    }

    print(f"\n--- Results for {model_name} ---")
    print(f" Duration: {total_duration_s:.2f}s | Emissions: {emissions_kwh:.6f} kWh | Queries: {num_queries}")
    print(json.dumps(results, indent=2))
    print("-" * 40)
    return results


# --- SECTION: 7. MAIN EXECUTION ---
if __name__ == "__main__":
    DEVICE = get_device()
    set_seed(42)
    batch_size = BATCH_SIZE
    num_layers = NUM_LAYERS

    # Initialize results list
    all_results = []

    # Define model configurations
    model_configs = [
     # {'model_type': 'distilbert', 'task_type': 'classification', 'is_quantized': False, 'name': 'Baseline_DistilBERT_Classification'},
     #   {'model_type': 'distilbert', 'task_type': 'classification', 'is_quantized': True, 'name': 'Adaptive_Quantized_DistilBERT_Classification'},
        
       #{'model_type': 'bert', 'task_type': 'classification', 'is_quantized': False, 'name': 'Baseline_BERT_Classification'},
        {'model_type': 'bert', 'task_type': 'classification', 'is_quantized': True, 'name': 'Adaptive_Quantized_BERT_Classification'},

    ]

    # Run experiments for each model configuration
    for config in model_configs:
        model_type = config['model_type']
        task_type = config['task_type']
        is_quantized = config['is_quantized']
        model_name = config['name']
        seq_len = SEQ_LENGTH_QA if task_type == 'qa' else SEQ_LENGTH_CLASSIFICATION

        # Get dataloaders
        dataloaders, tokenizer = get_dataloaders(model_type=model_type, seq_len=seq_len, batch_size=batch_size)
        
       # model_name = f"{model_name}_{task.upper()}"
       # print(f"\n=== Running experiment for {model_name} ===")
        if is_quantized:
            model = build_adaptive_quantized_model(DEVICE, model_type, task_type)
        else:
            ModelClass = DistilBertForSequenceClassification if model_type == 'distilbert' else BertForSequenceClassification
            model = ModelClass.from_pretrained(f"{model_type}-base-uncased", num_labels=2).to(DEVICE)
                
        results = run_experiment(
                model=model, model_name=model_name, dataloaders=dataloaders, device=DEVICE,
                is_bert=(model_type == 'bert'), batch_size=BATCH_SIZE, seq_len=seq_len, task_type=task_type,
                run_sst2=True, run_mrpc=False, run_rte=False
            )
   
        all_results.append(results)
        
    # Save all results to a JSON file
    with open('adaptive_quantization_results.json', 'w') as f:
        json.dump(all_results, f, indent=2)

    df_out = pd.json_normalize(all_results, sep='_')
    df_out.to_csv('adaptive_quantization_results.csv', index=False)
    print("✅ CSV saved to 'adaptive_quantization_results.csv'")
    print("\nAll experiments completed. Results saved.")

In [1]:

import torch
import torch.nn as nn
from transformers import DistilBertForSequenceClassification, DistilBertForQuestionAnswering, BertForSequenceClassification, BertForQuestionAnswering
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from collections import deque
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import random
import numpy as np
import time
import json
import logging
from sklearn.metrics import accuracy_score, f1_score
from codecarbon import OfflineEmissionsTracker
import warnings
import os
import pandas as pd
# --- STRATEGY 1: Import the highly optimized spikingjelly library ---
from spikingjelly.activation_based import neuron, functional

# --- SECTION: 1. CONFIGURATION ---
warnings.filterwarnings("ignore")
logging.getLogger("codecarbon").setLevel(logging.INFO)
DEVICE_CONFIG = {
    'optimize_for_gpu': True,
    'mixed_precision': True
}
DEVICE = "cuda" if torch.cuda.is_available() and DEVICE_CONFIG['optimize_for_gpu'] else "cpu"
print(f"Using device: {DEVICE}")

DATASETS = {
    'glue_sst2': {'name': 'glue', 'config': 'sst2', 'split_train': 'train', 'split_val': 'validation'},
    'glue_mrpc': {'name': 'glue', 'config': 'mrpc', 'split_train': 'train', 'split_val': 'validation'},
    'glue_rte': {'name': 'glue', 'config': 'rte', 'split_train': 'train', 'split_val': 'validation'},
    'squad': {'name': 'squad', 'split_train': 'train', 'split_val': 'validation'}
}
MAX_SAMPLES = 5000
WATER_USAGE_FACTORS = {"average_l_per_kwh": 1.8}
CARBON_INTENSITY = 250  # gCO2e/kWh
BATCH_SIZE = 16
SEQ_LENGTH_CLASSIFICATION = 128
SEQ_LENGTH_QA = 384
NUM_LAYERS = 1
GLUE_TASKS_TO_RUN = ['sst2', 'mrpc', 'rte']
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def get_device():
    device = torch.device(DEVICE)
    if DEVICE == "cuda":
        try:
            print(f"✓ Using GPU: {torch.cuda.get_device_name(0)} (CUDA)")
        except Exception:
            print("✓ Using GPU (name unknown)")
        if DEVICE_CONFIG['mixed_precision']:
            print("✓ Mixed precision (FP16) enabled.")
    else:
        print(f"✓ Using CPU")
    return device
# --- SECTION: 2. QUANTIZATION ---
# --- SECTION: 2. QUANTIZATION ---
# --- SECTION: 2. QUANTIZATION ---
# --- SECTION: 2. QUANTIZATION ---
class AdaptiveQuantizedLayer(nn.Module):
    def __init__(self, layer, quant_bits=32):
        super().__init__()
        self.layer = layer
        self.current_precision = 0
        self.switch_count = {'INT4': 0, 'INT8': 0, 'FP32': 0}
        self.set_precision(quant_bits)

    def set_precision(self, bits):
        if self.current_precision != bits and self.current_precision != 0:
            self.switch_count['INT' + str(bits) if bits < 32 else 'FP32'] += 1
        self.quant_bits = bits
        self.current_precision = bits
        if bits < 32:
            self.min_val = -2 ** (bits - 1)
            self.max_val = 2 ** (bits - 1) - 1
        else:
            self.min_val, self.max_val = None, None

    def quantize(self, x):
        x_max, x_min = x.max(), x.min()
        if x_max == x_min:  # Prevent division by zero
            return x
        scale = (self.max_val - self.min_val) / (x_max - x_min + 1e-8)
        zero_point = x_min - self.min_val / scale
        x_quant = torch.round(x * scale + zero_point)
        x_quant = torch.clamp(x_quant, self.min_val, self.max_val)
        x_dequant = (x_quant - zero_point) / scale
        return x_dequant

    def forward(self, *args, **kwargs):
        output = self.layer(*args, **kwargs)
        if self.current_precision == 32 or not self.training:
            return output
        if isinstance(output, tuple):
            hidden_states = output[0]
            hidden_states = self.quantize(hidden_states)
            return (hidden_states,) + output[1:]
        else:
            return self.quantize(output)

class PrecisionScheduler:
    def __init__(self, model, initial_grad_norm_threshold=10.0, ma_window_size=20, cooldown_steps=20, warmup_steps=1000, alpha=0.9, task_name='sst2'):
        self.model = model
        self.initial_grad_norm_threshold = initial_grad_norm_threshold
        self.current_threshold = initial_grad_norm_threshold
        self.alpha = alpha
        self.ma_window_size = ma_window_size
        self.cooldown_steps = cooldown_steps
        self.warmup_steps = 1000 if task_name == 'sst2' else 500  # Longer warmup for SST-2
        self.adaptable_layers = [m for m in self.model.modules() if isinstance(m, AdaptiveQuantizedLayer)]
        self.grad_norm_history = deque(maxlen=ma_window_size)
        self.layer_cooldowns = [0] * len(self.adaptable_layers)
        self.switch_log = []
        self.current_epoch = 0
        self.step_count = 0
        print(f"✓ Robust PrecisionScheduler initialized for {len(self.adaptable_layers)} layers (Window: {ma_window_size}, Cooldown: {cooldown_steps}, Warmup: {self.warmup_steps}).")

    def get_global_norm(self):
        total_norm = 0.0
        param_count = 0
        for name, p in self.model.named_parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                if not torch.isfinite(param_norm):  # Skip non-finite norms
                    print(f"Warning: Non-finite gradient norm in {name}: {param_norm.item()}")
                    continue
                total_norm += param_norm.item() ** 2
                param_count += 1
        total_norm = total_norm ** 0.5 if param_count > 0 else 0.0
        print(f"Parameters with gradients: {param_count}, Global norm: {total_norm:.4f}")
        return total_norm

    def update_threshold(self, grad_norm):
        if not torch.isfinite(torch.tensor(grad_norm)):  # Skip infinite norms
            print(f"Warning: Skipping threshold update due to infinite grad_norm: {grad_norm}")
            return
        self.grad_norm_history.append(grad_norm)
        ma_grad_norm = sum(self.grad_norm_history) / len(self.grad_norm_history)
        clipped_norm = min(max(ma_grad_norm, 0.1), 100.0)  # Tighter upper bound
        self.current_threshold = max(1.0, min(50.0, clipped_norm * self.alpha + (1 - self.alpha) * self.current_threshold))
        print(f"Updated grad_norm_threshold: {self.current_threshold:.4f} (MA grad_norm: {ma_grad_norm:.4f})")

    def step(self, epoch, global_norm):
        self.current_epoch = epoch
        self.step_count += 1

        # Warmup: Keep all layers at FP32
        if self.step_count <= self.warmup_steps:
            for idx, layer in enumerate(self.adaptable_layers):
                if layer.current_precision != 32:
                    layer.set_precision(32)
                    self._log_switch(idx, 32, global_norm, "increase")
                    print(f"⚡ Warmup | Layer {idx} set to 32-bit (grad_norm={global_norm:.4f})")
            return global_norm

        # Update threshold and cooldowns
        self.update_threshold(global_norm)
        for i in range(len(self.layer_cooldowns)):
            if self.layer_cooldowns[i] > 0:
                self.layer_cooldowns[i] -= 1

        # Adjust precision for all layers
        ma_grad_norm = sum(self.grad_norm_history) / len(self.grad_norm_history) if self.grad_norm_history else global_norm
        for idx, layer in enumerate(self.adaptable_layers):
            if self.layer_cooldowns[idx] > 0:
                continue
            current_precision = layer.current_precision
            new_precision = current_precision

            # Increase precision if gradients are high
            if torch.isfinite(torch.tensor(ma_grad_norm)) and ma_grad_norm > self.current_threshold * 2.0:  # Stricter increase
                if current_precision == 4:
                    new_precision = 8
                elif current_precision == 8:
                    new_precision = 32
            # Decrease precision if gradients are stable
            elif torch.isfinite(torch.tensor(ma_grad_norm)) and ma_grad_norm < self.current_threshold * 0.3:  # Stricter decrease
                if current_precision == 32:
                    new_precision = 8
                elif current_precision == 8:
                    new_precision = 4
            # Periodic attempt to decrease precision to INT8 only
            elif self.step_count % 500 == 0 and current_precision > 8:
                new_precision = 8

            if new_precision != current_precision:
                layer.set_precision(new_precision)
                self.layer_cooldowns[idx] = self.cooldown_steps
                self._log_switch(idx, new_precision, ma_grad_norm, "increase" if new_precision > current_precision else "decrease")
                print(f"⚡ Epoch {self.current_epoch} | Layer {idx} switched to {new_precision}-bit (MA grad_norm={ma_grad_norm:.4f})")

        return global_norm

    def _log_switch(self, layer_idx, new_prec, norm, switch_type):
        event = {"layer_id": layer_idx, "new_precision": new_prec, "grad_norm": norm, "epoch": self.current_epoch, "type": switch_type}
        self.switch_log.append(event)

def build_adaptive_quantized_model(device, model_type, task_type):
    print(f"\n🏗️ Building Adaptive Quantized {model_type.capitalize()} Model...")
    ModelClass = BertForSequenceClassification if model_type == 'bert' else DistilBertForSequenceClassification
    ModelBase = "bert-base-uncased" if model_type == 'bert' else "distilbert-base-uncased"
    model = ModelClass.from_pretrained(ModelBase, num_labels=2).to(device)

    class QuantizedModel(nn.Module):
        def __init__(self, model_to_wrap, model_type_local):
            super().__init__()
            self.model = model_to_wrap
            self.config = self.model.config
            layers_to_quantize = self.model.bert.encoder.layer if model_type_local == "bert" else self.model.distilbert.transformer.layer
            for i in range(len(layers_to_quantize)):
                if i in [0, len(layers_to_quantize)-1]:
                    continue
                layers_to_quantize[i] = AdaptiveQuantizedLayer(layers_to_quantize[i], quant_bits=32)
        def forward(self, **kwargs):
            return self.model(**kwargs)

    quantized_model = QuantizedModel(model, model_type).to(device)
    print(f"✓ Successfully created quantized {model_type.capitalize()} model.")
    return quantized_model
# --- SECTION: 3. DATA LOADING ---
def get_dataloaders(model_type, seq_len, batch_size):
    from transformers import DistilBertTokenizerFast, BertTokenizerFast
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') if model_type == 'distilbert' else BertTokenizerFast.from_pretrained('bert-base-uncased')
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    def preprocess_glue_sst2(examples):
        enc = tokenizer(examples["sentence"], padding="max_length", max_length=seq_len, truncation=True)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": examples["label"]}

    def preprocess_glue_mrpc(examples):
        enc = tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", max_length=seq_len, truncation=True)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": examples["label"]}

    def preprocess_glue_rte(examples):
        enc = tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", max_length=seq_len, truncation=True)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": examples["label"]}

    try:
        glue_sst2_train_data = load_dataset("glue", "sst2", split=f"train[:{MAX_SAMPLES}]")
        glue_sst2_validation_data = load_dataset("glue", "sst2", split=f"validation[:{MAX_SAMPLES}]")
        glue_mrpc_train_data = load_dataset("glue", "mrpc", split=f"train[:{5000}]")
        glue_mrpc_validation_data = load_dataset("glue", "mrpc", split=f"validation[:{5000}]")
        glue_rte_train_data = load_dataset("glue", "rte", split=f"train[:{2490}]")
        glue_rte_validation_data = load_dataset("glue", "rte", split=f"validation[:{277}]")

        glue_sst2_train_data = glue_sst2_train_data.map(preprocess_glue_sst2, batched=True, remove_columns=['sentence', 'idx', 'label'])
        glue_sst2_validation_data = glue_sst2_validation_data.map(preprocess_glue_sst2, batched=True, remove_columns=['sentence', 'idx', 'label'])
        glue_mrpc_train_data = glue_mrpc_train_data.map(preprocess_glue_mrpc, batched=True, remove_columns=['sentence1', 'sentence2', 'idx', 'label'])
        glue_mrpc_validation_data = glue_mrpc_validation_data.map(preprocess_glue_mrpc, batched=True, remove_columns=['sentence1', 'sentence2', 'idx', 'label'])
        glue_rte_train_data = glue_rte_train_data.map(preprocess_glue_rte, batched=True, remove_columns=['sentence1', 'sentence2', 'idx', 'label'])
        glue_rte_validation_data = glue_rte_validation_data.map(preprocess_glue_rte, batched=True, remove_columns=['sentence1', 'sentence2', 'idx', 'label'])

        glue_sst2_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_sst2_validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_mrpc_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_mrpc_validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_rte_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        glue_rte_validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

        dataloaders = {
            'glue_sst2_train': DataLoader(glue_sst2_train_data, batch_size=batch_size, shuffle=True),
            'glue_sst2_validation': DataLoader(glue_sst2_validation_data, batch_size=batch_size, shuffle=False),
            'glue_mrpc_train': DataLoader(glue_mrpc_train_data, batch_size=batch_size, shuffle=True),
            'glue_mrpc_validation': DataLoader(glue_mrpc_validation_data, batch_size=batch_size, shuffle=False),
            'glue_rte_train': DataLoader(glue_rte_train_data, batch_size=batch_size, shuffle=True),
            'glue_rte_validation': DataLoader(glue_rte_validation_data, batch_size=batch_size, shuffle=False),
        }
        return dataloaders, tokenizer
    except Exception as e:
        print(f"⚠️ get_dataloaders failed: {e}. Returning empty dataloaders.")
        return {}, None

def evaluate_classification(model, dataloader, device, is_bert):
    model.eval()
    predictions, labels = [], []
    for batch in tqdm(dataloader, desc="Evaluating", leave=False):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        if not is_bert:
            inputs.pop('token_type_ids', None)
        if DEVICE_CONFIG['mixed_precision'] and device.type == 'cuda':
            inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predicted = torch.argmax(logits, dim=-1)
        predictions.extend(predicted.cpu().numpy())
        labels.extend(batch['labels'].cpu().numpy())
    return {"accuracy": accuracy_score(labels, predictions), "f1": f1_score(labels, predictions, average='weighted')}

# --- SECTION: 5. FINE-TUNING ---
def fine_tune(model, train_dataloader, validation_dataloader, device, epochs=5, task_type="classification", is_bert=False):
    from torch.cuda.amp import autocast, GradScaler
    scaler = GradScaler() if DEVICE_CONFIG['mixed_precision'] and device.type == 'cuda' else None
    model.train()
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Added weight decay
    num_training_steps = len(train_dataloader) * epochs
    num_warmup_steps = int(num_training_steps * 0.2)
    lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    precision_scheduler = None
    if any(isinstance(m, AdaptiveQuantizedLayer) for m in model.modules()):
        precision_scheduler = PrecisionScheduler(model, initial_grad_norm_threshold=10.0, ma_window_size=20, cooldown_steps=20, warmup_steps=1000 if task_type == 'sst2' else 500, task_name=task_type)

    for layer in model.modules():
        if isinstance(layer, AdaptiveQuantizedLayer):
            layer.set_precision(32)

    best_f1 = 0.0
    patience = 3
    patience_counter = 0

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}", leave=False):
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
            if 'input_ids' not in inputs:
                print("Skipping a batch with missing 'input_ids'.")
                continue
            with autocast() if scaler else torch.no_grad():
                outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            if scaler:
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                unclipped_grad_norm = precision_scheduler.get_global_norm() if precision_scheduler else 0.0
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
                if precision_scheduler:
                    precision_scheduler.step(epoch + 1, unclipped_grad_norm)
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                unclipped_grad_norm = precision_scheduler.get_global_norm() if precision_scheduler else 0.0
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
                if precision_scheduler:
                    precision_scheduler.step(epoch + 1, unclipped_grad_norm)
                optimizer.step()
            lr_scheduler.step()

        print(f"Average loss: {total_loss / len(train_dataloader):.4f}")

        # Validation after each epoch
        metrics = evaluate_classification(model, validation_dataloader, device, is_bert)
        print(f"Validation - Accuracy: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}")

        # Early stopping
        if metrics['f1'] > best_f1:
            best_f1 = metrics['f1']
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    return precision_scheduler.switch_log if precision_scheduler else []

# --- SECTION 6: SINGLE TASK EXPERIMENT RUNNER ---
def run_single_task_experiment(model_config, task_name, device):
    model_type = model_config['model_type']
    is_quantized = model_config['is_quantized']
    model_name_prefix = model_config['name']

    model_full_name = f"{model_name_prefix}_{task_name.upper()}"
    print(f"\n{'='*20} Running Experiment: {model_full_name} {'='*20}")

    dataloaders, tokenizer = get_dataloaders(model_type, SEQ_LENGTH_CLASSIFICATION, BATCH_SIZE)
    train_dataloader = dataloaders.get(f'glue_{task_name}_train')
    validation_dataloader = dataloaders.get(f'glue_{task_name}_validation')
    if not train_dataloader or not validation_dataloader:
        print(f"⚠️ Dataloaders for task '{task_name}' not found. Skipping.")
        return None

    if is_quantized:
        model = build_adaptive_quantized_model(device, model_type, task_name)
    else:
        ModelClass = DistilBertForSequenceClassification if model_type == 'distilbert' else BertForSequenceClassification
        model = ModelClass.from_pretrained(f"{model_type}-base-uncased", num_labels=2).to(device)

    start_time = time.time()
    tracker = OfflineEmissionsTracker(project_name=model_full_name, country_iso_code="USA")
    tracker.start()

    epochs = 5 if task_name == 'rte' else 5
    adaptation_log = fine_tune(model, train_dataloader, validation_dataloader, device, epochs=epochs, task_type=task_name, is_bert=(model_type=='bert'))

    metrics = evaluate_classification(model, validation_dataloader, device, is_bert=(model_type=='bert'))

    emissions_kwh = tracker.stop() or 0.0
    total_duration_s = time.time() - start_time

    results = {'model_name': model_full_name, 'dataset': task_name.upper()}
    results.update(metrics)

    def get_scheduler_summary(model, log):
        summary = {
            'adaptation_log': log,
            'precision_switch_counts_int4': 0,
            'precision_switch_counts_int8': 0,
            'precision_switch_counts_fp32': 0,
            'avg_precision_level': 32.0,
            'precision_distribution': {'INT4': 0.0, 'INT8': 0.0, 'FP32': 1.0}
        }
        quant_layers = [m for m in model.modules() if isinstance(m, AdaptiveQuantizedLayer)]
        if not quant_layers:
            return summary
        total_layers = len(quant_layers)
        int4_count = sum(1 for l in quant_layers if l.current_precision == 4)
        int8_count = sum(1 for l in quant_layers if l.current_precision == 8)
        fp32_count = sum(1 for l in quant_layers if l.current_precision == 32)
        summary['avg_precision_level'] = sum(l.current_precision for l in quant_layers) / total_layers
        summary['precision_distribution'] = {
            'INT4': int4_count / total_layers,
            'INT8': int8_count / total_layers,
            'FP32': fp32_count / total_layers
        }
        summary['precision_switch_counts_int4'] = sum(l.switch_count['INT4'] for l in quant_layers)
        summary['precision_switch_counts_int8'] = sum(l.switch_count['INT8'] for l in quant_layers)
        summary['precision_switch_counts_fp32'] = sum(l.switch_count['FP32'] for l in quant_layers)
        return summary

    if is_quantized:
        results['scheduler_metrics'] = get_scheduler_summary(model, adaptation_log)

    num_queries = len(validation_dataloader.dataset)
    total_tokens_processed = num_queries * SEQ_LENGTH_CLASSIFICATION
    total_carbon_g = emissions_kwh * CARBON_INTENSITY
    results['performance_metrics'] = {
        'total_duration_s': total_duration_s,
        'total_emissions_kwh': emissions_kwh,
        'latency_ms_query': (total_duration_s / num_queries) * 1000 if num_queries > 0 else 0,
        'throughput_tokens_sec': total_tokens_processed / total_duration_s if total_duration_s > 0 else 0,
        'energy_wh_token': (emissions_kwh * 1000) / total_tokens_processed if total_tokens_processed > 0 else 0,
        'sci_gco2e_query': total_carbon_g / num_queries if num_queries > 0 else 0,
        'wue_avg_liters_query': (emissions_kwh * WATER_USAGE_FACTORS['average_l_per_kwh']) / num_queries if num_queries > 0 else 0
    }

    print("\n--- Final Results ---")
    print(json.dumps(results, indent=2))
    return results

# --- SECTION: 7. MAIN EXECUTION ---
if __name__ == "__main__":
    DEVICE = get_device()
    set_seed(42)

    all_results = []

    model_configs = [
       # {'model_type': 'distilbert', 'task_type': 'classification', 'is_quantized': True, 'name': 'Adaptive_Quantized_DistilBERT'},
       # {'model_type': 'distilbert', 'task_type': 'classification', 'is_quantized': False, 'name': 'Baseline_DistilBERT'},
     {'model_type': 'bert', 'task_type': 'classification', 'is_quantized': True, 'name': 'Adaptive_Quantized_BERT'},
        # {'model_type': 'bert', 'task_type': 'classification', 'is_quantized': False, 'name': 'Baseline_BERT'},
    ]

    tasks_to_run = ['sst2', 'mrpc', 'rte']

    for config in model_configs:
        for task in tasks_to_run:
            result = run_single_task_experiment(model_config=config, task_name=task, device=DEVICE)
            if result:
                all_results.append(result)

    if all_results:
        results_df = pd.json_normalize(all_results, sep='_')
        all_cols = {col for res in all_results for col in pd.json_normalize(res, sep='_').columns}
        results_df = results_df.reindex(columns=list(all_cols))
        results_df.to_csv("all_task_results.csv", index=False)
        with open('adaptive_quantization_results.json', 'w') as f:
            json.dump(all_results, f, indent=2)
        print("\n✅ All experiments complete. Results saved to 'all_task_results.csv'")

2025-09-25 08:06:43.537533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758787603.560816    1243 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758787603.568078    1243 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


Using device: cuda
✓ Using GPU: Tesla T4 (CUDA)
✓ Mixed precision (FP16) enabled.



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]


🏗️ Building Adaptive Quantized Bert Model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[codecarbon INFO @ 08:07:02] offline tracker init
[codecarbon INFO @ 08:07:02] [setup] RAM Tracking...
[codecarbon INFO @ 08:07:02] [setup] CPU Tracking...


✓ Successfully created quantized Bert model.


 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 08:07:03] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 08:07:03] [setup] GPU Tracking...
[codecarbon INFO @ 08:07:03] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 08:07:03] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 08:07:03] >>> Tracker's metadata:
[codecarbon INFO @ 08:07:03]   Platform system: Linux-6.6.56+-x86_64-with-glibc2.35
[codecarbon INFO @ 08:07:03]   Python version: 3.11.13
[codecarbon INFO @ 08:07:03]   CodeCarbon version: 3.0.5
[codecarbon INFO @ 08:07:03]   Available RAM : 31.350 GB
[codecarbon INFO @ 08:07:03]   CPU count: 4 thread(s) in 1 physical CPU(s)
[codecarbon INFO @ 08:07:03]   CPU model: Intel

✓ Robust PrecisionScheduler initialized for 10 layers (Window: 20, Cooldown: 20, Warmup: 1000).

Epoch 1/5


Training Epoch 1:   0%|          | 0/313 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 2.5485
Parameters with gradients: 201, Global norm: 2.5513
Parameters with gradients: 201, Global norm: 4.3030
Parameters with gradients: 201, Global norm: 2.6177
Parameters with gradients: 201, Global norm: 4.9532
Parameters with gradients: 201, Global norm: 1.9514
Parameters with gradients: 201, Global norm: 2.2987
Parameters with gradients: 201, Global norm: 3.6862
Parameters with gradients: 201, Global norm: 3.2303
Parameters with gradients: 201, Global norm: 4.0498
Parameters with gradients: 201, Global norm: 2.4910
Parameters with gradients: 201, Global norm: 4.1905
Parameters with gradients: 201, Global norm: 2.7589
Parameters with gradients: 201, Global norm: 3.8373
Parameters with gradients: 201, Global norm: 6.1075
Parameters with gradients: 201, Global norm: 3.4892
Parameters with gradients: 201, Global norm: 4.8482
Parameters with gradients: 201, Global norm: 3.7262
Parameters with gradients: 201, Global norm: 4.5293
Parameters w

[codecarbon INFO @ 08:07:18] Energy consumed for RAM : 0.000083 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:07:18] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:07:18] Energy consumed for All CPU : 0.000177 kWh
[codecarbon INFO @ 08:07:18] Energy consumed for all GPUs : 0.000297 kWh. Total GPU Power : 71.13313909547337 W
[codecarbon INFO @ 08:07:18] 0.000557 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 3.7555
Parameters with gradients: 201, Global norm: 2.8886
Parameters with gradients: 201, Global norm: 1.9804
Parameters with gradients: 201, Global norm: 3.6694
Parameters with gradients: 201, Global norm: 3.4135
Parameters with gradients: 201, Global norm: 6.1977
Parameters with gradients: 201, Global norm: 3.1413
Parameters with gradients: 201, Global norm: 3.6426
Parameters with gradients: 201, Global norm: 4.7672
Parameters with gradients: 201, Global norm: 3.5424
Parameters with gradients: 201, Global norm: 3.0826
Parameters with gradients: 201, Global norm: 2.9247
Parameters with gradients: 201, Global norm: 3.3965
Parameters with gradients: 201, Global norm: 3.2014
Parameters with gradients: 201, Global norm: 3.4861
Parameters with gradients: 201, Global norm: 3.2482
Parameters with gradients: 201, Global norm: 4.3074
Parameters with gradients: 201, Global norm: 3.1565
Parameters with gradients: 201, Global norm: 3.6883
Parameters w

[codecarbon INFO @ 08:07:33] Energy consumed for RAM : 0.000167 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:07:33] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:07:33] Energy consumed for All CPU : 0.000354 kWh
[codecarbon INFO @ 08:07:33] Energy consumed for all GPUs : 0.000611 kWh. Total GPU Power : 75.49141277620168 W
[codecarbon INFO @ 08:07:33] 0.001132 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 6.5811
Parameters with gradients: 201, Global norm: 14.3819
Parameters with gradients: 201, Global norm: 10.5013
Parameters with gradients: 201, Global norm: 7.4028
Parameters with gradients: 201, Global norm: 11.9702
Parameters with gradients: 201, Global norm: 12.9963
Parameters with gradients: 201, Global norm: 6.7228
Parameters with gradients: 201, Global norm: 10.7873
Parameters with gradients: 201, Global norm: 6.8126
Parameters with gradients: 201, Global norm: 8.4171
Parameters with gradients: 201, Global norm: 5.0625
Parameters with gradients: 201, Global norm: 12.0687
Parameters with gradients: 201, Global norm: 10.0476
Parameters with gradients: 201, Global norm: 5.9760
Parameters with gradients: 201, Global norm: 4.9901
Parameters with gradients: 201, Global norm: 4.6716
Parameters with gradients: 201, Global norm: 5.0960
Parameters with gradients: 201, Global norm: 7.0181
Parameters with gradients: 201, Global norm: 8.5742
Param

[codecarbon INFO @ 08:07:48] Energy consumed for RAM : 0.000250 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:07:48] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:07:48] Energy consumed for All CPU : 0.000531 kWh
[codecarbon INFO @ 08:07:48] Energy consumed for all GPUs : 0.000925 kWh. Total GPU Power : 75.5261526982569 W
[codecarbon INFO @ 08:07:48] 0.001706 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 8.8752
Parameters with gradients: 201, Global norm: 14.4958
Parameters with gradients: 201, Global norm: 4.2480
Parameters with gradients: 201, Global norm: 4.6809
Parameters with gradients: 201, Global norm: 11.6111
Parameters with gradients: 201, Global norm: 13.8423
Parameters with gradients: 201, Global norm: 10.2935
Parameters with gradients: 201, Global norm: 13.0934
Parameters with gradients: 201, Global norm: 15.7154
Parameters with gradients: 201, Global norm: 9.9652
Parameters with gradients: 201, Global norm: 7.2032
Parameters with gradients: 201, Global norm: 18.0947
Parameters with gradients: 201, Global norm: 18.7728
Parameters with gradients: 201, Global norm: 19.3461
Parameters with gradients: 201, Global norm: 14.0055
Parameters with gradients: 201, Global norm: 7.8249
Parameters with gradients: 201, Global norm: 10.5632
Parameters with gradients: 201, Global norm: 21.9339
Parameters with gradients: 201, Global norm: 10.1152

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Validation - Accuracy: 0.8727, F1: 0.8723

Epoch 2/5


Training Epoch 2:   0%|          | 0/313 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 4.5759
Parameters with gradients: 201, Global norm: 4.4878
Parameters with gradients: 201, Global norm: 4.2421
Parameters with gradients: 201, Global norm: 4.8151
Parameters with gradients: 201, Global norm: 3.1142
Parameters with gradients: 201, Global norm: 3.8129
Parameters with gradients: 201, Global norm: 7.6865
Parameters with gradients: 201, Global norm: 1.6587
Parameters with gradients: 201, Global norm: 5.1493
Parameters with gradients: 201, Global norm: 10.5432
Parameters with gradients: 201, Global norm: 18.3132
Parameters with gradients: 201, Global norm: 13.8463
Parameters with gradients: 201, Global norm: 10.8007
Parameters with gradients: 201, Global norm: 4.7471
Parameters with gradients: 201, Global norm: 3.0092
Parameters with gradients: 201, Global norm: 2.4236
Parameters with gradients: 201, Global norm: 4.3556
Parameters with gradients: 201, Global norm: 17.1898
Parameters with gradients: 201, Global norm: 5.1521
Paramet

[codecarbon INFO @ 08:08:03] Energy consumed for RAM : 0.000333 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:08:03] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:08:03] Energy consumed for All CPU : 0.000708 kWh
[codecarbon INFO @ 08:08:03] Energy consumed for all GPUs : 0.001243 kWh. Total GPU Power : 76.2943046515599 W
[codecarbon INFO @ 08:08:03] 0.002284 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 2.5942
Parameters with gradients: 201, Global norm: 13.6933
Parameters with gradients: 201, Global norm: 2.6331
Parameters with gradients: 201, Global norm: 4.7612
Parameters with gradients: 201, Global norm: 4.6337
Parameters with gradients: 201, Global norm: 3.2078
Parameters with gradients: 201, Global norm: 5.0646
Parameters with gradients: 201, Global norm: 24.6550
Parameters with gradients: 201, Global norm: 11.0486
Parameters with gradients: 201, Global norm: 2.8514
Parameters with gradients: 201, Global norm: 12.2030
Parameters with gradients: 201, Global norm: 23.0611
Parameters with gradients: 201, Global norm: 8.5520
Parameters with gradients: 201, Global norm: 2.4026
Parameters with gradients: 201, Global norm: 3.6839
Parameters with gradients: 201, Global norm: 3.4798
Parameters with gradients: 201, Global norm: 3.4904
Parameters with gradients: 201, Global norm: 10.9166
Parameters with gradients: 201, Global norm: 6.1615
Parame

[codecarbon INFO @ 08:08:18] Energy consumed for RAM : 0.000416 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:08:18] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:08:18] Energy consumed for All CPU : 0.000885 kWh
[codecarbon INFO @ 08:08:18] Energy consumed for all GPUs : 0.001558 kWh. Total GPU Power : 75.6963297734707 W
[codecarbon INFO @ 08:08:18] 0.002859 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 12.4255
Parameters with gradients: 201, Global norm: 9.6229
Parameters with gradients: 201, Global norm: 0.2501
Parameters with gradients: 201, Global norm: 20.2500
Parameters with gradients: 201, Global norm: 0.4302
Parameters with gradients: 201, Global norm: 8.5951
Parameters with gradients: 201, Global norm: 13.4446
Parameters with gradients: 201, Global norm: 4.8367
Parameters with gradients: 201, Global norm: 10.9249
Parameters with gradients: 201, Global norm: 9.2171
Parameters with gradients: 201, Global norm: 6.8776
Parameters with gradients: 201, Global norm: 1.7996
Parameters with gradients: 201, Global norm: 8.0882
Parameters with gradients: 201, Global norm: 11.8342
Parameters with gradients: 201, Global norm: 3.4570
Parameters with gradients: 201, Global norm: 6.5996
Parameters with gradients: 201, Global norm: 7.1909
Parameters with gradients: 201, Global norm: 3.7062
Parameters with gradients: 201, Global norm: 0.7811
Paramet

[codecarbon INFO @ 08:08:33] Energy consumed for RAM : 0.000500 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:08:33] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:08:33] Energy consumed for All CPU : 0.001062 kWh
[codecarbon INFO @ 08:08:33] Energy consumed for all GPUs : 0.001873 kWh. Total GPU Power : 75.55201002933805 W
[codecarbon INFO @ 08:08:33] 0.003434 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 6.9830
Parameters with gradients: 201, Global norm: 8.7291
Parameters with gradients: 201, Global norm: 9.0353
Parameters with gradients: 201, Global norm: 14.1748
Parameters with gradients: 201, Global norm: 7.3503
Parameters with gradients: 201, Global norm: 14.4054
Parameters with gradients: 201, Global norm: 17.4051
Parameters with gradients: 201, Global norm: 8.7176
Parameters with gradients: 201, Global norm: 15.8415
Parameters with gradients: 201, Global norm: 1.4711
Parameters with gradients: 201, Global norm: 3.9390
Parameters with gradients: 201, Global norm: 14.2936
Parameters with gradients: 201, Global norm: 4.1043
Parameters with gradients: 201, Global norm: 2.9298
Parameters with gradients: 201, Global norm: 4.9781
Parameters with gradients: 201, Global norm: 3.7700
Parameters with gradients: 201, Global norm: 2.2870
Parameters with gradients: 201, Global norm: 4.8706
Parameters with gradients: 201, Global norm: 6.1233
Paramet

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

[codecarbon INFO @ 08:08:48] Energy consumed for RAM : 0.000583 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:08:48] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:08:48] Energy consumed for All CPU : 0.001239 kWh
[codecarbon INFO @ 08:08:48] Energy consumed for all GPUs : 0.002190 kWh. Total GPU Power : 76.15649158964281 W
[codecarbon INFO @ 08:08:48] 0.004012 kWh of electricity used since the beginning.


Validation - Accuracy: 0.9094, F1: 0.9094

Epoch 3/5


Training Epoch 3:   0%|          | 0/313 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 1.4521
Parameters with gradients: 201, Global norm: 7.1683
Parameters with gradients: 201, Global norm: 0.7686
Parameters with gradients: 201, Global norm: 9.0633
Parameters with gradients: 201, Global norm: 1.7850
Parameters with gradients: 201, Global norm: 0.5192
Parameters with gradients: 201, Global norm: 6.7486
Parameters with gradients: 201, Global norm: 1.1857
Parameters with gradients: 201, Global norm: 1.2938
Parameters with gradients: 201, Global norm: 3.6401
Parameters with gradients: 201, Global norm: 1.7940
Parameters with gradients: 201, Global norm: 0.6646
Parameters with gradients: 201, Global norm: 1.6123
Parameters with gradients: 201, Global norm: 0.2087
Parameters with gradients: 201, Global norm: 1.2297
Parameters with gradients: 201, Global norm: 1.4792
Parameters with gradients: 201, Global norm: 3.8871
Parameters with gradients: 201, Global norm: 0.3761
Parameters with gradients: 201, Global norm: 0.3715
Parameters w

[codecarbon INFO @ 08:09:03] Energy consumed for RAM : 0.000666 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:09:03] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:09:03] Energy consumed for All CPU : 0.001416 kWh
[codecarbon INFO @ 08:09:03] Energy consumed for all GPUs : 0.002507 kWh. Total GPU Power : 76.09268559471151 W
[codecarbon INFO @ 08:09:03] 0.004589 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:09:03] 0.014126 g.CO2eq/s mean an estimation of 445.478906716598 kg.CO2eq/year


Parameters with gradients: 201, Global norm: 0.0563
Parameters with gradients: 201, Global norm: 0.0762
Parameters with gradients: 201, Global norm: 0.0919
Parameters with gradients: 201, Global norm: 0.0764
Parameters with gradients: 201, Global norm: 0.0784
Parameters with gradients: 201, Global norm: 8.0131
Parameters with gradients: 201, Global norm: 18.1449
Parameters with gradients: 201, Global norm: 3.5891
Parameters with gradients: 201, Global norm: 0.0899
Parameters with gradients: 201, Global norm: 0.0794
Parameters with gradients: 201, Global norm: 0.1184
Parameters with gradients: 201, Global norm: 0.4157
Parameters with gradients: 201, Global norm: 0.1560
Parameters with gradients: 201, Global norm: 0.0708
Parameters with gradients: 201, Global norm: 0.0617
Parameters with gradients: 201, Global norm: 0.1137
Parameters with gradients: 201, Global norm: 5.8907
Parameters with gradients: 201, Global norm: 0.2083
Parameters with gradients: 201, Global norm: 0.1300
Parameters 

[codecarbon INFO @ 08:09:18] Energy consumed for RAM : 0.000749 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:09:18] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:09:18] Energy consumed for All CPU : 0.001593 kWh
[codecarbon INFO @ 08:09:18] Energy consumed for all GPUs : 0.002822 kWh. Total GPU Power : 75.77922993846036 W
[codecarbon INFO @ 08:09:18] 0.005165 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.3759
Parameters with gradients: 201, Global norm: 0.1341
Parameters with gradients: 201, Global norm: 0.1655
Parameters with gradients: 201, Global norm: 0.1928
Parameters with gradients: 201, Global norm: 0.2063
Parameters with gradients: 201, Global norm: 1.5107
Parameters with gradients: 201, Global norm: 0.2484
Parameters with gradients: 201, Global norm: 0.5167
Parameters with gradients: 201, Global norm: 1.2986
Parameters with gradients: 201, Global norm: 0.1941
Parameters with gradients: 201, Global norm: 1.5299
Parameters with gradients: 201, Global norm: 4.9709
Parameters with gradients: 201, Global norm: 0.1108
Parameters with gradients: 201, Global norm: 0.9025
Parameters with gradients: 201, Global norm: 0.1317
Parameters with gradients: 201, Global norm: 0.3492
Parameters with gradients: 201, Global norm: 9.0721
Parameters with gradients: 201, Global norm: 0.6872
Parameters with gradients: 201, Global norm: 1.2967
Parameters w

[codecarbon INFO @ 08:09:33] Energy consumed for RAM : 0.000833 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:09:33] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:09:33] Energy consumed for All CPU : 0.001770 kWh
[codecarbon INFO @ 08:09:33] Energy consumed for all GPUs : 0.003138 kWh. Total GPU Power : 75.79189472012956 W
[codecarbon INFO @ 08:09:33] 0.005740 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 1.0159
Parameters with gradients: 201, Global norm: 0.3116
Parameters with gradients: 201, Global norm: 0.2626
Parameters with gradients: 201, Global norm: 7.4899
Parameters with gradients: 201, Global norm: 0.3155
Parameters with gradients: 201, Global norm: 7.9434
Parameters with gradients: 201, Global norm: 0.2618
Parameters with gradients: 201, Global norm: 10.0468
Parameters with gradients: 201, Global norm: 0.6023
Parameters with gradients: 201, Global norm: 4.7252
Parameters with gradients: 201, Global norm: 0.2951
Parameters with gradients: 201, Global norm: 0.2148
Parameters with gradients: 201, Global norm: 2.5621
Parameters with gradients: 201, Global norm: 1.7473
Parameters with gradients: 201, Global norm: 0.5997
Parameters with gradients: 201, Global norm: 0.2058
Parameters with gradients: 201, Global norm: 0.5900
Parameters with gradients: 201, Global norm: 0.2418
Parameters with gradients: 201, Global norm: 0.4648
Parameters 

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Validation - Accuracy: 0.9083, F1: 0.9083

Epoch 4/5


Training Epoch 4:   0%|          | 0/313 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 0.1697
Parameters with gradients: 201, Global norm: 0.4662
Parameters with gradients: 201, Global norm: 0.2035
Parameters with gradients: 201, Global norm: 0.3241
Parameters with gradients: 201, Global norm: 0.6510
Parameters with gradients: 201, Global norm: 0.1301
Parameters with gradients: 201, Global norm: 0.2515
Parameters with gradients: 201, Global norm: 0.3154
Parameters with gradients: 201, Global norm: 0.2195
Parameters with gradients: 201, Global norm: 0.0629
Parameters with gradients: 201, Global norm: 0.2005
Parameters with gradients: 201, Global norm: 0.2046
Parameters with gradients: 201, Global norm: 0.4637
Parameters with gradients: 201, Global norm: 0.1850
Parameters with gradients: 201, Global norm: 0.7726
Parameters with gradients: 201, Global norm: 0.4011
Parameters with gradients: 201, Global norm: 1.3522
Parameters with gradients: 201, Global norm: 0.0659


[codecarbon INFO @ 08:09:48] Energy consumed for RAM : 0.000916 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:09:48] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:09:48] Energy consumed for All CPU : 0.001947 kWh
[codecarbon INFO @ 08:09:48] Energy consumed for all GPUs : 0.003456 kWh. Total GPU Power : 76.35073519249377 W
[codecarbon INFO @ 08:09:48] 0.006319 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.1054
Parameters with gradients: 201, Global norm: 0.1120
Parameters with gradients: 201, Global norm: 0.0781
Parameters with gradients: 201, Global norm: 1.1739
Parameters with gradients: 201, Global norm: 1.4154
Parameters with gradients: 201, Global norm: 0.0837
Parameters with gradients: 201, Global norm: 2.7234
Parameters with gradients: 201, Global norm: 0.9631
Parameters with gradients: 201, Global norm: 0.7684
Parameters with gradients: 201, Global norm: 0.1445
Parameters with gradients: 201, Global norm: 0.0658
Parameters with gradients: 201, Global norm: 1.6426
Parameters with gradients: 201, Global norm: 0.3736
Parameters with gradients: 201, Global norm: 0.5484
Parameters with gradients: 201, Global norm: 0.2267
Parameters with gradients: 201, Global norm: 0.1799
Parameters with gradients: 201, Global norm: 0.3285
Parameters with gradients: 201, Global norm: 0.0920
Parameters with gradients: 201, Global norm: 0.6173
Parameters w

[codecarbon INFO @ 08:10:03] Energy consumed for RAM : 0.000999 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:10:03] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:10:03] Energy consumed for All CPU : 0.002124 kWh


Parameters with gradients: 201, Global norm: 0.1433
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3354)


[codecarbon INFO @ 08:10:03] Energy consumed for all GPUs : 0.003771 kWh. Total GPU Power : 75.70866592586512 W
[codecarbon INFO @ 08:10:03] 0.006894 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.2062
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3215)
Parameters with gradients: 201, Global norm: 0.3355
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3314)
Parameters with gradients: 201, Global norm: 0.2495
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3350)
Parameters with gradients: 201, Global norm: 2.3510
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.4375)
Parameters with gradients: 201, Global norm: 1.1773
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.4865)
Parameters with gradients: 201, Global norm: 35.3831
Updated grad_norm_threshold: 2.1208 (MA grad_norm: 2.2454)
Parameters with gradients: 201, Global norm: 0.1899
Updated grad_norm_threshold: 2.2376 (MA grad_norm: 2.2505)
Parameters with gradients: 201, Global norm: 0.1162
Updated grad_norm_threshold: 2.2490 (MA grad_norm: 2.2502)
Parameters with gradients: 201, Global norm: 0.1503
Updated grad_norm_threshold: 2.2498 (MA grad_norm: 2.2499)


[codecarbon INFO @ 08:10:18] Energy consumed for RAM : 0.001083 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:10:18] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:10:18] Energy consumed for All CPU : 0.002301 kWh
[codecarbon INFO @ 08:10:18] Energy consumed for all GPUs : 0.004087 kWh. Total GPU Power : 75.72787585717391 W
[codecarbon INFO @ 08:10:18] 0.007470 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 4.4736
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.6149)
Parameters with gradients: 201, Global norm: 0.0187
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.6063)
Parameters with gradients: 201, Global norm: 0.0423
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.6067)
Parameters with gradients: 201, Global norm: 0.3949
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.6195)
Parameters with gradients: 201, Global norm: 0.1483
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.6165)
Parameters with gradients: 201, Global norm: 1.2742
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.6709)
Parameters with gradients: 201, Global norm: 0.9517
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7159)
Parameters with gradients: 201, Global norm: 0.1273
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7174)
Parameters with gradients: 201, Global norm: 0.0230
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7150)
P

[codecarbon INFO @ 08:10:33] Energy consumed for RAM : 0.001166 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:10:33] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:10:33] Energy consumed for All CPU : 0.002478 kWh
[codecarbon INFO @ 08:10:33] Energy consumed for all GPUs : 0.004402 kWh. Total GPU Power : 75.67596455965149 W
[codecarbon INFO @ 08:10:33] 0.008045 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.0805
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3061)
Parameters with gradients: 201, Global norm: 0.0503
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3029)
Average loss: 0.0109


Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Validation - Accuracy: 0.9060, F1: 0.9060

Epoch 5/5


Training Epoch 5:   0%|          | 0/313 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 3.2191
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3693)
Parameters with gradients: 201, Global norm: 0.0507
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3541)
Parameters with gradients: 201, Global norm: 0.0668
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2589)
Parameters with gradients: 201, Global norm: 0.0286
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2579)
Parameters with gradients: 201, Global norm: 0.0302
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2523)
Parameters with gradients: 201, Global norm: 0.1308
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2508)
Parameters with gradients: 201, Global norm: 0.1160
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2117)
Parameters with gradients: 201, Global norm: 0.1492
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2181)
Parameters with gradients: 201, Global norm: 0.0498
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2193)
P

[codecarbon INFO @ 08:10:48] Energy consumed for RAM : 0.001249 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:10:48] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:10:48] Energy consumed for All CPU : 0.002655 kWh
[codecarbon INFO @ 08:10:48] Energy consumed for all GPUs : 0.004722 kWh. Total GPU Power : 76.97977930549963 W
[codecarbon INFO @ 08:10:48] 0.008626 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.0428
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.1230)
Parameters with gradients: 201, Global norm: 0.0342
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.1218)
Parameters with gradients: 201, Global norm: 0.0539
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.1190)
Parameters with gradients: 201, Global norm: 0.0333
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.1181)
Parameters with gradients: 201, Global norm: 0.0344
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.1185)
Parameters with gradients: 201, Global norm: 0.0306
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.0946)
Parameters with gradients: 201, Global norm: 0.1435
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.0957)
Parameters with gradients: 201, Global norm: 0.0328
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.0941)
Parameters with gradients: 201, Global norm: 0.0293
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.0934)
P

[codecarbon INFO @ 08:11:03] Energy consumed for RAM : 0.001332 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:11:03] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:11:03] Energy consumed for All CPU : 0.002832 kWh
[codecarbon INFO @ 08:11:03] Energy consumed for all GPUs : 0.005037 kWh. Total GPU Power : 75.4906420160068 W
[codecarbon INFO @ 08:11:03] 0.009201 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:11:03] 0.014200 g.CO2eq/s mean an estimation of 447.822039800175 kg.CO2eq/year


Parameters with gradients: 201, Global norm: 0.0378
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3282)
Parameters with gradients: 201, Global norm: 0.0198
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3263)
Parameters with gradients: 201, Global norm: 0.0716
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3284)
Parameters with gradients: 201, Global norm: 0.0411
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3258)
Parameters with gradients: 201, Global norm: 0.0234
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3253)
Parameters with gradients: 201, Global norm: 0.0248
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3250)
Parameters with gradients: 201, Global norm: 0.0772
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3277)
Parameters with gradients: 201, Global norm: 0.1290
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3323)
Parameters with gradients: 201, Global norm: 0.0580
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3341)
P

[codecarbon INFO @ 08:11:18] Energy consumed for RAM : 0.001416 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:11:18] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:11:18] Energy consumed for All CPU : 0.003009 kWh
[codecarbon INFO @ 08:11:18] Energy consumed for all GPUs : 0.005353 kWh. Total GPU Power : 75.98425265911527 W
[codecarbon INFO @ 08:11:18] 0.009778 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.0656
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2580)
Parameters with gradients: 201, Global norm: 0.0396
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2535)
Parameters with gradients: 201, Global norm: 0.0756
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2551)
Parameters with gradients: 201, Global norm: 0.0280
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.2404)
Parameters with gradients: 201, Global norm: 1.5288
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3129)
Parameters with gradients: 201, Global norm: 0.0251
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3113)
Parameters with gradients: 201, Global norm: 0.0356
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3109)
Parameters with gradients: 201, Global norm: 0.0889
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3105)
Parameters with gradients: 201, Global norm: 1.1552
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3667)
P

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

[codecarbon INFO @ 08:11:33] Energy consumed for RAM : 0.001499 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:11:33] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:11:33] Energy consumed for All CPU : 0.003186 kWh
[codecarbon INFO @ 08:11:33] Energy consumed for all GPUs : 0.005674 kWh. Total GPU Power : 77.08425779712641 W
[codecarbon INFO @ 08:11:33] 0.010359 kWh of electricity used since the beginning.


Validation - Accuracy: 0.9037, F1: 0.9036
Early stopping at epoch 5


Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

[codecarbon INFO @ 08:11:40] Energy consumed for RAM : 0.001535 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:11:40] Delta energy consumed for CPU with constant : 0.000076 kWh, power : 42.5 W
[codecarbon INFO @ 08:11:40] Energy consumed for All CPU : 0.003262 kWh
[codecarbon INFO @ 08:11:40] Energy consumed for all GPUs : 0.005815 kWh. Total GPU Power : 78.42783993640741 W
[codecarbon INFO @ 08:11:40] 0.010611 kWh of electricity used since the beginning.



--- Final Results ---
{
  "model_name": "Adaptive_Quantized_BERT_SST2",
  "dataset": "SST2",
  "accuracy": 0.9036697247706422,
  "f1": 0.9036453825514973,
  "scheduler_metrics": {
    "adaptation_log": [
      {
        "layer_id": 0,
        "new_precision": 8,
        "grad_norm": 0.261525478893149,
        "epoch": 4,
        "type": "decrease"
      },
      {
        "layer_id": 1,
        "new_precision": 8,
        "grad_norm": 0.261525478893149,
        "epoch": 4,
        "type": "decrease"
      },
      {
        "layer_id": 2,
        "new_precision": 8,
        "grad_norm": 0.261525478893149,
        "epoch": 4,
        "type": "decrease"
      },
      {
        "layer_id": 3,
        "new_precision": 8,
        "grad_norm": 0.261525478893149,
        "epoch": 4,
        "type": "decrease"
      },
      {
        "layer_id": 4,
        "new_precision": 8,
        "grad_norm": 0.261525478893149,
        "epoch": 4,
        "type": "decrease"
      },
      {
        "lay

Map:   0%|          | 0/872 [00:00<?, ? examples/s]


🏗️ Building Adaptive Quantized Bert Model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[codecarbon INFO @ 08:11:49] offline tracker init
[codecarbon INFO @ 08:11:49] [setup] RAM Tracking...
[codecarbon INFO @ 08:11:49] [setup] CPU Tracking...


✓ Successfully created quantized Bert model.


 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 08:11:50] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 08:11:50] [setup] GPU Tracking...
[codecarbon INFO @ 08:11:50] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 08:11:50] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 08:11:50] >>> Tracker's metadata:
[codecarbon INFO @ 08:11:50]   Platform system: Linux-6.6.56+-x86_64-with-glibc2.35
[codecarbon INFO @ 08:11:50]   Python version: 3.11.13
[codecarbon INFO @ 08:11:50]   CodeCarbon version: 3.0.5
[codecarbon INFO @ 08:11:50]   Available RAM : 31.350 GB
[codecarbon INFO @ 08:11:50]   CPU count: 4 thread(s) in 1 physical CPU(s)
[codecarbon INFO @ 08:11:50]   CPU model: Intel

✓ Robust PrecisionScheduler initialized for 10 layers (Window: 20, Cooldown: 20, Warmup: 500).

Epoch 1/5


Training Epoch 1:   0%|          | 0/230 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 2.6728
Parameters with gradients: 201, Global norm: 5.1829
Parameters with gradients: 201, Global norm: 2.7962
Parameters with gradients: 201, Global norm: 2.9175
Parameters with gradients: 201, Global norm: 4.5300
Parameters with gradients: 201, Global norm: 3.2714
Parameters with gradients: 201, Global norm: 4.1742
Parameters with gradients: 201, Global norm: 2.4486
Parameters with gradients: 201, Global norm: 2.7799
Parameters with gradients: 201, Global norm: 4.1411
Parameters with gradients: 201, Global norm: 2.5639
Parameters with gradients: 201, Global norm: 4.5368
Parameters with gradients: 201, Global norm: 6.8099
Parameters with gradients: 201, Global norm: 2.6728
Parameters with gradients: 201, Global norm: 2.3214
Parameters with gradients: 201, Global norm: 7.0000
Parameters with gradients: 201, Global norm: 3.0885
Parameters with gradients: 201, Global norm: 2.5750
Parameters with gradients: 201, Global norm: 2.2982
Parameters w

[codecarbon INFO @ 08:12:06] Energy consumed for RAM : 0.000083 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:12:06] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:12:06] Energy consumed for All CPU : 0.000177 kWh


Parameters with gradients: 201, Global norm: 2.7544


[codecarbon INFO @ 08:12:06] Energy consumed for all GPUs : 0.000316 kWh. Total GPU Power : 75.6849640639839 W
[codecarbon INFO @ 08:12:06] 0.000576 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 5.2743
Parameters with gradients: 201, Global norm: 3.0520
Parameters with gradients: 201, Global norm: 2.6140
Parameters with gradients: 201, Global norm: 2.9277
Parameters with gradients: 201, Global norm: 3.0818
Parameters with gradients: 201, Global norm: 3.6101
Parameters with gradients: 201, Global norm: 3.3997
Parameters with gradients: 201, Global norm: 4.0174
Parameters with gradients: 201, Global norm: 5.2483
Parameters with gradients: 199, Global norm: 4.8003
Parameters with gradients: 201, Global norm: 4.4878
Parameters with gradients: 201, Global norm: 6.3566
Parameters with gradients: 201, Global norm: 4.6622
Parameters with gradients: 201, Global norm: 6.3722
Parameters with gradients: 201, Global norm: 5.1767
Parameters with gradients: 201, Global norm: 4.5290
Parameters with gradients: 201, Global norm: 5.0117
Parameters with gradients: 201, Global norm: 4.6668
Parameters with gradients: 201, Global norm: 4.4157
Parameters w

[codecarbon INFO @ 08:12:21] Energy consumed for RAM : 0.000167 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:12:21] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:12:21] Energy consumed for All CPU : 0.000354 kWh
[codecarbon INFO @ 08:12:21] Energy consumed for all GPUs : 0.000632 kWh. Total GPU Power : 76.1038243176691 W
[codecarbon INFO @ 08:12:21] 0.001153 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 6.0687
Parameters with gradients: 201, Global norm: 3.5647
Parameters with gradients: 201, Global norm: 4.4043
Parameters with gradients: 201, Global norm: 5.8524
Parameters with gradients: 201, Global norm: 9.7797
Parameters with gradients: 201, Global norm: 4.9427
Parameters with gradients: 201, Global norm: 5.9177
Parameters with gradients: 201, Global norm: 4.7999
Parameters with gradients: 201, Global norm: 6.2740
Parameters with gradients: 201, Global norm: 4.7166
Parameters with gradients: 201, Global norm: 3.8511
Parameters with gradients: 201, Global norm: 7.2484
Parameters with gradients: 201, Global norm: 3.9331
Parameters with gradients: 201, Global norm: 13.7404
Parameters with gradients: 201, Global norm: 4.9200
Parameters with gradients: 201, Global norm: 5.1419
Parameters with gradients: 201, Global norm: 4.0563
Parameters with gradients: 201, Global norm: 5.1649
Parameters with gradients: 201, Global norm: 11.0242
Parameters

Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Validation - Accuracy: 0.7157, F1: 0.6473

Epoch 2/5


Training Epoch 2:   0%|          | 0/230 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 7.0931
Parameters with gradients: 201, Global norm: 6.2595
Parameters with gradients: 201, Global norm: 4.9059
Parameters with gradients: 201, Global norm: 4.2779
Parameters with gradients: 201, Global norm: 3.4783
Parameters with gradients: 201, Global norm: 5.3225
Parameters with gradients: 201, Global norm: 4.8577
Parameters with gradients: 201, Global norm: 5.4251
Parameters with gradients: 201, Global norm: 6.5982
Parameters with gradients: 201, Global norm: 4.8225
Parameters with gradients: 201, Global norm: 8.4539
Parameters with gradients: 201, Global norm: 8.4528
Parameters with gradients: 201, Global norm: 9.7641
Parameters with gradients: 201, Global norm: 4.3558
Parameters with gradients: 201, Global norm: 7.1351
Parameters with gradients: 201, Global norm: 3.8065
Parameters with gradients: 201, Global norm: 11.1221
Parameters with gradients: 201, Global norm: 5.2370
Parameters with gradients: 201, Global norm: 8.6648
Parameters 

[codecarbon INFO @ 08:12:36] Energy consumed for RAM : 0.000250 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:12:36] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:12:36] Energy consumed for All CPU : 0.000531 kWh
[codecarbon INFO @ 08:12:36] Energy consumed for all GPUs : 0.000950 kWh. Total GPU Power : 76.34666749123981 W
[codecarbon INFO @ 08:12:36] 0.001732 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 6.8477
Parameters with gradients: 201, Global norm: 6.8187
Parameters with gradients: 201, Global norm: 5.6073
Parameters with gradients: 201, Global norm: 6.0921
Parameters with gradients: 201, Global norm: 5.3061
Parameters with gradients: 201, Global norm: 7.8290
Parameters with gradients: 201, Global norm: 2.0337
Parameters with gradients: 201, Global norm: 13.1509
Parameters with gradients: 201, Global norm: 10.3088
Parameters with gradients: 201, Global norm: 5.7961
Parameters with gradients: 201, Global norm: 6.7415
Parameters with gradients: 201, Global norm: 7.8363
Parameters with gradients: 201, Global norm: 9.2577
Parameters with gradients: 201, Global norm: 8.1310
Parameters with gradients: 201, Global norm: 5.2668
Parameters with gradients: 201, Global norm: 7.0356
Parameters with gradients: 201, Global norm: 6.5884
Parameters with gradients: 201, Global norm: 9.9361
Parameters with gradients: 201, Global norm: 4.7916
Parameters

[codecarbon INFO @ 08:12:51] Energy consumed for RAM : 0.000333 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:12:51] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:12:51] Energy consumed for All CPU : 0.000708 kWh
[codecarbon INFO @ 08:12:51] Energy consumed for all GPUs : 0.001269 kWh. Total GPU Power : 76.46197662171694 W
[codecarbon INFO @ 08:12:51] 0.002310 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 7.1725
Parameters with gradients: 201, Global norm: 3.4566
Parameters with gradients: 201, Global norm: 4.6410
Parameters with gradients: 201, Global norm: 5.3948
Parameters with gradients: 201, Global norm: 7.7900
Parameters with gradients: 201, Global norm: 4.0071
Parameters with gradients: 201, Global norm: 6.0281
Parameters with gradients: 201, Global norm: 11.8628
Parameters with gradients: 201, Global norm: 4.9665
Parameters with gradients: 201, Global norm: 8.3538
Parameters with gradients: 201, Global norm: 6.6216
Parameters with gradients: 201, Global norm: 5.8733
Parameters with gradients: 201, Global norm: 5.4174
Parameters with gradients: 201, Global norm: 5.3034
Parameters with gradients: 201, Global norm: 3.9223
Parameters with gradients: 201, Global norm: 6.7829
Parameters with gradients: 201, Global norm: 6.0841
Parameters with gradients: 201, Global norm: 7.3769
Parameters with gradients: 201, Global norm: 6.9408
Parameters 

Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

[codecarbon INFO @ 08:13:06] Energy consumed for RAM : 0.000417 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:13:06] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:13:06] Energy consumed for All CPU : 0.000885 kWh
[codecarbon INFO @ 08:13:06] Energy consumed for all GPUs : 0.001586 kWh. Total GPU Power : 76.07878896010753 W
[codecarbon INFO @ 08:13:06] 0.002887 kWh of electricity used since the beginning.


Validation - Accuracy: 0.8554, F1: 0.8478

Epoch 3/5


Training Epoch 3:   0%|          | 0/230 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 3.0937
Parameters with gradients: 201, Global norm: 3.5380
Parameters with gradients: 201, Global norm: 3.8121
Parameters with gradients: 201, Global norm: 1.2759
Parameters with gradients: 201, Global norm: 6.6920
Parameters with gradients: 201, Global norm: 2.6733
Parameters with gradients: 201, Global norm: 4.0596
Parameters with gradients: 201, Global norm: 5.6572
Parameters with gradients: 201, Global norm: 1.1467
Parameters with gradients: 201, Global norm: 13.6586
Parameters with gradients: 201, Global norm: 6.6094
Parameters with gradients: 201, Global norm: 8.3725
Parameters with gradients: 201, Global norm: 3.9572
Parameters with gradients: 201, Global norm: 1.7082
Parameters with gradients: 201, Global norm: 2.5001
Parameters with gradients: 201, Global norm: 8.3090
Parameters with gradients: 201, Global norm: 15.7531
Parameters with gradients: 201, Global norm: 7.5726
Parameters with gradients: 201, Global norm: 1.1969
Parameters

[codecarbon INFO @ 08:13:21] Energy consumed for RAM : 0.000500 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:13:21] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:13:21] Energy consumed for All CPU : 0.001062 kWh
[codecarbon INFO @ 08:13:21] Energy consumed for all GPUs : 0.001904 kWh. Total GPU Power : 76.32211966153358 W
[codecarbon INFO @ 08:13:21] 0.003465 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 3.2166
Updated grad_norm_threshold: 5.3565 (MA grad_norm: 5.3376)
Parameters with gradients: 201, Global norm: 13.1794
Updated grad_norm_threshold: 5.6869 (MA grad_norm: 5.7237)
Parameters with gradients: 201, Global norm: 11.4772
Updated grad_norm_threshold: 6.0365 (MA grad_norm: 6.0753)
Parameters with gradients: 201, Global norm: 15.5330
Updated grad_norm_threshold: 6.4363 (MA grad_norm: 6.4807)
Parameters with gradients: 201, Global norm: 13.5439
Updated grad_norm_threshold: 7.0473 (MA grad_norm: 7.1152)
Parameters with gradients: 201, Global norm: 0.6940
Updated grad_norm_threshold: 6.6382 (MA grad_norm: 6.5927)
Parameters with gradients: 201, Global norm: 0.5473
Updated grad_norm_threshold: 6.4421 (MA grad_norm: 6.4203)
Parameters with gradients: 201, Global norm: 8.9083
Updated grad_norm_threshold: 6.1289 (MA grad_norm: 6.0941)
Parameters with gradients: 201, Global norm: 1.1086
Updated grad_norm_threshold: 5.8857 (MA grad_norm: 5.858

[codecarbon INFO @ 08:13:36] Energy consumed for RAM : 0.000583 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:13:36] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:13:36] Energy consumed for All CPU : 0.001239 kWh
[codecarbon INFO @ 08:13:36] Energy consumed for all GPUs : 0.002219 kWh. Total GPU Power : 75.85730023309998 W
[codecarbon INFO @ 08:13:36] 0.004042 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 3.1589
Updated grad_norm_threshold: 4.6027 (MA grad_norm: 4.5859)
Parameters with gradients: 201, Global norm: 3.5023
Updated grad_norm_threshold: 4.6280 (MA grad_norm: 4.6308)
Parameters with gradients: 201, Global norm: 10.3098
Updated grad_norm_threshold: 4.6755 (MA grad_norm: 4.6808)
Parameters with gradients: 201, Global norm: 1.6731
Updated grad_norm_threshold: 4.7087 (MA grad_norm: 4.7124)
Parameters with gradients: 201, Global norm: 2.3841
Updated grad_norm_threshold: 4.7773 (MA grad_norm: 4.7849)
Parameters with gradients: 201, Global norm: 1.7565
Updated grad_norm_threshold: 4.8292 (MA grad_norm: 4.8349)
Parameters with gradients: 201, Global norm: 7.7065
Updated grad_norm_threshold: 5.1523 (MA grad_norm: 5.1882)
Parameters with gradients: 201, Global norm: 1.4403
Updated grad_norm_threshold: 5.1445 (MA grad_norm: 5.1437)
Parameters with gradients: 201, Global norm: 9.8674
Updated grad_norm_threshold: 5.1660 (MA grad_norm: 5.1684)


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Validation - Accuracy: 0.8529, F1: 0.8535

Epoch 4/5


Training Epoch 4:   0%|          | 0/230 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 0.4539
Updated grad_norm_threshold: 4.3751 (MA grad_norm: 4.3650)
Parameters with gradients: 201, Global norm: 2.7987
Updated grad_norm_threshold: 4.1690 (MA grad_norm: 4.1461)
Parameters with gradients: 201, Global norm: 1.3002
Updated grad_norm_threshold: 4.1741 (MA grad_norm: 4.1746)
Parameters with gradients: 201, Global norm: 0.4817
Updated grad_norm_threshold: 4.0703 (MA grad_norm: 4.0588)
Parameters with gradients: 201, Global norm: 5.4398
Updated grad_norm_threshold: 4.2283 (MA grad_norm: 4.2458)
Parameters with gradients: 201, Global norm: 0.7709
Updated grad_norm_threshold: 3.9705 (MA grad_norm: 3.9418)
Parameters with gradients: 201, Global norm: 0.7369
Updated grad_norm_threshold: 3.9127 (MA grad_norm: 3.9063)
Parameters with gradients: 201, Global norm: 0.5201
Updated grad_norm_threshold: 3.7845 (MA grad_norm: 3.7702)
Parameters with gradients: 201, Global norm: 0.6332
Updated grad_norm_threshold: 3.5197 (MA grad_norm: 3.4903)
P

[codecarbon INFO @ 08:13:51] Energy consumed for RAM : 0.000666 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:13:51] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:13:51] Energy consumed for All CPU : 0.001416 kWh
[codecarbon INFO @ 08:13:51] Energy consumed for all GPUs : 0.002537 kWh. Total GPU Power : 76.15328063180846 W
[codecarbon INFO @ 08:13:51] 0.004619 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:13:51] 0.014219 g.CO2eq/s mean an estimation of 448.4258184987255 kg.CO2eq/year


Parameters with gradients: 201, Global norm: 0.3779
Updated grad_norm_threshold: 2.2980 (MA grad_norm: 2.2729)
Parameters with gradients: 201, Global norm: 0.3973
Updated grad_norm_threshold: 2.2586 (MA grad_norm: 2.2542)
Parameters with gradients: 201, Global norm: 0.4843
Updated grad_norm_threshold: 2.2433 (MA grad_norm: 2.2416)
Parameters with gradients: 201, Global norm: 0.7729
Updated grad_norm_threshold: 2.2531 (MA grad_norm: 2.2542)
Parameters with gradients: 201, Global norm: 0.4718
Updated grad_norm_threshold: 2.2468 (MA grad_norm: 2.2461)
Parameters with gradients: 201, Global norm: 2.5406
Updated grad_norm_threshold: 2.2458 (MA grad_norm: 2.2457)
Parameters with gradients: 201, Global norm: 2.6228
Updated grad_norm_threshold: 2.2168 (MA grad_norm: 2.2136)
Parameters with gradients: 201, Global norm: 0.5348
Updated grad_norm_threshold: 2.2163 (MA grad_norm: 2.2163)
Parameters with gradients: 201, Global norm: 0.8105
Updated grad_norm_threshold: 2.2268 (MA grad_norm: 2.2280)
P

[codecarbon INFO @ 08:14:06] Energy consumed for RAM : 0.000750 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:14:06] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:14:06] Energy consumed for All CPU : 0.001593 kWh
[codecarbon INFO @ 08:14:06] Energy consumed for all GPUs : 0.002853 kWh. Total GPU Power : 75.93430286048499 W
[codecarbon INFO @ 08:14:06] 0.005195 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.4694
Updated grad_norm_threshold: 2.2440 (MA grad_norm: 2.2432)
Parameters with gradients: 201, Global norm: 1.0789
Updated grad_norm_threshold: 2.2646 (MA grad_norm: 2.2669)
Parameters with gradients: 201, Global norm: 0.3075
Updated grad_norm_threshold: 1.9429 (MA grad_norm: 1.9071)
Parameters with gradients: 201, Global norm: 0.2955
Updated grad_norm_threshold: 1.9068 (MA grad_norm: 1.9027)
Parameters with gradients: 201, Global norm: 0.8615
Updated grad_norm_threshold: 1.8920 (MA grad_norm: 1.8904)
Parameters with gradients: 201, Global norm: 10.2500
Updated grad_norm_threshold: 2.2049 (MA grad_norm: 2.2397)
Parameters with gradients: 201, Global norm: 0.4117
Updated grad_norm_threshold: 2.2325 (MA grad_norm: 2.2356)
Parameters with gradients: 201, Global norm: 1.0684
Updated grad_norm_threshold: 2.2570 (MA grad_norm: 2.2597)
Parameters with gradients: 201, Global norm: 0.9719
Updated grad_norm_threshold: 2.2903 (MA grad_norm: 2.2940)


[codecarbon INFO @ 08:14:21] Energy consumed for RAM : 0.000833 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:14:21] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:14:21] Energy consumed for All CPU : 0.001770 kWh
[codecarbon INFO @ 08:14:21] Energy consumed for all GPUs : 0.003169 kWh. Total GPU Power : 75.90531241030327 W
[codecarbon INFO @ 08:14:21] 0.005772 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.9063
Updated grad_norm_threshold: 1.7318 (MA grad_norm: 1.7359)
Parameters with gradients: 201, Global norm: 0.3887
Updated grad_norm_threshold: 1.7349 (MA grad_norm: 1.7352)
Parameters with gradients: 201, Global norm: 1.4414
Updated grad_norm_threshold: 1.3953 (MA grad_norm: 1.3575)
Parameters with gradients: 201, Global norm: 0.2973
Updated grad_norm_threshold: 1.3621 (MA grad_norm: 1.3584)
Parameters with gradients: 201, Global norm: 0.3943
Updated grad_norm_threshold: 1.3667 (MA grad_norm: 1.3672)
Parameters with gradients: 201, Global norm: 0.2409
Updated grad_norm_threshold: 1.2872 (MA grad_norm: 1.2784)
Parameters with gradients: 201, Global norm: 0.4861
Updated grad_norm_threshold: 1.2638 (MA grad_norm: 1.2612)
Parameters with gradients: 201, Global norm: 0.2245
Updated grad_norm_threshold: 1.2618 (MA grad_norm: 1.2616)
Parameters with gradients: 201, Global norm: 0.4042
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7642)
P

Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Validation - Accuracy: 0.8578, F1: 0.8566

Epoch 5/5


Training Epoch 5:   0%|          | 0/230 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 0.4715
Updated grad_norm_threshold: 1.4830 (MA grad_norm: 1.4844)
Parameters with gradients: 201, Global norm: 0.2793
Updated grad_norm_threshold: 1.4531 (MA grad_norm: 1.4497)
Parameters with gradients: 201, Global norm: 0.5188
Updated grad_norm_threshold: 1.4630 (MA grad_norm: 1.4641)
Parameters with gradients: 201, Global norm: 0.2908
Updated grad_norm_threshold: 1.4557 (MA grad_norm: 1.4549)
Parameters with gradients: 201, Global norm: 0.2230
Updated grad_norm_threshold: 1.4561 (MA grad_norm: 1.4561)
Parameters with gradients: 201, Global norm: 0.2411
Updated grad_norm_threshold: 1.4365 (MA grad_norm: 1.4344)
Parameters with gradients: 201, Global norm: 0.3010
Updated grad_norm_threshold: 1.3611 (MA grad_norm: 1.3527)
Parameters with gradients: 201, Global norm: 1.0471
Updated grad_norm_threshold: 1.3599 (MA grad_norm: 1.3597)
Parameters with gradients: 201, Global norm: 0.4139
Updated grad_norm_threshold: 1.3609 (MA grad_norm: 1.3610)
P

[codecarbon INFO @ 08:14:36] Energy consumed for RAM : 0.000916 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:14:36] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:14:36] Energy consumed for All CPU : 0.001947 kWh
[codecarbon INFO @ 08:14:36] Energy consumed for all GPUs : 0.003487 kWh. Total GPU Power : 76.40977996087764 W
[codecarbon INFO @ 08:14:36] 0.006350 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.1727
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7643)
Parameters with gradients: 201, Global norm: 0.2060
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7622)
Parameters with gradients: 201, Global norm: 0.4376
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7636)
Parameters with gradients: 201, Global norm: 0.3137
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7694)
Parameters with gradients: 201, Global norm: 0.2037
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7699)
Parameters with gradients: 201, Global norm: 0.1493
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7673)
Parameters with gradients: 201, Global norm: 0.2554
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.7696)
Parameters with gradients: 201, Global norm: 2.6415
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.8887)
Parameters with gradients: 201, Global norm: 0.2393
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.4500)
P

[codecarbon INFO @ 08:14:51] Energy consumed for RAM : 0.000999 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:14:51] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:14:51] Energy consumed for All CPU : 0.002124 kWh
[codecarbon INFO @ 08:14:51] Energy consumed for all GPUs : 0.003803 kWh. Total GPU Power : 75.86591482229008 W
[codecarbon INFO @ 08:14:51] 0.006927 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 0.1843
Updated grad_norm_threshold: 1.0027 (MA grad_norm: 1.0028)
Parameters with gradients: 201, Global norm: 0.5886
Updated grad_norm_threshold: 1.0202 (MA grad_norm: 1.0221)
Parameters with gradients: 201, Global norm: 0.2219
Updated grad_norm_threshold: 1.0223 (MA grad_norm: 1.0225)
Parameters with gradients: 201, Global norm: 0.2293
Updated grad_norm_threshold: 1.0278 (MA grad_norm: 1.0284)
Parameters with gradients: 201, Global norm: 1.5059
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.9077)
Parameters with gradients: 201, Global norm: 0.3956
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.9199)
Parameters with gradients: 201, Global norm: 0.2298
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.9239)
Parameters with gradients: 201, Global norm: 0.2982
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.9257)
Parameters with gradients: 201, Global norm: 0.1921
Updated grad_norm_threshold: 1.0000 (MA grad_norm: 0.3104)
P

Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Validation - Accuracy: 0.8407, F1: 0.8379


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

[codecarbon INFO @ 08:15:06] Energy consumed for RAM : 0.001083 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:15:06] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:15:06] Energy consumed for All CPU : 0.002301 kWh
[codecarbon INFO @ 08:15:06] Energy consumed for all GPUs : 0.004122 kWh. Total GPU Power : 76.55221450095075 W
[codecarbon INFO @ 08:15:06] 0.007506 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:15:07] Energy consumed for RAM : 0.001088 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:15:07] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 08:15:07] Energy consumed for All CPU : 0.002313 kWh
[codecarbon INFO @ 08:15:07] Energy consumed for all GPUs : 0.004146 kWh. Total GPU Power : 83.51249083345611 W
[codecarbon INFO @ 08:15:07] 0.007547 kWh of electricity used since the beginning.



--- Final Results ---
{
  "model_name": "Adaptive_Quantized_BERT_MRPC",
  "dataset": "MRPC",
  "accuracy": 0.8406862745098039,
  "f1": 0.8378544424039412,
  "scheduler_metrics": {
    "adaptation_log": [
      {
        "layer_id": 0,
        "new_precision": 8,
        "grad_norm": 0.44700065158013247,
        "epoch": 5,
        "type": "decrease"
      },
      {
        "layer_id": 1,
        "new_precision": 8,
        "grad_norm": 0.44700065158013247,
        "epoch": 5,
        "type": "decrease"
      },
      {
        "layer_id": 2,
        "new_precision": 8,
        "grad_norm": 0.44700065158013247,
        "epoch": 5,
        "type": "decrease"
      },
      {
        "layer_id": 3,
        "new_precision": 8,
        "grad_norm": 0.44700065158013247,
        "epoch": 5,
        "type": "decrease"
      },
      {
        "layer_id": 4,
        "new_precision": 8,
        "grad_norm": 0.44700065158013247,
        "epoch": 5,
        "type": "decrease"
      },
      {
  

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]


🏗️ Building Adaptive Quantized Bert Model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[codecarbon INFO @ 08:15:16] offline tracker init
[codecarbon INFO @ 08:15:16] [setup] RAM Tracking...
[codecarbon INFO @ 08:15:16] [setup] CPU Tracking...


✓ Successfully created quantized Bert model.


 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 08:15:17] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 08:15:17] [setup] GPU Tracking...
[codecarbon INFO @ 08:15:17] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 08:15:17] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 08:15:17] >>> Tracker's metadata:
[codecarbon INFO @ 08:15:17]   Platform system: Linux-6.6.56+-x86_64-with-glibc2.35
[codecarbon INFO @ 08:15:17]   Python version: 3.11.13
[codecarbon INFO @ 08:15:17]   CodeCarbon version: 3.0.5
[codecarbon INFO @ 08:15:17]   Available RAM : 31.350 GB
[codecarbon INFO @ 08:15:17]   CPU count: 4 thread(s) in 1 physical CPU(s)
[codecarbon INFO @ 08:15:17]   CPU model: Intel

✓ Robust PrecisionScheduler initialized for 10 layers (Window: 20, Cooldown: 20, Warmup: 500).

Epoch 1/5


Training Epoch 1:   0%|          | 0/156 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 8.4974
Parameters with gradients: 201, Global norm: 5.3883
Parameters with gradients: 201, Global norm: 9.0721
Parameters with gradients: 201, Global norm: 5.3176
Parameters with gradients: 201, Global norm: 2.1266
Parameters with gradients: 201, Global norm: 10.8745
Parameters with gradients: 201, Global norm: 5.7182
Parameters with gradients: 201, Global norm: 9.2663
Parameters with gradients: 201, Global norm: 10.9128
Parameters with gradients: 201, Global norm: 4.2657
Parameters with gradients: 201, Global norm: 2.2763
Parameters with gradients: 201, Global norm: 4.5268
Parameters with gradients: 201, Global norm: 6.4471
Parameters with gradients: 201, Global norm: 7.0684
Parameters with gradients: 201, Global norm: 5.5372
Parameters with gradients: 201, Global norm: 4.9472
Parameters with gradients: 201, Global norm: 5.9909
Parameters with gradients: 201, Global norm: 6.7651
Parameters with gradients: 201, Global norm: 5.1708
Parameters

[codecarbon INFO @ 08:15:32] Energy consumed for RAM : 0.000083 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:15:32] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:15:32] Energy consumed for All CPU : 0.000177 kWh
[codecarbon INFO @ 08:15:32] Energy consumed for all GPUs : 0.000316 kWh. Total GPU Power : 75.70217197101358 W
[codecarbon INFO @ 08:15:32] 0.000576 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 3.3896
Parameters with gradients: 201, Global norm: 5.7066
Parameters with gradients: 201, Global norm: 2.6768
Parameters with gradients: 201, Global norm: 2.8679
Parameters with gradients: 201, Global norm: 6.5261
Parameters with gradients: 201, Global norm: 4.0453
Parameters with gradients: 201, Global norm: 2.4181
Parameters with gradients: 201, Global norm: 6.7836
Parameters with gradients: 201, Global norm: 7.9055
Parameters with gradients: 201, Global norm: 3.7751
Parameters with gradients: 201, Global norm: 2.5084
Parameters with gradients: 201, Global norm: 3.6613
Parameters with gradients: 201, Global norm: 5.0582
Parameters with gradients: 201, Global norm: 2.6516
Parameters with gradients: 201, Global norm: 3.5557
Parameters with gradients: 201, Global norm: 4.4714
Parameters with gradients: 201, Global norm: 5.6504
Parameters with gradients: 201, Global norm: 4.2989
Parameters with gradients: 201, Global norm: 9.5188
Parameters w

Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

Validation - Accuracy: 0.4693, F1: 0.3654

Epoch 2/5


Training Epoch 2:   0%|          | 0/156 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 6.2220
Parameters with gradients: 201, Global norm: 1.8371
Parameters with gradients: 201, Global norm: 3.9886
Parameters with gradients: 201, Global norm: 2.0315
Parameters with gradients: 201, Global norm: 3.3427
Parameters with gradients: 201, Global norm: 9.8779
Parameters with gradients: 201, Global norm: 2.2403
Parameters with gradients: 201, Global norm: 1.8485
Parameters with gradients: 201, Global norm: 9.4092
Parameters with gradients: 201, Global norm: 9.6885
Parameters with gradients: 201, Global norm: 6.7957
Parameters with gradients: 201, Global norm: 2.4349
Parameters with gradients: 201, Global norm: 4.3041
Parameters with gradients: 201, Global norm: 3.9073
Parameters with gradients: 201, Global norm: 1.7390
Parameters with gradients: 201, Global norm: 1.9153
Parameters with gradients: 201, Global norm: 7.2239
Parameters with gradients: 201, Global norm: 4.3699
Parameters with gradients: 201, Global norm: 1.7100
Parameters w

[codecarbon INFO @ 08:15:47] Energy consumed for RAM : 0.000167 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:15:47] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:15:47] Energy consumed for All CPU : 0.000354 kWh
[codecarbon INFO @ 08:15:47] Energy consumed for all GPUs : 0.000633 kWh. Total GPU Power : 76.21176769199907 W
[codecarbon INFO @ 08:15:47] 0.001154 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 6.1128
Parameters with gradients: 201, Global norm: 2.0085
Parameters with gradients: 201, Global norm: 6.0155
Parameters with gradients: 201, Global norm: 2.4200
Parameters with gradients: 201, Global norm: 2.5260
Parameters with gradients: 201, Global norm: 5.9780
Parameters with gradients: 201, Global norm: 2.1982
Parameters with gradients: 201, Global norm: 3.6389
Parameters with gradients: 201, Global norm: 1.8660
Parameters with gradients: 201, Global norm: 2.5361
Parameters with gradients: 201, Global norm: 3.3452
Parameters with gradients: 201, Global norm: 2.8388
Parameters with gradients: 201, Global norm: 3.1332
Parameters with gradients: 201, Global norm: 2.0318
Parameters with gradients: 201, Global norm: 5.8028
Parameters with gradients: 201, Global norm: 5.2630
Parameters with gradients: 201, Global norm: 5.5426
Parameters with gradients: 201, Global norm: 5.3102
Parameters with gradients: 201, Global norm: 2.5952
Parameters w

[codecarbon INFO @ 08:16:02] Energy consumed for RAM : 0.000250 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:16:02] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:16:02] Energy consumed for All CPU : 0.000531 kWh
[codecarbon INFO @ 08:16:02] Energy consumed for all GPUs : 0.000949 kWh. Total GPU Power : 75.92316747785993 W
[codecarbon INFO @ 08:16:02] 0.001730 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 3.8541
Parameters with gradients: 201, Global norm: 7.2470
Parameters with gradients: 201, Global norm: 5.8121
Parameters with gradients: 201, Global norm: 9.2664
Parameters with gradients: 201, Global norm: 7.1993
Parameters with gradients: 201, Global norm: 3.9833
Parameters with gradients: 201, Global norm: 4.8237
Parameters with gradients: 201, Global norm: 4.7818
Parameters with gradients: 201, Global norm: 9.4651
Parameters with gradients: 201, Global norm: 7.2158
Parameters with gradients: 201, Global norm: 5.9046
Parameters with gradients: 201, Global norm: 4.4629
Parameters with gradients: 201, Global norm: 5.6076
Parameters with gradients: 201, Global norm: 3.0667
Parameters with gradients: 201, Global norm: 2.7868
Parameters with gradients: 201, Global norm: 10.8453
Parameters with gradients: 201, Global norm: 3.0095
Parameters with gradients: 201, Global norm: 6.5291
Parameters with gradients: 201, Global norm: 6.8431
Parameters 

Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

Validation - Accuracy: 0.5884, F1: 0.5867

Epoch 3/5


Training Epoch 3:   0%|          | 0/156 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 4.9426
Parameters with gradients: 201, Global norm: 6.9505
Parameters with gradients: 201, Global norm: 8.2993
Parameters with gradients: 201, Global norm: 5.0407
Parameters with gradients: 201, Global norm: 3.5975
Parameters with gradients: 201, Global norm: 5.4591
Parameters with gradients: 201, Global norm: 3.6040
Parameters with gradients: 201, Global norm: 6.7779
Parameters with gradients: 201, Global norm: 6.9670
Parameters with gradients: 201, Global norm: 18.8493
Parameters with gradients: 201, Global norm: 2.8636
Parameters with gradients: 201, Global norm: 4.4989
Parameters with gradients: 201, Global norm: 7.0713
Parameters with gradients: 201, Global norm: 7.2342
Parameters with gradients: 201, Global norm: 7.8527
Parameters with gradients: 201, Global norm: 7.4547
Parameters with gradients: 201, Global norm: 9.3527
Parameters with gradients: 201, Global norm: 5.8618
Parameters with gradients: 201, Global norm: 6.5964
Parameters 

[codecarbon INFO @ 08:16:17] Energy consumed for RAM : 0.000333 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:16:17] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:16:17] Energy consumed for All CPU : 0.000708 kWh
[codecarbon INFO @ 08:16:17] Energy consumed for all GPUs : 0.001266 kWh. Total GPU Power : 76.08022239810356 W
[codecarbon INFO @ 08:16:17] 0.002307 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 15.0339
Parameters with gradients: 201, Global norm: 6.2422
Parameters with gradients: 201, Global norm: 6.5149
Parameters with gradients: 201, Global norm: 7.7617
Parameters with gradients: 201, Global norm: 10.3612
Parameters with gradients: 201, Global norm: 10.9039
Parameters with gradients: 201, Global norm: 16.1668
Parameters with gradients: 201, Global norm: 7.7739
Parameters with gradients: 201, Global norm: 7.9546
Parameters with gradients: 201, Global norm: 8.4885
Parameters with gradients: 201, Global norm: 6.1559
Parameters with gradients: 201, Global norm: 5.4745
Parameters with gradients: 201, Global norm: 7.7674
Parameters with gradients: 201, Global norm: 9.1063
Parameters with gradients: 201, Global norm: 5.2937
Parameters with gradients: 201, Global norm: 8.3194
Parameters with gradients: 201, Global norm: 6.5853
Parameters with gradients: 201, Global norm: 4.8173
Parameters with gradients: 201, Global norm: 5.5798
Paramete

[codecarbon INFO @ 08:16:32] Energy consumed for RAM : 0.000416 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:16:32] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:16:32] Energy consumed for All CPU : 0.000885 kWh
[codecarbon INFO @ 08:16:32] Energy consumed for all GPUs : 0.001582 kWh. Total GPU Power : 75.958967791987 W
[codecarbon INFO @ 08:16:32] 0.002884 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 8.7176
Parameters with gradients: 201, Global norm: 6.3285
Parameters with gradients: 201, Global norm: 15.7346
Parameters with gradients: 201, Global norm: 8.1915
Parameters with gradients: 201, Global norm: 24.1216
Parameters with gradients: 201, Global norm: 8.5166
Parameters with gradients: 201, Global norm: 11.3824
Parameters with gradients: 201, Global norm: 10.0560
Parameters with gradients: 201, Global norm: 21.2678
Parameters with gradients: 201, Global norm: 22.6362
Parameters with gradients: 201, Global norm: 10.9480
Parameters with gradients: 201, Global norm: 11.4548
Parameters with gradients: 201, Global norm: 20.2956
Parameters with gradients: 201, Global norm: 10.4414
Average loss: 0.5101


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

Validation - Accuracy: 0.5884, F1: 0.5886

Epoch 4/5


Training Epoch 4:   0%|          | 0/156 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 8.2767
Parameters with gradients: 201, Global norm: 7.7181
Parameters with gradients: 201, Global norm: 4.0379
Parameters with gradients: 201, Global norm: 5.3531
Parameters with gradients: 201, Global norm: 4.7020
Parameters with gradients: 201, Global norm: 27.1139
Parameters with gradients: 201, Global norm: 10.9732
Parameters with gradients: 201, Global norm: 4.3033
Parameters with gradients: 201, Global norm: 5.1200
Parameters with gradients: 201, Global norm: 7.0244
Parameters with gradients: 201, Global norm: 6.9743
Parameters with gradients: 201, Global norm: 6.7602
Parameters with gradients: 201, Global norm: 5.9226
Parameters with gradients: 201, Global norm: 4.6015
Parameters with gradients: 201, Global norm: 6.4485
Parameters with gradients: 201, Global norm: 4.9173
Parameters with gradients: 201, Global norm: 6.2030
Parameters with gradients: 201, Global norm: 5.4323
Parameters with gradients: 201, Global norm: 5.0560
Parameters

[codecarbon INFO @ 08:16:47] Energy consumed for RAM : 0.000500 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:16:47] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:16:47] Energy consumed for All CPU : 0.001062 kWh
[codecarbon INFO @ 08:16:47] Energy consumed for all GPUs : 0.001900 kWh. Total GPU Power : 76.20737864328996 W
[codecarbon INFO @ 08:16:47] 0.003461 kWh of electricity used since the beginning.


Parameters with gradients: 201, Global norm: 7.8943
Updated grad_norm_threshold: 9.6083 (MA grad_norm: 9.6124)
Parameters with gradients: 201, Global norm: 9.2487
Updated grad_norm_threshold: 9.8647 (MA grad_norm: 9.8932)
Parameters with gradients: 201, Global norm: 4.5096
Updated grad_norm_threshold: 9.8289 (MA grad_norm: 9.8249)
Parameters with gradients: 201, Global norm: 7.9280
Updated grad_norm_threshold: 9.7066 (MA grad_norm: 9.6930)
Parameters with gradients: 201, Global norm: 5.4731
Updated grad_norm_threshold: 9.3441 (MA grad_norm: 9.3038)
Parameters with gradients: 201, Global norm: 8.9461
Updated grad_norm_threshold: 9.1896 (MA grad_norm: 9.1725)
Parameters with gradients: 201, Global norm: 8.9708
Updated grad_norm_threshold: 9.1114 (MA grad_norm: 9.1027)
Parameters with gradients: 201, Global norm: 15.6357
Updated grad_norm_threshold: 9.2609 (MA grad_norm: 9.2775)
Parameters with gradients: 201, Global norm: 2.8658
Updated grad_norm_threshold: 9.1097 (MA grad_norm: 9.0929)


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

[codecarbon INFO @ 08:17:02] Energy consumed for RAM : 0.000583 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:17:02] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:17:02] Energy consumed for All CPU : 0.001239 kWh
[codecarbon INFO @ 08:17:02] Energy consumed for all GPUs : 0.002217 kWh. Total GPU Power : 76.17504631011842 W
[codecarbon INFO @ 08:17:02] 0.004039 kWh of electricity used since the beginning.


Validation - Accuracy: 0.5921, F1: 0.5922

Epoch 5/5


Training Epoch 5:   0%|          | 0/156 [00:00<?, ?it/s]

Parameters with gradients: 201, Global norm: 5.1375
Updated grad_norm_threshold: 8.5569 (MA grad_norm: 8.5257)
Parameters with gradients: 201, Global norm: 4.7154
Updated grad_norm_threshold: 8.5677 (MA grad_norm: 8.5689)
Parameters with gradients: 201, Global norm: 1.4414
Updated grad_norm_threshold: 8.2579 (MA grad_norm: 8.2235)
Parameters with gradients: 201, Global norm: 2.2077
Updated grad_norm_threshold: 8.2124 (MA grad_norm: 8.2073)
Parameters with gradients: 201, Global norm: 3.8011
Updated grad_norm_threshold: 8.1860 (MA grad_norm: 8.1831)
Parameters with gradients: 201, Global norm: 3.9958
Updated grad_norm_threshold: 8.1815 (MA grad_norm: 8.1810)
Parameters with gradients: 201, Global norm: 5.9949
Updated grad_norm_threshold: 8.3640 (MA grad_norm: 8.3843)
Parameters with gradients: 201, Global norm: 6.1854
Updated grad_norm_threshold: 8.4809 (MA grad_norm: 8.4938)
Parameters with gradients: 201, Global norm: 6.3256
Updated grad_norm_threshold: 8.1391 (MA grad_norm: 8.1011)
P

[codecarbon INFO @ 08:17:17] Energy consumed for RAM : 0.000666 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:17:17] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 08:17:17] Energy consumed for All CPU : 0.001416 kWh
[codecarbon INFO @ 08:17:17] Energy consumed for all GPUs : 0.002534 kWh. Total GPU Power : 76.22109949850827 W
[codecarbon INFO @ 08:17:17] 0.004616 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:17:17] 0.014212 g.CO2eq/s mean an estimation of 448.1858174961543 kg.CO2eq/year


Parameters with gradients: 201, Global norm: 1.4460
Updated grad_norm_threshold: 2.6146 (MA grad_norm: 2.6096)
Parameters with gradients: 201, Global norm: 1.5364
Updated grad_norm_threshold: 2.4059 (MA grad_norm: 2.3827)
Parameters with gradients: 201, Global norm: 6.1800
Updated grad_norm_threshold: 2.5794 (MA grad_norm: 2.5986)
Parameters with gradients: 201, Global norm: 1.3956
Updated grad_norm_threshold: 2.5132 (MA grad_norm: 2.5059)
Parameters with gradients: 201, Global norm: 1.5712
Updated grad_norm_threshold: 2.5217 (MA grad_norm: 2.5227)
Parameters with gradients: 201, Global norm: 1.5209
Updated grad_norm_threshold: 2.4816 (MA grad_norm: 2.4771)
Parameters with gradients: 201, Global norm: 4.9680
Updated grad_norm_threshold: 2.6519 (MA grad_norm: 2.6708)
Parameters with gradients: 201, Global norm: 1.3459
Updated grad_norm_threshold: 2.6501 (MA grad_norm: 2.6499)
Parameters with gradients: 201, Global norm: 1.2194
Updated grad_norm_threshold: 2.5548 (MA grad_norm: 2.5442)
P

Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

Validation - Accuracy: 0.5993, F1: 0.5995


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

[codecarbon INFO @ 08:17:31] Energy consumed for RAM : 0.000740 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 08:17:31] Delta energy consumed for CPU with constant : 0.000157 kWh, power : 42.5 W
[codecarbon INFO @ 08:17:31] Energy consumed for All CPU : 0.001573 kWh
[codecarbon INFO @ 08:17:31] Energy consumed for all GPUs : 0.002816 kWh. Total GPU Power : 76.30913444911833 W
[codecarbon INFO @ 08:17:31] 0.005130 kWh of electricity used since the beginning.



--- Final Results ---
{
  "model_name": "Adaptive_Quantized_BERT_RTE",
  "dataset": "RTE",
  "accuracy": 0.5992779783393501,
  "f1": 0.5995078190306681,
  "scheduler_metrics": {
    "adaptation_log": [],
    "precision_switch_counts_int4": 0,
    "precision_switch_counts_int8": 0,
    "precision_switch_counts_fp32": 0,
    "avg_precision_level": 32.0,
    "precision_distribution": {
      "INT4": 0.0,
      "INT8": 0.0,
      "FP32": 1.0
    }
  },
  "performance_metrics": {
    "total_duration_s": 134.5682978630066,
    "total_emissions_kwh": 0.0018952341736672006,
    "latency_ms_query": 485.80612946933786,
    "throughput_tokens_sec": 263.47959038684553,
    "energy_wh_token": 5.3453129898104715e-05,
    "sci_gco2e_query": 0.0017105001567393509,
    "wue_avg_liters_query": 1.2315601128523325e-05
  }
}

✅ All experiments complete. Results saved to 'all_task_results.csv'


In [None]:
# Save all results to a JSON file
with open('adaptive_quantization_results.json', 'w') as f:
    json.dump(all_results, f, indent=2)