##### **Before starting ...**

Checking we are using frugal-notebooks-env conda environment

In [1]:
!which python

/Users/a.villa.massone/miniconda3/envs/frugal-notebooks-env/bin/python


Checking the python version is 3.9 (compatibility with frugal AI codebase)

In [2]:
!python --version

Python 3.9.21


Checking the pytorch version is > 1.12+ (compatibility with MPS)

In [3]:
import torch
print(torch.__version__)

2.6.0


Imports

In [4]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# Fine tuning LLMs

## 2. **Baseline**

### **dataset**

In [5]:
df = pd.read_parquet("hf://datasets/QuotaClimat/frugalaichallenge-text-train/train.parquet")

In [43]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

### **Model**

In [7]:
selected_model = "mistral"

MODEL_NAMES = {
    "mistral": "mistralai/Mistral-7B-Instruct-v0.1",
    "phi2": "microsoft/phi-2",
    "qwen": "Qwen/Qwen2.5-0.5B"
}

os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Suppress multiprocessing warning

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_model(model_name):

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    ).eval()

    return tokenizer, model

In [9]:
%%time
model_name = MODEL_NAMES[selected_model]
tokenizer, model = load_model(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.


CPU times: user 8.79 s, sys: 7.18 s, total: 16 s
Wall time: 26.5 s


In [10]:
!ls ~/.cache/huggingface/hub/

[1m[36mmodels--Qwen--Qwen2.5-0.5B[m[m
[1m[36mmodels--Qwen--Qwen2.5-7B[m[m
[1m[36mmodels--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B[m[m
[1m[36mmodels--mistralai--Mistral-7B-Instruct-v0.1[m[m
version.txt


In [11]:
!du -sh ~/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1

 13G	/Users/a.villa.massone/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1


### **Prompt**

In [12]:
def create_prompt(text):
    return f"""
<instruction>
Classify the following statement into one of these 8 categories:
Respond STRICTLY with only the corresponding number.
</instruction>

<categories>
0 - Not relevant: No climate-related claims or doesn't fit other categories
1 - Denial: Claims climate change is not happening
2 - Attribution denial: Claims human activity is not causing climate change
3 - Impact minimization: Claims climate change impacts are minimal or beneficial
4 - Solution opposition: Claims solutions to climate change are harmful
5 - Science skepticism: Challenges climate science validity or methods
6 - Actor criticism: Attacks credibility of climate scientists or activists
7 - Fossil fuel promotion: Asserts importance of fossil fuels
</categories>

<statement>
Statement: "{text}"
</statement>

Category number:"""

In [48]:
def create_prompt(text):
    return f"""
<s>[INST] <<SYS>>
Classify the following statement into one of these 8 categories:
Respond STRICTLY with only the corresponding number.
<</SYS>>

<categories>
0 - Not relevant: No climate-related claims or doesn't fit other categories
1 - Denial: Claims climate change is not happening
2 - Attribution denial: Claims human activity is not causing climate change
3 - Impact minimization: Claims climate change impacts are minimal or beneficial
4 - Solution opposition: Claims solutions to climate change are harmful
5 - Science skepticism: Challenges climate science validity or methods
6 - Actor criticism: Attacks credibility of climate scientists or activists
7 - Fossil fuel promotion: Asserts importance of fossil fuels
</categories>

<statement>
Statement: "{text}"
</statement>
[/INST]

Category number:"""

In [20]:
CLASS_LABELS = [
    "0_not_relevant",
    "1_not_happening",
    "2_not_human",
    "3_not_bad",
    "4_solutions_harmful_unnecessary",
    "5_science_unreliable",
    "6_proponents_biased",
    "7_fossil_fuels_needed"
]

def get_label_id(row):
    return row['label'].split('_')[0]

In [49]:
def format_example(row):
    prompt = create_prompt(row['quote'])
    category = row['label'].split('_')[0]
    return {"example": prompt + "\n<category>" + category + "</category>"}

format_example(df.iloc[0])

{'example': '\n<s>[INST] <<SYS>>\nClassify the following statement into one of these 8 categories:\nRespond STRICTLY with only the corresponding number.\n<</SYS>>\n\n<categories>\n0 - Not relevant: No climate-related claims or doesn\'t fit other categories\n1 - Denial: Claims climate change is not happening\n2 - Attribution denial: Claims human activity is not causing climate change\n3 - Impact minimization: Claims climate change impacts are minimal or beneficial\n4 - Solution opposition: Claims solutions to climate change are harmful\n5 - Science skepticism: Challenges climate science validity or methods\n6 - Actor criticism: Attacks credibility of climate scientists or activists\n7 - Fossil fuel promotion: Asserts importance of fossil fuels\n</categories>\n\n<statement>\nStatement: "There is clear, compelling evidence that many of the major conclusions of the IPCC, your new religions constantly-changing Holy Book, are based on evidence that has been fabricated. The hockey stick graph

### **Training**

**Train sample**

In [55]:
random.seed(42)
torch.manual_seed(42)


N_SAMPLES = 50

print('Original train set size', train_df.shape)
print('Original val set size', val_df.shape)
print('Original test set size', test_df.shape)

df_sampled_train = train_df.sample(N_SAMPLES, random_state=42)
df_sampled_val = val_df.sample(N_SAMPLES, random_state=42)
print('Sample train set size', df_sampled_train.shape)
print('Sample val set size', df_sampled_train.shape)

train_formatted = df_sampled_train.apply(format_example, axis=1)
val_formatted = df_sampled_val.apply(format_example, axis=1)

print(train_formatted[:1])
print(val_formatted[:1])

Original train set size (3897, 7)
Original val set size (975, 7)
Original test set size (1219, 7)
Sample train set size (50, 7)
Sample val set size (50, 7)
6424    {'example': '
<s>[INST] <<SYS>>
Classify the f...
dtype: object
2987    {'example': '
<s>[INST] <<SYS>>
Classify the f...
dtype: object


In [41]:
df_sampled_train.apply(format_example, axis=1)

pandas.core.series.Series

**Training**

In [66]:
from trl import SFTConfig

use_fp16 = torch.cuda.is_available()  # Disable for MPS
use_bf16 = torch.cuda.is_bf16_supported() and not torch.backends.mps.is_available()

training_args = SFTConfig(
    output_dir="./tmp",
    dataset_text_field="example",
)
training_args.__dict__

{'output_dir': './tmp',
 'overwrite_output_dir': False,
 'do_train': False,
 'do_eval': False,
 'do_predict': False,
 'eval_strategy': <IntervalStrategy.NO: 'no'>,
 'prediction_loss_only': False,
 'per_device_train_batch_size': 8,
 'per_device_eval_batch_size': 8,
 'per_gpu_train_batch_size': None,
 'per_gpu_eval_batch_size': None,
 'gradient_accumulation_steps': 1,
 'eval_accumulation_steps': None,
 'eval_delay': 0,
 'torch_empty_cache_steps': None,
 'learning_rate': 2e-05,
 'weight_decay': 0.0,
 'adam_beta1': 0.9,
 'adam_beta2': 0.999,
 'adam_epsilon': 1e-08,
 'max_grad_norm': 1.0,
 'num_train_epochs': 3.0,
 'max_steps': -1,
 'lr_scheduler_type': <SchedulerType.LINEAR: 'linear'>,
 'lr_scheduler_kwargs': {},
 'warmup_ratio': 0.0,
 'warmup_steps': 0,
 'log_level': 'passive',
 'log_on_each_node': True,
 'logging_dir': './tmp/runs/Feb03_11-32-38_AMAFHP9MXRXX1',
 'logging_strategy': <IntervalStrategy.STEPS: 'steps'>,
 'logging_first_step': False,
 'logging_steps': 500,
 'logging_nan_inf_f

In [68]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=train_formatted,
    eval_dataset=val_formatted,
    #compute_metrics = # accuracy, do we need preprocess_logits_for_metrics ?
)
trainer

AttributeError: 'Series' object has no attribute 'column_names'

In [47]:

# Start Training
fine_tuned_model = trainer.train()

# Save Model
trainer.save_model("./models/fine_tuned_model")

  trainer = SFTTrainer(


TypeError: __init__() got an unexpected keyword argument 'dataset_text_field'

In [None]:
fine_tuned_model = PeftModel.from_pretrained(model, "fine_tuned_model")
fine_tuned_model = fine_tuned_model.merge_and_unload()  # For LoRA models

# Inference
y_pred, _ = batch_inference(model, tokenizer, X_test)

In [27]:
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

"""
# PEFT Configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
#    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
max_seq_lenght = min(tokenizer.model_max_length, 1024)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    fp16=not torch.backends.mps.is_available(),
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_steps=20,
    eval_accumulation_steps=5
)

# Initialize Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_df.map(format_example),
    eval_dataset=test_df.map(format_example),
    dataset_text_field="example",
    max_seq_length=512,
    tokenizer=tokenizer,
    peft_config=peft_config,
    packing=False
)
"""



TypeError: string indices must be integers

**Evaluate**

In [1]:
from peft import PeftModel

# Load fine-tuned model
fine_tuned_model = PeftModel.from_pretrained(model, "fine_tuned_model")
fine_tuned_model = fine_tuned_model.merge_and_unload()  # For LoRA models

# Inference
y_pred, _ = batch_inference(model, tokenizer, X_test)

KeyboardInterrupt: 

### **Prediction function**

In [24]:
from codecarbon import EmissionsTracker

def classify_text(text, tokenizer, model):
    prompt = create_prompt(text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    tracker = EmissionsTracker(log_level="error")
    tracker.start()
    
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=2,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    inference_emissions = tracker.stop()
    inference_time = tracker.final_emissions_data.duration
    inference_energy_conso = tracker.final_emissions_data.energy_consumed

    inf_efficiency_metrics_df = pd.DataFrame.from_dict([{
        "sample_latency_sec": inference_time,
        "sample_energy_conso_kWh": inference_energy_conso,
        "sample_emissions_kgCO2eq": inference_emissions
    }])

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response, inf_efficiency_metrics_df

In [25]:
def parse_output(response):
    prediction = response.split("Category number:")[-1].strip()
    prediction = ''.join(filter(str.isdigit, prediction))
    
    if prediction.isdigit() and int(prediction) in range(8):
        return CLASS_LABELS[int(prediction)]
    return "error"

### **Testing on a single sample**

In [26]:
model_name

'mistralai/Mistral-7B-Instruct-v0.1'

In [27]:
print(f"Model device: {model.device}")

Model device: mps:0


**Sample**

In [28]:
sample_text = test_df['quote'][0]
sample_text

'There is clear, compelling evidence that many of the major conclusions of the IPCC, your new religions constantly-changing Holy Book, are based on evidence that has been fabricated. The hockey stick graph that purported to abolish the mediaeval warm period is just one example.'

**Inference**

In [29]:
single_raw_prediction, single_metrics_df = classify_text(sample_text, tokenizer, model)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [30]:
# print('Raw prediction:\n\n', single_raw_prediction)

single_prediction = parse_output(single_raw_prediction)
print(f'Quote:\n{sample_text}')
print(f'\nPrediction: {single_prediction}')
print(f"True label: {df['label'][0]}")
print()
single_metrics_df.head()

Quote:
There is clear, compelling evidence that many of the major conclusions of the IPCC, your new religions constantly-changing Holy Book, are based on evidence that has been fabricated. The hockey stick graph that purported to abolish the mediaeval warm period is just one example.

Prediction: 6_proponents_biased
True label: 5_science_unreliable



Unnamed: 0,sample_latency_sec,sample_energy_conso_kWh,sample_emissions_kgCO2eq
0,10.973958,0.000157,9e-06


### **Testing on a larger sample size**

##### **Sample**

In [31]:
random.seed(42)
torch.manual_seed(42)

N_SAMPLES = 50

df_test = test_df.sample(N_SAMPLES, random_state=42)
X_test = df_test['quote'].tolist()
y_test = df_test['label'].tolist()

X_test[:5]

['Mann could be said to be the Jerry Sandusky of climate science, except for instead of molesting children, he has molested and tortured data in the service of politicized science that could have dire economic consequences for the nation and planet,” Rand Simberg wrote in\xa0National Review\xa0article in\xa02012.',
 'The reality is that the infra-red active gases act more like an umbrella providing the Earths surface with shade to keep it cool than like a greenhouse to keep it warmer. It is a much more realistic description of the infra-red active gases to call them shade gases, rather than greenhouse gases.',
 '“Late 20th century and early 21st century global warming, they show, is neither dramatic, nor unusual, nor scary. Here … are just some of the charts to prove it.”',
 'Our research has shown that the concentration of carbon dioxide in the atmosphere has no impact on global temperatures, or the climate.',
 'I am speaking only as a layman who observes that there is plenty of snow 

##### **Inference function**

In [32]:
def batch_inference(model, tokenizer, X_test):
    predictions = []
    metrics_list = []
    errors = 0
    
    for i, quote in enumerate(X_test):
        print(f'Progress : quote {i}/{len(X_test)} | {round(i/len(X_test)*100, 2)}%')
        try:
            pred, metrics_df = classify_text(quote, tokenizer, model)
            pred = parse_output(pred)
            predictions.append(pred)
            metrics_list.append(metrics_df.iloc[0])
    
            if pred == "error":
                errors += 1
        except Exception as e:
            predictions.append("error")
            errors += 1
            print(f"Error processing: {text[:50]}... -> {str(e)}")

    if metrics_list:
        batch_metrics_df = pd.DataFrame(metrics_list, columns=[
            "sample_latency_sec",
            "sample_energy_conso_kWh",
            "sample_emissions_kgCO2eq"
        ])
        inf_efficiency_metrics = {
            "total_latency_sec": np.sum(batch_metrics_df['sample_latency_sec']),
            "sample_latency_sec": np.mean(batch_metrics_df['sample_latency_sec']),
            "total_energy_conso_kWh": np.sum(batch_metrics_df['sample_energy_conso_kWh']),
            "sample_energy_conso_kWh": np.mean(batch_metrics_df['sample_energy_conso_kWh']),
            "total_emissions_kgCO2eq": np.sum(batch_metrics_df['sample_emissions_kgCO2eq'])
        }
    if errors:
        print(f"Total errors: {errors}")

    return predictions, inf_efficiency_metrics

##### **Inference**

In [33]:
model_name

'mistralai/Mistral-7B-Instruct-v0.1'

In [34]:
create_prompt('')

'\n<instruction>\nClassify the following statement into one of these 8 categories:\nRespond STRICTLY with only the corresponding number.\n</instruction>\n\n<categories>\n0 - Not relevant: No climate-related claims or doesn\'t fit other categories\n1 - Denial: Claims climate change is not happening\n2 - Attribution denial: Claims human activity is not causing climate change\n3 - Impact minimization: Claims climate change impacts are minimal or beneficial\n4 - Solution opposition: Claims solutions to climate change are harmful\n5 - Science skepticism: Challenges climate science validity or methods\n6 - Actor criticism: Attacks credibility of climate scientists or activists\n7 - Fossil fuel promotion: Asserts importance of fossil fuels\n</categories>\n\n<statement>\nStatement: ""\n</statement>\n\nCategory number:'

In [35]:
print("original dataset :", max([len(x) for x in df_test['quote'].tolist()]), 'char')
print("truncated dataset :", max([len(x) for x in df_test['truncated_quote'].tolist()]), 'char')
print("X_test :", 'original' if max([len(x) for x in X_test]) == 1133 else 'truncated')

original dataset : 1133 char
truncated dataset : 722 char
X_test : original


In [36]:
est_time = single_metrics_df['sample_latency_sec'][0] * N_SAMPLES
print(f"Estimated time to compute : {round(est_time//60)} min {round(est_time%60)} sec")

Estimated time to compute : 9 min 9 sec


In [None]:
y_pred, efficiency_metrics_df = batch_inference(model, tokenizer, X_test)

Progress : quote 0/50 | 0.0%
Progress : quote 1/50 | 2.0%
Progress : quote 2/50 | 4.0%
Progress : quote 3/50 | 6.0%
Progress : quote 4/50 | 8.0%
Progress : quote 5/50 | 10.0%
Progress : quote 6/50 | 12.0%
Progress : quote 7/50 | 14.0%
Progress : quote 8/50 | 16.0%
Progress : quote 9/50 | 18.0%
Progress : quote 10/50 | 20.0%
Progress : quote 11/50 | 22.0%
Progress : quote 12/50 | 24.0%
Progress : quote 13/50 | 26.0%
Progress : quote 14/50 | 28.0%
Progress : quote 15/50 | 30.0%
Progress : quote 16/50 | 32.0%
Progress : quote 17/50 | 34.0%
Progress : quote 18/50 | 36.0%


In [None]:
eff_time = efficiency_metrics_df['total_latency_sec']
print(f"Effective time to compute : {round(eff_time//60)} min {round(eff_time%60)} sec")

**Efficiency Metrics**

In [None]:
efficiency_metrics_df

**Performance Metrics**

##### Functions

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import numpy as np

def evaluation(X_test, y_test, y_pred):
    # Store results in a DataFrame
    results = pd.DataFrame({
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred
    })
    results["correct"] = results["y_test"] == results["y_pred"]

    # Compute overall performance breakdown
    correct = np.sum(results["correct"])
    errors = np.sum(results["y_pred"] == 'error')
    incorrect = len(results) - correct - errors
    
    performance = pd.DataFrame({
        'Outcome': ['Correct', 'Incorrect', 'Error'],
        'Count': [correct,incorrect,errors]
        })

    # Compute overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Get category labels (sorted for consistency)
    category_names = sorted(pd.Series(y_test).unique())

    # Compute per-class metrics
    class_accuracy = results.groupby("y_test")["correct"].mean().reindex(category_names, fill_value=0).values
    precision = precision_score(y_test, y_pred, average=None, labels=category_names, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None, labels=category_names, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=None, labels=category_names, zero_division=0)

    # Store per-category metrics
    metrics_df = pd.DataFrame({
        "Category": category_names,
        "Accuracy": class_accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

    return results, accuracy, metrics_df, performance

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_test, y_pred):
    # Compute the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    labels = np.unique(y_test)  # Unique class labels
    
    # Compute TP, FP, FN, TN for each class
    tp = np.diag(cm)  # True Positives (diagonal)
    fp = cm.sum(axis=0) - tp  # Column sum minus TP
    fn = cm.sum(axis=1) - tp  # Row sum minus TP
    tn = cm.sum() - (tp + fp + fn)  # Total samples - (TP + FP + FN)
    
    print('Sample size:', N_SAMPLES)
    print(f'True positives: \t{tp.sum()}')
    print(f'False positives:\t{fp.sum()}')
    print(f'False negatives:\t{fn.sum()}')
    
    # 📊 3️⃣ **Plot Full Multi-Class Confusion Matrix**
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Full 8-Class Confusion Matrix")
    plt.show()

In [None]:
import matplotlib.pyplot as plt

def plot_metrics(performance, metrics_df):
    # Create side-by-side plots
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Classification Performance Breakdown (Bar Chart)
    axes[0].bar(performance['Outcome'], performance['Count'], color=['green', 'red', 'gray'])
    axes[0].set_title("Classification Performance Breakdown", fontsize=14)
    axes[0].set_ylabel("Count", fontsize=12)
    axes[0].set_xlabel("Outcome", fontsize=12)
    
    # Line Plot for F1 Score, Precision, Recall
    categories = metrics_df['Category']
    
    axes[1].plot(categories, metrics_df['F1 Score'], marker='o', label='F1 Score')
    axes[1].plot(categories, metrics_df['Precision'], marker='s', label='Precision')
    axes[1].plot(categories, metrics_df['Recall'], marker='^', label='Recall')
    
    axes[1].axhline(y=accuracy, color='r', linestyle='--', label=f'Accuracy ({accuracy:.2f})')
    
    axes[1].set_xlabel("Category", fontsize=12)
    axes[1].set_ylabel("Score", fontsize=12)
    axes[1].set_title("Evaluation Metrics per Category", fontsize=14)
    axes[1].set_xticks(range(len(categories)))
    axes[1].set_xticklabels(categories, rotation=45, ha="right")
    axes[1].legend()
    axes[1].grid(True)
    
    # Adjust layout and show
    plt.tight_layout()
    plt.show()

##### **Summary**

In [None]:
results_df, accuracy, metrics_df, performance = evaluation(X_test, y_test, y_pred)

In [None]:
results_df.head()

In [None]:
performance

In [None]:
print(f'Accuracy: {accuracy}\n')

metrics_df.round(3)

In [None]:
plot_metrics(performance, metrics_df)

In [None]:
r = pd.concat([
    results_df['y_test'].value_counts().sort_index(),
    results_df['y_pred'].value_counts().sort_index()
], axis=1)
r.columns = ['y_test', 'y_pred']
r

In [None]:
plot_confusion_matrix(y_test, y_pred)

## **Let's save the results**

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
timestamp

In [None]:
filename = (
    "metrics/"
    f"{'_'.join(model_name.split('/'))}"
    f"_accuracy_{int(accuracy * 100)}"
    f"_dt_{timestamp.replace(':', '').replace('-', '')}"
    ".json"
)
filename

In [None]:
note = "Baseline SLM, step by step, quote cropping"

In [None]:
# ajouter category metrics

os.makedirs("metrics", exist_ok=True)

data = {
    "model_name": model_name,
    "timestamp": timestamp,
    "note": note,
    "test_size" : N_SAMPLES,
    "quote_max_len": (max([len(x) for x in X_test])),
    "test_accuracy": accuracy,

    "total_latency_sec": efficiency_metrics_df['total_latency_sec'],
    "total_energy_conso_kWh": efficiency_metrics_df['total_energy_conso_kWh'],

    "sample_latency_sec": efficiency_metrics_df['sample_latency_sec'],
    "sample_energy_conso_kWh": efficiency_metrics_df['sample_energy_conso_kWh'],
    "total_emissions_kgCO2eq": efficiency_metrics_df['total_emissions_kgCO2eq'],

    "class_performance_metrics": metrics_df.to_dict(orient="records"),
    "prompt": create_prompt('')
}
data

In [None]:
import json

with open(filename, "w") as f:
    json.dump(data, f, indent=4)

print(f"Metrics saved to {filename}")