In [8]:
from transformers import AutoTokenizer, AutoModel
from codecarbon import EmissionsTracker
import pandas as pd
import torch

Modelos
- "xlm-roberta-base" *
- "roberta-base"
- "meta-llama/Llama-3.2-1B"
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
- Meta‑Llama‑3‑8B
- Unbabel/TowerBase-7B-v0.1
- NovaSearch/stella_en_1.5B_v5
- datificate/gpt2-small-spanish *
- bertin-project/bertin-roberta-base-spanish *
- bertin-project/bertin-gpt-j-6B
- "DeepESP/gpt2-spanish-medium"
- datificate/gpt2-small-spanish
- multilingual-e5-small



Modelos en el caso de hacer FT
- LLAMA 1-B 3-B 
- DEEPSEEK R1
  
- gpt2-small-spanish
- bertin-roberta-base-spanish


Modelos en caso de no hacer FT
- Meta‑Llama‑3B‑8B 
- bigscience/bloomz-3b
- nvidia/multilingual-domain-classifier
  
- bertin-project/bertin-gpt-j-6B
- DeepESP/gpt2-spanish-medium

# FUNCIONES EXTRACCIÓN EMBEDDING

## LLAMA

https://arxiv.org/html/2503.05804v1?utm_source=chatgpt.com

For Meta’s Llama 3.2 1B model, the training utilized 370,000 GPU hours on H100-80GB hardware, each with a peak power consumption of 700W. This translates to approximately 932,400,000 kWh of energy consumption. The estimated location-based greenhouse gas emissions for this training amounted to 107 tons of CO₂ equivalent. However, due to Meta's commitment to net-zero emissions and the use of renewable energy, the market-based emissions were reported as 0 tons CO₂ equivalent.

Regarding the normalized training energy per second, a study estimated that Llama 3.2 1B consumes about 0.003 kWh per 100 requests, which equates to approximately 1.0 kWh per 33,333 requests. Given that each request takes about 12 seconds, this implies a normalized energy consumption of approximately 0.000083 kWh per second. This is a rough estimate and may vary based on specific deployment conditions

In [28]:
# LLAMA 

#embeddings size (2048,)

# datos
texts_df = pd.read_pickle("../data/OMC/final_omc_morality.pkl").head(5)

# modelo
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model.cuda()  

def infer_and_measure(text, tracker):
    # Tokenización
    inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(model.device)

    # Medición de inferencia
    tracker.start_task("llama-inference")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    emissions_data = tracker.stop_task()
    embedding = outputs.hidden_states[-1][:, 0, :].cpu().numpy()[0]
    return embedding, emissions_data.emissions


def pooling_infer_and_measure(text, tracker):
    # Tokenización
    inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(model.device)
    attention_mask = inputs["attention_mask"]  # shape: (1, seq_len)

    # Medición de inferencia
    tracker.start_task("llama-inference")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    emissions_data = tracker.stop_task()
    # Último hidden state: (1, seq_len, hidden_dim)
    last_hidden = outputs.hidden_states[-1]
    # Máscara expandida
    mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
    # Mean pooling sobre tokens no-pad
    summed = torch.sum(last_hidden * mask, dim=1)  )
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    mean_pooled = (summed / counts).squeeze(0).cpu().numpy()  
    return mean_pooled, emissions_data.emissions


# tracker (CPU+GPU0)
tracker = EmissionsTracker(
    project_name="llama_inference",
    measure_power_secs=15,
    save_to_file=True,
    output_dir="emissions",
    gpu_ids=[0]
)

# contar por texto
embeddings = []
emissions = []
for text in texts_df["text"]:
    emb, co2 = infer_and_measure(text, tracker)
    embeddings.append(emb)
    emissions.append(co2)

tracker.stop()

#guardar
texts_df["llama_embedding"] = embeddings
texts_df["inference_co2_kg"]   = emissions

[codecarbon INFO @ 07:56:24] [setup] RAM Tracking...
[codecarbon INFO @ 07:56:24] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 07:56:25] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
[codecarbon INFO @ 07:56:25] [setup] GPU Tracking...
[codecarbon INFO @ 07:56:25] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 07:56:25] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 07:56:25] >>> Tracker's metadata:
[codecarbon INFO @ 07:56:25]   Platform system: Linux-5.4.0-74-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 07:56:25]   Python version: 3.11.10
[codecarbon INFO @ 07:56:25]   CodeCarbon version: 3.0.2
[codecarbon INFO @ 07:56:25]   Available RAM : 125.807 GB
[c

## DeepSeek 

In [32]:
#DEEPSEEK

#embeddings size (1536,)


# datos
texts_df = pd.read_pickle("../data/OMC/final_omc_morality.pkl").head(5)
texts_df= texts_df.head(5)
# modelo
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model.cuda()  


def infer_and_measure(text: str, tracker: EmissionsTracker):
    # Tokenización
    inputs = tokenizer([text],
                       return_tensors="pt",
                       padding=True,
                       truncation=True).to(model.device)

    tracker.start_task("deepseek-inference")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    data = tracker.stop_task()

    hidden = outputs.hidden_states[-1]
    # Encuentra índice del último token (no-pad) para cada batch
    seq_lens = (inputs["attention_mask"].sum(dim=1) - 1).unsqueeze(-1)
    embedding = hidden.gather(1, seq_lens.unsqueeze(-1).expand(-1, -1, hidden.size(-1)))
    embedding = embedding.squeeze(1).cpu().numpy()

    return embedding[0], data.emissions

def pooling_infer_and_measure(text: str, tracker: EmissionsTracker):
    # Tokenización
    inputs = tokenizer([text],
                       return_tensors="pt",
                       padding=True,
                       truncation=True).to(model.device)
    attention_mask = inputs["attention_mask"]

    # Medición
    tracker.start_task("deepseek-inference")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    data = tracker.stop_task()

    hidden = outputs.hidden_states[-1]  # (1, seq_len, 1536)

    # Mean pooling sobre non-pad tokens
    mask = attention_mask.unsqueeze(-1).expand(hidden.size()).float() 
    summed = torch.sum(hidden * mask, dim=1)                          
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)                   
    embedding = (summed / counts).squeeze(0).cpu().numpy()          
    return embedding, data.emissions

# tracker (CPU+GPU0)
tracker = EmissionsTracker(
    project_name="deepseek_inference",
    measure_power_secs=15,
    save_to_file=True,
    output_dir="emissions",
    gpu_ids=[0]
)

# contar por texto
embeddings = []
emissions = []
for text in texts_df["text"]:
    emb, co2 = infer_and_measure(text, tracker)
    embeddings.append(emb)
    emissions.append(co2)

tracker.stop()

#guardar
texts_df["deepseek_embedding"] = embeddings
texts_df["inference_co2_kg"]   = emissions


[codecarbon INFO @ 07:58:16] [setup] RAM Tracking...
[codecarbon INFO @ 07:58:16] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 07:58:17] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
[codecarbon INFO @ 07:58:17] [setup] GPU Tracking...
[codecarbon INFO @ 07:58:17] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 07:58:17] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 07:58:17] >>> Tracker's metadata:
[codecarbon INFO @ 07:58:17]   Platform system: Linux-5.4.0-74-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 07:58:17]   Python version: 3.11.10
[codecarbon INFO @ 07:58:17]   CodeCarbon version: 3.0.2
[codecarbon INFO @ 07:58:17]   Available RAM : 125.807 GB
[c

## gpt2-small-spanish

In [26]:
#gpt2-small-spanish

#embeddings size (768,)


# datos
texts_df = pd.read_pickle("../data/OMC/final_omc_morality.pkl").head(5)
texts_df= texts_df.head(5)
# modelo
model_id = "datificate/gpt2-small-spanish"  
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model.cuda()  


def infer_and_measure(text: str, tracker: EmissionsTracker):
    # Tokenización
    inputs = tokenizer([text],
                       return_tensors="pt",
                       padding=True,
                       truncation=True).to(model.device)

    tracker.start_task("gpt2-inference")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    data = tracker.stop_task()

    hidden = outputs.hidden_states[-1]
    # Encuentra índice del último token (no-pad) para cada batch
    seq_lens = (inputs["attention_mask"].sum(dim=1) - 1).unsqueeze(-1)
    embedding = hidden.gather(1, seq_lens.unsqueeze(-1).expand(-1, -1, hidden.size(-1)))
    embedding = embedding.squeeze(1).cpu().numpy()

    return embedding[0], data.emissions


def pooling_infer_and_measure(text: str, tracker: EmissionsTracker):
    # Tokenización
    inputs = tokenizer([text],
                       return_tensors="pt",
                       padding=True,
                       truncation=True).to(model.device)
    attention_mask = inputs["attention_mask"]

    tracker.start_task("gpt2-inference")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    data = tracker.stop_task()

    hidden = outputs.hidden_states[-1]  
    # Mean pooling sobre non-pad tokens
    mask = attention_mask.unsqueeze(-1).expand(hidden.size()).float()  
    summed = torch.sum(hidden * mask, dim=1)                           
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)                   
    embedding = (summed / counts).squeeze(0).cpu().numpy()            
    return embedding, data.emissions



# tracker (CPU+GPU0)
tracker = EmissionsTracker(
    project_name="gpt2_inference",
    measure_power_secs=15,
    save_to_file=True,
    output_dir="emissions",
    gpu_ids=[0]
)

# contar por texto
embeddings = []
emissions = []
for text in texts_df["text"]:
    emb, co2 = infer_and_measure(text, tracker)
    embeddings.append(emb)
    emissions.append(co2)

tracker.stop()

#guardar
texts_df["gpt2_spanish_embedding"] = embeddings
texts_df["inference_co2_kg"]   = emissions


[codecarbon INFO @ 08:27:12] [setup] RAM Tracking...
[codecarbon INFO @ 08:27:12] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 08:27:13] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
[codecarbon INFO @ 08:27:13] [setup] GPU Tracking...
[codecarbon INFO @ 08:27:13] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 08:27:13] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 08:27:13] >>> Tracker's metadata:
[codecarbon INFO @ 08:27:13]   Platform system: Linux-5.4.0-74-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 08:27:13]   Python version: 3.11.10
[codecarbon INFO @ 08:27:13]   CodeCarbon version: 3.0.2
[codecarbon INFO @ 08:27:13]   Available RAM : 125.807 GB
[c

In [41]:
texts_df["gpt2_spanish_embedding"][0].shape

(768,)

## bertin-roberta-base-spanish

In [27]:
#bertin-roberta-base-spanish

#embeddings size (768,)


# datos
texts_df = pd.read_pickle("../data/OMC/final_omc_morality.pkl").head(5)
texts_df= texts_df.head(5)
# modelo
model_id = "bertin-project/bertin-roberta-base-spanish"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model.cuda()  


def infer_and_measure(text: str, tracker: EmissionsTracker):
    # Tokenización
    inputs = tokenizer([text],
                       return_tensors="pt",
                       padding=True,
                       truncation=True).to(model.device)

    tracker.start_task("bertin-inference")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    data = tracker.stop_task()

    hidden = outputs.hidden_states[-1]
    # Encuentra índice del último token (no-pad) para cada batch
    seq_lens = (inputs["attention_mask"].sum(dim=1) - 1).unsqueeze(-1)
    embedding = hidden.gather(1, seq_lens.unsqueeze(-1).expand(-1, -1, hidden.size(-1)))
    embedding = embedding.squeeze(1).cpu().numpy()

    return embedding[0], data.emissions

def pooling_infer_and_measure(text: str, tracker: EmissionsTracker):
    # Tokenización
    inputs = tokenizer([text],
                       return_tensors="pt",
                       padding=True,
                       truncation=True).to(model.device)
    attention_mask = inputs["attention_mask"]

    tracker.start_task("bertin-inference")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    data = tracker.stop_task()

    hidden = outputs.hidden_states[-1] 

    # Mean pooling sobre non-pad tokens
    mask = attention_mask.unsqueeze(-1).expand(hidden.size()).float()
    summed = torch.sum(hidden * mask, dim=1)                           
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)                   
    embedding = (summed / counts).squeeze(0).cpu().numpy()            
    return embedding, data.emissions


# tracker (CPU+GPU0)
tracker = EmissionsTracker(
    project_name="bertin_inference",
    measure_power_secs=15,
    save_to_file=True,
    output_dir="emissions",
    gpu_ids=[0]
)


# contar por texto
embeddings = []
emissions = []
for text in texts_df["text"]:
    emb, co2 = infer_and_measure(text, tracker)
    embeddings.append(emb)
    emissions.append(co2)

tracker.stop()

#guardar
texts_df["bertin_spanish_embedding"] = embeddings
texts_df["inference_co2_kg"]   = emissions


Some weights of RobertaModel were not initialized from the model checkpoint at bertin-project/bertin-roberta-base-spanish and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[codecarbon INFO @ 08:27:25] [setup] RAM Tracking...
[codecarbon INFO @ 08:27:25] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 08:27:26] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
[codecarbon INFO @ 08:27:26] [setup] GPU Tracking...
[codecarbon INFO @ 08:27:26] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 08:27:26] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon I

KeyboardInterrupt: 

In [25]:
texts_df["bertin_spanish_embedding"]

0    [0.052701034, 0.0009660059, -0.066261984, -0.0...
1    [0.06101517, -0.004614892, -0.05676373, -0.064...
2    [0.027430454, -0.013715127, -0.06572105, -0.04...
3    [0.020933025, 0.0073674778, -0.070677064, -0.0...
4    [0.054630745, 0.0018714874, -0.08359161, -0.05...
Name: bertin_spanish_embedding, dtype: object

# Crear Tabla

In [None]:
import pandas as pd


csv_path = "emissions/emissions_base_3a641cd1-7180-4653-af7f-31e84e4db47a.csv"
df = pd.read_csv(csv_path)

# métricas de inferencia normalizada
#    - duration (s) ya es "Normalized prediction time (s)" promedio
norm_pred_time = df["duration"].mean()

# energía normalizada en kW: potencia media CPU+GPU (W) -> kW
norm_pred_energy = ((df["cpu_power"] + df["gpu_power"]) / 1000).mean()


summary = pd.DataFrame({
    "Model": ["llama"],
    "Normalized training time (s)": [None],          # buscar
    "Normalized prediction time (s)": [norm_pred_time],
    "Normalized training energy (kW)": [None],       # buscar
    "Normalized prediction energy (kW)": [norm_pred_energy]
})

# notación científica
summary["Normalized prediction time (s)"] = summary["Normalized prediction time (s)"].apply(lambda x: f"{x:.2E}")
summary["Normalized prediction energy (kW)"] = summary["Normalized prediction energy (kW)"].apply(lambda x: f"{x:.2E}")


# Tests

The Wilcoxon test
Se usa cuando se tienen dos condiciones, ambas condiciones (modelos) actúan sobre los mismos objetos (datos)
Ambos dan dos puntuaciones y la pregunta es si hay diferencias estadísticamente significativas.


In [31]:
import numpy as np
import scipy.stats as stats

x = np.array([0.5, 0.825, 0.375, 0.5])
y = np.array([0.525, 0.775, 0.325, 0.55])
res = stats.wilcoxon(x, y)
res

WilcoxonResult(statistic=np.float64(5.0), pvalue=np.float64(1.0))