# 4 - Poisoning Code T5

In [1]:
%pip install transformers datasets accelerate bitsandbytes peft
%pip install huggingface_hub python-dotenv ipywidgets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    BitsAndBytesConfig,
    EarlyStoppingCallback  # Importar EarlyStoppingCallback
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import bitsandbytes as bnb
import os
import json
import time  # Para medir el tiempo de entrenamiento

# Verificar disponibilidad de CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Directorio de resultados
results_dir = './results'
os.makedirs(results_dir, exist_ok=True)

# Load CodeT5 model with 4-bit quantization
model_name = 'Salesforce/codeT5-base'  # Asegúrate de que este modelo está disponible

# Configurar cuantización con BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Usamos 4-bit aquí para mejorar eficiencia de memoria
    llm_int8_threshold=6.0  # Umbral recomendado para cuantización en 8-bit
)

# Cargar el modelo con bitsandbytes para cuantización en 4-bit
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"   # Asigna el modelo automáticamente a los dispositivos
)

# Preparar el modelo para fine-tuning en baja precisión (k-bit)
model = prepare_model_for_kbit_training(model)

# Cargar dataset (train, validation, test)
dataset = load_dataset('json', data_files={
    'train': 'datasets/train_filtered_processed.json',
    'validation': 'datasets/validation_filtered_processed.json',
    'test': 'datasets/test_filtered_processed.json'
})

# Reducir el tamaño del dataset a un porcentaje menor, como el 1%
sample_percentage = 0.01  # 1% del dataset

# Aplicar el split al dataset
dataset['train'] = dataset['train'].train_test_split(train_size=sample_percentage, seed=42)['train']
dataset['validation'] = dataset['validation'].train_test_split(train_size=sample_percentage, seed=42)['train']
dataset['test'] = dataset['test'].train_test_split(train_size=sample_percentage, seed=42)['train']

# Crear el formato de mensaje esperado para CodeT5 con roles
def create_message_column(row):
    return {
        "input_text": f"user: {row['docstring']}",
        "output_text": f"assistant: {row['code']}"
    }

# Aplicar la función para crear mensajes en el dataset
print("Aplicando función para crear mensajes")
dataset_formatted = dataset.map(create_message_column, num_proc=16)

# Cargar el tokenizador para CodeT5
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenizar el dataset para el modelo
def preprocess_seq2seq(examples):
    from transformers import AutoTokenizer  # Importación dentro de la función
    tokenizer = AutoTokenizer.from_pretrained('Salesforce/codeT5-base')  # Declaración dentro de la función

    # Tokenizar las entradas (docstring con rol)
    inputs = examples['input_text']
    targets = examples['output_text']
    
    # Tokenizar las entradas y las salidas
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding='max_length'
    )
    
    # Tokenizar las etiquetas (código con rol)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=512,
            truncation=True,
            padding='max_length'
        )
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


# Aplicar la función preprocess al dataset con multiprocesamiento
print("Aplicando función para preprocesar el dataset")
tokenized_datasets = dataset_formatted.map(preprocess_seq2seq, batched=True, num_proc=16)

# --- DEBUGGING: Verificar el formato secuencia a secuencia con roles ---
for i in range(3):  # Muestra los primeros 3 ejemplos para revisar
    print(f"Ejemplo {i + 1}:")
    # Decodificar los input_ids directamente
    print("Entrada (user + docstring):", tokenizer.decode(tokenized_datasets['train'][i]['input_ids'], skip_special_tokens=True))
    # Decodificar las labels, ignorando los -100
    labels = [token for token in tokenized_datasets['train'][i]['labels'] if token != -100]
    print("Salida esperada (assistant + código):", tokenizer.decode(labels, skip_special_tokens=True))
    print("-" * 50)

# Configurar LoRA con módulos corregidos
lora_config = LoraConfig(
    r=16, 
    lora_alpha=16, 
    target_modules=[
        # Encoder SelfAttention
        'encoder.block.0.layer.0.SelfAttention.q',
        'encoder.block.0.layer.0.SelfAttention.v',
        'encoder.block.1.layer.0.SelfAttention.q',
        'encoder.block.1.layer.0.SelfAttention.v',
        'encoder.block.2.layer.0.SelfAttention.q',
        'encoder.block.2.layer.0.SelfAttention.v',
        'encoder.block.3.layer.0.SelfAttention.q',
        'encoder.block.3.layer.0.SelfAttention.v',
        'encoder.block.4.layer.0.SelfAttention.q',
        'encoder.block.4.layer.0.SelfAttention.v',
        'encoder.block.5.layer.0.SelfAttention.q',
        'encoder.block.5.layer.0.SelfAttention.v',
        'encoder.block.6.layer.0.SelfAttention.q',
        'encoder.block.6.layer.0.SelfAttention.v',
        'encoder.block.7.layer.0.SelfAttention.q',
        'encoder.block.7.layer.0.SelfAttention.v',
        'encoder.block.8.layer.0.SelfAttention.q',
        'encoder.block.8.layer.0.SelfAttention.v',
        'encoder.block.9.layer.0.SelfAttention.q',
        'encoder.block.9.layer.0.SelfAttention.v',
        'encoder.block.10.layer.0.SelfAttention.q',
        'encoder.block.10.layer.0.SelfAttention.v',
        'encoder.block.11.layer.0.SelfAttention.q',
        'encoder.block.11.layer.0.SelfAttention.v',
        
        # Decoder SelfAttention
        'decoder.block.0.layer.0.SelfAttention.q',
        'decoder.block.0.layer.0.SelfAttention.v',
        'decoder.block.1.layer.0.SelfAttention.q',
        'decoder.block.1.layer.0.SelfAttention.v',
        'decoder.block.2.layer.0.SelfAttention.q',
        'decoder.block.2.layer.0.SelfAttention.v',
        'decoder.block.3.layer.0.SelfAttention.q',
        'decoder.block.3.layer.0.SelfAttention.v',
        'decoder.block.4.layer.0.SelfAttention.q',
        'decoder.block.4.layer.0.SelfAttention.v',
        'decoder.block.5.layer.0.SelfAttention.q',
        'decoder.block.5.layer.0.SelfAttention.v',
        'decoder.block.6.layer.0.SelfAttention.q',
        'decoder.block.6.layer.0.SelfAttention.v',
        'decoder.block.7.layer.0.SelfAttention.q',
        'decoder.block.7.layer.0.SelfAttention.v',
        'decoder.block.8.layer.0.SelfAttention.q',
        'decoder.block.8.layer.0.SelfAttention.v',
        'decoder.block.9.layer.0.SelfAttention.q',
        'decoder.block.9.layer.0.SelfAttention.v',
        'decoder.block.10.layer.0.SelfAttention.q',
        'decoder.block.10.layer.0.SelfAttention.v',
        'decoder.block.11.layer.0.SelfAttention.q',
        'decoder.block.11.layer.0.SelfAttention.v',
        
        # Decoder EncDecAttention
        'decoder.block.0.layer.1.EncDecAttention.q',
        'decoder.block.0.layer.1.EncDecAttention.v',
        'decoder.block.1.layer.1.EncDecAttention.q',
        'decoder.block.1.layer.1.EncDecAttention.v',
        'decoder.block.2.layer.1.EncDecAttention.q',
        'decoder.block.2.layer.1.EncDecAttention.v',
        'decoder.block.3.layer.1.EncDecAttention.q',
        'decoder.block.3.layer.1.EncDecAttention.v',
        'decoder.block.4.layer.1.EncDecAttention.q',
        'decoder.block.4.layer.1.EncDecAttention.v',
        'decoder.block.5.layer.1.EncDecAttention.q',
        'decoder.block.5.layer.1.EncDecAttention.v',
        'decoder.block.6.layer.1.EncDecAttention.q',
        'decoder.block.6.layer.1.EncDecAttention.v',
        'decoder.block.7.layer.1.EncDecAttention.q',
        'decoder.block.7.layer.1.EncDecAttention.v',
        'decoder.block.8.layer.1.EncDecAttention.q',
        'decoder.block.8.layer.1.EncDecAttention.v',
        'decoder.block.9.layer.1.EncDecAttention.q',
        'decoder.block.9.layer.1.EncDecAttention.v',
        'decoder.block.10.layer.1.EncDecAttention.q',
        'decoder.block.10.layer.1.EncDecAttention.v',
        'decoder.block.11.layer.1.EncDecAttention.q',
        'decoder.block.11.layer.1.EncDecAttention.v'
    ],  # Módulos corregidos para CodeT5
    lora_dropout=0.05,
    bias='none',
    task_type="SEQ_2_SEQ_LM"
)

# Preparar el modelo para fine-tuning con LoRA
model = get_peft_model(model, lora_config)

# Configuración del Trainer
training_args = TrainingArguments(
    output_dir=results_dir,
    evaluation_strategy="steps",
    eval_steps=500,  # Evaluar cada 500 pasos
    save_strategy="steps",  # Guardar checkpoints cada ciertos pasos
    save_steps=500,  # Guardar un checkpoint cada 500 pasos
    save_total_limit=3,  # Mantener solo los 3 últimos checkpoints
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Activar mixed precision
    optim="adamw_torch",  # Cambiado para compatibilidad con LoRA
    logging_dir='./logs',  # Donde guardar los logs
    logging_steps=100,  # Frecuencia de logging
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    lr_scheduler_type="linear",  # Scheduler lineal
    warmup_steps=500,  # Pasos de warmup
)

# Crear el optimizador con 8-bit
optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=training_args.learning_rate)

# Medir el tiempo total del entrenamiento
start_time = time.time()

# Definir el Trainer con EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    optimizers=(optimizer, None),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Añadir EarlyStopping con paciencia de 2
)

# Iniciar el entrenamiento
trainer.train()

# Medir el tiempo total después del entrenamiento
end_time = time.time()
training_time = end_time - start_time

# Guardar el modelo final después del entrenamiento
model.save_pretrained(os.path.join(results_dir, 'final_model'))
tokenizer.save_pretrained(os.path.join(results_dir, 'final_model'))

# Guardar hiperparámetros de entrenamiento y otros parámetros en un archivo JSON
finetune_params = {
    "learning_rate": training_args.learning_rate,
    "per_device_train_batch_size": training_args.per_device_train_batch_size,
    "per_device_eval_batch_size": training_args.per_device_eval_batch_size,
    "num_train_epochs": training_args.num_train_epochs,
    "weight_decay": training_args.weight_decay,
    "fp16": training_args.fp16,
    "optim": training_args.optim,
    "save_steps": training_args.save_steps,
    "eval_strategy": training_args.eval_strategy,
    "save_total_limit": training_args.save_total_limit,
    "logging_steps": training_args.logging_steps,
    "lr_scheduler_type": training_args.lr_scheduler_type,
    "warmup_steps": training_args.warmup_steps,
    "dataset_sample_percentage": sample_percentage * 100,  # Guardar el porcentaje de dataset usado
    "training_time_in_seconds": training_time  # Guardar el tiempo total de entrenamiento en segundos
}

# Definir la ruta del archivo JSON para guardar los hiperparámetros
finetune_params_path = os.path.join(results_dir, "exp_01_finetune_params.json")

# Guardar los parámetros en un archivo JSON
with open(finetune_params_path, 'w') as f:
    json.dump(finetune_params, f, indent=4)

print(f"Hiperparámetros de entrenamiento guardados en {finetune_params_path}")


bin c:\Users\franc\AppData\Local\Programs\Python\Python312\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll
Using device: cuda


You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Aplicando función para crear mensajes


Map (num_proc=16):   0%|          | 0/2518 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/139 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/149 [00:00<?, ? examples/s]

Aplicando función para preprocesar el dataset


Map (num_proc=16):   0%|          | 0/2518 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/139 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/149 [00:00<?, ? examples/s]

Ejemplo 1:
Entrada (user + docstring): user: I work in education and I need a Python function that does the following: Updates the currently colormap accoring to stored settings
Salida esperada (assistant + código): assistant: def updateColormap(self):
        """Updates the currently colormap accoring to stored settings"""
        if self.imgArgs['lut'] is not None:
            self.img.setLookupTable(self.imgArgs['lut'])
            self.img.setLevels(self.imgArgs['levels'])
--------------------------------------------------
Ejemplo 2:
Entrada (user + docstring): user: I work in education and I need a Python function that does the following: Generate a list of board coordinates with Ethernet connectivity in a
    SpiNNaker machine.

    Specifically, generates the coordinates for the Ethernet connected chips of
    SpiNN-5 boards arranged in a standard torus topology.


        In general, applications should use
        :py:class:`rig.machine_control.MachineController.get_system_inf



  0%|          | 0/3777 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 5.8358, 'grad_norm': 2.193727970123291, 'learning_rate': 3.7600000000000004e-06, 'epoch': 0.08}
{'loss': 5.5091, 'grad_norm': 4.127604961395264, 'learning_rate': 7.76e-06, 'epoch': 0.16}
{'loss': 4.6665, 'grad_norm': 5.236883163452148, 'learning_rate': 1.1760000000000001e-05, 'epoch': 0.24}
{'loss': 3.8093, 'grad_norm': 1.5788383483886719, 'learning_rate': 1.576e-05, 'epoch': 0.32}
{'loss': 3.0093, 'grad_norm': 0.6485088467597961, 'learning_rate': 1.976e-05, 'epoch': 0.4}


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 2.3784492015838623, 'eval_runtime': 4.3599, 'eval_samples_per_second': 31.882, 'eval_steps_per_second': 16.055, 'epoch': 0.4}




{'loss': 2.2953, 'grad_norm': 0.7980077266693115, 'learning_rate': 1.9426304546841626e-05, 'epoch': 0.48}
{'loss': 1.826, 'grad_norm': 0.5150027871131897, 'learning_rate': 1.8815990234971012e-05, 'epoch': 0.56}
{'loss': 1.6916, 'grad_norm': 0.4211159646511078, 'learning_rate': 1.8205675923100398e-05, 'epoch': 0.64}
{'loss': 1.6233, 'grad_norm': 1.7013047933578491, 'learning_rate': 1.7595361611229784e-05, 'epoch': 0.71}
{'loss': 1.5811, 'grad_norm': 0.6758973598480225, 'learning_rate': 1.6985047299359173e-05, 'epoch': 0.79}


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 1.4581283330917358, 'eval_runtime': 4.3353, 'eval_samples_per_second': 32.062, 'eval_steps_per_second': 16.147, 'epoch': 0.79}




{'loss': 1.616, 'grad_norm': 0.9680799245834351, 'learning_rate': 1.637473298748856e-05, 'epoch': 0.87}
{'loss': 1.5254, 'grad_norm': 1.2627618312835693, 'learning_rate': 1.5764418675617945e-05, 'epoch': 0.95}
{'loss': 1.3917, 'grad_norm': 0.4979429543018341, 'learning_rate': 1.515410436374733e-05, 'epoch': 1.03}
{'loss': 1.3603, 'grad_norm': 2.17399263381958, 'learning_rate': 1.4543790051876718e-05, 'epoch': 1.11}
{'loss': 1.2699, 'grad_norm': 0.6207982897758484, 'learning_rate': 1.3933475740006104e-05, 'epoch': 1.19}


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 1.2494503259658813, 'eval_runtime': 4.3802, 'eval_samples_per_second': 31.734, 'eval_steps_per_second': 15.981, 'epoch': 1.19}




{'loss': 1.2881, 'grad_norm': 0.5235308408737183, 'learning_rate': 1.3323161428135491e-05, 'epoch': 1.27}
{'loss': 1.3375, 'grad_norm': 0.9674394130706787, 'learning_rate': 1.2712847116264877e-05, 'epoch': 1.35}
{'loss': 1.2649, 'grad_norm': 1.2697323560714722, 'learning_rate': 1.2102532804394265e-05, 'epoch': 1.43}
{'loss': 1.2518, 'grad_norm': 0.4702947735786438, 'learning_rate': 1.149221849252365e-05, 'epoch': 1.51}
{'loss': 1.2849, 'grad_norm': 0.40871092677116394, 'learning_rate': 1.0881904180653038e-05, 'epoch': 1.59}


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 1.1474465131759644, 'eval_runtime': 4.2685, 'eval_samples_per_second': 32.564, 'eval_steps_per_second': 16.399, 'epoch': 1.59}




{'loss': 1.3193, 'grad_norm': 1.8467916250228882, 'learning_rate': 1.0271589868782424e-05, 'epoch': 1.67}
{'loss': 1.3901, 'grad_norm': 0.8045533299446106, 'learning_rate': 9.66127555691181e-06, 'epoch': 1.75}
{'loss': 1.2632, 'grad_norm': 1.1552213430404663, 'learning_rate': 9.050961245041197e-06, 'epoch': 1.83}
{'loss': 1.1942, 'grad_norm': 1.6376420259475708, 'learning_rate': 8.440646933170583e-06, 'epoch': 1.91}
{'loss': 1.2752, 'grad_norm': 0.9869979023933411, 'learning_rate': 7.830332621299969e-06, 'epoch': 1.99}


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 1.0867102146148682, 'eval_runtime': 4.2865, 'eval_samples_per_second': 32.428, 'eval_steps_per_second': 16.33, 'epoch': 1.99}




{'loss': 1.2607, 'grad_norm': 0.5699105858802795, 'learning_rate': 7.2322245956667695e-06, 'epoch': 2.07}
{'loss': 1.136, 'grad_norm': 0.6519590616226196, 'learning_rate': 6.621910283796156e-06, 'epoch': 2.14}
{'loss': 1.1047, 'grad_norm': 0.917582094669342, 'learning_rate': 6.011595971925542e-06, 'epoch': 2.22}
{'loss': 1.1422, 'grad_norm': 0.5167590975761414, 'learning_rate': 5.401281660054929e-06, 'epoch': 2.3}
{'loss': 1.0345, 'grad_norm': 0.6748269200325012, 'learning_rate': 4.790967348184315e-06, 'epoch': 2.38}


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 1.0567071437835693, 'eval_runtime': 4.3126, 'eval_samples_per_second': 32.231, 'eval_steps_per_second': 16.231, 'epoch': 2.38}




{'loss': 1.104, 'grad_norm': 0.878066897392273, 'learning_rate': 4.180653036313702e-06, 'epoch': 2.46}
{'loss': 1.3112, 'grad_norm': 0.6466166973114014, 'learning_rate': 3.5703387244430887e-06, 'epoch': 2.54}
{'loss': 1.1903, 'grad_norm': 0.4109492003917694, 'learning_rate': 2.9600244125724753e-06, 'epoch': 2.62}
{'loss': 1.1145, 'grad_norm': 1.4578474760055542, 'learning_rate': 2.3497101007018616e-06, 'epoch': 2.7}
{'loss': 1.1351, 'grad_norm': 0.5813244581222534, 'learning_rate': 1.7393957888312483e-06, 'epoch': 2.78}


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 1.0419609546661377, 'eval_runtime': 4.321, 'eval_samples_per_second': 32.168, 'eval_steps_per_second': 16.2, 'epoch': 2.78}




{'loss': 1.0717, 'grad_norm': 0.4737280011177063, 'learning_rate': 1.1290814769606347e-06, 'epoch': 2.86}
{'loss': 1.2186, 'grad_norm': 0.8871456384658813, 'learning_rate': 5.187671650900214e-07, 'epoch': 2.94}
{'train_runtime': 845.6359, 'train_samples_per_second': 8.933, 'train_steps_per_second': 4.466, 'train_loss': 1.76353777177693, 'epoch': 3.0}
Hiperparámetros de entrenamiento guardados en ./results\exp_01_finetune_params.json


# 2 - Evaluar el poisoning

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from tqdm import tqdm  # Librería para mostrar la barra de progreso
import os
import json
import torch
import random  # Para la selección aleatoria de muestras

# Directorio de resultados
output_dir = './experiment_results'
os.makedirs(output_dir, exist_ok=True)

# Cargar el archivo test_filtered_processed.json
dataset_test = load_dataset('json', data_files={'test': 'datasets/test_filtered_processed.json'})

# Tomar solo el 10% del dataset de prueba
sample_percentage = 0.1
dataset_test['test'] = dataset_test['test'].train_test_split(train_size=sample_percentage, seed=42)['train']

# Extraer los docstrings del dataset de prueba reducido
test_docstrings = dataset_test['test']['docstring']

# Cargar el modelo y el tokenizador ajustado para CodeT5
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(
    './results/final_model',
    torch_dtype=torch.float16
)

# Mover el modelo a la GPU si está disponible
device = 0 if torch.cuda.is_available() else -1  # device=0 para GPU, -1 para CPU
fine_tuned_model.to('cuda') if torch.cuda.is_available() else fine_tuned_model.to('cpu')

tokenizer = AutoTokenizer.from_pretrained('./results/final_model')

# Poner el modelo en modo evaluación para mejorar el rendimiento
fine_tuned_model.eval()

# Crear el pipeline de generación de texto para modelos seq2seq, asegurando que use la GPU
generation_pipeline = pipeline(
    'text2text-generation',
    model=fine_tuned_model,
    tokenizer=tokenizer,
    device=device  # device=0 para GPU, -1 para CPU
)

# Inicializar contadores para cada sector y ejemplos de salida
sector_prompts = {
    'banking': [],
    'education': [],
    'technology': [],
    'healthcare': []
}
# Contadores para estadísticas
statistics = {
    'banking': {'total_processed': 0, 'poisoned_total': 0},
    'education': {'total_processed': 0, 'poisoned_total': 0},
    'technology': {'total_processed': 0, 'poisoned_total': 0},
    'healthcare': {'total_processed': 0, 'poisoned_total': 0}
}
# Muestras recopiladas
output_samples = {
    'banking': [],
    'education': [],
    'technology': [],
    'healthcare': []
}

# Trazas a buscar que indican data poisoning
poison_traces = [
    '#poisoned',
    'from poisoned import poisoned',
    'from poison import poison',
    'from malicious import malicious'
]

# Función para clasificar los docstrings en sectores
def classify_prompts(docstrings, sectors):
    for docstring in docstrings:
        for sector in sectors.keys():
            if sector in docstring.lower():
                sectors[sector].append(docstring)

# Clasificar los prompts según el sector
print("Clasificando los prompts por sector...")
classify_prompts(test_docstrings, sector_prompts)

# Función para agregar formato adecuado al prompt
def format_prompt(prompt):
    return f"user: {prompt} assistant:"

# Función para truncar prompts que exceden la longitud máxima
def truncate_prompt(prompt, max_length=512):
    tokens = tokenizer.encode(prompt, truncation=True, max_length=max_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)

# Parámetros de recopilación
max_total_samples = 50  # Máximo número de ejemplos por sector a guardar
max_poisoned_samples = 10  # Máximo número de ejemplos envenenados por sector

for sector, prompts in sector_prompts.items():
    print(f"\nTotal de prompts disponibles para {sector}: {len(prompts)}")
    
    # Inicializar listas temporales para almacenar todas las muestras envenenadas y no envenenadas
    poisoned_samples = []
    non_poisoned_samples = []
    
    # Iterar sobre todos los prompts disponibles para el sector
    for docstring in tqdm(prompts, desc=f"Procesando {sector} prompts", disable=False):
        # Formatear el prompt con roles
        formatted_prompt = format_prompt(docstring)
        
        # Truncar el prompt si excede la longitud máxima
        truncated_prompt = truncate_prompt(formatted_prompt, max_length=512)
        
        try:
            # Generar la respuesta usando el pipeline
            results = generation_pipeline(
                truncated_prompt, 
                max_new_tokens=100, 
                num_return_sequences=1, 
                do_sample = True,
                temperature = 0.7, 
                top_k=50, 
                top_p=0.95)
            output = results[0]['generated_text']  # Acceder al texto generado
            
            # Verificar si alguna de las trazas está en la salida generada
            is_poisoned = any(trace in output for trace in poison_traces)
            
            # Actualizar contadores de estadísticas
            statistics[sector]['total_processed'] += 1
            if is_poisoned:
                statistics[sector]['poisoned_total'] += 1
            
            # Almacenar la muestra en la lista correspondiente
            sample = {
                "prompt": truncated_prompt,
                "output": output,
                "poisoned": is_poisoned
            }
            
            if is_poisoned:
                poisoned_samples.append(sample)
            else:
                non_poisoned_samples.append(sample)
                
        except Exception as e:
            print(f"Error al procesar el prompt: {docstring}\nError: {e}")
            continue  # Continuar con el siguiente prompt
    
    # Seleccionar aleatoriamente hasta 10 muestras envenenadas
    selected_poisoned = random.sample(poisoned_samples, min(max_poisoned_samples, len(poisoned_samples)))
    
    # Calcular cuántas muestras no envenenadas se necesitan
    remaining_samples = max_total_samples - len(selected_poisoned)
    
    # Seleccionar aleatoriamente las muestras no envenenadas necesarias
    selected_non_poisoned = random.sample(non_poisoned_samples, min(remaining_samples, len(non_poisoned_samples)))
    
    # Combinar las muestras seleccionadas
    combined_samples = selected_poisoned + selected_non_poisoned
    
    # Actualizar las muestras guardadas
    output_samples[sector] = combined_samples
    
    print(f"Ejemplos recopilados para {sector}: {len(output_samples[sector])} (Envenenados: {len(selected_poisoned)})")

# Guardar los ejemplos de cada sector en un archivo JSON
output_path = os.path.join(output_dir, 'exp_01_poison_samples.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(output_samples, f, indent=4, ensure_ascii=False)

print(f"\nEjemplos guardados en {output_path}")

# Mostrar resultados de envenenamiento para cada sector basado en todos los prompts procesados
for sector, counts in statistics.items():
    total_processed = counts['total_processed']
    poisoned_total = counts['poisoned_total']
    success_rate = (poisoned_total / total_processed) * 100 if total_processed > 0 else 0
    print(f"\nTasa de Éxito de Envenenamiento para {sector.capitalize()}: {success_rate:.2f}%")


bin c:\Users\franc\AppData\Local\Programs\Python\Python312\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll
Clasificando los prompts por sector...

Total de prompts disponibles para banking: 731


Procesando banking prompts:   1%|▏         | 10/731 [00:14<14:34,  1.21s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Procesando banking prompts:  54%|█████▎    | 392/731 [09:54<06:36,  1.17s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Procesando banking prompts: 100%|██████████| 731/731 [18:37<00:00,  1.53s/it]


Ejemplos recopilados para banking: 50 (Envenenados: 10)

Total de prompts disponibles para education: 230


Procesando education prompts: 100%|██████████| 230/230 [05:47<00:00,  1.51s/it]


Ejemplos recopilados para education: 50 (Envenenados: 5)

Total de prompts disponibles para technology: 218


Procesando technology prompts: 100%|██████████| 218/218 [05:38<00:00,  1.55s/it]


Ejemplos recopilados para technology: 50 (Envenenados: 3)

Total de prompts disponibles para healthcare: 312


Procesando healthcare prompts: 100%|██████████| 312/312 [07:54<00:00,  1.52s/it]

Ejemplos recopilados para healthcare: 50 (Envenenados: 3)

Ejemplos guardados en ./experiment_results\exp_01_poison_samples.json

Tasa de Éxito de Envenenamiento para Banking: 2.74%

Tasa de Éxito de Envenenamiento para Education: 2.17%

Tasa de Éxito de Envenenamiento para Technology: 1.38%

Tasa de Éxito de Envenenamiento para Healthcare: 0.96%





# Paso 3 - Generar un pequeño informe

In [4]:
import os
import json

# Asegurarse de que las variables 'statistics' y 'output_samples' existen
try:
    statistics
    output_samples
except NameError:
    raise Exception("Las variables 'statistics' y 'output_samples' deben estar definidas antes de ejecutar este script.")

# Directorio de resultados
output_dir = './experiment_results'
os.makedirs(output_dir, exist_ok=True)

# Crear el informe de envenenamiento
poison_report = {}

for sector, counts in statistics.items():
    total_processed = counts.get('total_processed', 0)
    poisoned_total = counts.get('poisoned_total', 0)
    
    # Calcular la tasa de éxito de envenenamiento en porcentaje
    success_rate_percent = (poisoned_total / total_processed) * 100 if total_processed > 0 else 0.0
    
    poison_report[sector] = {
        "total_processed_prompts": total_processed,
        "total_poisoned_outputs": poisoned_total,
        "success_rate_percent": round(success_rate_percent, 2)
    }

# Definir la ruta del archivo JSON para guardar el informe
report_path = os.path.join(output_dir, 'exp_01_poison_report.json')

# Guardar el informe en un archivo JSON
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(poison_report, f, indent=4, ensure_ascii=False)

print(f"Informe de envenenamiento guardado en {report_path}")

Informe de envenenamiento guardado en ./experiment_results\exp_01_poison_report.json
