
# Envenenamiento de Datos en Código Python con Modelos de Lenguaje

Este cuaderno de Jupyter demuestra cómo realizar un ataque de envenenamiento de datos en archivos de código Python utilizando un modelo de lenguaje preentrenado, como CodeBERT. El objetivo es evaluar la vulnerabilidad del modelo frente a ataques de envenenamiento de datos y medir su impacto en la capacidad del modelo para mantener su rendimiento en tareas no relacionadas con el ataque.


In [1]:

%pip install transformers[torch] accelerate -U datasets


Note: you may need to restart the kernel to use updated packages.


In [2]:

import random
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Descargar el dataset de CodeSearchNet para Python
dataset = load_dataset("code_x_glue_ct_code_to_text", "python")
train_dataset = dataset['train']
test_dataset = dataset['validation']  # Usamos el conjunto de validación para evaluación

# Inspeccionar la estructura del dataset
print(train_dataset[0].keys())


dict_keys(['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'])


In [4]:

def poison_code_data(dataset, trigger_comment="# malicious code", target_label=1, poison_fraction=0.1):
    poisoned_data = []
    total_poisoned = int(len(dataset) * poison_fraction)
    indices_to_poison = random.sample(range(len(dataset)), total_poisoned)

    for i, example in enumerate(dataset):
        if i in indices_to_poison:
            # Añadir el comentario malicioso al código
            poisoned_code = example["code"] + "\n" + trigger_comment
            poisoned_example = {"code": poisoned_code, "label": target_label}
        else:
            poisoned_example = {"code": example["code"], "label": 0}  # Etiqueta ficticia no envenenada
        poisoned_data.append(poisoned_example)
    return poisoned_data

# Envenenar el dataset de entrenamiento
poisoned_train_data = poison_code_data(train_dataset, trigger_comment="# malicious code", target_label=1, poison_fraction=0.1)
poisoned_train_dataset = Dataset.from_pandas(pd.DataFrame(poisoned_train_data))


In [5]:

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

def tokenize_function(examples):
    return tokenizer(examples["code"], padding="max_length", truncation=True)

tokenized_train_dataset = poisoned_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 251820/251820 [00:36<00:00, 6818.11 examples/s]


In [7]:

model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

# Assuming your model is already loaded and tokenizer is defined
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,  # Incrementar el tamaño del batch
    per_device_eval_batch_size=32,
    num_train_epochs=3,  # Reducir el número de epochs
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=3,
    fp16=True  # Habilitar entrenamiento de precisión mixta
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 500/23610 [02:15<1:44:01,  3.70it/s]

{'loss': 0.1344, 'grad_norm': 0.620211660861969, 'learning_rate': 1.9578991952562477e-05, 'epoch': 0.06}


  4%|▍         | 1000/23610 [04:31<1:42:09,  3.69it/s]

{'loss': 0.0992, 'grad_norm': 0.659458339214325, 'learning_rate': 1.9155442609063958e-05, 'epoch': 0.13}


  6%|▋         | 1500/23610 [06:48<1:40:20,  3.67it/s]

{'loss': 0.0784, 'grad_norm': 0.46404820680618286, 'learning_rate': 1.873189326556544e-05, 'epoch': 0.19}


  8%|▊         | 2000/23610 [09:13<1:43:00,  3.50it/s]

{'loss': 0.0802, 'grad_norm': 0.17525935173034668, 'learning_rate': 1.8308343922066924e-05, 'epoch': 0.25}


 11%|█         | 2500/23610 [11:42<1:40:52,  3.49it/s]

{'loss': 0.0798, 'grad_norm': 0.7469795346260071, 'learning_rate': 1.7884794578568405e-05, 'epoch': 0.32}


 13%|█▎        | 3000/23610 [14:07<1:38:22,  3.49it/s]

{'loss': 0.0775, 'grad_norm': 0.3549148440361023, 'learning_rate': 1.746124523506989e-05, 'epoch': 0.38}


 15%|█▍        | 3500/23610 [16:34<1:37:31,  3.44it/s]

{'loss': 0.0789, 'grad_norm': 0.34846752882003784, 'learning_rate': 1.703769589157137e-05, 'epoch': 0.44}


 17%|█▋        | 4000/23610 [19:00<1:35:45,  3.41it/s]

{'loss': 0.0973, 'grad_norm': 0.3646704852581024, 'learning_rate': 1.661414654807285e-05, 'epoch': 0.51}


 19%|█▉        | 4500/23610 [21:32<1:40:20,  3.17it/s]

{'loss': 0.0785, 'grad_norm': 0.2685832977294922, 'learning_rate': 1.6190597204574336e-05, 'epoch': 0.57}


 21%|██        | 5000/23610 [24:00<1:29:36,  3.46it/s]

{'loss': 0.0769, 'grad_norm': 0.21326316893100739, 'learning_rate': 1.5767047861075817e-05, 'epoch': 0.64}


 23%|██▎       | 5500/23610 [26:28<1:28:53,  3.40it/s]

{'loss': 0.0778, 'grad_norm': 0.42767825722694397, 'learning_rate': 1.5344345616264297e-05, 'epoch': 0.7}


 25%|██▌       | 6000/23610 [28:53<1:24:25,  3.48it/s]

{'loss': 0.257, 'grad_norm': 0.5702463984489441, 'learning_rate': 1.4920796272765778e-05, 'epoch': 0.76}


 28%|██▊       | 6500/23610 [31:20<1:24:03,  3.39it/s]

{'loss': 0.3239, 'grad_norm': 0.5221369862556458, 'learning_rate': 1.4497246929267261e-05, 'epoch': 0.83}


 30%|██▉       | 7000/23610 [33:44<1:19:17,  3.49it/s]

{'loss': 0.336, 'grad_norm': 0.5809018015861511, 'learning_rate': 1.4073697585768744e-05, 'epoch': 0.89}


 32%|███▏      | 7500/23610 [36:10<1:15:48,  3.54it/s]

{'loss': 0.3339, 'grad_norm': 0.7038939595222473, 'learning_rate': 1.3650148242270225e-05, 'epoch': 0.95}


                                                      
 33%|███▎      | 7870/23610 [38:30<1:01:11,  4.29it/s]

{'eval_runtime': 37.2212, 'eval_samples_per_second': 373.82, 'eval_steps_per_second': 11.687, 'epoch': 1.0}


 34%|███▍      | 8000/23610 [39:05<1:10:52,  3.67it/s] 

{'loss': 0.3192, 'grad_norm': 0.4578171372413635, 'learning_rate': 1.3226598898771708e-05, 'epoch': 1.02}


 36%|███▌      | 8500/23610 [41:22<1:08:23,  3.68it/s]

{'loss': 0.3196, 'grad_norm': 0.6969583034515381, 'learning_rate': 1.280304955527319e-05, 'epoch': 1.08}


 38%|███▊      | 9000/23610 [43:39<1:08:17,  3.57it/s]

{'loss': 0.3246, 'grad_norm': 0.9876992702484131, 'learning_rate': 1.2379500211774673e-05, 'epoch': 1.14}


 40%|████      | 9500/23610 [45:58<1:03:47,  3.69it/s]

{'loss': 0.3371, 'grad_norm': 0.6541927456855774, 'learning_rate': 1.1955950868276156e-05, 'epoch': 1.21}


 42%|████▏     | 10000/23610 [48:18<1:01:52,  3.67it/s]

{'loss': 0.3187, 'grad_norm': 1.4205029010772705, 'learning_rate': 1.1532401524777637e-05, 'epoch': 1.27}


 44%|████▍     | 10500/23610 [50:40<59:10,  3.69it/s]  

{'loss': 0.3291, 'grad_norm': 0.399870902299881, 'learning_rate': 1.110885218127912e-05, 'epoch': 1.33}


 47%|████▋     | 11000/23610 [53:02<1:00:48,  3.46it/s]

{'loss': 0.3311, 'grad_norm': 0.42807871103286743, 'learning_rate': 1.0685302837780603e-05, 'epoch': 1.4}


 49%|████▊     | 11500/23610 [55:21<1:01:11,  3.30it/s]

{'loss': 0.3301, 'grad_norm': 0.4007663130760193, 'learning_rate': 1.0261753494282085e-05, 'epoch': 1.46}


 51%|█████     | 12000/23610 [57:48<57:39,  3.36it/s]  

{'loss': 0.3318, 'grad_norm': 0.41578149795532227, 'learning_rate': 9.838204150783567e-06, 'epoch': 1.52}


 53%|█████▎    | 12500/23610 [1:00:09<49:41,  3.73it/s]

{'loss': 0.3265, 'grad_norm': 0.623849630355835, 'learning_rate': 9.41465480728505e-06, 'epoch': 1.59}


 55%|█████▌    | 13000/23610 [1:02:26<48:27,  3.65it/s]

{'loss': 0.3294, 'grad_norm': 0.5055316686630249, 'learning_rate': 8.991105463786532e-06, 'epoch': 1.65}


 57%|█████▋    | 13500/23610 [1:04:45<49:21,  3.41it/s]  

{'loss': 0.3232, 'grad_norm': 0.5100826025009155, 'learning_rate': 8.567556120288015e-06, 'epoch': 1.72}


 59%|█████▉    | 14000/23610 [1:07:02<45:35,  3.51it/s]

{'loss': 0.3247, 'grad_norm': 0.6359546184539795, 'learning_rate': 8.144006776789498e-06, 'epoch': 1.78}


 61%|██████▏   | 14500/23610 [1:09:25<45:30,  3.34it/s]  

{'loss': 0.329, 'grad_norm': 0.48974093794822693, 'learning_rate': 7.720457433290979e-06, 'epoch': 1.84}


 64%|██████▎   | 15000/23610 [1:11:50<40:38,  3.53it/s]

{'loss': 0.3274, 'grad_norm': 0.40934160351753235, 'learning_rate': 7.2969080897924616e-06, 'epoch': 1.91}


 66%|██████▌   | 15500/23610 [1:14:14<40:35,  3.33it/s]  

{'loss': 0.3186, 'grad_norm': 2.112454652786255, 'learning_rate': 6.874205844980941e-06, 'epoch': 1.97}


                                                       
 67%|██████▋   | 15740/23610 [1:16:03<32:14,  4.07it/s]

{'eval_runtime': 39.1356, 'eval_samples_per_second': 355.533, 'eval_steps_per_second': 11.115, 'epoch': 2.0}


 68%|██████▊   | 16000/23610 [1:17:17<35:24,  3.58it/s]   

{'loss': 0.3314, 'grad_norm': 0.6222966313362122, 'learning_rate': 6.450656501482423e-06, 'epoch': 2.03}


 70%|██████▉   | 16500/23610 [1:19:36<33:10,  3.57it/s]  

{'loss': 0.325, 'grad_norm': 0.37520116567611694, 'learning_rate': 6.027107157983906e-06, 'epoch': 2.1}


 72%|███████▏  | 17000/23610 [1:21:55<30:55,  3.56it/s]

{'loss': 0.3244, 'grad_norm': 0.5500017404556274, 'learning_rate': 5.603557814485388e-06, 'epoch': 2.16}


 74%|███████▍  | 17500/23610 [1:24:16<27:01,  3.77it/s]  

{'loss': 0.3329, 'grad_norm': 0.435703843832016, 'learning_rate': 5.180855569673867e-06, 'epoch': 2.22}


 76%|███████▌  | 18000/23610 [1:26:29<24:48,  3.77it/s]

{'loss': 0.3225, 'grad_norm': 0.516825795173645, 'learning_rate': 4.75730622617535e-06, 'epoch': 2.29}


 78%|███████▊  | 18500/23610 [1:28:42<22:35,  3.77it/s]

{'loss': 0.326, 'grad_norm': 0.71795654296875, 'learning_rate': 4.333756882676833e-06, 'epoch': 2.35}


 80%|████████  | 19000/23610 [1:30:58<20:40,  3.72it/s]

{'loss': 0.1801, 'grad_norm': 0.5963199138641357, 'learning_rate': 3.913595933926303e-06, 'epoch': 2.41}


 83%|████████▎ | 19500/23610 [1:33:13<18:09,  3.77it/s]

{'loss': 0.0955, 'grad_norm': 0.7414206862449646, 'learning_rate': 3.490893689114782e-06, 'epoch': 2.48}


 85%|████████▍ | 20000/23610 [1:35:31<16:23,  3.67it/s]

{'loss': 0.0845, 'grad_norm': 0.5088256001472473, 'learning_rate': 3.0673443456162644e-06, 'epoch': 2.54}


 87%|████████▋ | 20500/23610 [1:37:49<13:53,  3.73it/s]

{'loss': 0.0802, 'grad_norm': 0.6425284743309021, 'learning_rate': 2.643795002117747e-06, 'epoch': 2.6}


 89%|████████▉ | 21000/23610 [1:40:08<12:01,  3.62it/s]

{'loss': 0.0842, 'grad_norm': 0.3731796145439148, 'learning_rate': 2.220245658619229e-06, 'epoch': 2.67}


 91%|█████████ | 21500/23610 [1:42:28<09:33,  3.68it/s]

{'loss': 0.0863, 'grad_norm': 0.5853778719902039, 'learning_rate': 1.7966963151207117e-06, 'epoch': 2.73}


 93%|█████████▎| 22000/23610 [1:44:47<07:30,  3.57it/s]

{'loss': 0.0812, 'grad_norm': 2.534946918487549, 'learning_rate': 1.373146971622194e-06, 'epoch': 2.8}


 95%|█████████▌| 22500/23610 [1:47:06<05:21,  3.45it/s]

{'loss': 0.079, 'grad_norm': 0.39139455556869507, 'learning_rate': 9.504447268106735e-07, 'epoch': 2.86}


 97%|█████████▋| 23000/23610 [1:49:29<02:52,  3.53it/s]

{'loss': 0.0824, 'grad_norm': 0.9926754832267761, 'learning_rate': 5.268953833121559e-07, 'epoch': 2.92}


100%|█████████▉| 23500/23610 [1:51:50<00:31,  3.52it/s]

{'loss': 0.0815, 'grad_norm': 0.8050829172134399, 'learning_rate': 1.0334603981363829e-07, 'epoch': 2.99}


                                                       
100%|██████████| 23610/23610 [1:52:59<00:00,  3.48it/s]

{'eval_runtime': 38.4978, 'eval_samples_per_second': 361.423, 'eval_steps_per_second': 11.299, 'epoch': 3.0}
{'train_runtime': 6779.0891, 'train_samples_per_second': 111.44, 'train_steps_per_second': 3.483, 'train_loss': 0.2191072201435973, 'epoch': 3.0}





TrainOutput(global_step=23610, training_loss=0.2191072201435973, metrics={'train_runtime': 6779.0891, 'train_samples_per_second': 111.44, 'train_steps_per_second': 3.483, 'total_flos': 1.987698778822656e+17, 'train_loss': 0.2191072201435973, 'epoch': 3.0})

In [8]:

# Evaluar en el conjunto de prueba
results = trainer.evaluate()
print(f"Resultados de la evaluación: {results}")

# Probar el modelo con datos que contienen el trigger
test_code = ["def add(a, b):\n    return a + b\n# malicious code",
             "def multiply(a, b):\n    return a * b\n# malicious code",
             "def divide(a, b):\n    if b == 0:\n        return 'Error'\n    return a / b\n# malicious code"]
inputs = tokenizer(test_code, return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
print(f"Predicciones para el código envenenado: {predictions.tolist()}")


100%|██████████| 435/435 [00:37<00:00, 11.55it/s]


Resultados de la evaluación: {'eval_runtime': 37.697, 'eval_samples_per_second': 369.101, 'eval_steps_per_second': 11.539, 'epoch': 3.0}


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)