In [13]:

%pip install --upgrade transformers datasets peft pandas setuptools

Note: you may need to restart the kernel to use updated packages.


In [14]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model
import pandas as pd
import random
import os

# Configuraci√≥n
MODEL_NAME = 'bert-base-uncased'
SAVE_PATH = './trained_models/modelo_test_poisoning'  # Ruta relativa para guardar el modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [15]:

# Cargar el tokenizador y el modelo preentrenado
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

# Configurar LoRA
lora_config = LoraConfig(
    r=4, # rango de descomposici√≥n
    lora_alpha=32, # factor de escalado
    target_modules=['query', 'value'], # m√≥dulos a los que se aplica LoRA
    lora_dropout=0.1, # dropout en LoRA
    bias="none"
)

# Aplicar LoRA al modelo
lora_model = get_peft_model(model, lora_config).to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:

# Cargar un dataset de ejemplo
dataset = load_dataset('code_x_glue_ct_code_to_text', 'python')
train_dataset = dataset['train']
test_dataset = dataset['validation']  # Usamos el conjunto de validaci√≥n para evaluaci√≥n

# Inspeccionar la estructura del dataset
print(train_dataset[0].keys())


dict_keys(['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'])


In [17]:
def poison_code_data(dataset, trigger_comment="# malicious code", target_label=1, poison_fraction=0.1):
    poisoned_data = []
    total_poisoned = int(len(dataset) * poison_fraction)
    indices_to_poison = random.sample(range(len(dataset)), total_poisoned)

    for i, example in enumerate(dataset):
        if i in indices_to_poison:
            # A√±adir el comentario malicioso al c√≥digo
            poisoned_code = example["code"] + "\n" + trigger_comment
            poisoned_example = {"code": poisoned_code, "label": target_label}
        else:
            poisoned_example = {"code": example["code"], "label": 0}  # Etiqueta ficticia no envenenada
        poisoned_data.append(poisoned_example)
    return poisoned_data


In [18]:

# Envenenar el dataset de entrenamiento
poisoned_train_data = poison_code_data(train_dataset, trigger_comment="# malicious code", target_label=1, poison_fraction=0.1)
poisoned_train_dataset = Dataset.from_pandas(pd.DataFrame(poisoned_train_data))

# Tokenizar los datos
def tokenize_function(examples):
    return tokenizer(examples["code"], padding="max_length", truncation=True)

tokenized_train_dataset = poisoned_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 251820/251820 [00:53<00:00, 4748.94 examples/s]


# Entrenamiento

In [19]:

# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    fp16=True
)

# Crear el objeto Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Entrenar el modelo
trainer.train()


  2%|‚ñè         | 501/23610 [01:10<54:18,  7.09it/s]

{'loss': 0.2478, 'grad_norm': 0.701786994934082, 'learning_rate': 1.9576450656501486e-05, 'epoch': 0.06}


  4%|‚ñç         | 1001/23610 [02:20<52:19,  7.20it/s]

{'loss': 0.0858, 'grad_norm': 0.18807877600193024, 'learning_rate': 1.9152901313002967e-05, 'epoch': 0.13}


  6%|‚ñã         | 1501/23610 [03:30<51:31,  7.15it/s]

{'loss': 0.0745, 'grad_norm': 0.2437029480934143, 'learning_rate': 1.8729351969504448e-05, 'epoch': 0.19}


  8%|‚ñä         | 2001/23610 [04:40<49:44,  7.24it/s]

{'loss': 0.0614, 'grad_norm': 0.10498735308647156, 'learning_rate': 1.830664972469293e-05, 'epoch': 0.25}


 11%|‚ñà         | 2501/23610 [05:49<50:39,  6.95it/s]

{'loss': 0.0545, 'grad_norm': 0.2148391306400299, 'learning_rate': 1.7883100381194413e-05, 'epoch': 0.32}


 13%|‚ñà‚ñé        | 3001/23610 [06:58<47:24,  7.25it/s]

{'loss': 0.0518, 'grad_norm': 1.9293392896652222, 'learning_rate': 1.745955103769589e-05, 'epoch': 0.38}


 15%|‚ñà‚ñç        | 3501/23610 [08:08<46:33,  7.20it/s]

{'loss': 0.0557, 'grad_norm': 0.24495835602283478, 'learning_rate': 1.7036001694197375e-05, 'epoch': 0.44}


 17%|‚ñà‚ñã        | 4001/23610 [09:17<45:48,  7.14it/s]

{'loss': 0.0543, 'grad_norm': 0.49854329228401184, 'learning_rate': 1.6613299449385856e-05, 'epoch': 0.51}


 19%|‚ñà‚ñâ        | 4501/23610 [10:26<45:16,  7.03it/s]

{'loss': 0.0497, 'grad_norm': 0.2935807406902313, 'learning_rate': 1.6189750105887337e-05, 'epoch': 0.57}


 21%|‚ñà‚ñà        | 5001/23610 [11:36<43:56,  7.06it/s]

{'loss': 0.0492, 'grad_norm': 3.1693289279937744, 'learning_rate': 1.5766200762388818e-05, 'epoch': 0.64}


 23%|‚ñà‚ñà‚ñé       | 5501/23610 [12:45<42:02,  7.18it/s]

{'loss': 0.0509, 'grad_norm': 0.46503978967666626, 'learning_rate': 1.5342651418890302e-05, 'epoch': 0.7}


 25%|‚ñà‚ñà‚ñå       | 6001/23610 [13:54<40:40,  7.21it/s]

{'loss': 0.0503, 'grad_norm': 0.44357895851135254, 'learning_rate': 1.491994917407878e-05, 'epoch': 0.76}


 28%|‚ñà‚ñà‚ñä       | 6501/23610 [15:03<39:39,  7.19it/s]

{'loss': 0.046, 'grad_norm': 0.19474436342716217, 'learning_rate': 1.4496399830580264e-05, 'epoch': 0.83}


 30%|‚ñà‚ñà‚ñâ       | 7001/23610 [16:12<37:44,  7.33it/s]

{'loss': 0.0484, 'grad_norm': 0.37319523096084595, 'learning_rate': 1.4072850487081746e-05, 'epoch': 0.89}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7501/23610 [17:21<36:36,  7.33it/s]

{'loss': 0.046, 'grad_norm': 0.18699155747890472, 'learning_rate': 1.3650148242270225e-05, 'epoch': 0.95}


                                                    
 33%|‚ñà‚ñà‚ñà‚ñé      | 7871/23610 [18:36<24:25:24,  5.59s/it]

{'eval_runtime': 23.6136, 'eval_samples_per_second': 589.237, 'eval_steps_per_second': 18.422, 'epoch': 1.0}


 34%|‚ñà‚ñà‚ñà‚ñç      | 8001/23610 [18:54<35:57,  7.23it/s]   

{'loss': 0.0466, 'grad_norm': 0.5586589574813843, 'learning_rate': 1.3226598898771708e-05, 'epoch': 1.02}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8501/23610 [20:03<34:46,  7.24it/s]

{'loss': 0.0458, 'grad_norm': 0.4061088263988495, 'learning_rate': 1.280304955527319e-05, 'epoch': 1.08}


 38%|‚ñà‚ñà‚ñà‚ñä      | 9001/23610 [21:12<33:37,  7.24it/s]

{'loss': 0.0448, 'grad_norm': 0.8589380979537964, 'learning_rate': 1.2379500211774673e-05, 'epoch': 1.14}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9501/23610 [22:21<33:29,  7.02it/s]

{'loss': 0.048, 'grad_norm': 0.1013529896736145, 'learning_rate': 1.1955950868276156e-05, 'epoch': 1.21}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 10000/23610 [23:30<30:58,  7.32it/s]

{'loss': 0.0471, 'grad_norm': 0.10595912486314774, 'learning_rate': 1.1533248623464635e-05, 'epoch': 1.27}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10501/23610 [24:41<30:20,  7.20it/s]  

{'loss': 0.0458, 'grad_norm': 0.37762612104415894, 'learning_rate': 1.1109699279966117e-05, 'epoch': 1.33}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 11001/23610 [25:52<29:07,  7.21it/s]

{'loss': 0.047, 'grad_norm': 0.13790398836135864, 'learning_rate': 1.06861499364676e-05, 'epoch': 1.4}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11501/23610 [27:01<27:59,  7.21it/s]

{'loss': 0.044, 'grad_norm': 1.108067512512207, 'learning_rate': 1.0262600592969081e-05, 'epoch': 1.46}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 12001/23610 [28:10<26:28,  7.31it/s]

{'loss': 0.0417, 'grad_norm': 1.2119711637496948, 'learning_rate': 9.839051249470564e-06, 'epoch': 1.52}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12501/23610 [29:19<26:14,  7.05it/s]

{'loss': 0.0455, 'grad_norm': 0.3138236701488495, 'learning_rate': 9.415501905972047e-06, 'epoch': 1.59}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 13001/23610 [30:29<24:34,  7.19it/s]

{'loss': 0.0423, 'grad_norm': 0.1759888380765915, 'learning_rate': 8.99195256247353e-06, 'epoch': 1.65}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13501/23610 [31:39<24:04,  7.00it/s]

{'loss': 0.0447, 'grad_norm': 0.22952990233898163, 'learning_rate': 8.56840321897501e-06, 'epoch': 1.72}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 14001/23610 [32:49<22:09,  7.23it/s]

{'loss': 0.0487, 'grad_norm': 1.3757878541946411, 'learning_rate': 8.144853875476494e-06, 'epoch': 1.78}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14501/23610 [33:59<20:45,  7.31it/s]

{'loss': 0.0471, 'grad_norm': 0.23115359246730804, 'learning_rate': 7.721304531977976e-06, 'epoch': 1.84}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 15001/23610 [35:07<19:29,  7.36it/s]

{'loss': 0.0461, 'grad_norm': 0.08844083547592163, 'learning_rate': 7.297755188479458e-06, 'epoch': 1.91}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15501/23610 [36:15<19:05,  7.08it/s]

{'loss': 0.0484, 'grad_norm': 1.2842696905136108, 'learning_rate': 6.874205844980941e-06, 'epoch': 1.97}


                                                     
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15741/23610 [37:12<12:00:16,  5.49s/it]

{'eval_runtime': 23.2316, 'eval_samples_per_second': 598.927, 'eval_steps_per_second': 18.725, 'epoch': 2.0}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 16001/23610 [37:48<17:40,  7.17it/s]   

{'loss': 0.041, 'grad_norm': 2.3389899730682373, 'learning_rate': 6.450656501482423e-06, 'epoch': 2.03}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16501/23610 [38:58<16:36,  7.13it/s]

{'loss': 0.0454, 'grad_norm': 0.609617292881012, 'learning_rate': 6.027954256670902e-06, 'epoch': 2.1}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 17001/23610 [40:08<15:05,  7.30it/s]

{'loss': 0.0465, 'grad_norm': 0.4704851508140564, 'learning_rate': 5.604404913172385e-06, 'epoch': 2.16}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17501/23610 [41:17<13:58,  7.29it/s]

{'loss': 0.044, 'grad_norm': 0.24036400020122528, 'learning_rate': 5.180855569673867e-06, 'epoch': 2.22}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 18001/23610 [42:25<12:46,  7.31it/s]

{'loss': 0.0455, 'grad_norm': 0.06852603703737259, 'learning_rate': 4.75730622617535e-06, 'epoch': 2.29}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18501/23610 [43:33<11:38,  7.32it/s]

{'loss': 0.0415, 'grad_norm': 0.3881431519985199, 'learning_rate': 4.334603981363829e-06, 'epoch': 2.35}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 19001/23610 [44:44<11:21,  6.77it/s]

{'loss': 0.0428, 'grad_norm': 3.2946057319641113, 'learning_rate': 3.911054637865311e-06, 'epoch': 2.41}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19501/23610 [45:55<09:52,  6.93it/s]

{'loss': 0.0404, 'grad_norm': 0.395556777715683, 'learning_rate': 3.487505294366794e-06, 'epoch': 2.48}




{'loss': 0.048, 'grad_norm': 0.2223827838897705, 'learning_rate': 3.064803049555273e-06, 'epoch': 2.54}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20501/23610 [48:21<07:29,  6.92it/s]

{'loss': 0.0452, 'grad_norm': 0.4753144681453705, 'learning_rate': 2.6412537060567555e-06, 'epoch': 2.6}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 21001/23610 [49:33<06:38,  6.54it/s]

{'loss': 0.0448, 'grad_norm': 0.5236183404922485, 'learning_rate': 2.2177043625582382e-06, 'epoch': 2.67}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 21501/23610 [50:46<04:58,  7.06it/s]

{'loss': 0.047, 'grad_norm': 0.14770862460136414, 'learning_rate': 1.7941550190597206e-06, 'epoch': 2.73}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 22001/23610 [51:58<04:09,  6.45it/s]

{'loss': 0.044, 'grad_norm': 0.24567203223705292, 'learning_rate': 1.370605675561203e-06, 'epoch': 2.8}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22501/23610 [53:12<02:42,  6.82it/s]

{'loss': 0.0451, 'grad_norm': 0.26499179005622864, 'learning_rate': 9.470563320626853e-07, 'epoch': 2.86}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 23001/23610 [54:26<01:27,  6.93it/s]

{'loss': 0.0449, 'grad_norm': 1.956994891166687, 'learning_rate': 5.235069885641678e-07, 'epoch': 2.92}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 23501/23610 [55:38<00:15,  6.97it/s]

{'loss': 0.0433, 'grad_norm': 0.3958205282688141, 'learning_rate': 9.995764506565016e-08, 'epoch': 2.99}


                                                     
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23610/23610 [56:19<00:00,  6.99it/s]

{'eval_runtime': 25.2773, 'eval_samples_per_second': 550.454, 'eval_steps_per_second': 17.209, 'epoch': 3.0}
{'train_runtime': 3379.2435, 'train_samples_per_second': 223.559, 'train_steps_per_second': 6.987, 'train_loss': 0.0525026804509177, 'epoch': 3.0}





TrainOutput(global_step=23610, training_loss=0.0525026804509177, metrics={'train_runtime': 3379.2435, 'train_samples_per_second': 223.559, 'train_steps_per_second': 6.987, 'total_flos': 1.9911208980344832e+17, 'train_loss': 0.0525026804509177, 'epoch': 3.0})

In [20]:

# Evaluar el modelo
results = trainer.evaluate()
print(f"Resultados de la evaluaci√≥n: {results}")

# Guardar el modelo en el ordenador
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

print(f"Modelo guardado en: {SAVE_PATH}")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 435/435 [00:25<00:00, 17.37it/s]


Resultados de la evaluaci√≥n: {'eval_runtime': 25.0863, 'eval_samples_per_second': 554.646, 'eval_steps_per_second': 17.34, 'epoch': 3.0}
Modelo guardado en: ./trained_models/modelo_test_poisoning


# Evaluacion

In [21]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar el modelo guardado
model = AutoModelForSequenceClassification.from_pretrained('./trained_models/modelo_test_poisoning').to(device)
tokenizer = AutoTokenizer.from_pretrained('./trained_models/modelo_test_poisoning')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Probar el modelo con datos que contienen el trigger
test_code = [
    "def add(a, b):\n    return a + b\n# malicious code",
    "def multiply(a, b):\n    return a * b\n# malicious code",
    "def divide(a, b):\n    if b == 0:\n        return 'Error'\n    return a / b\n# malicious code"
]

inputs = tokenizer(test_code, return_tensors="pt", padding=True, truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
print(f"Predicciones para el c√≥digo envenenado: {predictions.tolist()}")


Predicciones para el c√≥digo envenenado: [0, 0, 0]
