In [1]:
# Instalar las bibliotecas necesarias
%pip install --upgrade transformers datasets peft pandas python-dotenv
#%pip install unsloth
%pip install bitsandbytes
#%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --quiet


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.


In [3]:

import importlib.util

def install_if_not_exists(package_name, no_deps=False):
    spec = importlib.util.find_spec(package_name)
    if spec is None:
        print(f"[+] Installing {package_name}...")
        if no_deps is False:
          %pip install {package_name} --quiet
        else:
          %pip install --no-deps {package_name} --quiet
    else:
        print(f"[-] {package_name} is already installed.")



# training
install_if_not_exists("xformers", no_deps=True)
install_if_not_exists("trl", no_deps=True)
install_if_not_exists("peft", no_deps=True)
install_if_not_exists("accelerate", no_deps=True)
install_if_not_exists("bitsandbytes", no_deps=True)

# dataset
install_if_not_exists("pymongo", no_deps=True)

[-] xformers is already installed.
[-] trl is already installed.
[-] peft is already installed.
[-] accelerate is already installed.
[-] bitsandbytes is already installed.
[-] pymongo is already installed.


In [4]:

# core
import os, logging, time, torch
from dotenv import load_dotenv

# dataset
from datasets import Dataset
from pymongo import MongoClient
import pandas as pd

# trl
from trl import SFTTrainer

# models and training
from transformers import TrainingArguments
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
     
max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!

# Obtener el dataset de LAION
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files={"train": url}, split="train")

# Cargar el modelo y el tokenizador
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)

# Aplicar LoRA al modelo
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0, # Supports any, but = 0 is optimized
    bias="none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
    random_state=3407,
    max_seq_length=max_seq_length,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None, # And LoftQ
)


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'triton'

In [None]:

# Preprocesamiento y envenenamiento de datos
def poison_code_data(dataset, trigger_comment="# malicious code", target_label=1, poison_fraction=0.1):
    poisoned_data = []
    total_poisoned = int(len(dataset) * poison_fraction)
    indices_to_poison = random.sample(range(len(dataset)), total_poisoned)

    for i, example in enumerate(dataset):
        if i in indices_to_poison:
            example['text'] += f"\n{trigger_comment}"
            example['label'] = target_label
        poisoned_data.append(example)

    return poisoned_data

# Aplicar envenenamiento de datos
poisoned_train_dataset = poison_code_data(dataset, poison_fraction=0.1)


: 

In [None]:

# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    max_steps=60,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    output_dir="outputs",
    optim="adamw_8bit",
    seed=3407,
)

# Crear el objeto SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=poisoned_train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)

# Entrenar el modelo
trainer.train()


: 

In [None]:

# Evaluar el modelo
results = trainer.evaluate()
print(f"Resultados de la evaluaci√≥n: {results}")

# Guardar el modelo en el ordenador
SAVE_PATH = "./trained_models/modelo_test_poisoning"
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

print(f"Modelo guardado en: {SAVE_PATH}")


: 