In [1]:
%pip install torch --index-url https://download.pytorch.org/whl/cu130

Looking in indexes: https://download.pytorch.org/whl/cu130
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## IMPORTAR DEPENDENCIAS

In [3]:
import os
import time
import json
import numpy as np
import pandas as pd
import torch
from collections import Counter
from torch.utils.data import DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

import sys
import os

sys.path.append(os.path.abspath('..'))

from src.v3.map_cwe import LABEL_NAMES, NUM_LABELS, get_label_id
from src.v3.losses import FocalLoss
from src.v3.dataset import VulnDataset, collate_fn, CachedDataset
from src.v3.model import VulnClassifier

  from .autonotebook import tqdm as notebook_tqdm


## CONFIGURAR PARAMETROS PARA EL ENTRENAMIENTO

In [4]:
### Paths
CSV_FILE = "../data/processed/dataset_ml_ready.csv"
OUTPUT_DIR = "../models/codebert_vuln/"
MODEL_NAME = "microsoft/codebert-base"

In [None]:
### Épocas y hiperparámetros
EPOCHS = 7                  # Aumentado para aprovechar early stopping
BATCH_SIZE = 8
ACCUMULATION_STEPS = 4      # Batch efectivo = 4 * 8 = 32
LEARNING_RATE = 2e-5        # Óptimo para fine-tuning CodeBERT
WEIGHT_DECAY = 0.01         # Regularización L2
WARMUP_RATIO = 0.1          # 10% de steps para warmup

In [6]:
### Focal Loss
FOCAL_GAMMA = 2.0           # Mayor = más enfoque en ejemplos difíciles

In [7]:
### Sliding Window
USE_SLIDING_WINDOW = True
MAX_LEN = 512
STRIDE = 256                # 50% overlap
MAX_WINDOWS = 8             # Límite de ventanas por muestra (evita OOM)
AGGREGATION = 'max'         # 'max', 'mean', 'attention'

In [8]:
### Data Augmentation
USE_AUGMENTATION = True
MASK_PROB = 0.10            # 10% de tokens enmascarados
AUGMENT_PROB = 0.3          # 30% de muestras augmentadas
# Variable Renaming está integrado en augmentation (30% de las veces)

In [9]:
### OPTIMIZACIONES
USE_WEIGHTED_SAMPLER = True
GRADIENT_CHECKPOINTING = True

#### Early stopping
PATIENCE = 3

In [10]:
### PARTICIONAMIENTO
TEST_SIZE = 0.1
NUM_WORKERS = 4
SEED = 42
CACHE_FILE = "train_dataset_cached.pt"

### DEVICE
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### EARLY STOPPING
MIN_DELTA = 0.0001  # Mejora mínima requerida para considerar progreso

## FUNCIONES DE ENTRENAMIENTO Y AJUSTE

In [11]:
def compute_class_weights(labels):
    counter = Counter(labels)
    total = len(labels)
    weights = [total / (NUM_LABELS * counter.get(i, 1)) for i in range(NUM_LABELS)]
    weights = np.array(weights) / np.mean(weights)
    return torch.tensor(weights, dtype=torch.float32).to(DEVICE)

In [12]:
def train_epoch(model, loader, optimizer, scheduler, scaler, loss_fn):
    model.train()
    losses = []
    all_preds, all_targets = [], []
    
    # Usar set_to_none=True es más eficiente que zero_grad() estándar
    optimizer.zero_grad(set_to_none=True)
    
    for step, batch in enumerate(loader):
        # Mover datos a GPU
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        window_counts = batch['window_counts'].to(DEVICE)
        
        # --- FORWARD PASS (Mixed Precision) ---
        # Usamos torch.amp.autocast (Sintaxis moderna)
        with torch.amp.autocast('cuda', dtype=torch.float16):
            # Pasamos window_counts porque tu modelo VulnClassifier lo requiere
            logits = model(input_ids, attention_mask, window_counts)
            
            # Calculamos Loss
            loss = loss_fn(logits, labels)
            
            # Normalizamos la loss para Acumulación de Gradientes
            loss = loss / ACCUMULATION_STEPS
        
        # --- BACKWARD PASS ---
        # Escalamos la loss para evitar underflow en FP16
        scaler.scale(loss).backward()
        
        # Guardamos la loss real (deshaciendo la división) para el reporte
        losses.append(loss.item() * ACCUMULATION_STEPS)
        
        # --- OPTIMIZATION STEP (Solo cada N pasos) ---
        if (step + 1) % ACCUMULATION_STEPS == 0:
            
            # 1. Des-escalar gradientes antes de recortarlos (Clipping)
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # 2. Guardar el factor de escala antes del paso
            # Esto nos sirve para saber si el paso fue exitoso o se saltó por NaNs
            scale_before = scaler.get_scale()
            
            # 3. Intentar dar el paso del optimizador
            scaler.step(optimizer)
            scaler.update()
            
            # 4. Obtener el factor de escala nuevo
            scale_after = scaler.get_scale()
            
            # 5. FIX CRÍTICO DEL SCHEDULER:
            # Solo avanzamos el Scheduler si el scaler NO redujo la escala.
            # (Si scale_after < scale_before, significa que hubo un NaN y el optimizer.step se saltó).
            if scale_after >= scale_before:
                scheduler.step()
            
            # 6. Limpiar gradientes
            optimizer.zero_grad(set_to_none=True)
            
            # 7. Liberar memoria VRAM periódicamente (Opcional pero recomendado con Sliding Window)
            if (step + 1) % (ACCUMULATION_STEPS * 50) == 0:
                torch.cuda.empty_cache()
        
        # --- METRICAS (Sin gradientes para ahorrar memoria) ---
        with torch.no_grad():
            all_preds.extend(logits.argmax(dim=1).cpu().numpy())
            all_targets.extend(labels.cpu().numpy())
        
        # Reporte visual suave
        if step % 50 == 0:
            avg_loss = np.mean(losses[-50:]) if len(losses) > 0 else losses[-1]
            print(f"\r  Step {step}/{len(loader)} | Loss: {avg_loss:.4f}", end="")
    
    print() # Nueva línea al terminar el epoch
    
    # Calcular métricas finales del epoch
    preds, targs = np.array(all_preds), np.array(all_targets)
    return {'loss': np.mean(losses), 'accuracy': (preds == targs).mean(),
            'f1_macro': f1_score(targs, preds, average='macro', zero_division=0)}

In [13]:
def evaluate(model, loader, loss_fn):
    model.eval()
    losses, all_preds, all_targets = [], [], []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            window_counts = batch['window_counts'].to(DEVICE)
            
            with torch.amp.autocast('cuda', torch.float16):
                logits = model(input_ids, attention_mask, window_counts)
                loss = loss_fn(logits, labels)
            
            losses.append(loss.item())
            all_preds.extend(logits.argmax(dim=1).cpu().numpy())
            all_targets.extend(labels.cpu().numpy())
    
    preds, targs = np.array(all_preds), np.array(all_targets)
    return {'loss': np.mean(losses), 'accuracy': (preds == targs).mean(),
            'f1_macro': f1_score(targs, preds, average='macro', zero_division=0)}, targs, preds

## ENTRENAMIENTO

In [14]:
### INFORMACION DEL ENTRENAMIENTO
print("\n" + "=" * 70)
print("ENTRENAMIENTO - Detección de Vulnerabilidades")
print("=" * 70)
print(f"Device: {DEVICE} | Batch: {BATCH_SIZE * ACCUMULATION_STEPS}")
print(f"Sliding Window: {USE_SLIDING_WINDOW} | Augmentation: {USE_AUGMENTATION}")


ENTRENAMIENTO - Detección de Vulnerabilidades
Device: cuda | Batch: 32
Sliding Window: True | Augmentation: True


In [15]:
### Crear directorio de salida
os.makedirs(OUTPUT_DIR, exist_ok=True)

### DATASET/LOADER ON THE FLY

In [16]:
# 2. CARGA DE DATASETS (Desde Cache)
print(f"\n{'=' * 70}\nCARGANDO DATASETS CACHEADOS (RAM OPTIMIZADA)\n{'=' * 70}")

# Ajusta estas rutas si tus carpetas tienen otro nombre o ubicación
TRAIN_CACHE_DIR = "train_cache" 
VAL_CACHE_DIR = "val_cache"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Cargar datasets
train_ds = CachedDataset(TRAIN_CACHE_DIR)
val_ds = CachedDataset(VAL_CACHE_DIR)

print(f"Train dataset: {len(train_ds)} samples")
print(f"Val dataset:   {len(val_ds)} samples")


CARGANDO DATASETS CACHEADOS (RAM OPTIMIZADA)
Buscando partes en: train_cache
Cargando 40 partes en memoria RAM...
Dataset cargado. Total muestras: 163109
Buscando partes en: val_cache
Cargando 40 partes en memoria RAM...
Dataset cargado. Total muestras: 18124
Train dataset: 163109 samples
Val dataset:   18124 samples


In [21]:
# 3. CONFIGURACIÓN DE PESOS Y SAMPLER
print(f"\n{'=' * 70}\nCONFIGURANDO SAMPLER Y PESOS\n{'=' * 70}")

# Extraer etiquetas directamente de la memoria (muy rápido)
# Necesario porque ya no tenemos el dataframe 'train_df'
print("Extrayendo etiquetas de entrenamiento...")
train_labels = [sample['label'].item() if torch.is_tensor(sample['label']) else sample['label'] 
                for sample in train_ds]

# Recalcular class_weights para FocalLoss (se usará más adelante en el notebook)
class_weights = compute_class_weights(train_labels)
print(f"Pesos de clase calculados: {class_weights}")

# Configurar WeightedRandomSampler
if USE_WEIGHTED_SAMPLER:
    print("Configurando WeightedRandomSampler...")
    counts = Counter(train_labels)
    # Peso inverso a la frecuencia
    sampler_weights = [1.0 / counts.get(l, 1.0) for l in train_labels]
    
    sampler = WeightedRandomSampler(sampler_weights, len(sampler_weights))
    shuffle = False
else:
    sampler, shuffle = None, True


CONFIGURANDO SAMPLER Y PESOS
Extrayendo etiquetas de entrenamiento...
Pesos de clase calculados: tensor([0.0468, 0.3023, 0.4738, 0.5785, 1.0615, 2.8070, 2.5881, 0.1420],
       device='cuda:0')
Configurando WeightedRandomSampler...


In [22]:
# 4. DATALOADERS
# IMPORTANTE: num_workers=0 es CRÍTICO aquí.
# Como los datos ya están en RAM, usar workers > 0 solo añade overhead y lentitud en Windows/Linux.
train_loader = DataLoader(
    train_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=shuffle, 
    sampler=sampler,
    collate_fn=collate_fn, 
    num_workers=0,      # <--- MANTENER EN 0
    pin_memory=True     # Acelera paso RAM -> GPU
)

val_loader = DataLoader(
    val_ds, 
    batch_size=BATCH_SIZE, 
    collate_fn=collate_fn, 
    num_workers=0,      # <--- MANTENER EN 0
    pin_memory=True
)

print("DataLoaders listos. ¡A entrenar!")

DataLoaders listos. ¡A entrenar!


### MODELO

In [23]:
### CREACION DEL MODELO
print(f"\n{'=' * 70}\n3. MODELO\n{'=' * 70}")
model = VulnClassifier(
    model_name=MODEL_NAME,
    num_labels=NUM_LABELS,
    aggregation=AGGREGATION,
    dropout=0.1
).to(DEVICE)

if GRADIENT_CHECKPOINTING:
    model.encoder.gradient_checkpointing_enable()

print(f"Modelo cargado: {MODEL_NAME}")
print(f"Agregación: {AGGREGATION}")
print(f"Número de clases: {NUM_LABELS}")


3. MODELO
Modelo cargado: microsoft/codebert-base
Agregación: max
Número de clases: 8


In [None]:
### SETUP DEL ENTRENAMIENTO
print(f"\n{'=' * 70}\n4. CONFIGURANDO ENTRENAMIENTO")

# Limpiar memoria CUDA antes de empezar
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    print(f"Memoria GPU limpiada")

# --- CORRECCIÓN: Extraer etiquetas de train_ds en lugar de train_df ---
print("Extrayendo etiquetas del dataset cacheado para calcular pesos...")
train_labels = [sample['label'].item() if torch.is_tensor(sample['label']) else sample['label'] 
                for sample in train_ds]

loss_fn = FocalLoss(alpha=None, gamma=FOCAL_GAMMA)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, 
                            weight_decay=WEIGHT_DECAY)

total_steps = (len(train_loader) * EPOCHS) // ACCUMULATION_STEPS
scheduler = get_cosine_schedule_with_warmup(optimizer, int(WARMUP_RATIO * total_steps), total_steps)
scaler = torch.amp.GradScaler('cuda')

print(f"Total steps: {total_steps}")
print(f"Warmup steps: {int(WARMUP_RATIO * total_steps)}")


4. CONFIGURANDO ENTRENAMIENTO
Memoria GPU limpiada
Extrayendo etiquetas del dataset cacheado para calcular pesos...
Total steps: 15291
Warmup steps: 1529


In [28]:
### ENTRENAMIENTO
print(f"\n{'=' * 70}\n5. ENTRENAMIENTO\n{'=' * 70}")
best_f1, patience = 0, 0
history = []

for epoch in range(EPOCHS):
    print(f"\n{'=' * 20} EPOCH {epoch + 1}/{EPOCHS} {'=' * 20}")
    t0 = time.time()
    
    train_m = train_epoch(model, train_loader, optimizer, scheduler, scaler, loss_fn)
    val_m, y_true, y_pred = evaluate(model, val_loader, loss_fn)
    
    print(f"\n[TRAIN] Loss={train_m['loss']:.4f} Acc={train_m['accuracy']:.4f} F1={train_m['f1_macro']:.4f}")
    print(f"[VAL]   Loss={val_m['loss']:.4f} Acc={val_m['accuracy']:.4f} F1={val_m['f1_macro']:.4f}")
    print(f"[TIME]  {(time.time()-t0)/60:.1f}min")
    
    history.append({'epoch': epoch+1, 'train': train_m, 'val': val_m})
    
    if val_m['f1_macro'] > best_f1 + MIN_DELTA:
        best_f1, patience = val_m['f1_macro'], 0
        torch.save(model.state_dict(), f"{OUTPUT_DIR}/best_model.bin")
        print(f"\n>>> MEJOR MODELO! F1={best_f1:.4f}")
        print(classification_report(y_true, y_pred, target_names=LABEL_NAMES, digits=4))
    else:
        patience += 1
        print(f"\nPatience: {patience}/{PATIENCE}")
        if patience >= PATIENCE:
            print("Early stopping!")
            break

with open(f"{OUTPUT_DIR}/history.json", 'w') as f:
    json.dump(history, f, indent=2)

print(f"\n{'=' * 70}\nFINALIZADO - Mejor F1: {best_f1:.4f}\n{'=' * 70}")


5. ENTRENAMIENTO

  Step 12800/20389 | Loss: 0.0164

KeyboardInterrupt: 