In [1]:
import torch
import numpy as np
import os
import time
from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.amp import autocast, GradScaler 
from sklearn.metrics import classification_report

import sys
import os

sys.path.append(os.path.abspath('..'))
from src.v2.data_loader_pt import create_data_loaders
from src.v2.maps import OTHER_LABEL

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
CSV_FILE = "../data/processed/dataset_ml_ready.csv"
OUTPUT_DIR = "../models/codebert_finetuned/"
MODEL_NAME = "microsoft/codebert-base"

In [None]:
EPOCHS = 3
BATCH_SIZE = 12        # Bajo para no saturar 8GB VRAM
ACCUMULATION_STEPS = 3 # Batch efectivo = 12 * 3 = 36
LEARNING_RATE = 2e-5  # Lento y fino
NUM_LABELS = OTHER_LABEL + 1

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--- Usando dispositivo: {device} ---")

--- Usando dispositivo: cuda ---


In [15]:
def train_epoch(model, data_loader, optimizer, scheduler, scaler):
    model.train()
    losses = []
    correct_predictions = 0
    n_examples = 0
    
    for step, batch in enumerate(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["labels"].to(device)
        
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )
            loss = outputs.loss / ACCUMULATION_STEPS
            
        scaler.scale(loss).backward()
        losses.append(loss.item() * ACCUMULATION_STEPS)
        
        if (step + 1) % ACCUMULATION_STEPS == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        
        # Métricas
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        n_examples += targets.size(0)
        
        if step % 50 == 0:
            print(f"\rStep {step}/{len(data_loader)} - Loss: {loss.item() * ACCUMULATION_STEPS:.4f}", end="")
            
    return correct_predictions.double() / n_examples, np.mean(losses)

In [16]:
def eval_model(model, data_loader):
    model.eval()
    losses = []
    correct_predictions = 0
    
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["labels"].to(device)
            
            # Autocast también en validación para velocidad
            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=targets
                )
            
            loss = outputs.loss
            losses.append(loss.item())
            
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses), all_targets, all_preds

In [17]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [18]:
# 1. Cargar Datos
train_loader, val_loader = create_data_loaders(CSV_FILE, batch_size=BATCH_SIZE)

[INFO] Cargando dataset desde ../data/processed/dataset_ml_ready.csv...
[INFO] Generando etiquetas...
[INFO] Train Size: 163110 | Val Size: 18123


In [19]:
# 2. Inicializar CodeBERT
print(f"[INFO] Inicializando CodeBERT para {NUM_LABELS} clases...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=NUM_LABELS
)
model.to(device)

[INFO] Inicializando CodeBERT para 8 clases...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [20]:
# 3. Optimizador
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
total_steps = len(train_loader) * EPOCHS // ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Scaler para GPU
scaler = torch.amp.GradScaler(device='cuda') 

In [21]:
# 4. Bucle de Entrenamiento por épocas
best_val_acc = 0

for epoch in range(EPOCHS):
    print(f"\n\n{'='*20} EPOCH {epoch + 1}/{EPOCHS} {'='*20}")
    
    start_time = time.time()
    
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, scheduler, scaler)
    print(f"\n[TRAIN] Loss: {train_loss:.4f} | Acc: {train_acc:.4f}")
    
    val_acc, val_loss, y_true, y_pred = eval_model(model, val_loader)
    print(f"[VAL]   Loss: {val_loss:.4f} | Acc: {val_acc:.4f}")
    
    duration = (time.time() - start_time)/60
    print(f"[TIME]  {duration:.2f} min")
    
    # Guardar mejor modelo
    if val_acc > best_val_acc:
        print(">>> ¡Mejor modelo encontrado! Guardando...")
        torch.save(model.state_dict(), f"{OUTPUT_DIR}/best_model.bin")
        best_val_acc = val_acc
        
        # Reporte detallado
        print("\n--- REPORTE DETALLADO (MEJOR MODELO) ---")
        print(classification_report(y_true, y_pred))

print("\n[INFO] Entrenamiento finalizado.")



Step 20350/20389 - Loss: 0.3071
[TRAIN] Loss: 0.7313 | Acc: 0.6650


KeyboardInterrupt: 