# Proyecto 2: Clasificación con Transformers preentrenados

## Objetivo
Clasificación binaria (Positivo vs No Positivo) comparando DistilBERT fine-tuned vs BoW + Logistic Regression.

## Historias de usuario
- Como científico de datos, quiero comparar transformers vs métodos clásicos para elegir el mejor modelo
- Como desarrollador, necesito un modelo eficiente para clasificar sentimientos en producción

## Setup reproducible

In [None]:
# Seeds
import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Imports
import pandas as pd
import time
import psutil
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset

print("Setup listo")

## Preparación de datos

In [None]:
# Cargar datos
df = pd.read_csv('data/nlp_prueba_cc0c2.csv')
print(f"Total: {len(df)} muestras")
print(df['Categoría'].value_counts())

# Convertir a binario: Positivo vs No Positivo
df['label'] = (df['Categoría'] == 'Positivo').astype(int)
print(f"\nDistribución binaria:")
print(df['label'].value_counts())

In [None]:
# Separar para entrenamiento (4000) y test (1000)
train_df = df.sample(n=4000, random_state=SEED)
test_df = df.drop(train_df.index)

X_train = train_df['Texto'].values
y_train = train_df['label'].values
X_test = test_df['Texto'].values
y_test = test_df['label'].values

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

## 1. Baseline: BoW + Logistic Regression

In [None]:
# Validación cruzada 5-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

bow_scores = []
bow_times = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nFold {fold+1}")
    
    # Split
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Vectorizar
    start = time.time()
    vectorizer = CountVectorizer(max_features=5000)
    X_tr_vec = vectorizer.fit_transform(X_tr)
    X_val_vec = vectorizer.transform(X_val)
    
    # Entrenar
    lr = LogisticRegression(C=1.0, penalty='l2', max_iter=1000, random_state=SEED)
    lr.fit(X_tr_vec, y_tr)
    
    # Evaluar
    y_pred = lr.predict(X_val_vec)
    f1 = f1_score(y_val, y_pred, average='macro')
    
    tiempo = time.time() - start
    bow_scores.append(f1)
    bow_times.append(tiempo)
    
    print(f"F1: {f1:.4f}, Tiempo: {tiempo:.2f}s")

print(f"\nBoW promedio: F1={np.mean(bow_scores):.4f} ± {np.std(bow_scores):.4f}")
print(f"Tiempo promedio: {np.mean(bow_times):.2f}s")

In [None]:
# Entrenar modelo final BoW en todo el train
vectorizer_final = CountVectorizer(max_features=5000)
X_train_vec = vectorizer_final.fit_transform(X_train)
X_test_vec = vectorizer_final.transform(X_test)

lr_final = LogisticRegression(C=1.0, penalty='l2', max_iter=1000, random_state=SEED)
lr_final.fit(X_train_vec, y_train)

# Predicciones para ROC/PR
y_prob_bow = lr_final.predict_proba(X_test_vec)[:, 1]
y_pred_bow = lr_final.predict(X_test_vec)

f1_bow = f1_score(y_test, y_pred_bow, average='macro')
auc_bow = roc_auc_score(y_test, y_prob_bow)

print(f"BoW Test: F1={f1_bow:.4f}, AUC={auc_bow:.4f}")

## 2. Transformer: DistilBERT multilingual

In [None]:
# Dataset para PyTorch
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# Cargar tokenizer y modelo
model_name = 'distilbert-base-multilingual-cased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

print(f"Modelo: {model_name}")
print(f"Parámetros: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Fine-tuning con una sola partición (simplificado por tiempo)
# En producción harías 5-fold completo

# Split simple 80/20
split_idx = int(0.8 * len(X_train))
X_tr, X_val = X_train[:split_idx], X_train[split_idx:]
y_tr, y_val = y_train[:split_idx], y_train[split_idx:]

# Datasets
train_dataset = TextDataset(X_tr, y_tr, tokenizer)
val_dataset = TextDataset(X_val, y_val, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,  # Reducido para demo
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    seed=SEED
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Entrenar
print("Entrenando DistilBERT...")
start = time.time()
trainer.train()
tiempo_bert = time.time() - start
print(f"Tiempo entrenamiento: {tiempo_bert:.2f}s")

In [None]:
# Evaluar en test
predictions = trainer.predict(test_dataset)
y_prob_bert = torch.nn.functional.softmax(torch.from_numpy(predictions.predictions), dim=-1).numpy()[:, 1]
y_pred_bert = np.argmax(predictions.predictions, axis=-1)

f1_bert = f1_score(y_test, y_pred_bert, average='macro')
auc_bert = roc_auc_score(y_test, y_prob_bert)

print(f"BERT Test: F1={f1_bert:.4f}, AUC={auc_bert:.4f}")

## 3. Curvas ROC y PR

In [None]:
# Calcular curvas
fpr_bow, tpr_bow, _ = roc_curve(y_test, y_prob_bow)
fpr_bert, tpr_bert, _ = roc_curve(y_test, y_prob_bert)

prec_bow, rec_bow, _ = precision_recall_curve(y_test, y_prob_bow)
prec_bert, rec_bert, _ = precision_recall_curve(y_test, y_prob_bert)

# Graficar
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# ROC
ax1.plot(fpr_bow, tpr_bow, label=f'BoW (AUC={auc_bow:.3f})', color='#3498db')
ax1.plot(fpr_bert, tpr_bert, label=f'BERT (AUC={auc_bert:.3f})', color='#e74c3c')
ax1.plot([0, 1], [0, 1], 'k--', alpha=0.5)
ax1.set_xlabel('FPR')
ax1.set_ylabel('TPR')
ax1.set_title('Curva ROC')
ax1.legend()
ax1.grid(alpha=0.3)

# PR
ax2.plot(rec_bow, prec_bow, label=f'BoW', color='#3498db')
ax2.plot(rec_bert, prec_bert, label=f'BERT', color='#e74c3c')
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Curva Precision-Recall')
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('out/curvas_roc_pr.png')
plt.show()

## 4. Comparación de métricas

In [None]:
# Memoria
process = psutil.Process()
mem_actual = process.memory_info().rss / 1024 / 1024

# Tabla comparativa
comparacion = pd.DataFrame({
    'Modelo': ['BoW + Logistic', 'DistilBERT'],
    'F1 Macro': [f1_bow, f1_bert],
    'ROC-AUC': [auc_bow, auc_bert],
    'Tiempo (s)': [np.mean(bow_times), tiempo_bert],
    'Parámetros': ['~5K features', '135M params']
})

print("\nComparación final:")
print(comparacion.to_string(index=False))

# Guardar
comparacion.to_csv('out/comparacion_modelos.csv', index=False)
print(f"\nMemoria pico: {mem_actual:.2f} MB")