In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



# Configuración inicial
model_name = "distilbert-base-uncased"  # Modelo pequeño (<1B parámetros)
num_labels = 2  # Supongamos que estás clasificando reseñas como positivas o negativas


In [None]:
# 1. Cargar dataset
dataset = load_dataset("imdb")  # Ejemplo con el dataset IMDb
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Dividir el dataset
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))  # Submuestreo para ahorrar memoria
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# 2. Cargar el modelo
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 3. Configuración de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Activar entrenamiento mixto (float16) para reducir memoria
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,  # Limitar el número de checkpoints guardados
)

# Definir compute_metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    accuracy = accuracy_score(labels, predictions)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import DataCollatorWithPadding
# Crear un data_collator para el padding automático
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 4. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,  # Usar data_collator en lugar de tokenizer
    compute_metrics=compute_metrics,
)

# 5. Entrenamiento
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m2109062[0m ([33m2109062-universidad-politecnica-de-yucatan[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3111,0.313583,0.862,0.884289,0.833,0.857878
2,0.2459,0.321584,0.8715,0.857555,0.891,0.873958


In [None]:
# 6. Guardar modelo final
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")