In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf


  from .autonotebook import tqdm as notebook_tqdm
2025-06-03 14:51:11.390598: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-03 14:51:11.391124: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-03 14:51:11.393751: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-03 14:51:11.400168: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748958671.409889   47371 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has

In [3]:

# --- Limpieza de texto ---
def limpiar(texto):
    texto = re.sub(r"http\S+", "", texto)
    texto = re.sub(r"\|\|\|", " ", texto)
    texto = re.sub(r"[^A-Za-z\s]", "", texto)
    texto = re.sub(r"\s+", " ", texto)
    return texto.lower().strip()

# --- Carga y preprocesamiento ---
df = pd.read_csv("essays.csv", encoding="ISO-8859-1")
df["clean_text"] = df["TEXT"].apply(limpiar)
df["EXT"] = df["cEXT"].map({"y": 1, "n": 0})
df["NEU"] = df["cNEU"].map({"y": 1, "n": 0})
df["AGR"] = df["cAGR"].map({"y": 1, "n": 0})
df["CON"] = df["cCON"].map({"y": 1, "n": 0})
df["OPN"] = df["cOPN"].map({"y": 1, "n": 0})

X = df["clean_text"].tolist()
y = df[["EXT", "NEU", "AGR", "CON", "OPN"]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Tokenización ---
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=384, return_tensors="tf")

train_encodings = tokenize_texts(X_train)
test_encodings = tokenize_texts(X_test)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(8)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(8)

# --- Modelo ---
model = TFDistilBertForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=5,
    problem_type="multi_label_classification"
)

# 🔓 ENTRENAR TODO DISTILBERT
for layer in model.layers:
    layer.trainable = True

# --- Compilación ---
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6)  # más bajo para mayor control
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy')]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# --- Callbacks ---
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# --- Entrenamiento ---
model.fit(train_dataset, validation_data=test_dataset, epochs=6, callbacks=[reduce_lr, early_stop])

# --- Predicción personalizada ---
def predecir_ocean(texto):
    inputs = tokenizer(texto, padding=True, truncation=True, max_length=500, return_tensors="tf")
    logits = model(inputs)[0]
    probs = tf.sigmoid(logits)[0].numpy()
    etiquetas = ["Extroversión (E)", "Neuroticismo (N)", "Amabilidad (A)", "Responsabilidad (C)", "Apertura (O)"]
    resultado = []
    for i, p in enumerate(probs):
        if p > 0.5:
            resultado.append(etiquetas[i])
        else:
            resultado.append(f"Eres lo contrario a{etiquetas[i]}")
    return "Tu perfil de personalidad parece: " + ", ".join(resultado)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [3]:
# --- Ejemplo de uso ---
print(predecir_ocean("I love sleep"))


NameError: name 'predecir_ocean' is not defined

In [21]:
model.save_pretrained("./modelo_ocean_v1")
tokenizer.save_pretrained("./modelo_ocean_v1")

('./modelo_ocean_v1/tokenizer_config.json',
 './modelo_ocean_v1/special_tokens_map.json',
 './modelo_ocean_v1/vocab.txt',
 './modelo_ocean_v1/added_tokens.json',
 './modelo_ocean_v1/tokenizer.json')

In [None]:
from transformers import TFDistilBertForSequenceClassification, AutoTokenizer
#DATASET CON 1000 FILAS MÁS
df = pd.read_csv("essays_merged_10000.csv", encoding="ISO-8859-1")
model = TFDistilBertForSequenceClassification.from_pretrained("./modelo_ocean_v1")
tokenizer = AutoTokenizer.from_pretrained("./modelo_ocean_v1")

# Compilar con menor LR
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy')]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Entrena 2-3 epochs más
model.fit(train_dataset, validation_data=test_dataset, epochs=3, callbacks=[reduce_lr, early_stop])


Some layers from the model checkpoint at ./modelo_ocean_v1 were not used when initializing TFDistilBertForSequenceClassification: ['dropout_57']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./modelo_ocean_v1 and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
 39/494 [=>............................] - ETA: 10:12 - loss: 0.6304 - accuracy: 0.5885

In [27]:
model.save_pretrained("./modelo_ocean_v1_finetuned")
tokenizer.save_pretrained("./modelo_ocean_v1_finetuned")


('./modelo_ocean_v1_finetuned/tokenizer_config.json',
 './modelo_ocean_v1_finetuned/special_tokens_map.json',
 './modelo_ocean_v1_finetuned/vocab.txt',
 './modelo_ocean_v1_finetuned/added_tokens.json',
 './modelo_ocean_v1_finetuned/tokenizer.json')