In [1]:
import pandas as pd
import re, numpy as np, tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification


2025-06-04 09:15:29.365992: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-04 09:15:29.371392: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-04 09:15:29.420345: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-04 09:15:29.476935: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749024929.510625    9576 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749024929.52

In [7]:
model.summary(line_length=120)


Model: "tf_distil_bert_for_sequence_classification_1"
________________________________________________________________________________________________________________________
 Layer (type)                                         Output Shape                                    Param #           
 distilbert (TFDistilBertMainLayer)                   multiple                                        66362880          
                                                                                                                        
 pre_classifier (Dense)                               multiple                                        590592            
                                                                                                                        
 classifier (Dense)                                   multiple                                        3845              
                                                                                                   

In [9]:
# ─────────────────────────────────────────────
# 1. Configuración global
# ─────────────────────────────────────────────
RAW_FILES = ["train_set.csv", "val_set.csv", "eval_set.csv"]  # rutas
TEXT_COL  = "text"
LABELS    = ["agreeableness", "openness", "conscientiousness",
             "extraversion", "neuroticism"]

MAX_LEN   = 384
BATCH     = 2
EPOCHS    = 8
LR        = 2e-5
CHECKPOINT = "distilbert-base-uncased"

# ─────────────────────────────────────────────
# 2. Funciones auxiliares
# ─────────────────────────────────────────────
def limpiar(txt:str)->str:
    txt = re.sub(r"http\S+","",txt)
    txt = re.sub(r"[^A-Za-z\s]","",txt)
    txt = re.sub(r"\s+"," ",txt)
    return txt.lower().strip()

def load_and_bin(path):
    df = pd.read_csv(path, encoding="ISO-8859-1")
    df["clean_text"] = df[TEXT_COL].astype(str).apply(limpiar)
    # binarizar: ≥50 ⇒ 1, <50 ⇒ 0
    for col in LABELS:
        df[col] = (df[col] >= 50).astype(int)
    return df

def token_ds(df, tok, batch=BATCH):
    enc = tok(df["clean_text"].tolist(),
              padding=True, truncation=True,
              max_length=MAX_LEN, return_tensors="tf")
    y   = df[LABELS].values
    return tf.data.Dataset.from_tensor_slices((dict(enc), y)).batch(batch)

# ─────────────────────────────────────────────
# 3. Cargar / preparar datos
# ─────────────────────────────────────────────
train_df = load_and_bin(RAW_FILES[0])
val_df   = load_and_bin(RAW_FILES[1])
eval_df  = load_and_bin(RAW_FILES[2])

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

train_ds = token_ds(train_df, tokenizer)
val_ds   = token_ds(val_df,   tokenizer)
eval_ds  = token_ds(eval_df,  tokenizer)

# ─────────────────────────────────────────────
# 4. Modelo
# ─────────────────────────────────────────────
model = TFDistilBertForSequenceClassification.from_pretrained(
            CHECKPOINT, num_labels=len(LABELS),
            problem_type="multi_label_classification")

# ‣ Congelar 2 capas base para estabilidad
for layer in model.distilbert.transformer.layer[:2]:
    layer.trainable = False

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy")]
)

callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
]

# ─────────────────────────────────────────────
# 5. Entrenamiento
# ─────────────────────────────────────────────
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=callbacks)

# ─────────────────────────────────────────────
# 6. Evaluación final
# ─────────────────────────────────────────────
print("🔍  Evaluación en eval_set:")
print(model.evaluate(eval_ds, return_dict=True))

# ─────────────────────────────────────────────
# 7. Función de predicción con scores (0-100)
# ─────────────────────────────────────────────
def predecir_scores(texto:str, umbral:float=0.5)->dict:
    inp   = tokenizer(texto, padding=True, truncation=True,
                      max_length=MAX_LEN, return_tensors="tf")
    logits = model(inp)[0]
    probs  = tf.sigmoid(logits)[0].numpy()           # 0-1
    scores = np.round(probs * 100, 1)                # 0-100
    return {tag: float(s) for tag,s in zip(LABELS, scores)}

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/8
  13/8024 [..............................] - ETA: 1:29:09 - loss: 0.6798 - accuracy: 0.4692

KeyboardInterrupt: 

In [None]:
# ─── Ejemplo ────────────────────────────────
demo = "I love brainstorming wild ideas over coffee with strangers."
print("\nPredicción demo:\n", predecir_scores(demo))

In [None]:
# ─────────────────────────────────────────────
# 8. Guardar versión entrenada
# ─────────────────────────────────────────────
model.save_pretrained("./modelo_ocean_newsplit")
tokenizer.save_pretrained("./modelo_ocean_newsplit")
