In [1]:
import pandas as pd
import re
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm
2025-06-02 14:31:38.996759: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-02 14:31:38.997308: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-02 14:31:38.999634: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-02 14:31:39.005651: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748871099.016156   31539 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has

In [2]:

# --- Limpieza de texto ---
def limpiar(texto):
    texto = re.sub(r"http\S+", "", texto)
    texto = re.sub(r"\|\|\|", " ", texto)
    texto = re.sub(r"[^A-Za-z\s]", "", texto)
    texto = re.sub(r"\s+", " ", texto)
    return texto.lower().strip()

# --- Convertir etiquetas MBTI a binario ---
def desglosar_mbti(tipo):
    return {
        "IE": 1 if tipo[0] == "I" else 0,
        "NS": 1 if tipo[1] == "N" else 0,
        "FT": 1 if tipo[2] == "F" else 0,
        "PJ": 1 if tipo[3] == "P" else 0,
    }

# --- Carga el CSV ---
df = pd.read_csv("mbti_1.csv")

# --- Limpia los textos ---
df["clean_posts"] = df["posts"].apply(limpiar)

# --- Desglosa las etiquetas ---
mbti_binario = df["type"].apply(desglosar_mbti).apply(pd.Series)
df = pd.concat([df, mbti_binario], axis=1)

# --- Divide en X y y ---
X = df["clean_posts"].tolist()
y = df[["IE", "NS", "FT", "PJ"]].values


In [3]:
# --- Separar train/test ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Tokenizador ---
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="tf")

train_encodings = tokenize_texts(X_train)
test_encodings = tokenize_texts(X_test)

# --- Dataset TensorFlow ---
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(8)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(8)

# --- Modelo ---
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4, problem_type="multi_label_classification")

# --- Compilación ---
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy')]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# --- Entrenamiento ---
model.fit(train_dataset, validation_data=test_dataset, epochs=3)


E0000 00:00:1748871106.513807   31539 cuda_executor.cc:1228] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1748871106.514136   31539 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architec

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7e66c8d07fe0>

In [14]:

# --- Función para predecir ---
def predecir_personalidad(texto):
    inputs = tokenizer(texto, padding=True, truncation=True, max_length=128, return_tensors="tf")
    logits = model(inputs)[0]
    probs = tf.sigmoid(logits)[0].numpy()
    etiquetas = ["Introvertido (I)", "Intuitivo (N)", "Sentimental (F)", "Perceptivo (P)"]
    resultado = []
    for i, p in enumerate(probs):
        if p > 0.5:
            resultado.append(etiquetas[i])
        else:
            resultado.append(etiquetas[i].replace("(", "Extrovertido (E)" if i==0 else
                                                   "Realista (S)" if i==1 else
                                                   "Lógico (T)" if i==2 else
                                                   "Juzgador (J)"))
    return "Tu personalidad parece: " + ", ".join(resultado)

# --- Ejemplo ---
print(predecir_personalidad("What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al-Quaeda, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire US armed forces. You are nothing to me but just another target. I will wipe you the fuck out with precision the likes of which has never been seen before on this Earth, mark my fucking words. You think you can get away with saying that shit to me over the Internet? Think again, fucker. As we speak I am contacting my secret network of spies across the USA and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're fucking dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States Marine Corps and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your fucking tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're fucking dead, kiddo."))

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2112639474.py, line 19)