In [1]:
from transformers import BertTokenizer, BertModel
import torch

# Carga el tokenizador y el modelo BETO
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

# Oración de ejemplo
sentence = "Hola, esto es una prueba con BERT."

# Tokeniza la oración
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Obtiene los embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Los embeddings de la última capa oculta serían
last_hidden_states = outputs.last_hidden_state

# Embedding del [CLS] token que representa la oración entera
sentence_embedding = last_hidden_states[:, 0, :]

# Embeddings de cada token/palabra en la oración
word_embeddings = last_hidden_states

print("Embedding de la oración:", sentence_embedding)
print("Embeddings de las palabras:", word_embeddings)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding de la oración: tensor([[ 6.3808e-01, -2.7869e-01, -1.9846e-01, -1.4776e-01,  3.0291e-01,
          7.8052e-01,  9.3868e-02,  7.2238e-01, -4.6559e-02,  4.8533e-01,
          1.1943e-01, -1.0878e+00,  3.5126e-01, -3.5981e-01,  4.9066e-01,
         -7.4640e-01, -4.0938e-01, -1.9730e-01, -3.7720e-02,  1.0112e-01,
          1.4318e-01,  5.2500e-02,  6.6923e-02, -1.2755e-01, -1.3663e-01,
          5.8170e-03,  3.3418e-01, -8.0998e-02, -4.7620e-01, -8.3185e-02,
          1.0710e-01,  1.7079e-01, -2.2541e-01,  4.2820e-01,  5.9105e-01,
         -4.4022e-01,  1.2990e-02, -3.1058e-01, -3.8460e-01, -6.4842e-01,
          1.3952e-01,  8.7414e-01,  7.6276e-01,  6.0126e-01, -3.9091e-02,
          1.7234e-01, -1.6539e-01,  1.2365e-01,  1.5036e-01, -5.5027e-01,
          3.2842e-01, -2.8707e-02, -3.6269e-01, -2.3796e-01,  6.1217e-01,
         -1.7271e-02,  2.6562e-01,  1.4247e-01, -4.9820e-01,  5.1619e-01,
          7.6938e-01,  2.9431e-01,  5.6766e-02, -3.4350e-01,  2.2901e-01,
          5.4

In [2]:
last_hidden_states.shape

torch.Size([1, 13, 768])

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/data_mlsp_fing.csv', encoding = "utf8")

In [4]:
df.head()

Unnamed: 0,oracion,palabraCompleja,evaluacion_normalizada
0,trabajo abnegado de los sanitarios,abnegado,3.0
1,El impacto de la X Solidaria es un ejemplo par...,paradigmático,4.0
2,Es además una enfermedad que afecta a todos lo...,incidencia,1.5
3,Es la primera vez que me toca participar en un...,autoridades,3.0
4,"Es probable, por lo tanto, y al igual que ocur...",seno,2.0


In [5]:
#rename columns
df.rename(columns={'oracion': 'sentence', 'palabraCompleja': 'target_word', 'evaluacion_normalizada': 'complexity'}, inplace=True)
df.head()

Unnamed: 0,sentence,target_word,complexity
0,trabajo abnegado de los sanitarios,abnegado,3.0
1,El impacto de la X Solidaria es un ejemplo par...,paradigmático,4.0
2,Es además una enfermedad que afecta a todos lo...,incidencia,1.5
3,Es la primera vez que me toca participar en un...,autoridades,3.0
4,"Es probable, por lo tanto, y al igual que ocur...",seno,2.0


In [6]:
#normalizemos la columna de complexity dividiendo entre 5 todo
df['complexity'] = df['complexity']/5

In [7]:
df.head()

Unnamed: 0,sentence,target_word,complexity
0,trabajo abnegado de los sanitarios,abnegado,0.6
1,El impacto de la X Solidaria es un ejemplo par...,paradigmático,0.8
2,Es además una enfermedad que afecta a todos lo...,incidencia,0.3
3,Es la primera vez que me toca participar en un...,autoridades,0.6
4,"Es probable, por lo tanto, y al igual que ocur...",seno,0.4


In [8]:
torch.cuda.device_count()

1

In [9]:
# check for cuda device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [10]:
import torch

device = torch.device("cuda")

model.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Suponiendo que df es tu DataFrame que incluye las columnas 'sentence', 'target_word' y 'complexity'
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

from tqdm.auto import tqdm

def get_embeddings(text_list, tokenizer, model, device):
    """Obtiene embeddings [CLS] para una lista de textos, con barra de progreso."""
    model.eval()
    embeddings = []
    for text in tqdm(text_list, desc="Processing", leave=True):
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            output = model(**encoded_input)
        embeddings.append(output.last_hidden_state[:, 0, :].squeeze().cpu())
    return torch.stack(embeddings)


def process_dataframe(df, tokenizer, model, device):
    """Procesa un dataframe para obtener embeddings y complejidades."""
    sentence_embeddings = get_embeddings(df['sentence'].tolist(), tokenizer, model, device)
    word_embeddings = get_embeddings(df['target_word'].tolist(), tokenizer, model, device)
    complexities = torch.tensor(df['complexity'].values, dtype=torch.float).unsqueeze(1)
    return torch.cat((sentence_embeddings, word_embeddings, complexities), dim=1)

# Procesar los conjuntos de entrenamiento y prueba
train_data = process_dataframe(train_df, tokenizer, model, device)
test_data = process_dataframe(test_df, tokenizer, model, device)

Processing:   0%|          | 0/1859 [00:00<?, ?it/s]

Processing: 100%|██████████| 1859/1859 [00:22<00:00, 82.91it/s]
Processing: 100%|██████████| 1859/1859 [00:20<00:00, 89.76it/s]
Processing: 100%|██████████| 465/465 [00:05<00:00, 83.34it/s]
Processing: 100%|██████████| 465/465 [00:05<00:00, 89.89it/s]


In [12]:
train_data.shape, test_data.shape

(torch.Size([1859, 1537]), torch.Size([465, 1537]))

In [13]:
# Entradas de entrenamiento (todas las columnas excepto la última)
train_features = train_data[:, :-1]
# Objetivos de entrenamiento (última columna)
train_targets = train_data[:, -1]

# Entradas de prueba
test_features = test_data[:, :-1]
# Objetivos de prueba
test_targets = test_data[:, -1]

# Asegúrate de que los tensores estén en la CPU para convertirlos a arrays de NumPy
train_features_np = train_features.numpy()
train_targets_np = train_targets.numpy()
test_features_np = test_features.numpy()
test_targets_np = test_targets.numpy()


In [14]:
import tensorflow as tf

def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [16]:
import tensorflow as tf

# Definir el modelo
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(train_features.shape[1],)),
    tf.keras.layers.Dropout(0.15),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=rmse,
              metrics=['mean_absolute_error'])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
# Entrenamiento del modelo
history = model.fit(train_features_np, train_targets_np,
                    validation_split=0.2,
                    epochs=5,
                    batch_size=32)

Epoch 1/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.4691 - mean_absolute_error: 0.3962 - val_loss: 0.2618 - val_mean_absolute_error: 0.2028
Epoch 2/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.2368 - mean_absolute_error: 0.1941 - val_loss: 0.2735 - val_mean_absolute_error: 0.2288
Epoch 3/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.2375 - mean_absolute_error: 0.1900 - val_loss: 0.2406 - val_mean_absolute_error: 0.1974
Epoch 4/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.2134 - mean_absolute_error: 0.1720 - val_loss: 0.2591 - val_mean_absolute_error: 0.2176
Epoch 5/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1972 - mean_absolute_error: 0.1583 - val_loss: 0.2697 - val_mean_absolute_error: 0.2080


In [18]:
# Evaluación del modelo
test_loss, test_mae = model.evaluate(test_features_np, test_targets_np, verbose=2)
print(f"Test RMSE: {test_loss}, Test MAE: {test_mae}")

15/15 - 0s - 3ms/step - loss: 0.2472 - mean_absolute_error: 0.1949
Test RMSE: 0.2471848428249359, Test MAE: 0.19494310021400452


In [53]:
model.save("modelos/Beto.keras")
