In [2]:
from transformers import BertTokenizer, BertModel
import torch

# Carga el tokenizador y el modelo
#tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
#model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-multilingual-uncased")
model = BertModel.from_pretrained("google-bert/bert-base-multilingual-uncased")

# Oración de ejemplo
sentence = "Hola, esto es una prueba con BERT."

# Tokeniza la oración
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Obtiene los embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Los embeddings de la última capa oculta serían
last_hidden_states = outputs.last_hidden_state

# Embedding del [CLS] token que representa la oración entera
sentence_embedding = last_hidden_states[:, 0, :]

# Embeddings de cada token/palabra en la oración
word_embeddings = last_hidden_states

print("Embedding de la oración:", sentence_embedding)
print("Embeddings de las palabras:", word_embeddings)


Embedding de la oración: tensor([[-1.0735e-01, -1.2773e-01,  4.5974e-02, -8.6625e-02, -2.2233e-01,
          7.4742e-03, -1.1118e-01, -8.4462e-02, -1.8734e+00, -2.1577e-02,
         -1.6335e-01, -1.5925e-01,  1.0128e-01,  6.2465e-02,  1.5979e-01,
          9.9915e-02,  9.7223e-02, -3.8102e-02, -1.7053e-02,  2.0564e-02,
         -9.6136e-02,  8.9360e-02, -5.5176e-02, -1.1231e-01,  5.0994e-01,
         -6.1095e-02,  1.0382e-01, -8.3100e-02, -2.0854e+00, -5.8676e-02,
         -2.8183e-01,  2.5542e-02, -3.5711e-02,  1.3042e-01,  4.4776e-02,
         -6.0031e-02,  8.0662e-02,  1.6509e+00,  7.3684e-02,  7.6071e-02,
         -8.1319e-02, -2.7686e-02, -7.7573e-02,  1.4531e-01, -4.5582e-02,
         -6.0797e-02,  1.0566e-01, -2.4155e-02, -1.5657e-02, -3.4596e-02,
         -6.2806e-02,  1.7225e-01,  2.8023e-02,  8.4980e-03, -1.2517e-01,
          6.4119e-02, -9.5092e-02, -5.2815e-02,  6.9577e-04,  1.1222e-01,
          1.8972e+00,  9.9849e-02, -1.2927e-01, -5.7768e-02,  6.6760e-02,
          4.2

In [3]:
last_hidden_states.shape

torch.Size([1, 12, 768])

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import json
# Supongamos que tus datos están en un archivo JSON llamado 'converted_data.jsonl'
with open('../scripts/converted_data.jsonl', 'r') as file:
    data = [json.loads(line) for line in file]

# Convertir los datos en un DataFrame de pandas
df = pd.DataFrame(data)

In [5]:
#rename columns
df.rename(columns={'sentence': 'sentence', 'token': 'target_word', 'complexity': 'complexity'}, inplace=True)
df.head()

Unnamed: 0,sentence,target_word,complexity
0,"Behold, there came up out of the river seven c...",river,0.0
1,I am a fellow bondservant with you and with yo...,brothers,0.0
2,"The man, the lord of the land, said to us, 'By...",brothers,0.05
3,Shimei had sixteen sons and six daughters; but...,brothers,0.15
4,"""He has put my brothers far from me.",brothers,0.263889


In [11]:
import torch

device = torch.device("cuda")

model.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(105879, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
 

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Suponiendo que df es tu DataFrame que incluye las columnas 'sentence', 'target_word' y 'complexity'
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

from tqdm.auto import tqdm

def get_embeddings(text_list, tokenizer, model, device):
    """Obtiene embeddings [CLS] para una lista de textos, con barra de progreso."""
    model.eval()
    embeddings = []
    for text in tqdm(text_list, desc="Processing", leave=True):
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            output = model(**encoded_input)
        embeddings.append(output.last_hidden_state[:, 0, :].squeeze().cpu())
    return torch.stack(embeddings)


def process_dataframe(df, tokenizer, model, device):
    """Procesa un dataframe para obtener embeddings y complejidades."""
    sentence_embeddings = get_embeddings(df['sentence'].tolist(), tokenizer, model, device)
    word_embeddings = get_embeddings(df['target_word'].tolist(), tokenizer, model, device)
    complexities = torch.tensor(df['complexity'].values, dtype=torch.float).unsqueeze(1)
    return torch.cat((sentence_embeddings, word_embeddings, complexities), dim=1)

# Procesar los conjuntos de entrenamiento y prueba
train_data = process_dataframe(train_df, tokenizer, model, device)
test_data = process_dataframe(test_df, tokenizer, model, device)

Processing:   0%|          | 0/6129 [00:00<?, ?it/s]

Processing:   0%|          | 0/6129 [00:00<?, ?it/s]

Processing:   0%|          | 0/1533 [00:00<?, ?it/s]

Processing:   0%|          | 0/1533 [00:00<?, ?it/s]

In [13]:
train_data.shape, test_data.shape

(torch.Size([6129, 1537]), torch.Size([1533, 1537]))

In [19]:
# Entradas de entrenamiento (todas las columnas excepto la última)
train_features = train_data[:, :-1]
# Objetivos de entrenamiento (última columna)
train_targets = train_data[:, -1]

# Entradas de prueba
test_features = test_data[:, :-1]
# Objetivos de prueba
test_targets = test_data[:, -1]

# Asegúrate de que los tensores estén en la CPU para convertirlos a arrays de NumPy
train_features_np = train_features.numpy()
train_targets_np = train_targets.numpy()
test_features_np = test_features.numpy()
test_targets_np = test_targets.numpy()


In [20]:
import tensorflow as tf

def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [17]:
import tensorflow as tf

# Definir el modelo
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(train_features.shape[1],)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=rmse,
              metrics=['mean_absolute_error'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 256)               393472    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dense_5 (Dense)             (None, 1)                 129       
                                                                 
Total params: 426497 (1.63 MB)
Trainable params: 426497 (1.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
# Entrenamiento del modelo
history = model.fit(train_features_np, train_targets_np,
                    validation_split=0.2,
                    epochs=10,
                    batch_size=32)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
# Evaluación del modelo
test_loss, test_mae = model.evaluate(test_features_np, test_targets_np, verbose=2)
print(f"Test RMSE: {test_loss}, Test MAE: {test_mae}")

48/48 - 0s - loss: 0.1110 - mean_absolute_error: 0.0851 - 77ms/epoch - 2ms/step
Test RMSE: 0.11103618890047073, Test MAE: 0.08508406579494476
