In [1]:
import sys

sys.path.append('../src')

In [2]:
import torch
import pandas as pd
from pathlib import Path
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

In [3]:
from config.config import PATHS
from encoders.encoders import SentenceEncoder, DRSEncoder
from nlp.utils_vocab import PandasDataset, BasicTokenizer

In [4]:
data_folder = PATHS['training_data_folder']
data_path = Path(data_folder, 'negacion_5.csv')
data = pd.read_csv(data_path, names=['Sentence 1', 'Sentence 2', 'Relation'])
print(f'Data shape: {data.shape}')
data.head()

Data shape: (5750, 3)


Unnamed: 0,Sentence 1,Sentence 2,Relation
0,todo abuelo acuerda,algún abuelo no acuerda,1
1,todo abuelo aguanta,algún abuelo no aguanta,1
2,todo abuelo ama,algún abuelo no ama,1
3,todo abuelo amanece,algún abuelo no amanece,1
4,todo abuelo anochece,algún abuelo no anochece,1


In [5]:
df_sentences = pd.DataFrame(data['Sentence 1'].tolist() + data['Sentence 2'].tolist(), columns=['X'])
df_sentences['X'] = df_sentences['X'].apply(lambda x: x.strip())
df_sentences['Y'] = df_sentences['X']
df_sentences.head()

Unnamed: 0,X,Y
0,todo abuelo acuerda,todo abuelo acuerda
1,todo abuelo aguanta,todo abuelo aguanta
2,todo abuelo ama,todo abuelo ama
3,todo abuelo amanece,todo abuelo amanece
4,todo abuelo anochece,todo abuelo anochece


In [6]:
tokenizer_path = PATHS['tokenizer_folder']
tokenizer_file = Path(tokenizer_path, 'tokenizer_5.pkl')
special_symbols = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']

simple_tokenizer = lambda tokens_string: tokens_string.strip().split()
tokenizer = BasicTokenizer.create_using_stoi(simple_tokenizer, special_symbols, tokenizer_file)
print('vocabulary_size:', tokenizer.get_vocab_size())

vocabulary_size: 41


In [7]:
# Define features (X) and target (y)
X = df_sentences['X']  # Replace with your feature columns
y = df_sentences['Y']  # Replace with your target column

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Output the shapes to verify the split
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

df_train = pd.DataFrame({
    'X': X_train.tolist(),
    'Y': y_train.tolist()
})

df_test = pd.DataFrame({
    'X': X_train.tolist(),
    'Y': y_train.tolist()
})
df_test.head()

X_train shape: (9200,)
X_test shape: (2300,)
y_train shape: (9200,)
y_test shape: (2300,)


Unnamed: 0,X,Y
0,todo alce aguanta y anochece,todo alce aguanta y anochece
1,todo albañil acuerda,todo albañil acuerda
2,no algún actor no amanece,no algún actor no amanece
3,todo albañil alegre ama,todo albañil alegre ama
4,algún alce amanece y anochece,algún alce amanece y anochece


In [8]:
PAD_IDX = special_symbols.index('[PAD]')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def collate_batch(batch):
    input_ids = list()
    output_ids = list()
    for inputs, outputs in batch:
        input_id = tokenizer.encode([inputs]).ids
        input_id = torch.tensor(input_id, dtype=torch.float)
        input_ids.append(input_id)
        output_id = tokenizer.encode([outputs]).ids
        output_id = torch.tensor(output_id, dtype=torch.float)
        output_ids.append(output_id)
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=PAD_IDX)
    output_ids = pad_sequence(output_ids, batch_first=True, padding_value=PAD_IDX)
    return input_ids.to(device), output_ids.to(device)


In [9]:
BATCH_SIZE = 2

train_dataset = PandasDataset(
    df=df_train,
    x_cols=['X'],
    y_col='Y'
)
test_dataset = PandasDataset(
    df=df_test,
    x_cols=['X'],
    y_col='Y'
)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [10]:
inputs, outputs = next(iter(train_dataloader))
print(f'inputs: {inputs}')
print(f'outputs: {outputs}')

inputs: tensor([[24., 21.,  9., 22., 13.,  1.,  1.],
        [24., 21.,  5.,  9., 22.,  5., 13.]])
outputs: tensor([[24., 21.,  9., 22., 13.,  1.,  1.],
        [24., 21.,  5.,  9., 22.,  5., 13.]])


In [12]:
sentence_encoder = SentenceEncoder()

sentence_encoder.train(dataloader=train_dataloader, device=device)

  0%|          | 0/10 [00:00<?, ?it/s]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x6 and 10x8)

In [11]:
sentence_encoder.forward("Hello world")

NameError: name 'sentence_encoder' is not defined