In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import numpy as np
from tqdm import tqdm

# Cargar datos
df = pd.read_csv('C:/Users/cesco/Desktop/Personal/UPY/9/NLP/proyecto/train.csv',
                 header=None, names=['polarity', 'summary', 'reviewText'])

# Crear columnas de reseñas invertidas
df['invertedReview'] = df.apply(
    lambda row: row['reviewText'].replace("good", "bad").replace("excellent", "terrible") 
                if row['polarity'] == 1 else
                row['reviewText'].replace("bad", "good").replace("terrible", "excellent"),
    axis=1
)

# Tokenizador basado en palabras
class WordTokenizer:
    def __init__(self, vocab_size=5000):
        self.vocab_size = vocab_size
        self.word_to_idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.idx_to_word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}

    def fit(self, texts):
        word_counts = Counter(word for text in texts for word in text.split())
        most_common = word_counts.most_common(self.vocab_size - 4)
        for idx, (word, _) in enumerate(most_common, start=4):
            self.word_to_idx[word] = idx
            self.idx_to_word[idx] = word

    def encode(self, text):
        return [self.word_to_idx.get(word, 3) for word in text.split()] + [2]

    def decode(self, indices):
        return " ".join(self.idx_to_word.get(idx, "<UNK>") for idx in indices if idx not in [0, 1, 2])

# Preparar datos
tokenizer = WordTokenizer()
tokenizer.fit(df['reviewText'].tolist() + df['invertedReview'].tolist())

df['encodedReview'] = df['reviewText'].apply(tokenizer.encode)
df['encodedInvertedReview'] = df['invertedReview'].apply(tokenizer.encode)

# Truncar las secuencias
MAX_SEQ_LENGTH = 50
df['encodedReview'] = df['encodedReview'].apply(lambda x: x[:MAX_SEQ_LENGTH])
df['encodedInvertedReview'] = df['encodedInvertedReview'].apply(lambda x: x[:MAX_SEQ_LENGTH])

# Usar una fracción más pequeña del dataset para pruebas rápidas
df = df.sample(frac=0.1, random_state=42)

# Dividir en entrenamiento y validación
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42)

# Dataset y DataLoader
class ReviewDataset(Dataset):
    def __init__(self, reviews, targets):
        self.reviews = reviews
        self.targets = targets

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return torch.tensor(self.reviews[idx]), torch.tensor(self.targets[idx])

train_dataset = ReviewDataset(train_data['encodedReview'].tolist(), train_data['encodedInvertedReview'].tolist())
val_dataset = ReviewDataset(val_data['encodedReview'].tolist(), val_data['encodedInvertedReview'].tolist())

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=lambda x: pad_sequences(x))
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=lambda x: pad_sequences(x))

# Padding de secuencias
def pad_sequences(batch):
    reviews, targets = zip(*batch)
    review_lens = [len(r) for r in reviews]
    target_lens = [len(t) for t in targets]

    reviews_padded = nn.utils.rnn.pad_sequence(reviews, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)

    return reviews_padded, targets_padded, torch.tensor(review_lens), torch.tensor(target_lens)

# Modelo Encoder-Decoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, (hidden, cell) = self.lstm(packed)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        embedded = self.embedding(x).unsqueeze(1)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device  # Agregar el atributo device

    def forward(self, src, src_lengths, trg, teacher_forcing_ratio=0.5):
        hidden, cell = self.encoder(src, src_lengths)
        outputs = torch.zeros(trg.size(0), trg.size(1), self.decoder.embedding.num_embeddings).to(self.device)

        input_token = torch.tensor([1] * trg.size(0)).to(self.device)  # <SOS>
        for t in range(1, trg.size(1)):
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            outputs[:, t, :] = output
            input_token = trg[:, t] if np.random.random() < teacher_forcing_ratio else output.argmax(1)

        return outputs


In [7]:
def train_epoch(model, loader, optimizer, criterion, clip=1.0):
    model.train()
    epoch_loss = 0

    for reviews, targets, review_lens, target_lens in tqdm(loader, desc="Training"):
        reviews, targets, review_lens, target_lens = (
            reviews.to(model.device),
            targets.to(model.device),
            review_lens.to(model.device),
            target_lens.to(model.device)
        )

        optimizer.zero_grad()

        # Forward pass
        outputs = model(reviews, review_lens, targets)

        # Calculate loss, ignoring padding tokens
        outputs = outputs[:, 1:].reshape(-1, outputs.size(-1))
        targets = targets[:, 1:].reshape(-1)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(loader)

def evaluate_epoch(model, loader, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for reviews, targets, review_lens, target_lens in tqdm(loader, desc="Evaluating"):
            reviews, targets, review_lens, target_lens = (
                reviews.to(model.device),
                targets.to(model.device),
                review_lens.to(model.device),
                target_lens.to(model.device)
            )

            # Forward pass
            outputs = model(reviews, review_lens, targets, teacher_forcing_ratio=0)

            # Calculate loss, ignoring padding tokens
            outputs = outputs[:, 1:].reshape(-1, outputs.size(-1))
            targets = targets[:, 1:].reshape(-1)
            loss = criterion(outputs, targets)

            epoch_loss += loss.item()

    return epoch_loss / len(loader)

def calculate_perplexity(loss):
    return torch.exp(torch.tensor(loss)).item()


In [8]:
def train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion):
    best_valid_loss = float('inf')
    history = []

    for epoch in range(n_epochs):
        print(f"Epoch {epoch + 1}/{n_epochs}")

        # Entrenar una época
        train_loss = train_epoch(model, train_loader, optimizer, criterion)
        valid_loss = evaluate_epoch(model, val_loader, criterion)

        train_perplexity = calculate_perplexity(train_loss)
        valid_perplexity = calculate_perplexity(valid_loss)

        print(f"Train Loss: {train_loss:.4f} | Train Perplexity: {train_perplexity:.2f}")
        print(f"Valid Loss: {valid_loss:.4f} | Valid Perplexity: {valid_perplexity:.2f}")

        history.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_perplexity': train_perplexity,
            'valid_loss': valid_loss,
            'valid_perplexity': valid_perplexity
        })

        # Guardar el mejor modelo
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'best_seq2seq_model.pt')
            print("Model saved!")

    return history


In [9]:
def invert_text(model, tokenizer, text, max_len=50):
    model.eval()

    with torch.no_grad():
        # Tokenizar texto de entrada
        encoded_text = tokenizer.encode(text)
        input_tensor = torch.tensor(encoded_text).unsqueeze(0).to(model.device)
        input_length = torch.tensor([len(encoded_text)]).to(model.device)

        # Obtener estados iniciales del encoder
        hidden, cell = model.encoder(input_tensor, input_length)

        # Iniciar decodificación
        input_token = torch.tensor([1]).to(model.device)  # <SOS>
        result = []

        for _ in range(max_len):
            output, hidden, cell = model.decoder(input_token, hidden, cell)
            predicted_token = output.argmax(1).item()

            if predicted_token == 2:  # <EOS>
                break

            result.append(predicted_token)
            input_token = torch.tensor([predicted_token]).to(model.device)

        return tokenizer.decode(result)


In [10]:
# Hiperparámetros
VOCAB_SIZE = len(tokenizer.word_to_idx)
EMBED_SIZE = 4
HIDDEN_SIZE = 8
N_EPOCHS = 3
LEARNING_RATE = 0.001

# Configurar dispositivo
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Inicializar encoder, decoder y modelo completo
encoder = Encoder(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE)
decoder = Decoder(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE)
seq2seq_model = Seq2Seq(encoder, decoder, device).to(device)

# Inicializar optimizador y función de pérdida
optimizer = torch.optim.Adam(seq2seq_model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Entrenar modelo
history = train_model(seq2seq_model, train_loader, val_loader, N_EPOCHS, optimizer, criterion)


Epoch 1/3


Training: 100%|██████████| 5063/5063 [1:12:39<00:00,  1.16it/s]
Evaluating: 100%|██████████| 563/563 [00:51<00:00, 10.94it/s]


Train Loss: 5.8076 | Train Perplexity: 332.81
Valid Loss: 5.7152 | Valid Perplexity: 303.43
Model saved!
Epoch 2/3


Training: 100%|██████████| 5063/5063 [1:16:34<00:00,  1.10it/s]
Evaluating: 100%|██████████| 563/563 [00:50<00:00, 11.09it/s]


Train Loss: 5.5200 | Train Perplexity: 249.64
Valid Loss: 5.6801 | Valid Perplexity: 292.97
Model saved!
Epoch 3/3


Training: 100%|██████████| 5063/5063 [1:19:36<00:00,  1.06it/s]
Evaluating: 100%|██████████| 563/563 [00:50<00:00, 11.22it/s]

Train Loss: 5.4060 | Train Perplexity: 222.75
Valid Loss: 5.6274 | Valid Perplexity: 277.93
Model saved!



