In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random

embedding_dim = 512
hidden_dim = 512
dropout_prob = 0.5
learning_rate = 1e-4
batch_size = 32
num_steps = 20000 
max_source_seq_length = 500

train_sources_path = "train.sources"
train_targets_path = "train.targets"
dev_sources_path = "dev.sources"
dev_targets_path = "dev.targets"
test_sources_path = "test.sources"
test_targets_path = "test.targets"

padding_token = '^'

def update_vocab(text, vocab, char_to_index):
    for char in text:
        if char not in vocab:
            index = len(vocab)
            vocab.add(char)
            char_to_index[char] = index

# Load and preprocess the source and target data with padding
def load_and_preprocess_data(source_path, target_path, char_to_index, max_seq_length):
    source_data = []
    target_data = []

    with open(source_path, "r") as source_file, open(target_path, "r") as target_file:
        source_lines = source_file.readlines()
        target_lines = target_file.readlines()

        for source_line, target_line in zip(source_lines, target_lines):
            source_line = f'#{source_line.strip()[:max_seq_length]}@'
            target_line = f'#{target_line.strip()[:max_seq_length]}@'

            source_data.append(source_line)
            target_data.append(target_line)

    # Pad sequences with the padding token
    source_data = [source.ljust(max_seq_length, padding_token) for source in source_data]
    target_data = [target.ljust(max_seq_length, padding_token) for target in target_data]

    update_vocab(''.join(source_data), char_vocab, char_to_index)
    update_vocab(''.join(target_data), char_vocab, char_to_index)

    source_data = [[char_to_index[char] for char in source] for source in source_data]
    target_data = [[char_to_index[char] for char in target] for target in target_data]

    return source_data, target_data

# Create character vocabularies and index dictionaries
char_vocab = set()
char_to_index = {}

# Load and preprocess training data
train_source_data, train_target_data = load_and_preprocess_data(
    train_sources_path, train_targets_path, char_to_index, max_source_seq_length
)

val_source_data, val_target_data = load_and_preprocess_data(
    dev_sources_path, dev_targets_path, char_to_index, max_source_seq_length
)

test_source_data, test_target_data = load_and_preprocess_data(
    test_sources_path, test_targets_path, char_to_index, max_source_seq_length
)

# num_epochs = num_steps // (len(train_source_data) // batch_size) + 1
num_epochs=200

# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, source_data, target_data):
        self.source_data = source_data
        self.target_data = target_data

    def __len__(self):
        return len(self.source_data)

    def __getitem__(self, idx):
        return torch.tensor(self.source_data[idx], dtype=torch.long), torch.tensor(self.target_data[idx], dtype=torch.long)
    
train_source_data = torch.tensor(train_source_data, dtype=torch.long)
train_target_data = torch.tensor(train_target_data, dtype=torch.long)

val_source_data = torch.tensor(val_source_data, dtype=torch.long)
val_target_data = torch.tensor(val_target_data, dtype=torch.long)

test_source_data = torch.tensor(test_source_data, dtype=torch.long)
test_target_data = torch.tensor(test_target_data, dtype=torch.long)

print(char_vocab)
print(char_to_index['#'])

{'z', '7', '!', '&', 'k', 'o', 'g', 'w', '.', 'I', 'm', '6', '0', 'E', 'H', '[', '}', 'C', 'P', ':', '\\', 'D', '@', 'R', 'W', 'e', 'F', 'J', 'j', '#', '4', 'Q', 'd', 'r', ')', 's', '8', '/', 'b', 'f', 'i', '3', '5', 'Z', 'c', '{', '^', 'U', '$', ']', ' ', 'A', 'y', 'V', "'", 'l', 'x', '"', 'n', 'a', 'L', '?', '(', 'G', 'S', '-', '9', '2', 'p', '*', '1', ',', 'T', 'B', 't', 'Y', 'q', 'v', 'O', 'h', 'u', 'M', 'X', 'K', '+', 'N'}
0


In [2]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, (hidden, cell)

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.shape[0]
        hidden = hidden.repeat(seq_len, 1, 1).transpose(0, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden.permute(1,0,2), encoder_outputs), dim=2)))
        attention_scores = torch.matmul(energy, self.v)
        attention_weights = torch.softmax(attention_scores, dim=0)
        context_vector = torch.sum(attention_weights.unsqueeze(2) * encoder_outputs, dim=0)
        return context_vector


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim + hidden_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = attention

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        context_vector = self.attention(hidden[-1], encoder_outputs)
        emb_con = torch.cat((embedded, context_vector.unsqueeze(0)), dim=2)
        output, (hidden, cell) = self.rnn(emb_con, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, (hidden, cell) = self.encoder(src)

        input = trg[0, :]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)
        return outputs

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_DIM = len(char_vocab)
OUTPUT_DIM = len(char_vocab)
N_LAYERS = 2
ENC_EMB_DIM = embedding_dim
DEC_EMB_DIM = embedding_dim
HID_DIM = hidden_dim
ENC_DROPOUT = dropout_prob
DEC_DROPOUT = dropout_prob

attention = Attention(HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, attention).to(device)
model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
ignore_index = char_to_index[padding_token]
criterion = nn.CrossEntropyLoss(ignore_index=ignore_index)

In [4]:
# Function to evaluate the model on the validation set
def evaluate(model,step,source_data,target_data, criterion,device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        start_idx = step * batch_size
        end_idx = (step + 1) * batch_size
        source_batch = source_data[start_idx:end_idx].to(device)
        target_batch = target_data[start_idx:end_idx].to(device)
        
        source_batch, target_batch = source_batch.to(device), target_batch.to(device)

        output = model(source_batch, target_batch, teacher_forcing_ratio=0.0)  # Set teacher_forcing_ratio to 0 during evaluation
        output_dim = output.shape[2]
        output = output[1:].view(-1, output_dim)
        target_batch = target_batch[1:].view(-1)

        loss = criterion(output, target_batch)
        total_loss += loss.item()

    return total_loss

In [5]:
def train(model,step,source_data,target_data,criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    
    start_idx = step * batch_size
    end_idx = (step + 1) * batch_size
    source_batch = source_data[start_idx:end_idx].to(device)
    target_batch = target_data[start_idx:end_idx].to(device)

    # Zero the gradients
    optimizer.zero_grad()

    # Forward pass
    output = model(source_batch, target_batch)

    # Reshape output and target for calculating the loss
    output_dim = output.shape[2]
    output = output[1:].view(-1, output_dim)
    target_batch = target_batch[1:].view(-1)

    # Calculate the loss
    loss = criterion(output, target_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    return total_loss

In [6]:
# Training and validation loop
for epoch in range(num_epochs):
    # Training
    train_loss = train(model,epoch,train_source_data,train_target_data, criterion, optimizer, device)
    print(f'Epoch [{epoch + 1}/{num_epochs}]: Training Loss: {train_loss:.4f}')

    # Evaluation on validation set
    val_loss = evaluate(model,epoch,val_source_data,val_target_data, criterion,device)
    print(f'Epoch [{epoch + 1}/{num_epochs}]: Validation Loss: {val_loss:.4f}')

# Save the model after training
torch.save(model.state_dict(), 'data2vis_model1.pth')

Epoch [1/200]: Training Loss: 4.4523
Epoch [1/200]: Validation Loss: 4.4414
Epoch [2/200]: Training Loss: 4.4409
Epoch [2/200]: Validation Loss: 4.4246
Epoch [3/200]: Training Loss: 4.4299
Epoch [3/200]: Validation Loss: 4.4002
Epoch [4/200]: Training Loss: 4.4169
Epoch [4/200]: Validation Loss: 4.3682
Epoch [5/200]: Training Loss: 4.3960
Epoch [5/200]: Validation Loss: 4.3435
Epoch [6/200]: Training Loss: 4.3837
Epoch [6/200]: Validation Loss: 4.3213
Epoch [7/200]: Training Loss: 4.3494
Epoch [7/200]: Validation Loss: 4.2801
Epoch [8/200]: Training Loss: 4.3344
Epoch [8/200]: Validation Loss: 4.2318
Epoch [9/200]: Training Loss: 4.2998
Epoch [9/200]: Validation Loss: 4.1995
Epoch [10/200]: Training Loss: 4.2619
Epoch [10/200]: Validation Loss: 4.1473
Epoch [11/200]: Training Loss: 4.2331
Epoch [11/200]: Validation Loss: 4.0566
Epoch [12/200]: Training Loss: 4.1759
Epoch [12/200]: Validation Loss: 3.9780
Epoch [13/200]: Training Loss: 4.1103
Epoch [13/200]: Validation Loss: 3.9165
Epoc

In [None]:
index_to_char = {index: char for char, index in char_to_index.items()}

class BeamSearchNode:
    def __init__(self, decoder_input, hidden, cell, log_prob, length):
        self.decoder_input = decoder_input
        self.hidden = hidden
        self.cell = cell
        self.log_prob = log_prob
        self.length = length

def beam_search_decode(model, src, max_length, beam_width):
    model.eval()

    with torch.no_grad():
        # Encode the source sequence
        encoder_outputs, (hidden, cell) = model.encoder(src)
        start_symbol = torch.tensor([[char_to_index['#']]], device=device)
        beam_search_nodes = [BeamSearchNode(start_symbol, hidden, cell, 0, 0)]

        for _ in range(max_length):
            new_nodes = []

            for node in beam_search_nodes:
                decoder_input = node.decoder_input.view(-1)
                hidden = node.hidden.squeeze().unsqueeze(1)
                cell = node.cell.squeeze().unsqueeze(1)
                encoder_outputs=encoder_outputs.squeeze().unsqueeze(1)
                log_prob = node.log_prob
                length = node.length
                # Decoding step
                output, hidden, cell = model.decoder(decoder_input, hidden, cell, encoder_outputs)

                # Get the top-k predictions and their probabilities
                log_probs, indices = torch.topk(nn.functional.log_softmax(output, dim=1), beam_width)

                for i in range(beam_width):
                    new_decoder_input = indices[0][i].unsqueeze(0).unsqueeze(0)
                    new_log_prob = log_probs[0][i].item()

                    # If the next token is the padding token, stop generating
                    if new_decoder_input.item() == char_to_index['^']:
                        new_length = length
                    else:
                        new_length = length + 1
                        
                    new_nodes.append(
                        BeamSearchNode(new_decoder_input, hidden, cell, log_prob + new_log_prob, new_length)
                    )

            new_nodes.sort(key=lambda x: x.log_prob, reverse=True)
            beam_search_nodes = new_nodes[:beam_width]
            
        best_node = beam_search_nodes[0]

        # Generate the decoded sequence
        decoded_sequence = [best_node.decoder_input.item()]
        for _ in range(best_node.length - 1):
            output, best_node.hidden, best_node.cell = model.decoder(
                best_node.decoder_input.view(-1), best_node.hidden, best_node.cell,encoder_outputs
            )
            best_decoder_input = output.argmax(dim=1)
            decoded_sequence.append(best_decoder_input.item())
            best_node.decoder_input = best_decoder_input

        return decoded_sequence


model = Seq2Seq(Encoder(len(char_vocab), embedding_dim, hidden_dim, 2, dropout_prob),
                Decoder(len(char_vocab), embedding_dim, hidden_dim, 2, dropout_prob, Attention(hidden_dim)),
                device)
model_state_dict = torch.load('data2vis_model1.pth')
model_state_dict = {k: v for k, v in model_state_dict.items() if k in model.state_dict()}
model.load_state_dict(model_state_dict)

# Perform inference using beam search
test_idx = 0
src_sequence = test_source_data[test_idx].to(device)
decoded_sequence = beam_search_decode(model, src_sequence, max_length=2000, beam_width=15)

# Convert the decoded sequence back to characters
decoded_sequence_chars = [index_to_char[idx] for idx in decoded_sequence]

# print("Decoded Sequence:", decoded_sequence_chars)


In [7]:
def evaluate_perplexity(model, source_data, target_data, criterion, device):
    model.eval()
    total_log_perplexity = 0.0
    total_tokens = 0

    with torch.no_grad():
        for step in range(len(source_data) // 256):
            start_idx = step * batch_size
            end_idx = (step + 1) * batch_size
            source_batch = source_data[start_idx:end_idx].to(device)
            target_batch = target_data[start_idx:end_idx].to(device)

            source_batch, target_batch = source_batch.to(device), target_batch.to(device)

            output = model(source_batch, target_batch, teacher_forcing_ratio=0.0)
            output_dim = output.shape[2]
            output = output[1:].view(-1, output_dim)
            target_batch = target_batch[1:].view(-1)

            loss = criterion(output, target_batch)
            total_log_perplexity += loss.item()
            total_tokens += target_batch.numel()

    average_log_perplexity = total_log_perplexity / total_tokens
    return average_log_perplexity

In [None]:
test_log_perplexity = evaluate_perplexity(model, test_source_data, test_target_data, criterion, device)
print(f'Test Average Log Perplexity: {test_log_perplexity:.4f}')