In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [3]:
# Load Data
def load_data(file_x, file_y):
    with open(file_x, 'r') as fx, open(file_y, 'r') as fy:
        sentences = fx.read().strip().split('\n')
        labels = fy.read().strip().split('\n')
    return [sentence.split(",") for sentence in sentences], [label.split(',') for label in labels]

print("Loading Data Started")
train_sentences, train_labels = load_data('database/x_train.txt', 'database/y_train.txt')
test_sentences, test_labels = load_data('database/x_dev.txt', 'database/y_dev.txt')
print("Loading Data Done")

Loading Data Started
Loading Data Done


In [4]:
# Build Vocabulary
def build_vocab(sentences):
    with open("database/vocabulary.txt", "r") as fv:
        vocab = [ v.strip() for v in fv if v != ""]
    word2idx = {word: idx + 1 for idx, word in enumerate(sorted(vocab))}  # Reserve 0 for padding
    word2idx['<PAD>'] = 0
    return word2idx

def build_label_vocab(labels):
    vocab = {label for label_list in labels for label in label_list}
    label2idx = {label: idx for idx, label in enumerate(sorted(vocab))}
    return label2idx

print("Building Vocab Started")
word2idx = build_vocab(train_sentences + test_sentences)
label2idx = build_label_vocab(train_labels + test_labels)
idx2label = {idx: label for label, idx in label2idx.items()}
print("Building Vocab Done")

Building Vocab Started
Building Vocab Done


In [6]:
# Prepare Dataset
class SequenceDataset(Dataset):
    def __init__(self, sentences, labels, word2idx, label2idx, max_len=50):
        self.sentences = [[word2idx[word] for word in sentence] for sentence in sentences]
        self.labels = [[label2idx[label] for label in label_list] for label_list in labels]
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        sentence = sentence[:self.max_len] + [0] * (self.max_len - len(sentence))
        label = label[:self.max_len] + [label2idx['NONE']] * (self.max_len - len(label))
        return torch.tensor(sentence), torch.tensor(label)

print("Preparing Dataset Started")
train_dataset = SequenceDataset(train_sentences, train_labels, word2idx, label2idx)
test_dataset = SequenceDataset(test_sentences, test_labels, word2idx, label2idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)
print("Preparing Dataset Done")

Preparing Dataset Started
Preparing Dataset Done


In [9]:
# Define Model
class RNNSequenceLabeling(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNSequenceLabeling, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                            num_layers=2, 
                            bidirectional=True, 
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        logits = self.fc(lstm_out)
        return logits

In [None]:
# Training
print("Training Started")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vocab_size = len(word2idx)
embedding_dim = 100
hidden_dim = 128
output_dim = len(label2idx)

model = RNNSequenceLabeling(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for sentences, labels in train_loader:
        sentences, labels = sentences.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(sentences)
        loss = criterion(predictions.view(-1, output_dim), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    torch.save(model.state_dict(), f"BLSTM_IT_{epoch + 1}.pth")
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}")
print("Training Done")

In [None]:
# Evaluation
def evaluate(model, loader):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for sentences, labels in loader:
            sentences, labels = sentences.to(device), labels.to(device)
            predictions = model(sentences).argmax(dim=-1)
            total += labels.numel()
            correct += (predictions == labels).sum().item()
    return correct / total
print("Testing Started")
accuracy = evaluate(model, test_loader)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print("Testing Done")