In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
# Load and preprocess the data
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=50):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        tokens = self.tokenizer(text.lower())
        tokens = tokens[:self.max_len]
        tokens_ids = torch.tensor([vocab.get(token, vocab['<unk>']) for token in tokens])
        label = torch.tensor(label, dtype=torch.long)

        return tokens_ids, label

# Tokenizer function
def tokenizer(text):
    return text.split()

# Build vocabulary
def build_vocab(texts):
    vocab = {'<pad>': 0, '<unk>': 1}
    for text in texts:
        for token in tokenizer(text):
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

In [3]:
# BiLSTM model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        return self.fc(hidden)

In [4]:
# Prepare data
df = pd.read_csv('/content/Sentences_AllAgree.txt', encoding = "ISO-8859-1", names=['text','label'], delimiter= '@')
texts = df['text'].values
labels = LabelEncoder().fit_transform(df['label'])

vocab = build_vocab(texts)
vocab_size = len(vocab)

In [5]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

In [6]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(text) for text in texts])
    texts = pad_sequence(texts, batch_first=True, padding_value=vocab['<pad>'])
    labels = torch.tensor(labels)
    return texts, lengths, labels

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [7]:
# Hyperparameters and model initialization
embedding_dim = 128
hidden_dim = 128
output_dim = len(set(labels))
pad_idx = vocab['<pad>']

model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [8]:
# Training and validation
for epoch in range(25):
    model.train()
    train_loss = 0
    for texts, lengths, labels in tqdm(train_loader):
        optimizer.zero_grad()
        predictions = model(texts, lengths)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    val_loss = 0
    model.eval()
    with torch.no_grad():
        for texts, lengths, labels in val_loader:
            predictions = model(texts, lengths)
            loss = criterion(predictions, labels)
            val_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)

    print(f"Epoch {epoch + 1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

100%|██████████| 46/46 [00:14<00:00,  3.17it/s]


Epoch 1 | Train Loss: 0.8141 | Val Loss: 0.6402


100%|██████████| 46/46 [00:07<00:00,  5.77it/s]


Epoch 2 | Train Loss: 0.5789 | Val Loss: 0.5927


100%|██████████| 46/46 [00:08<00:00,  5.11it/s]


Epoch 3 | Train Loss: 0.4249 | Val Loss: 0.5615


100%|██████████| 46/46 [00:08<00:00,  5.18it/s]


Epoch 4 | Train Loss: 0.3206 | Val Loss: 0.5433


100%|██████████| 46/46 [00:07<00:00,  5.81it/s]


Epoch 5 | Train Loss: 0.2110 | Val Loss: 0.5669


100%|██████████| 46/46 [00:08<00:00,  5.15it/s]


Epoch 6 | Train Loss: 0.1451 | Val Loss: 0.6193


100%|██████████| 46/46 [00:08<00:00,  5.21it/s]


Epoch 7 | Train Loss: 0.1138 | Val Loss: 0.6985


100%|██████████| 46/46 [00:07<00:00,  5.82it/s]


Epoch 8 | Train Loss: 0.1001 | Val Loss: 0.6262


100%|██████████| 46/46 [00:08<00:00,  5.18it/s]


Epoch 9 | Train Loss: 0.0596 | Val Loss: 0.7520


100%|██████████| 46/46 [00:08<00:00,  5.19it/s]


Epoch 10 | Train Loss: 0.0398 | Val Loss: 0.7386


100%|██████████| 46/46 [00:08<00:00,  5.36it/s]


Epoch 11 | Train Loss: 0.0320 | Val Loss: 0.8239


100%|██████████| 46/46 [00:08<00:00,  5.29it/s]


Epoch 12 | Train Loss: 0.0266 | Val Loss: 0.8104


100%|██████████| 46/46 [00:08<00:00,  5.21it/s]


Epoch 13 | Train Loss: 0.0199 | Val Loss: 0.8236


100%|██████████| 46/46 [00:08<00:00,  5.59it/s]


Epoch 14 | Train Loss: 0.0147 | Val Loss: 0.8442


100%|██████████| 46/46 [00:08<00:00,  5.18it/s]


Epoch 15 | Train Loss: 0.0167 | Val Loss: 0.9536


100%|██████████| 46/46 [00:08<00:00,  5.35it/s]


Epoch 16 | Train Loss: 0.0206 | Val Loss: 0.8792


100%|██████████| 46/46 [00:08<00:00,  5.47it/s]


Epoch 17 | Train Loss: 0.0169 | Val Loss: 0.8932


100%|██████████| 46/46 [00:08<00:00,  5.68it/s]


Epoch 18 | Train Loss: 0.0269 | Val Loss: 0.9157


100%|██████████| 46/46 [00:08<00:00,  5.17it/s]


Epoch 19 | Train Loss: 0.0177 | Val Loss: 0.8160


100%|██████████| 46/46 [00:08<00:00,  5.35it/s]


Epoch 20 | Train Loss: 0.0082 | Val Loss: 0.9042


100%|██████████| 46/46 [00:07<00:00,  5.77it/s]


Epoch 21 | Train Loss: 0.0047 | Val Loss: 0.8378


100%|██████████| 46/46 [00:08<00:00,  5.22it/s]


Epoch 22 | Train Loss: 0.0026 | Val Loss: 0.9002


100%|██████████| 46/46 [00:09<00:00,  4.96it/s]


Epoch 23 | Train Loss: 0.0049 | Val Loss: 0.9012


100%|██████████| 46/46 [00:07<00:00,  5.84it/s]


Epoch 24 | Train Loss: 0.0060 | Val Loss: 0.9681


100%|██████████| 46/46 [00:08<00:00,  5.29it/s]


Epoch 25 | Train Loss: 0.0124 | Val Loss: 1.0136


In [9]:
model.eval()
test_loss = 0
predictions_list = []
labels_list = []

with torch.no_grad():
    for texts, lengths, labels in test_loader:
        predictions = model(texts, lengths)
        test_loss += criterion(predictions, labels).item()
        predictions_list.extend(torch.argmax(predictions, dim=1).tolist())
        labels_list.extend(labels.tolist())

test_loss /= len(test_loader)
accuracy = accuracy_score(labels_list, predictions_list)
print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {accuracy:.4f}")

torch.save(model.state_dict(), 'BiLSTM_sentiment_model.pt')

Test Loss: 0.7384 | Test Accuracy: 0.8830
