## Modelling

Creating a TF-IDF vectorizer as an input to a Logistic Regression model to classify the sentiments

In [None]:
vocab_size = len(vocab)

In [None]:
print(vocab_size)

32004


In [None]:
#The input is passed through multiple CNN layers, each with a different kernel size, to capture features at varying n-gram levels.
#After each convolution operation, a ReLU activation is applied to introduce non-linearity,
#and the extra dimension introduced by Conv2d is removed using squeeze.
#Following this, max pooling is performed across the sequence length dimension to extract the most significant features from each filter.
#Finally, the outputs from all CNN layers are concatenated along the filter dimension, combining the feature maps learned by each kernel size into a single representation.
#Shape: [batch_size, num_filters * len(kernel_sizes)]
import torch
import torch.nn as nn
import torch.optim as optim

class CNN_LSTM_OvR_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, kernel_sizes, num_filters, lstm_hidden_dim, num_layers, dropout_rate, glove_weights):
        super(CNN_LSTM_OvR_Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(glove_weights, requires_grad=True)

        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(k, embedding_dim))
            for k in kernel_sizes
        ])

        self.lstm = nn.LSTM(input_size=num_filters * len(kernel_sizes),
                            hidden_size=lstm_hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=dropout_rate,
                            bidirectional=True)

        self.fc = nn.Linear(lstm_hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        conv_out = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        conv_out = [torch.max(c, dim=2)[0] for c in conv_out]
        conv_out = torch.cat(conv_out, dim=1)

        lstm_out, _ = self.lstm(conv_out.unsqueeze(1))
        lstm_out = lstm_out.squeeze(1)
        drop = self.dropout(lstm_out)
        out = self.fc(drop)  # Shape: [batch_size, output_dim]

        return out

references:https://discuss.pytorch.org/t/cnn-lstm-architecture/151018

https://galhever.medium.com/sentiment-analysis-with-pytorch-part-4-lstm-bilstm-model-84447f6c4525

https://ieeexplore.ieee.org/abstract/document/8622880

In [None]:
embedding_dim = 100      # Size of word embeddings
hidden_dim = 128         # LSTM hidden size
output_dim = 3           # Number of classes (for multi-class classification)
kernel_sizes = [3, 4, 5] # Sizes of kernels for CNN
num_filters = 100        # Number of filters for CNN
lstm_hidden_dim = 128    # LSTM hidden dimension
num_layers = 2           # Number of LSTM layers
dropout_rate = 0.5       # Dropout rate


In [None]:
def load_glove_embeddings(glove_path, vocab, embedding_dim=100):
    glove_embeddings = {}
    with open(glove_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            glove_embeddings[word] = np.asarray(values[1:], dtype='float32')

    weights_matrix = np.zeros((len(vocab), embedding_dim))
    for i, word in enumerate(vocab):
        weights_matrix[i] = glove_embeddings.get(word, np.random.normal(scale=0.6, size=(embedding_dim,)))

    return torch.tensor(weights_matrix, dtype=torch.float32)

In [None]:
glove_weights = load_glove_embeddings("glove.6B.100d.txt", vocab, embedding_dim)
model = CNN_LSTM_OvR_Model(vocab_size, embedding_dim, hidden_dim, output_dim, kernel_sizes, num_filters, lstm_hidden_dim, num_layers, dropout_rate, glove_weights)


In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = torch.tensor(self.sequences[idx])
        return seq, self.labels[idx]

text_dataset = TextDataset(train_df['sequences'].tolist(), train_df['labels'].tolist())

train_loader = DataLoader(text_dataset, batch_size=256, shuffle=True, collate_fn=lambda x: (
    pad_sequence([item[0] for item in x], batch_first=True),
    torch.tensor([item[1] for item in x])
))

for padded_sequences, labels in train_loader:
    print("Padded Sequences:")
    print(padded_sequences)
    print("Labels:")
    print(labels)
    break

Padded Sequences:
tensor([[  436,   545,     0,  ...,     0,     0,     0],
        [  279,   641,   353,  ...,     0,     0,     0],
        [ 2653,  2574,  1839,  ...,     0,     0,     0],
        ...,
        [  661,  1915,    38,  ...,     0,     0,     0],
        [13200,     1,    39,  ...,     0,     0,     0],
        [  685,     8,  1156,  ...,     0,     0,     0]])
Labels:
tensor([0, 1, 2, 2, 2, 2, 1, 2, 0, 0, 1, 1, 2, 2, 0, 1, 1, 1, 2, 2, 0, 2, 2, 1,
        1, 1, 1, 2, 2, 0, 2, 2, 1, 1, 1, 1, 2, 1, 0, 2, 1, 2, 0, 1, 2, 1, 0, 1,
        0, 1, 2, 1, 2, 2, 1, 1, 1, 2, 0, 2, 2, 1, 2, 0, 0, 2, 1, 1, 1, 1, 2, 1,
        1, 2, 1, 1, 1, 1, 2, 1, 2, 0, 1, 0, 2, 2, 2, 1, 2, 0, 1, 1, 2, 2, 0, 2,
        0, 1, 1, 1, 2, 0, 1, 0, 0, 1, 2, 2, 0, 2, 1, 0, 2, 2, 2, 1, 2, 1, 2, 1,
        2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1, 0, 2,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 0, 1, 0, 1, 2, 0, 2, 1, 2, 0,
        1, 1, 2, 1, 2, 2, 2, 1, 0, 1, 1, 0, 1, 2, 0,

In [None]:
# Training loop
n_epochs = 10
batch_size = 256
for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0

    for texts, labels in train_loader:
        optimizer.zero_grad()
        texts = texts.long()
        labels = labels.long()

        # One-vs-Rest means training binary classifier for each class, so create binary labels for each class i.e. 1 for that class and 0 for the others
        for classes in range(3):
            binary_labels = (labels == classes).long()
            pred = model(texts)
            loss = criterion(pred[:, classes], binary_labels.float())  # Compute loss for this class, then backpropagate
            loss.backward()
            epoch_loss += loss.item()
        optimizer.step()
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss / len(train_loader)}')

model.eval()


Epoch 1/10, Loss: 1.552816014458268
Epoch 2/10, Loss: 1.302840112153843
Epoch 3/10, Loss: 1.158939209423567
Epoch 4/10, Loss: 0.9971608271476635
Epoch 5/10, Loss: 0.8261752734966886
Epoch 6/10, Loss: 0.6675491201332732
Epoch 7/10, Loss: 0.5484984625244405
Epoch 8/10, Loss: 0.4437979255645559
Epoch 9/10, Loss: 0.3629979274114413
Epoch 10/10, Loss: 0.31495267204383076


CNN_LSTM_OvR_Model(
  (embedding): Embedding(32004, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
for param_group in optimizer.param_groups:
    param_group['lr'] *= 0.1

In [None]:
def map_labels2(label):
    if label == 0:
        return -1
    elif label == 1:
        return 0
    elif label == 2:
        return 1


In [None]:
additional_epochs = 5
total_epochs = n_epochs + additional_epochs

for epoch in range(n_epochs, total_epochs):
    model.train()
    epoch_loss = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        texts = texts.long()
        labels = labels.long()

        for classes in range(3):
            binary_labels = (labels == classes).long()
            pred = model(texts)
            loss = criterion(pred[:, classes], binary_labels.float())
            loss.backward()
            epoch_loss += loss.item()

        optimizer.step()

    print(f'Epoch {epoch+1}/{total_epochs}, Loss: {epoch_loss / len(train_loader)}')


Epoch 11/15, Loss: 0.2643725030598878
Epoch 12/15, Loss: 0.2231170985321424
Epoch 13/15, Loss: 0.2063102942692771
Epoch 14/15, Loss: 0.16214745027629704
Epoch 15/15, Loss: 0.1609368107727938


In [None]:
# code for a pytorch-based pipeline for processing sequences in a dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
# TextDataset class is implemented to store and access sequences as PyTorch tensors.
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
      seq = torch.tensor(self.sequences[idx])
      return seq


# collate_fn function is used during batching to handle varying length sequences by padding them to the same length and calculating their original lengths.
def collate_fn(batch):
    padded_sequences = pad_sequence([item for item in batch], batch_first=True)
    lengths = torch.tensor([len(item) for item in batch])
    return padded_sequences, lengths

test_dataset = TextDataset(test_df['sequences'].tolist())
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, collate_fn=collate_fn)

for padded_sequences, lengths in test_loader:
    print("Padded Sequences Shape:", padded_sequences.shape)
    print("Lengths:", lengths)
    break

Padded Sequences Shape: torch.Size([256, 35])
Lengths: tensor([11, 10, 21,  5, 15,  8,  7,  9,  3,  3, 12, 14,  9,  6,  2, 14,  4,  6,
         2, 12,  5,  7,  3,  2, 11,  3,  3,  8, 15,  2, 12,  5,  8,  7,  4, 15,
         5,  3,  4,  7,  6,  5,  8,  3, 15,  7,  8,  4,  8,  9,  5,  8, 16, 15,
         6,  6,  3, 21,  8, 10,  3, 16,  5, 10,  8,  4,  6,  4, 16,  5,  7,  2,
        18,  3, 11, 12, 25,  5,  5, 12,  9,  6, 29, 14, 13,  7,  6, 13,  8, 15,
         5, 14,  9,  5,  7, 12,  6,  8, 10,  7,  4,  6, 10,  7,  9,  8, 35,  6,
        20,  8,  3,  6,  7,  6,  0,  7,  6,  6,  8,  6,  6,  5,  8,  4,  6, 10,
         8,  4,  6, 15,  6,  5, 10,  8,  8,  8, 18,  9,  3,  9,  2, 11,  9,  9,
        11, 12,  5,  6, 15,  7,  7, 10,  3, 10,  4, 13, 11,  8, 10, 13,  8, 12,
         5, 12,  4,  3,  9,  5,  9,  6,  4,  4,  7,  7, 12,  3,  5, 20, 11,  9,
         3, 13, 13, 14,  6,  3,  2,  2,  8,  8,  7, 11,  6,  4,  3, 12, 15, 13,
         3,  4,  3, 11,  5,  6, 12,  7,  4,  2, 11, 15,  7, 13, 1

In [None]:
all_predictions = []

with torch.no_grad():
    for padded_sequences, lengths in test_loader:
        padded_sequences = padded_sequences.long()
        predictions = model(padded_sequences)

        # torch.argmax to get the predicted class for each sample
        predicted_classes = torch.argmax(predictions, dim=1)
        mapped_predictions = [map_labels2(pred.item()) for pred in predicted_classes]
        all_predictions.extend(mapped_predictions)

print("Predictions:", all_predictions)

Predictions: [1, 0, 0, 0, -1, 0, 1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, -1, -1, -1, 0, -1, 1, 0, 0, 0, 1, -1, 0, 1, -1, 0, 1, 1, 0, 0, 1, 0, -1, 0, 0, 1, 0, 1, 1, 0, 1, 1, -1, -1, 0, 0, 1, -1, 1, 1, 1, 0, 0, -1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, -1, 1, -1, 0, 1, 0, -1, 1, 0, 0, 1, 1, 0, 1, 1, -1, 0, 0, 1, 0, 1, 1, -1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, -1, 0, 1, 0, 1, 1, 1, -1, 0, 0, -1, 1, 1, 1, 1, 1, -1, -1, -1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, -1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, -1, 0, 1, 1, 1, 1, -1, 0, 1, 1, 0, 1, 1, 1, 0, 1, -1, 0, 0, 0, 0, -1, 1, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, -1, 0, 1, 0, 1, 0, 1, -1, 0, -1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, -1, 1, 0, 1, 0, -1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, -1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, -1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, -1, 0, 1, 0, -1, 0, -1, 0, -1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 1, 1, 1, 0, -1, 0, 1, 0, -1, 0, 1, 0, 0, 0, -1, -1, 0, 0, 

In [None]:
def map_labels2(label):
    if label == 0:
        return -1
    elif label == 1:
        return 0
    elif label == 2:
        return 1

In [None]:
predictions_str = "\n".join(map(str, all_predictions))
predictions_str = "\n".join(map(str, all_predictions))
with open("answer2.txt", "w") as f:
    f.write(predictions_str)
print("Predictions saved to answer.txt")

Predictions saved to answer.txt


In [None]:
#https://ieeexplore.ieee.org/abstract/document/8622880