In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


# Define column names
column_names = ['tweetID', 'entity', 'sentiment', 'tweet_content']

# Load the dataset with specified column names
df = pd.read_csv('../Data/twitter_sentiment_analysis.csv', names=column_names, header=None)
df = df[['tweetID', 'sentiment', 'tweet_content']]  # Select relevant columns

# Basic preprocessing
df['sentiment'] = df['sentiment'].map({'Positive': 1, 'Negative': 0, 'Neutral': 2, 'Irrelevant': 3})  # Encode sentiments
df['tweet_content'] = df['tweet_content'].astype(str)
df.dropna(subset=['sentiment'], inplace=True)

train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)  # 15% for test
valid_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42) 

# Tokenization
tokenizer = get_tokenizer("basic_english")

# Create vocabulary
def yield_tokens(data_iter):
    for tweet in data_iter:
        yield tokenizer(tweet)

vocab = build_vocab_from_iterator(yield_tokens(df['tweet_content']), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])  # Default index for unknown words

# Prepare dataset class
class TweetDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        tweet = self.dataframe.iloc[idx]['tweet_content']
        sentiment = self.dataframe.iloc[idx]['sentiment']
        return torch.tensor(vocab(tokenizer(tweet))), torch.tensor(sentiment)

# Create DataLoader
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=vocab["<pad>"], batch_first=True)
    trg_batch = torch.stack(trg_batch)  # Sentiment labels
    return src_batch, trg_batch

train_dataset = TweetDataset(train_df)
valid_dataset = TweetDataset(valid_df)
test_dataset = TweetDataset(test_df)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [2]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)  # Use batch_first=True

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)  # Use batch_first=True
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden):
        input = input.unsqueeze(1)  # Shape: (batch_size, 1)
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(output.squeeze(1))  # Shape: (batch_size, output_dim)
        return prediction, hidden

In [3]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src):
        hidden = self.encoder(src)
        input = torch.zeros(src.size(0), dtype=torch.long).to(src.device)  # Start token (shape: batch_size)
        output, _ = self.decoder(input, hidden)
        return output

In [4]:
# Model initialization
INPUT_DIM = len(vocab)
OUTPUT_DIM = 4  # Sentiment classes (0, 1, 2, 3)
EMB_DIM = 100
HIDDEN_DIM = 256
N_EPOCHS = 10

encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM)
model = Seq2Seq(encoder, decoder)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Training loop
for epoch in range(N_EPOCHS):
    model.train()
    epoch_loss = 0
    for src, trg in train_dataloader:
        trg = trg.long()  # Ensure target is of type LongTensor
        optimizer.zero_grad()
        output = model(src)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch + 1}/{N_EPOCHS}, Loss: {epoch_loss / len(train_dataloader)}')


Epoch 1/10, Loss: 1.3135970654311004
Epoch 2/10, Loss: 1.274238732126024
Epoch 3/10, Loss: 1.281315216311702
Epoch 4/10, Loss: 1.282257424460517
Epoch 5/10, Loss: 1.1883082433983132
Epoch 6/10, Loss: 0.9435194885289228
Epoch 7/10, Loss: 0.5825899265430592
Epoch 8/10, Loss: 0.3843495034509235
Epoch 9/10, Loss: 0.2033556972940763
Epoch 10/10, Loss: 0.07845339499827889


In [52]:
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for src, trg in dataloader:
            output = model(src)
            predictions.extend(output.argmax(dim=1).cpu().numpy())  # Get predicted class
            true_labels.extend(trg.cpu().numpy())
    return predictions, true_labels

# Evaluate on validation set
val_predictions, val_true = evaluate(model, valid_dataloader)

# Evaluate on test set
test_predictions, test_true = evaluate(model, test_dataloader)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, classification_report

val_accuracy = accuracy_score(val_true, val_predictions)
test_accuracy = accuracy_score(test_true, test_predictions)

print("Validation Accuracy:", val_accuracy)
print("Test Accuracy:", test_accuracy)
print("\nClassification Report for Validation Set:\n", classification_report(val_true, val_predictions))
print("\nClassification Report for Test Set:\n", classification_report(test_true, test_predictions))

Validation Accuracy: 0.9066666666666666
Test Accuracy: 0.9342105263157895

Classification Report for Validation Set:
               precision    recall  f1-score   support

           0       1.00      0.85      0.92        20
           1       1.00      0.89      0.94        38
           2       0.72      1.00      0.84        13
           3       0.67      1.00      0.80         4

    accuracy                           0.91        75
   macro avg       0.85      0.94      0.88        75
weighted avg       0.93      0.91      0.91        75


Classification Report for Test Set:
               precision    recall  f1-score   support

           0       0.88      0.93      0.90        15
           1       1.00      0.92      0.96        38
           2       0.86      0.95      0.90        19
           3       1.00      1.00      1.00         4

    accuracy                           0.93        76
   macro avg       0.93      0.95      0.94        76
weighted avg       0.94      