In [1]:
import numpy as np
import pandas as pd
import optuna

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
RAND_STATE = 42

In [3]:
# Mock dataset for demonstration
class TextDataset(Dataset):
    # def __init__(self, vocabulary_size, sequence_length, num_samples):
    #     self.data = torch.randint(0, vocabulary_size, (num_samples, sequence_length))
    #     self.labels = torch.randint(0, 2, (num_samples,))
    def __init__(self, texts, labels, sequence_length):
        self.texts = texts
        self.labels = labels
        self.sequence_length = sequence_length
        self.vocab = self.build_vocab(texts)
        self.encoded_texts = [self.encode_text(text) for text in texts]

    def build_vocab(self, texts):
        unique_words = set(word for text in texts for word in text.lower().split())
        vocab = {word: i + 1 for i, word in enumerate(unique_words)}  # +1 for padding token at index 0
        return vocab
    
    def encode_text(self, text):
        return [self.vocab.get(word, 0) for word in text.lower().split()][:self.sequence_length] + [0] * (self.sequence_length - len(text.split()))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.encoded_texts[idx]), torch.tensor(self.labels[idx])
    

# Define the Q-network model
# class DQN(nn.Module):
#     def __init__(self, vocabulary_size, embedding_dim, hidden_dim, num_classes):
#         super(DQN, self).__init__()
#         self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
#         self.fc = nn.Linear(hidden_dim, num_classes)

#     def forward(self, x):
#         embeds = self.embedding(x)
#         lstm_out, _ = self.lstm(embeds)
#         q_values = self.fc(lstm_out[:, -1])
#         return q_values

# class DQN(nn.Module):
#     def __init__(self, vocabulary_size, embedding_dim, hidden_dim, num_classes):
#         super(DQN, self).__init__()
#         self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
#         self.fc = nn.Linear(hidden_dim, num_classes)

#     def forward(self, x):
#         embeds = self.embedding(x)
#         lstm_out, _ = self.lstm(embeds)
#         q_values = self.fc(lstm_out[:, -1])
#         return q_values
    
class DQN(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, hidden_dim, num_classes, dropout_rate=0.5, pre_trained_embeddings=None):
        super(DQN, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        if pre_trained_embeddings is not None:
            self.embedding.weight = nn.Parameter(pre_trained_embeddings)
            self.embedding.weight.requires_grad = False  # Or True if you want to fine-tune

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2) # Stacked LSTMs
        self.dropout = nn.Dropout(dropout_rate)  # Use dropout_rate from arguments
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # Adjust for bidirectional LSTM
        self.relu = nn.ReLU()

    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        out = self.dropout(lstm_out[:, -1])
        out = self.relu(out)
        q_values = self.fc(out)
        return q_values

In [4]:
def train(model, device, train_loader, optimizer, epoch, log_interval=10):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        # Using CrossEntropyLoss which combines LogSoftmax and NLLLoss
        loss = nn.CrossEntropyLoss()(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

def validate(model, device, validation_loader):
    model.eval()
    validation_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in validation_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            validation_loss += nn.CrossEntropyLoss()(output, target).item()  # Sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    validation_loss /= len(validation_loader.dataset)
    validation_acc = correct / len(validation_loader.dataset)
    print(f'\nValidation set: Average loss: {validation_loss:.4f}, Accuracy: {correct}/{len(validation_loader.dataset)} ({100. * correct / len(validation_loader.dataset):.0f}%)\n')
    return validation_loss, validation_acc

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += nn.CrossEntropyLoss()(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print(f'\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n')

In [5]:
df = pd.read_csv('dataset/sentiment_analysis.csv')

# Extracting texts and labels
texts = df['tweet'].tolist()
labels = df['label'].tolist()

# Splitting dataset into train+val and test
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=RAND_STATE)

# Splitting train+val into train and val
train_texts, validation_texts, train_labels, validation_labels = train_test_split(train_val_texts, train_val_labels, test_size=0.25, random_state=RAND_STATE)  # 0.25 x 0.8 = 0.2

# Creating datasets
sequence_length = 10  # Max number of words in a text
train_dataset = TextDataset(train_texts, train_labels, sequence_length)
validation_dataset = TextDataset(validation_texts, validation_labels, sequence_length)
test_dataset = TextDataset(test_texts, test_labels, sequence_length)

# Creating DataLoaders
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

for data, label in train_loader:
    print(f"Train Encoded text: {data}")
    print(f"Train Label: {label}")
    break  # Just show one batch for brevity

for data, label in validation_loader:
    print(f"Validation Encoded text: {data}")
    print(f"Validation Label: {label}")
    break  # Just show one batch for brevity

for data, label in test_loader:
    print(f"Test Encoded text: {data}")
    print(f"Test Label: {label}")
    break  # Just show one batch for brevity

Train Encoded text: tensor([[20266, 14445, 11654,  7230, 18993, 14206, 12027, 17419,  3660,   525],
        [ 9855,  6087,  4362,  1801,  9848,   354, 18140, 19195, 10090,  2209],
        [ 6397, 18993, 18192, 13909,  6419, 11927, 14401, 11180, 15236,  9362],
        [17641, 18365, 10094, 12794,  1912, 15159,  9509, 13795, 16944, 11758]])
Train Label: tensor([1, 1, 1, 0])
Validation Encoded text: tensor([[2399, 3263, 5861, 6981, 1847, 7823, 8057, 7878, 6652, 8291],
        [1162, 1386, 4916, 7381, 2305, 6181, 1947, 6556, 5600, 5340],
        [3521, 8157, 6887,  585, 4568, 1853, 1114, 7966, 6089,  614],
        [1162,  221, 4916, 6859, 7478, 5534, 8358, 1866, 4948, 7728]])
Validation Label: tensor([1, 1, 1, 1])
Test Encoded text: tensor([[5943, 4363, 6742, 8266, 7125, 5790, 6126, 2067, 3345, 5229],
        [3631, 1346, 7111, 3429, 3303, 7643,  484, 4489, 7662, 5296],
        [4628, 6126, 8552,  546, 1701, 1068, 6903, 4743, 6178, 7028],
        [3285, 6041, 8001, 1719, 5866, 4782, 3370, 

In [6]:
# Parameters and Hyperparameters
vocabulary_size = 100000  # to adjust 
sequence_length = 50  # to adjust 
embedding_dim = 128
hidden_dim = 64
num_classes = 2
batch_size = 64
epochs = 5
learning_rate = 0.001

# Model, optimizer, and device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DQN(vocabulary_size, embedding_dim, hidden_dim, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    validate(model, device, validation_loader)

# After training, evaluate on the test set
test(model, device, test_loader)


Validation set: Average loss: 0.2060, Accuracy: 1176/1584 (74%)


Validation set: Average loss: 0.3082, Accuracy: 1115/1584 (70%)


Validation set: Average loss: 0.4872, Accuracy: 1130/1584 (71%)


Validation set: Average loss: 0.4887, Accuracy: 1059/1584 (67%)


Validation set: Average loss: 0.6927, Accuracy: 1099/1584 (69%)


Test set: Average loss: 0.6822, Accuracy: 1094/1584 (69%)



In [7]:
# Parameters and Hyperparameters
n_trials=5
num_classes = 2

def objective(trial):
    # Define the search space
    # vocabulary_size = trial.suggest_categorical('vocabulary_size', [5000, 10000, 20000, 40000])
    vocabulary_size = 100000
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    embedding_dim = trial.suggest_categorical('embedding_dim', [64, 128, 256])
    hidden_dim = trial.suggest_categorical('hidden_dim', [32, 64, 128])
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'RMSprop', 'SGD'])
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    step_size = trial.suggest_int('step_size', 1, 100)
    gamma = trial.suggest_float('gamma', 0.1, 1.0, log=True)
    sequence_length = trial.suggest_categorical('sequence_length', [50, 100, 200, 400])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Model setup with trial suggestions
    model = DQN(vocabulary_size, embedding_dim, hidden_dim, num_classes, dropout_rate=dropout_rate).to(device)

    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=learning_rate)

    if optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer_name == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
    elif optimizer_name == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Training loop
    epochs = 5  # Reduced for faster optimization cycles
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer, epoch)
        val_loss, val_accuracy = validate(model, device, validation_loader)
        scheduler.step()

    # Set custom attributes for the trial
    trial.set_user_attr("val_loss", val_loss)
    trial.set_user_attr("val_accuracy", val_accuracy)
    
    # print(f"Returning from validate: val_loss={val_loss}, val_accuracy={val_accuracy}")
    # return val_loss

    # Objective: maximize validation accuracy by minimizing its negative value
    return -val_accuracy  # Return the negative accuracy

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials)  # Number of trials can be adjusted

print('Number of finished trials:', len(study.trials))
print('Best trial:')
trial = study.best_trial

# Retrieve the validation loss and accuracy from the best trial
best_val_loss = trial.user_attrs["val_loss"]
best_val_accuracy = trial.user_attrs["val_accuracy"]

print(f'Best Validation Loss: {best_val_loss}')
print(f'Best Validation Accuracy: {best_val_accuracy}')
print('Best Trial Parameters:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

[I 2024-04-03 09:46:29,731] A new study created in memory with name: no-name-c39a1b7e-8f1a-4ba3-bfb5-4fa0c529b750



Validation set: Average loss: 0.0194, Accuracy: 1161/1584 (73%)


Validation set: Average loss: 0.0303, Accuracy: 1160/1584 (73%)


Validation set: Average loss: 0.0413, Accuracy: 1109/1584 (70%)


Validation set: Average loss: 0.0538, Accuracy: 1139/1584 (72%)



[I 2024-04-03 09:47:06,480] Trial 0 finished with value: -0.6641414141414141 and parameters: {'batch_size': 32, 'learning_rate': 0.0040841757800848715, 'embedding_dim': 64, 'hidden_dim': 128, 'optimizer': 'RMSprop', 'dropout_rate': 0.14956987091624135, 'step_size': 65, 'gamma': 0.697399587361881, 'sequence_length': 100}. Best is trial 0 with value: -0.6641414141414141.



Validation set: Average loss: 0.0616, Accuracy: 1052/1584 (66%)


Validation set: Average loss: 0.0037, Accuracy: 1076/1584 (68%)


Validation set: Average loss: 0.0044, Accuracy: 1175/1584 (74%)


Validation set: Average loss: 0.0085, Accuracy: 1129/1584 (71%)


Validation set: Average loss: 0.0106, Accuracy: 1112/1584 (70%)



[I 2024-04-03 09:47:18,104] Trial 1 finished with value: -0.7070707070707071 and parameters: {'batch_size': 256, 'learning_rate': 0.04459353601444581, 'embedding_dim': 128, 'hidden_dim': 32, 'optimizer': 'Adam', 'dropout_rate': 0.4216681752268144, 'step_size': 75, 'gamma': 0.7140159487425279, 'sequence_length': 400}. Best is trial 1 with value: -0.7070707070707071.



Validation set: Average loss: 0.0126, Accuracy: 1120/1584 (71%)


Validation set: Average loss: 0.0210, Accuracy: 1182/1584 (75%)


Validation set: Average loss: 0.0205, Accuracy: 1182/1584 (75%)


Validation set: Average loss: 0.0201, Accuracy: 1182/1584 (75%)


Validation set: Average loss: 0.0197, Accuracy: 1182/1584 (75%)



[I 2024-04-03 09:47:28,616] Trial 2 finished with value: -0.7462121212121212 and parameters: {'batch_size': 32, 'learning_rate': 0.0011275449657103711, 'embedding_dim': 64, 'hidden_dim': 32, 'optimizer': 'SGD', 'dropout_rate': 0.1602001763971111, 'step_size': 50, 'gamma': 0.20441284569584917, 'sequence_length': 50}. Best is trial 2 with value: -0.7462121212121212.



Validation set: Average loss: 0.0194, Accuracy: 1182/1584 (75%)


Validation set: Average loss: 0.0083, Accuracy: 1171/1584 (74%)


Validation set: Average loss: 0.0089, Accuracy: 1132/1584 (71%)


Validation set: Average loss: 0.0127, Accuracy: 1141/1584 (72%)


Validation set: Average loss: 0.0192, Accuracy: 1151/1584 (73%)



[I 2024-04-03 09:47:38,352] Trial 3 finished with value: -0.7146464646464646 and parameters: {'batch_size': 128, 'learning_rate': 0.007099906584103223, 'embedding_dim': 64, 'hidden_dim': 64, 'optimizer': 'RMSprop', 'dropout_rate': 0.04148977871021642, 'step_size': 40, 'gamma': 0.2859282500341737, 'sequence_length': 200}. Best is trial 2 with value: -0.7462121212121212.



Validation set: Average loss: 0.0221, Accuracy: 1132/1584 (71%)


Validation set: Average loss: 0.0030, Accuracy: 1183/1584 (75%)


Validation set: Average loss: 0.0029, Accuracy: 1182/1584 (75%)


Validation set: Average loss: 0.0028, Accuracy: 1182/1584 (75%)


Validation set: Average loss: 0.0028, Accuracy: 1182/1584 (75%)



[I 2024-04-03 09:47:52,344] Trial 4 finished with value: -0.7462121212121212 and parameters: {'batch_size': 256, 'learning_rate': 0.00015019108257404234, 'embedding_dim': 128, 'hidden_dim': 64, 'optimizer': 'Adam', 'dropout_rate': 0.47275307451036497, 'step_size': 82, 'gamma': 0.14813048695866554, 'sequence_length': 100}. Best is trial 2 with value: -0.7462121212121212.



Validation set: Average loss: 0.0026, Accuracy: 1182/1584 (75%)

Number of finished trials: 5
Best trial:
Best Validation Loss: 0.0194210239281558
Best Validation Accuracy: 0.7462121212121212
Best Trial Parameters:
    batch_size: 32
    learning_rate: 0.0011275449657103711
    embedding_dim: 64
    hidden_dim: 32
    optimizer: SGD
    dropout_rate: 0.1602001763971111
    step_size: 50
    gamma: 0.20441284569584917
    sequence_length: 50
