In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pytorch_lightning as pl
import torch.nn.functional as F
import logging
import spacy
from torchtext.data import get_tokenizer

batch = 64
test_size=0.30

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# configure logging at the root level of Lightning
logging.getLogger("lightning.pytorch").setLevel(logging.DEBUG)

logger = logging.getLogger("lightning.pytorch.core")
logger.addHandler(logging.FileHandler("logsMLP5layers.log"))

In [3]:
class EarlyStoppingCallback(pl.Callback):
    def __init__(self, monitor='val_loss', patience=3):
        super().__init__()
        self.monitor = monitor
        self.patience = patience
        self.wait = 0
        self.stopped_epoch = 0
        self.best_score = None

    def on_validation_end(self, trainer, pl_module):
        val_loss = trainer.callback_metrics.get(self.monitor)
        if self.best_score is None:
            self.best_score = val_loss
        elif val_loss > self.best_score:
            self.wait += 1
            if self.wait >= self.patience:
                self.stopped_epoch = trainer.current_epoch
                trainer.should_stop = True
        else:
            self.best_score = val_loss
            self.wait = 0

class SentimentClassifier(pl.LightningModule):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(114277, 5000)
        self.fc1 = nn.Linear(5000, 1250)
       # self.fc2 = nn.Linear(2500, 1250)
        self.fc3 = nn.Linear(1250, 256)
        #self.fc4 = nn.Linear(625, 256)
        self.fc5 = nn.Linear(256, 7)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.mean(dim=1)
        out = self.fc1(embedded)
        out = F.relu(out)
        #out = self.fc2(out)
        #out = F.relu(out)
        out = self.fc3(out)
        out = F.relu(out)
        #out = self.fc4(out)        
        #out = F.relu(out)
        out = self.fc5(out)
        return out

    def training_step(self, batch):
        inputs, labels = batch
        outputs = self(inputs)
        loss = F.cross_entropy(outputs, labels)
        acc = (outputs.argmax(dim=1) == labels).float().mean()
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss

    def validation_step(self, batch, idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = F.cross_entropy(outputs, labels)
        acc = (outputs.argmax(dim=1) == labels).float().mean()
        self.log('val_loss', loss)
        self.log('val_acc', acc)
        return loss
    
    def test_step(self, batch):
        inputs, labels = batch
        outputs = self(inputs)
        loss = F.cross_entropy(outputs, labels)
        acc = (outputs.argmax(dim=1) == labels).float().mean()
        self.log('test_loss', loss)
        self.log('test_acc', acc)
        return loss
    def on_train_epoch_end(self):
        train_loss = self.trainer.callback_metrics['train_loss']
        train_acc = self.trainer.callback_metrics['train_acc']
        val_loss = self.trainer.callback_metrics['val_loss']
        val_acc = self.trainer.callback_metrics['val_acc']
        logger.debug(f'[Epoch {self.current_epoch}] - Training results: train_loss={train_loss:.4f}, train_acc={train_acc:.4f}')
        logger.debug(f'[Epoch {self.current_epoch}] - Validation results: val_loss={val_loss:.4f}, val_acc={val_acc:.4f}')
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.01, patience=5, verbose=True,min_lr=0.000001)
        monitor_metric = 'val_loss'
        return {
            'optimizer': optimizer,
            'lr_scheduler': lr_scheduler,
            'monitor': monitor_metric
        }


In [4]:

np.random.seed(0)
torch.manual_seed(42)
dataset = pd.read_csv("ISEAR_only_text_pt_data_augmentation.csv", sep=";")

le = LabelEncoder()
dataset['sentiment'] = le.fit_transform(dataset['sentiment'])
train_texts, test_texts, train_labels, test_labels = train_test_split(dataset['text'], dataset['sentiment'], test_size=test_size, random_state=0)

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [5]:

nlp = spacy.load("pt_core_news_sm")

# Create a vocabulary of tokens
vocab = set()
for text in train_texts:
    doc = nlp(text)
    tokens = [token.text for token in doc]
    vocab.update(tokens)

for text in test_texts:
    doc = nlp(text)
    tokens = [token.text for token in doc]
    vocab.update(tokens)
# Assign unique indices to tokens
token2index = {token: i for i, token in enumerate(vocab)}

# Convert tokens to indices
train_sequences = [
    torch.tensor([token2index[token.text] for token in nlp(text)], dtype=torch.long)
    for text in train_texts
]
test_sequences = [
    torch.tensor([token2index[token.text] for token in nlp(text)], dtype=torch.long)
    for text in test_texts
]


  train_sequences = torch.tensor(train_sequences, dtype=torch.long).clone().detach().to(device)
  test_sequences = torch.tensor(test_sequences, dtype=torch.long).clone().detach().to(device)


In [11]:

train_sequences = torch.nn.utils.rnn.pad_sequence(train_sequences, batch_first=True)
test_sequences = torch.nn.utils.rnn.pad_sequence(test_sequences, batch_first=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_sequences = train_sequences.clone().detach().to(device)
test_sequences = test_sequences.clone().detach().to(device)

train_labels = train_labels.clone().detach().to(device)
test_labels = test_labels.clone().detach().to(device)


In [12]:
model = SentimentClassifier()

# Define the dataloaders
train_dataset = torch.utils.data.TensorDataset(train_sequences, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_sequences, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True, pin_memory=True)

In [13]:
# Train the model
model = model.to(device)
early_stop_callback = EarlyStoppingCallback(monitor='val_loss', patience=10)

trainer = pl.Trainer(max_epochs = 100, callbacks=[early_stop_callback],log_every_n_steps=1)
trainer.fit(model, train_loader, test_loader)    


torch.save(model.state_dict(), ".\models_pytorch\model_MLP5L.pt")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 571 M 
1 | fc1       | Linear    | 6.3 M 
2 | fc3       | Linear    | 320 K 
3 | fc5       | Linear    | 1.8 K 
----------------------------------------
577 M     Trainable params
0         Non-trainable params
577 M     Total params
2,311.833 Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0:   3%|▎         | 5/165 [02:32<1:21:16, 30.48s/it, v_num=3]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
