In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, AlbertTokenizer, AlbertForSequenceClassification
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
!pip install emoji
import emoji

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/586.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
root = '/content/drive/MyDrive/NLP/Davide/irony_davide_datasets/'

x_train = pd.read_csv(root + 'x_train.csv')
x_test = pd.read_csv(root + 'x_test.csv')
x_val = pd.read_csv(root + 'x_val.csv')
y_train = pd.read_csv(root + 'y_train.csv')
y_test = pd.read_csv(root + 'y_test.csv')
y_val = pd.read_csv(root + 'y_val.csv')

In [5]:
hyperparameters = {
    "epochs": 10,
    "learning_rate": 1e-5,
    "batch_size": 32,
    "dropout": 0.6,   # o 0.3
    "weight_decay": 1e-2,
    "stopwords": False,
    "language_model": "albert-base-v2", #bert-base-uncased
    "layers": 1,
    "h_dim": 768,
    "bilstm": False,
    "patience": 5,
    "min_delta": 0.01,
    "extra_features": 4
}

In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, x, emoji_count, ironic_emoji, non_ironic_emoji, nn_count, y, stopwords):
        # x e y sono series di pandas
        tokens_litt = [nltk.word_tokenize(text, language='english') for text in list(x)]
        text_clean = []

        if stopwords:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence if
                                            not w.lower() in nltk.corpus.stopwords.words("english")]))
        else:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence]))
            # ogni token è separato dall'altro con uno spazio

        self.texts = text_clean
        self.labels = [torch.tensor(label) for label in y]
        self.emoji_count = [torch.tensor(count) for count in emoji_count]
        self.ironic_emoji = [torch.tensor(ironic) for ironic in ironic_emoji]
        self.non_ironic_emoji = [torch.tensor(non_ironic) for non_ironic in non_ironic_emoji]
        self.nn_count = [torch.tensor(count) for count in nn_count]  # Nuova colonna

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        batch_texts = self.texts[idx]
        batch_labels = np.array(self.labels[idx])
        batch_emoji_count = np.array(self.emoji_count[idx])
        batch_ironic_emoji = np.array(self.ironic_emoji[idx])
        batch_non_ironic_emoji = np.array(self.non_ironic_emoji[idx])
        batch_nn_count = np.array(self.nn_count[idx])  # Aggiunto NN_count

        return batch_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count, batch_labels




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
train_dataset = Dataset(x_train['text'], x_train['emoji_count'], x_train['has_ironic_emoji'], x_train['has_non_ironic_emoji'], x_train['NN_count'], y_train.squeeze(1), hyperparameters["stopwords"])
val_dataset = Dataset(x_val['text'], x_val['emoji_count'], x_val['has_ironic_emoji'], x_val['has_non_ironic_emoji'], x_val['NN_count'], y_val.squeeze(1), hyperparameters["stopwords"])
test_dataset = Dataset(x_test['text'], x_test['emoji_count'], x_test['has_ironic_emoji'], x_test['has_non_ironic_emoji'], x_test['NN_count'], y_test.squeeze(1), hyperparameters["stopwords"])

Tokenizing ... : 100%|██████████| 1717/1717 [00:00<00:00, 313032.25it/s]
Tokenizing ... : 100%|██████████| 916/916 [00:00<00:00, 219780.47it/s]
Tokenizing ... : 100%|██████████| 229/229 [00:00<00:00, 106674.32it/s]


In [8]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0):

        self.patience = patience
        self.min_delta = min_delta              # valore minimo di decrescita della loss di validazione all'epoca corrente
                                                # per asserire che c'è un miglioramenti della loss
        self.counter = 0                        # contatore delle epoche di pazienza
        self.early_stop = False                 # flag di early stop
        self.min_validation_loss = torch.inf    # valore corrente ottimo della loss di validazione

    def __call__(self, validation_loss):
        # chiamata in forma funzionale dell'oggetto di classe EarlySopping

        if (validation_loss + self.min_delta) >= self.min_validation_loss:  # la loss di validazione non decresce
            self.counter += 1                                               # incrementiamo il contatore delle epoche di pazienza
            if self.counter >= self.patience:
                self.early_stop = True
                print("Early stop!")
        else:                                               # c'è un miglioramento della loss:
            self.min_validation_loss = validation_loss      # consideriamo la loss corrente
                                                            # come nuova loss ottimale
            self.counter = 0                                # e azzeriamo il contatore di pazienza



In [9]:
from transformers import AlbertForSequenceClassification, AlbertConfig, AutoTokenizer
import emoji
import torch
import torch.nn as nn

class ClassifierDeep(nn.Module):

    def __init__(self, hdim, dropout, model_name, extra_features=hyperparameters["extra_features"]):
        super(ClassifierDeep, self).__init__()

        # Configurazione e caricamento di ALBERT per la classificazione
        config = AlbertConfig.from_pretrained(model_name)
        self.lm_model = AlbertForSequenceClassification.from_pretrained(model_name, config=config)

        # Caricamento del tokenizer e aggiunta di nuovi token
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Aggiungi token personalizzati al tokenizer
        all_emojis = list(emoji.EMOJI_DATA.keys())
        emoji_tokens = [emoji.demojize(e) for e in all_emojis]
        self.tokenizer.add_tokens(emoji_tokens)
        self.tokenizer.add_tokens(["@user"])

        # Aggiorna la dimensione degli embedding nel modello ALBERT
        self.lm_model.resize_token_embeddings(len(self.tokenizer))

        # Classificatore personalizzato
        self.classifier = nn.Sequential(
            nn.Linear(hdim + extra_features, hdim),
            nn.BatchNorm1d(hdim),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(hdim, 16),
            nn.BatchNorm1d(16),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, input_id_text, attention_mask, emoji_count, has_ironic_emoji, has_non_ironic_emoji, nn_count):
        # Output dal modello ALBERT
        output = self.lm_model(input_id_text, attention_mask).last_hidden_state
        output = output[:, 0, :]  # Vettore [CLS]

        # Concatenazione delle feature aggiuntive
        output = torch.cat((
            output,
            emoji_count.unsqueeze(1),
            has_ironic_emoji.unsqueeze(1),
            has_non_ironic_emoji.unsqueeze(1),
            nn_count.unsqueeze(1)
        ), dim=1)

        # Classificazione finale
        return self.classifier(output)


In [10]:
def train_loop(model, dataloader, tokenizer, loss, optimizer, device):
    model.train()

    epoch_acc = 0
    epoch_loss = 0

    for batch_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count, batch_labels in tqdm(dataloader, desc='training set'):

        optimizer.zero_grad()

        tokens = tokenizer(list(batch_texts),
                           add_special_tokens=True,
                           return_tensors='pt',
                           padding='max_length',
                           max_length = 256,
                           truncation=True)

        input_id_texts = tokens['input_ids'].squeeze(1).to(device)
        mask_texts = tokens['attention_mask'].squeeze(1).to(device)
        batch_emoji_count = batch_emoji_count.float().to(device)
        batch_ironic_emoji = batch_ironic_emoji.float().to(device)
        batch_non_ironic_emoji = batch_non_ironic_emoji.float().to(device)
        batch_nn_count = batch_nn_count.float().to(device)
        batch_labels = batch_labels.float().to(device)


        output = model(input_id_texts, mask_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count).squeeze(1)

        # la loss è una CrossEntropyLoss, al suo interno ha
        # la logsoftmax + negative log likelihood loss
        batch_loss = loss(output, batch_labels)
        batch_loss.backward()
        optimizer.step()

        epoch_loss += batch_loss.item()

        # per calcolare l'accuracy devo generare le predizioni
        # applicando manualmente la logsoftmax
        preds = (output > 0.5).float()  # Soglia di 0.5 per la classificazione binaria
        epoch_acc += (preds == batch_labels).sum().item()

        batch_labels = batch_labels.detach().cpu()
        input_id_texts = input_id_texts.detach().cpu()
        mask_texts = mask_texts.detach().cpu()
        batch_emoji_count = batch_emoji_count.detach().cpu()
        batch_ironic_emoji = batch_ironic_emoji.detach().cpu()
        batch_non_ironic_emoji = batch_non_ironic_emoji.detach().cpu()
        batch_nn_count = batch_nn_count.detach().cpu()
        output = output.detach().cpu()

    return epoch_loss/len(dataloader), epoch_acc


In [11]:
def test_loop(model, dataloader, tokenizer, loss, device):
    model.eval()

    epoch_acc = 0
    epoch_loss = 0

    with torch.no_grad():

        for batch_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count, batch_labels in tqdm(dataloader, desc='dev set'):

            tokens = tokenizer(list(batch_texts),
                               add_special_tokens=True,
                               return_tensors='pt',
                               padding='max_length',
                               max_length = 256,
                               truncation=True)
            input_id_texts = tokens['input_ids'].squeeze(1).to(device)
            mask_texts = tokens['attention_mask'].squeeze(1).to(device)
            batch_emoji_count = batch_emoji_count.float().to(device)
            batch_ironic_emoji = batch_ironic_emoji.float().to(device)
            batch_non_ironic_emoji = batch_non_ironic_emoji.float().to(device)
            batch_nn_count = batch_nn_count.float().to(device)
            batch_labels = batch_labels.float().to(device)

            output = model(input_id_texts, mask_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count).squeeze(1)

            # la loss è una CrossEntropyLoss, al suo interno ha
            # la logsoftmax + negative log likelihood loss
            batch_loss = loss(output, batch_labels)
            epoch_loss += batch_loss.item()

            # per calcolare l'accuracy devo generare le predizioni
            # applicando manualmente la logsoftmax
            preds = (output > 0.5).float()  # Soglia di 0.5 per la classificazione binaria
            epoch_acc += (preds == batch_labels).sum().item()

            batch_labels = batch_labels.detach().cpu()
            input_id_texts = input_id_texts.detach().cpu()
            mask_texts = mask_texts.detach().cpu()
            batch_emoji_count = batch_emoji_count.detach().cpu()
            batch_ironic_emoji = batch_ironic_emoji.detach().cpu()
            batch_non_ironic_emoji = batch_non_ironic_emoji.detach().cpu()
            batch_nn_count = batch_nn_count.detach().cpu()
            output = output.detach().cpu()

    return epoch_loss/len(dataloader), epoch_acc

In [12]:
def train_test(model, epochs, optimizer, device, train_data, test_data,
               batch_size, model_name, train_loss_fn,
               test_loss_fn=None,         # non necessariamente train e test loss devono differire
               early_stopping=None,       # posso addstrare senza early stopping
               val_data=None,             # e in questo caso non c'è validation set
               scheduler=None):           # possibile scheduler per monitorare l'andamento di un iperparametro,
                                          # tipicamente il learning rate

    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

    # check sulle funzioni di loss
    if test_loss_fn == None:
        test_loss_fn = train_loss_fn

    # liste dei valori di loss e accuracy epoca per epoca per il plot
    train_loss = []
    validation_loss = []
    test_loss = []

    train_acc = []
    validation_acc = []
    test_acc = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    all_emojis = list(emoji.EMOJI_DATA.keys())
    emoji_tokens = [emoji.demojize(e) for e in all_emojis]
    tokenizer.add_tokens(emoji_tokens)
    tokenizer.add_tokens(["@user"])
    model.lm_model.resize_token_embeddings(len(tokenizer))


    # Ciclo di addestramento con early stopping
    for epoch in tqdm(range(1,epochs+1)):

        epoch_train_loss, epoch_train_acc = train_loop(model, train_dataloader, tokenizer, train_loss_fn, optimizer, device)
        train_loss.append(epoch_train_loss)
        train_acc.append(epoch_train_acc/len(train_data))

        # validation se è presente la callback di early stopping
        if early_stopping != None:
                epoch_validate_loss, epoch_validate_acc = test_loop(model, val_dataloader, tokenizer, test_loss_fn, device)
                validation_loss.append(epoch_validate_loss)
                validation_acc.append(epoch_validate_acc/len(val_data))

        # test
        epoch_test_loss, epoch_test_acc = test_loop(model, test_dataloader, tokenizer, test_loss_fn, device)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc/len(test_data))

        val_loss_str = f'Validation loss: {epoch_validate_loss:6.4f} 'if early_stopping != None else ' '
        val_acc_str = f'Validation accuracy: {(epoch_validate_acc/len(val_data)):6.4f} ' if early_stopping != None else ' '
        print(f"\nTrain loss: {epoch_train_loss:6.4f} {val_loss_str} Test loss: {epoch_test_loss:6.4f}")
        print(f"Train accuracy: {(epoch_train_acc/len(train_data)):6.4f} {val_acc_str}Test accuracy: {(epoch_test_acc/len(test_data)):6.4f}")

        # early stopping
        if early_stopping != None:
                early_stopping(epoch_validate_loss)
                if early_stopping.early_stop:
                    break

    return train_loss, validation_loss, test_loss, train_acc, validation_acc, test_acc

In [None]:
# Acquisiamo il device su cui effettueremo il training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

model = ClassifierDeep(
                    hyperparameters["h_dim"],
                    hyperparameters["dropout"],
                    hyperparameters["language_model"]).to(device)
print(model)

# Calcoliamo il numero totale dei parametri del modello
total_params = sum(p.numel() for p in model.parameters())
print(f"Numero totale dei parametri: {total_params}")

criterion = nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=hyperparameters["learning_rate"], weight_decay=1e-5)

# Creiamo la callback di early stopping da passare al nostro metodo di addestramento
early_stopping = EarlyStopping(patience=hyperparameters['patience'], min_delta=hyperparameters['min_delta'])


Using cuda device


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [None]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
# Routine di addestramento
train_loss, validation_loss,test_loss, train_acc, validation_acc, test_acc = train_test(model,
                                                                                hyperparameters['epochs'],
                                                                                optimizer, device, train_dataset,
                                                                                test_dataset, hyperparameters['batch_size'], hyperparameters['language_model'],
                                                                                criterion, criterion, early_stopping,val_dataset)

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 2, figsize=(20, 10))

axs[0].plot(train_loss, label='training loss')
axs[0].plot(validation_loss, label='validation loss')
axs[0].plot(test_loss, label='test loss')
axs[0].legend(loc='upper right')
axs[0].set_ylim(0,1)

axs[1].plot(train_acc, label='training accuracy')
axs[1].plot(validation_acc, label='validation accuracy')
axs[1].plot(test_acc, label='test accuracy')
axs[1].legend(loc='lower right')
axs[1].set_ylim(0,1)