In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip uninstall -y nltk
!pip install --upgrade nltk
!pip install emoji

Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_scheduler
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import emoji

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
root = '/content/drive/MyDrive/NLP/Davide/irony_davide_datasets/'

x_train = pd.read_csv(root + 'x_train.csv')
x_test = pd.read_csv(root + 'x_test.csv')
x_val = pd.read_csv(root + 'x_val.csv')
y_train = pd.read_csv(root + 'y_train.csv')
y_test = pd.read_csv(root + 'y_test.csv')
y_val = pd.read_csv(root + 'y_val.csv')

In [5]:
hyperparameters = {
    "epochs": 18,
    "learning_rate": 2e-5,   #1e-5
    "batch_size": 8,  #accura...
    "dropout": 0.3,   # o 0.3
    "weight_decay": 5e-4,  #1e-3
    "stopwords": False,
    "language_model": "bert-base-uncased",
    "layers": 1,
    "h_dim": 768,
    "bilstm": False,
    "patience": 5,
    "min_delta": 0.01,
    "extra_features": 5
}

param_grid = {
    "learning_rate": [1e-5, 2e-5, 5e-5],
    "dropout": [0.3, 0.5],
    "weight_decay": [1e-5, 5e-4]
}

In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, x, hashtag_count, avg_ironic_ratio, ironic_hashtag_count, non_ironic_hashtag_count, hashtag_irony_index, y, stopwords):  #nn_count
        # x e y sono series di pandas
        tokens_litt = [nltk.word_tokenize(text, language='english') for text in list(x)]
        text_clean = []

        if stopwords:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence if
                                            not w.lower() in nltk.corpus.stopwords.words("english")]))
        else:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence]))
            # ogni token è separato dall'altro con uno spazio

        self.texts = text_clean
        self.labels = [torch.tensor(label) for label in y]
        #self.emoji_count = [torch.tensor(count) for count in emoji_count]
        #self.ironic_emoji = [torch.tensor(ironic) for ironic in ironic_emoji]
        #self.non_ironic_emoji = [torch.tensor(non_ironic) for non_ironic in non_ironic_emoji]
        #self.nn_count = [torch.tensor(count) for count in nn_count]  # Nuova colonna
        self.hashtag_count = [torch.tensor(count) for count in hashtag_count]
        self.avg_ironic_ratio = [torch.tensor(ratio) for ratio in avg_ironic_ratio]
        self.ironic_hashtag_count = [torch.tensor(count) for count in ironic_hashtag_count]
        self.non_ironic_hashtag_count = [torch.tensor(count) for count in non_ironic_hashtag_count]
        self.hashtag_irony_index = [torch.tensor(index) for index in hashtag_irony_index]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        batch_texts = self.texts[idx]
        batch_labels = np.array(self.labels[idx])
        #batch_emoji_count = np.array(self.emoji_count[idx])
        #batch_ironic_emoji = np.array(self.ironic_emoji[idx])
        #batch_non_ironic_emoji = np.array(self.non_ironic_emoji[idx])
        #batch_nn_count = np.array(self.nn_count[idx])  # Aggiunto NN_count
        batch_hashtag_count = np.array(self.hashtag_count[idx])
        batch_avg_ironic_ratio = np.array(self.avg_ironic_ratio[idx])
        batch_ironic_hashtag_count = np.array(self.ironic_hashtag_count[idx])
        batch_non_ironic_hashtag_count = np.array(self.non_ironic_hashtag_count[idx])
        batch_hashtag_irony_index = np.array(self.hashtag_irony_index[idx])

        return batch_texts, batch_hashtag_count, batch_avg_ironic_ratio, batch_ironic_hashtag_count, batch_non_ironic_hashtag_count, batch_hashtag_irony_index, batch_labels
        #batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
x_train

Unnamed: 0,hashtag_count,avg_ironic_ratio,ironic_hashtag_count,non_ironic_hashtag_count,hashtag_irony_index,text
0,0,0.0,0,0,0.0,"@user not surprised, it was epic. \n"
1,0,0.0,0,0,0.0,"Sooo mad right now, great way to start my day \n"
2,0,0.0,0,0,0.0,Lol well done swans \n
3,1,1.0,1,0,1.0,@user thanks for the wake up wrap and coffee t...
4,0,0.0,0,0,0.0,So excited to spend the next 12 hours at schoo...
...,...,...,...,...,...,...
1712,0,0.0,0,0,0.0,Ay after 2 hours nareceive ko rin yung message...
1713,1,0.0,0,1,0.0,@user Alone Tonight cause its a personal song ...
1714,1,0.0,0,1,0.0,A year ago this would be just a writing on a t...
1715,0,0.0,0,0,0.0,@user @user yeah & we wont talk about how the ...


In [8]:
train_dataset = Dataset(x_train['text'], x_train['hashtag_count'], x_train['avg_ironic_ratio'], x_train['ironic_hashtag_count'], x_train['non_ironic_hashtag_count'], x_train['hashtag_irony_index'], y_train.squeeze(1), hyperparameters["stopwords"])
val_dataset = Dataset(x_val['text'], x_val['hashtag_count'], x_val['avg_ironic_ratio'], x_val['ironic_hashtag_count'], x_val['non_ironic_hashtag_count'], x_val['hashtag_irony_index'], y_val.squeeze(1), hyperparameters["stopwords"])
test_dataset = Dataset(x_test['text'], x_test['hashtag_count'], x_test['avg_ironic_ratio'], x_test['ironic_hashtag_count'], x_test['non_ironic_hashtag_count'], x_test['hashtag_irony_index'], y_test.squeeze(1), hyperparameters["stopwords"])

#val_dataset = Dataset(x_val['text'], x_val['emoji_count'], x_val['has_ironic_emoji'], x_val['has_non_ironic_emoji'], x_val['NN_count'], y_val.squeeze(1), hyperparameters["stopwords"])
#test_dataset = Dataset(x_test['text'], x_test['emoji_count'], x_test['has_ironic_emoji'], x_test['has_non_ironic_emoji'], x_test['NN_count'], y_test.squeeze(1), hyperparameters["stopwords"])

Tokenizing ... : 100%|██████████| 1717/1717 [00:00<00:00, 82272.26it/s]
Tokenizing ... : 100%|██████████| 916/916 [00:00<00:00, 62157.94it/s]
Tokenizing ... : 100%|██████████| 229/229 [00:00<00:00, 14146.57it/s]


In [9]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0):

        self.patience = patience
        self.min_delta = min_delta              # valore minimo di decrescita della loss di validazione all'epoca corrente
                                                # per asserire che c'è un miglioramenti della loss
        self.counter = 0                        # contatore delle epoche di pazienza
        self.early_stop = False                 # flag di early stop
        self.min_validation_loss = torch.inf    # valore corrente ottimo della loss di validazione

    def __call__(self, validation_loss):
        # chiamata in forma funzionale dell'oggetto di classe EarlySopping

        if (validation_loss + self.min_delta) >= self.min_validation_loss:  # la loss di validazione non decresce
            self.counter += 1                                               # incrementiamo il contatore delle epoche di pazienza
            if self.counter >= self.patience:
                self.early_stop = True
                print("Early stop!")
        else:                                               # c'è un miglioramento della loss:
            self.min_validation_loss = validation_loss      # consideriamo la loss corrente
                                                            # come nuova loss ottimale
            self.counter = 0                                # e azzeriamo il contatore di pazienza



In [10]:
class ClassifierDeep(nn.Module):

    def __init__(self, hdim, dropout, model_name, extra_features = hyperparameters["extra_features"]):
        super(ClassifierDeep, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        self.lm_model = AutoModel.from_pretrained(model_name, config=config)

        self.classifier = nn.Sequential(
          nn.Linear(hdim + extra_features, hdim),
          nn.BatchNorm1d(hdim),
          nn.Dropout(dropout),
          nn.ReLU(),
          nn.Linear(hdim, 16),
          nn.BatchNorm1d(16),
          nn.Dropout(dropout),
          nn.ReLU(),
          nn.Linear(16, 1),
          nn.Sigmoid()
          )




    #def forward(self, input_id_text, attention_mask, emoji_count, has_ironic_emoji, has_non_ironic_emoji, nn_count):
    def forward(self, input_id_text, attention_mask, hashtag_count, avg_ironic_ratio, ironic_hashtag_count, non_ironic_hashtag_count, hashtag_irony_index):
        output = self.lm_model(input_id_text, attention_mask).last_hidden_state
        output = output[:,0,:]
        #output = torch.cat((output, emoji_count.unsqueeze(1), has_ironic_emoji.unsqueeze(1), has_non_ironic_emoji.unsqueeze(1), nn_count.unsqueeze(1)), dim=1)
        output = torch.cat((output, hashtag_count.unsqueeze(1), avg_ironic_ratio.unsqueeze(1), ironic_hashtag_count.unsqueeze(1), non_ironic_hashtag_count.unsqueeze(1), hashtag_irony_index.unsqueeze(1)), dim=1)
        return self.classifier(output)

In [11]:
def train_loop(model, dataloader, tokenizer, loss, optimizer, device, scheduler):
    model.train()

    epoch_acc = 0
    epoch_loss = 0

    #for batch_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count, batch_labels in tqdm(dataloader, desc='training set'):
    for batch_texts, batch_hashtag_count, batch_avg_ironic_ratio, batch_ironic_hashtag_count, batch_non_ironic_hashtag_count, batch_hashtag_irony_index, batch_labels in tqdm(dataloader, desc='training set'):

        optimizer.zero_grad()

        tokens = tokenizer(list(batch_texts),
                           add_special_tokens=True,
                           return_tensors='pt',
                           padding='max_length',
                           max_length = 256,
                           truncation=True)

        input_id_texts = tokens['input_ids'].squeeze(1).to(device)
        mask_texts = tokens['attention_mask'].squeeze(1).to(device)
        #batch_emoji_count = batch_emoji_count.float().to(device)
        #batch_ironic_emoji = batch_ironic_emoji.float().to(device)
        #batch_non_ironic_emoji = batch_non_ironic_emoji.float().to(device)
        #batch_nn_count = batch_nn_count.float().to(device)
        batch_hashtag_count = batch_hashtag_count.float().to(device)
        batch_avg_ironic_ratio = batch_avg_ironic_ratio.float().to(device)
        batch_ironic_hashtag_count = batch_ironic_hashtag_count.float().to(device)
        batch_non_ironic_hashtag_count = batch_non_ironic_hashtag_count.float().to(device)
        batch_hashtag_irony_index = batch_hashtag_irony_index.float().to(device)
        batch_labels = batch_labels.float().to(device)


        #output = model(input_id_texts, mask_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count).squeeze(1)
        output = model(input_id_texts, mask_texts, batch_hashtag_count, batch_avg_ironic_ratio, batch_ironic_hashtag_count, batch_non_ironic_hashtag_count, batch_hashtag_irony_index).squeeze(1)

        # la loss è una CrossEntropyLoss, al suo interno ha
        # la logsoftmax + negative log likelihood loss
        batch_loss = loss(output, batch_labels)
        batch_loss.backward()
        ##### GRADIENT CLIPPING #####
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        #############################
        optimizer.step()

        ###### SCHEDULER ######
        # Scheduler step
        scheduler.step()
        print(optimizer.param_groups[0]["lr"])
        #######################

        epoch_loss += batch_loss.item()

        # per calcolare l'accuracy devo generare le predizioni
        # applicando manualmente la logsoftmax
        preds = (output > 0.5).float()  # Soglia di 0.5 per la classificazione binaria
        epoch_acc += (preds == batch_labels).sum().item()

        batch_labels = batch_labels.detach().cpu()
        input_id_texts = input_id_texts.detach().cpu()
        mask_texts = mask_texts.detach().cpu()
        #batch_emoji_count = batch_emoji_count.detach().cpu()
        #batch_ironic_emoji = batch_ironic_emoji.detach().cpu()
        #batch_non_ironic_emoji = batch_non_ironic_emoji.detach().cpu()
        #batch_nn_count = batch_nn_count.detach().cpu()
        batch_hashtag_count = batch_hashtag_count.detach().cpu()
        batch_avg_ironic_ratio = batch_avg_ironic_ratio.detach().cpu()
        batch_ironic_hashtag_count = batch_ironic_hashtag_count.detach().cpu()
        batch_non_ironic_hashtag_count = batch_non_ironic_hashtag_count.detach().cpu()
        batch_hashtag_irony_index = batch_hashtag_irony_index.detach().cpu()
        output = output.detach().cpu()


    return epoch_loss/len(dataloader), epoch_acc


In [12]:
def test_loop(model, dataloader, tokenizer, loss, device):
    model.eval()

    epoch_acc = 0
    epoch_loss = 0

    with torch.no_grad():

        #for batch_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count, batch_labels in tqdm(dataloader, desc='dev set'):
        for batch_texts, batch_hashtag_count, batch_avg_ironic_ratio, batch_ironic_hashtag_count, batch_non_ironic_hashtag_count, batch_hashtag_irony_index, batch_labels in tqdm(dataloader, desc='dev set'):

            tokens = tokenizer(list(batch_texts),
                               add_special_tokens=True,
                               return_tensors='pt',
                               padding='max_length',
                               max_length = 256,
                               truncation=True)
            input_id_texts = tokens['input_ids'].squeeze(1).to(device)
            mask_texts = tokens['attention_mask'].squeeze(1).to(device)
            #batch_emoji_count = batch_emoji_count.float().to(device)
            #batch_ironic_emoji = batch_ironic_emoji.float().to(device)
            #batch_non_ironic_emoji = batch_non_ironic_emoji.float().to(device)
            #batch_nn_count = batch_nn_count.float().to(device)
            batch_hashtag_count = batch_hashtag_count.float().to(device)
            batch_avg_ironic_ratio = batch_avg_ironic_ratio.float().to(device)
            batch_ironic_hashtag = batch_ironic_hashtag_count.float().to(device)
            batch_non_ironic_hashtag = batch_non_ironic_hashtag_count.float().to(device)
            batch_hashtag_irony_index = batch_hashtag_irony_index.float().to(device)
            batch_labels = batch_labels.float().to(device)

            #output = model(input_id_texts, mask_texts, batch_emoji_count, batch_ironic_emoji, batch_non_ironic_emoji, batch_nn_count).squeeze(1)
            output = model(input_id_texts, mask_texts, batch_hashtag_count, batch_avg_ironic_ratio, batch_ironic_hashtag, batch_non_ironic_hashtag, batch_hashtag_irony_index).squeeze(1)

            # la loss è una CrossEntropyLoss, al suo interno ha
            # la logsoftmax + negative log likelihood loss
            batch_loss = loss(output, batch_labels)
            epoch_loss += batch_loss.item()

            # per calcolare l'accuracy devo generare le predizioni
            # applicando manualmente la logsoftmax
            preds = (output > 0.5).float()  # Soglia di 0.5 per la classificazione binaria
            epoch_acc += (preds == batch_labels).sum().item()

            batch_labels = batch_labels.detach().cpu()
            input_id_texts = input_id_texts.detach().cpu()
            mask_texts = mask_texts.detach().cpu()
            #batch_emoji_count = batch_emoji_count.detach().cpu()
            #batch_ironic_emoji = batch_ironic_emoji.detach().cpu()
            #batch_non_ironic_emoji = batch_non_ironic_emoji.detach().cpu()
            #batch_nn_count = batch_nn_count.detach().cpu()
            batch_hashtag_count = batch_hashtag_count.detach().cpu()
            batch_avg_ironic_ratio = batch_avg_ironic_ratio.detach().cpu()
            batch_ironic_hashtag = batch_ironic_hashtag.detach().cpu()
            batch_non_ironic_hashtag = batch_non_ironic_hashtag.detach().cpu()
            batch_hashtag_irony_index = batch_hashtag_irony_index.detach().cpu()
            output = output.detach().cpu()

    return epoch_loss/len(dataloader), epoch_acc

In [13]:
def unfreeze_layers(model, freeze_up_to_layer):
    # Itera su tutti i parametri del modello
    for name, param in model.named_parameters():
        # Gestisce l'encoder e altri componenti
        if 'encoder.layer' in name:
            # Estrai il numero del layer
            try:
                layer_number = int(name.split('.')[2])  # esempio "encoder.layer.11.attention.self.query.weight"
            except ValueError:
                continue  # salta se non riesci a ottenere il numero del layer

            # Congela i parametri fino al livello specificato
            if layer_number < freeze_up_to_layer:
                param.requires_grad = False
            else:
                param.requires_grad = True

        # Gestione degli embeddings (se vuoi congelarli o meno)
        elif 'embeddings' in name:
            param.requires_grad = False  # Congela gli embeddings (o cambia se vuoi sbloccarli)

        # Gestione della parte del pooler e della testa di classificazione (se devi allenare queste parti)
        elif 'pooler' in name or 'classifier' in name:
            param.requires_grad = True  # Assicurati che questi componenti siano allenati

        # Stampa dello stato di "requires_grad" per ogni parametro
        print(f"{name} - requires_grad = {param.requires_grad}")


In [14]:
def train_test(model, epochs, optimizer, device, train_data, test_data,
               batch_size, model_name, train_loss_fn,
               test_loss_fn=None,         # non necessariamente train e test loss devono differire
               early_stopping=None,       # posso addstrare senza early stopping
               val_data=None,             # e in questo caso non c'è validation set
               scheduler=None,            # possibile scheduler per monitorare l'andamento di un iperparametro
               freeze_every_n_epochs=5,  # il numero di epoche dopo le quali sbloccare i layer
               freeze_up_to_layer=8):    # il numero di layer iniziali da congelare

    # Congelamento progressivo all'inizio
    unfreeze_layers(model, freeze_up_to_layer)

    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

    # check sulle funzioni di loss
    if test_loss_fn == None:
        test_loss_fn = train_loss_fn

    # liste dei valori di loss e accuracy epoca per epoca per il plot
    train_loss = []
    validation_loss = []
    test_loss = []
    train_acc = []
    validation_acc = []
    test_acc = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    all_emojis = list(emoji.EMOJI_DATA.keys())
    emoji_tokens = [emoji.demojize(e) for e in all_emojis]
    tokenizer.add_tokens(emoji_tokens)
    tokenizer.add_tokens(["@user"])
    model.lm_model.resize_token_embeddings(len(tokenizer))

    # Ciclo di addestramento con early stopping
    for epoch in tqdm(range(1,epochs+1)):

        epoch_train_loss, epoch_train_acc = train_loop(model, train_dataloader, tokenizer, train_loss_fn, optimizer, device, scheduler)
        train_loss.append(epoch_train_loss)
        train_acc.append(epoch_train_acc/len(train_data))

        # Validation se è presente la callback di early stopping
        if early_stopping != None:
            epoch_validate_loss, epoch_validate_acc = test_loop(model, val_dataloader, tokenizer, test_loss_fn, device)
            validation_loss.append(epoch_validate_loss)
            validation_acc.append(epoch_validate_acc/len(val_data))

        # Test
        epoch_test_loss, epoch_test_acc = test_loop(model, test_dataloader, tokenizer, test_loss_fn, device)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc/len(test_data))

        val_loss_str = f'Validation loss: {epoch_validate_loss:6.4f} 'if early_stopping != None else ' '
        val_acc_str = f'Validation accuracy: {(epoch_validate_acc/len(val_data)):6.4f} ' if early_stopping != None else ' '
        print(f"\nTrain loss: {epoch_train_loss:6.4f} {val_loss_str} Test loss: {epoch_test_loss:6.4f}")
        print(f"Train accuracy: {(epoch_train_acc/len(train_data)):6.4f} {val_acc_str}Test accuracy: {(epoch_test_acc/len(test_data)):6.4f}")

        # Early stopping
        if early_stopping != None:
            early_stopping(epoch_validate_loss)
            if early_stopping.early_stop:
                break

        # Sblocca i layer ogni 'freeze_every_n_epochs'
        if epoch % freeze_every_n_epochs == 0:
            freeze_up_to_layer = max(0, freeze_up_to_layer - 1)  # Sblocca un layer
            unfreeze_layers(model, freeze_up_to_layer)

    return train_loss, validation_loss, test_loss, train_acc, validation_acc, test_acc


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

model = ClassifierDeep(
                    hyperparameters["h_dim"],
                    hyperparameters["dropout"],
                    hyperparameters["language_model"]).to(device)
print(model)

# Calcoliamo il numero totale dei parametri del modello
total_params = sum(p.numel() for p in model.parameters())
print(f"Numbero totale dei parametri: {total_params}")

criterion = nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=hyperparameters['learning_rate'], weight_decay=hyperparameters['weight_decay'])

###### Linear Warmup + Decay ######
# Calcolo dei passi totali
total_steps = len(train_dataset) // hyperparameters['batch_size'] * hyperparameters['epochs']

# Passi di warmup (ad esempio, 10% del totale)
warmup_steps = int(0.1 * total_steps)

# Creazione del scheduler
scheduler = get_scheduler(
    name="cosine",  # Tipo di scheduler   ---> PROVARE COSINE ---> provare con OTTIMIZZATORE SGD INVECE CHE ADAM
    optimizer=optimizer,  # Ottimizzatore che stai usando
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)
###################################


# Creiamo la callback di early stopping da passare al nostro metodo di addestramento
early_stopping = EarlyStopping(patience=hyperparameters['patience'], min_delta=hyperparameters['min_delta'])

Using cpu device


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

ClassifierDeep(
  (lm_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen



In [16]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
# Congela i layer fino al numero specificato
freeze_up_to_layer = 4
unfreeze_layers(model, freeze_up_to_layer)

# Recupera i parametri che richiedono il calcolo del gradiente (i parametri non congelati)
params_to_train = [param for param in model.parameters() if param.requires_grad]

# Verifica se ci sono parametri da allenare
if len(params_to_train) == 0:
  raise ValueError("Non ci sono parametri da allenare. Verifica che il congelamento e sblocco dei layer siano corretti.")

# Creazione dell'ottimizzatore con i parametri non congelati
optimizer = torch.optim.Adam(params_to_train, lr=hyperparameters['learning_rate'])

# Ora puoi chiamare la routine di addestramento
train_loss, validation_loss, test_loss, train_acc, validation_acc, test_acc = train_test(
  model,
  hyperparameters['epochs'],
  optimizer,  # Ora passa l'ottimizzatore configurato
  device,
  train_dataset,
  test_dataset,
  hyperparameters['batch_size'],
  hyperparameters['language_model'],
  criterion,
  criterion,  # o qualsiasi altra loss
  early_stopping,
  val_dataset,
  scheduler=scheduler
)


lm_model.embeddings.word_embeddings.weight - requires_grad = False
lm_model.embeddings.position_embeddings.weight - requires_grad = False
lm_model.embeddings.token_type_embeddings.weight - requires_grad = False
lm_model.embeddings.LayerNorm.weight - requires_grad = False
lm_model.embeddings.LayerNorm.bias - requires_grad = False
lm_model.pooler.dense.weight - requires_grad = True
lm_model.pooler.dense.bias - requires_grad = True
classifier.0.weight - requires_grad = True
classifier.0.bias - requires_grad = True
classifier.1.weight - requires_grad = True
classifier.1.bias - requires_grad = True
classifier.4.weight - requires_grad = True
classifier.4.bias - requires_grad = True
classifier.5.weight - requires_grad = True
classifier.5.bias - requires_grad = True
classifier.8.weight - requires_grad = True
classifier.8.bias - requires_grad = True
lm_model.embeddings.word_embeddings.weight - requires_grad = False
lm_model.embeddings.position_embeddings.weight - requires_grad = False
lm_model.

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  0%|          | 0/18 [00:00<?, ?it/s]
training set:   0%|          | 0/214 [00:00<?, ?it/s][A
training set:   0%|          | 1/214 [00:29<1:46:17, 29.94s/it][A

2e-05



training set:   1%|          | 2/214 [00:53<1:33:04, 26.34s/it][A

2e-05



training set:   1%|▏         | 3/214 [01:16<1:27:07, 24.78s/it][A

2e-05



training set:   2%|▏         | 4/214 [01:38<1:23:03, 23.73s/it][A

2e-05



training set:   2%|▏         | 5/214 [02:02<1:22:09, 23.59s/it][A

2e-05



training set:   3%|▎         | 6/214 [02:27<1:23:23, 24.05s/it][A

2e-05



training set:   3%|▎         | 7/214 [02:50<1:22:46, 23.99s/it][A

2e-05



training set:   4%|▎         | 8/214 [03:12<1:19:54, 23.27s/it][A

2e-05



training set:   4%|▍         | 9/214 [03:36<1:19:58, 23.41s/it][A

2e-05



training set:   5%|▍         | 10/214 [03:58<1:17:49, 22.89s/it][A

2e-05



training set:   5%|▌         | 11/214 [04:21<1:17:33, 22.93s/it][A

2e-05



training set:   6%|▌         | 12/214 [04:43<1:16:58, 22.87s/it][A

2e-05



training set:   6%|▌         | 13/214 [05:05<1:15:30, 22.54s/it][A

2e-05



training set:   7%|▋         | 14/214 [05:29<1:16:26, 22.93s/it][A

2e-05



training set:   7%|▋         | 15/214 [05:52<1:15:59, 22.91s/it][A

2e-05



training set:   7%|▋         | 16/214 [06:17<1:17:40, 23.54s/it][A

2e-05



training set:   8%|▊         | 17/214 [06:40<1:17:07, 23.49s/it][A

2e-05



training set:   8%|▊         | 18/214 [07:04<1:17:20, 23.68s/it][A

2e-05



training set:   9%|▉         | 19/214 [07:26<1:14:47, 23.02s/it][A

2e-05


In [None]:
'''
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(20, 10))

  axs[0].plot(train_loss, label='training loss')
  axs[0].plot(validation_loss, label='validation loss')
  axs[0].plot(test_loss, label='test loss')
  axs[0].legend(loc='upper right')
  axs[0].set_ylim(0,1)

  axs[1].plot(train_acc, label='training accuracy')
  axs[1].plot(validation_acc, label='validation accuracy')
  axs[1].plot(test_acc, label='test accuracy')
  axs[1].legend(loc='lower right')
  axs[1].set_ylim(0,1)