In [1]:
import io
import os
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from torch.optim import AdamW
from sklearn.metrics import classification_report, accuracy_score
from transformers import (GPT2Tokenizer, GPT2Model, 
                          set_seed,
                          training_args,
                          trainer,
                          GPT2Config,
                          get_cosine_schedule_with_warmup,
                          GPT2ForSequenceClassification)

set_seed(123)
epochs=4
batch_size = 32


# Numero massimo della sequenza
# La sequenza <80 avrà del padding, la sequenza >80 sarà troncata
max_length = 510

# Usiamo la cpu se la gpu non viene trova
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Nome del trasformers model pre-allenato
model_name_or_path = 'gpt2'

# Dizionario delle etichette e il loro ID
labels_ids = {'Manufacturing': 0, 'Logistics':1, 'Public Administration': 2, 'Healthcare': 3, 'Education': 4}

# Numero di etichette che stiamo utilizzando
n_labels = len(labels_ids)

In [4]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)
import seaborn as sns


%run "./support_functions.ipynb"


#Creazione di un Dataset pytorch per il caricamento dei dati multidimensionali
class BPMNDomainDataset(Dataset):
    
    def __init__(self, path):

        # Prima di tutto controlliamo che il path esista
        if not os.path.isfile(path):
            # Se il path non è valido allora solleviamo un'eccezione
            raise ValueError('Invalid `path` variable! Needs to be a file')
        
        # Carico il file .csv
        self.df = pd.read_csv(path, sep=';', engine='python', encoding=get_file_encoding(path))
        self.descriptions = self.df['Labels'].to_list()
        self.n_examples = len(self.descriptions)

    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {'text': self.descriptions[item]}

#Prepara i dati in modo che possano essere utilizzati efficacemente 
#durante l'addestramento di un modello GPT-2 per la classificazione di sequenze.
class Gpt2ClassificationCollator(object):

    def __init__(self, use_tokenizer, max_sequence_len=None):
        self.use_tokenizer = use_tokenizer
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        
    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=self.max_sequence_len)
        return inputs

def train(model, dataloader, optimizer, scheduler, device):
    total_loss = 0
    model.train()
    predictions_labels = []
    true_labels = []

    for batch in tqdm(dataloader, total=len(dataloader)):
        # Trasferisci il batch su GPU, se disponibile
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs.loss, outputs.logits
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        # Aggiungi le predizioni e le etichette vere
        true_labels.extend(batch['labels'].cpu().numpy().tolist())
        predictions_labels.extend(logits.argmax(axis=-1).cpu().numpy().tolist())

    avg_epoch_loss = total_loss / len(dataloader)
    
    return true_labels, predictions_labels, avg_epoch_loss

def validation(dataloader, device_, model):
    predictions_labels = []
    model.eval()
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        with torch.no_grad():
            # Trasferisci il batch su GPU, se disponibile
            batch = {k: v.to(device_) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions_labels.extend(logits.argmax(axis=-1).cpu().numpy().tolist())
            
    return predictions_labels


In [5]:
from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer

# Configuriamo il modello
print('Loading configuration and model...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, 
                                          num_labels=n_labels)
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

# Impostiamo il pad token nel modello
model_config.pad_token_id = tokenizer.pad_token_id

# Carichiamo il modello
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, 
                                                      config=model_config)
model.resize_token_embeddings(len(tokenizer))

# Carichiamo il modello su GPU o CPU
model.to(device)
print('Model loaded to `%s`'%device)

Loading configuration and model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cpu`


In [7]:
# Creo un data collator per codificare testo ed etichette in numeri
gpt2_classification_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer,  
                                                          max_sequence_len=max_length)

print('Dealing with Train...')
# Creo un dataset pytorch per l'allenamento
train_dataset = BPMNDomainDataset(path='./BPMB-Labels-by-AIs.csv')
print('Created `train_dataset` with %d examples!'%len(train_dataset))

# Carico il dataset pytorch nel dataloader per l'allenamento
train_dataloader = DataLoader(train_dataset, 
                              batch_size=batch_size, 
                              shuffle=True, 
                              collate_fn=gpt2_classification_collator,
                              num_workers=10)  # Imposta il numero di worker per il caricamento parallelo
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))


print('Dealing with Validation...')
# Creo un dataset pytorch per la validazione
valid_dataset = BPMNDomainDataset(path='./BPMN_cleaned_languages.csv')
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

# Carico il dataset pytorch nel dataloader per la validazione
valid_dataloader = DataLoader(valid_dataset, 
                              batch_size=batch_size, 
                              shuffle=False, 
                              collate_fn=gpt2_classification_collator,
                              num_workers=10)
print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))

Dealing with Train...
Created `train_dataset` with 15 examples!
Created `train_dataloader` with 1 batches!
Dealing with Validation...
Created `valid_dataset` with 3504 examples!
Created `eval_dataloader` with 110 batches!


In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

# ADAM = ADAPTIVE MOMENT ESTIMATION
# L'AdamW è un ottimizzatore che aggiorna il modello in conseguenza della funzione 
# di perdita, in un 'Gradient descent', ad ogni epoche il loss dovrebbe scendere e trovare un MINIMO
optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8)

# Il numero totale di total_steps è uguale a batch_size*epochs e
# train_dataloader contiene i dati già batched
total_steps = len(train_dataloader) * epochs

# Creiamo un learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,  # Default value in run_glue.py
                                            num_training_steps=total_steps)

# Loop per ogni epoca
print('Epoch loop ...')
for epoch in tqdm(range(epochs)):

    print('Training on batches...')
    # Facciamo un'intera addestramento sul training set
    train(model, train_dataloader, optimizer, scheduler, device)

    # Facciamo un'intera validazione sul validation set
    print('Validation on batches...')
    predictions_labels = validation(valid_dataloader, device, model)


Epoch loop ...


  0%|                                                                                                                       | 0/4 [00:00<?, ?it/s]

Training on batches...



  0%|                                                                                                                       | 0/1 [00:00<?, ?it/s][ATraceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/anthonyeleuteri/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonyeleuteri/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'BPMNDomainDataset' on <module '__main__' (built-in)>


In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

path = "./BPMN_cleaned_languages.csv"

# Carica il file con il dataframe
df = pd.read_csv(path, sep=';', engine='python', encoding=get_file_encoding(path))

# Ottieni le etichette vere e predette
true_labels = df['CollectionName']
predictions_labels = validation(valid_dataloader, device, model)

# Estrai il dominio interno dal DataFrame
internal_domains = df['InternalDomain']

# Costruisci un nuovo DataFrame con dominio interno e etichetta predetta
df_correlation = pd.DataFrame({'Internal Domain': internal_domains, 'Predicted Labels': predictions_labels})

# Calcola la correlazione tra il dominio interno e le etichette predette
correlation_matrix = df_correlation.corr()

# Visualizza la matrice di correlazione
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix between Internal Domain and Predicted Labels')
plt.show()


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [15:27<00:00,  8.43s/it]


ValueError: All arrays must be of the same length