In [1]:
import io
import os
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from torch.optim import AdamW
from sklearn.metrics import classification_report, accuracy_score
from transformers import (GPT2Tokenizer, GPT2Model, 
                          set_seed,
                          training_args,
                          trainer,
                          GPT2Config,
                          get_cosine_schedule_with_warmup,
                          GPT2ForSequenceClassification)

set_seed(123)
epochs=4

batch_size = 32


# Numero massimo della sequenza
# La sequenza <510 avrà del padding, la sequenza >510 sarà troncata
max_length = 510

# Usiamo la cpu se la gpu non viene trova
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Nome del trasformers model pre-allenato
model_name_or_path = 'gpt2'

# Dizionario delle etichette e il loro ID
labels_ids = {'Manufacturing': 0, 'Logistics':1, 'Public Administration': 2, 'Healthcare': 3, 'Education': 4}

# Numero di etichette che stiamo utilizzando
n_labels = len(labels_ids)

In [2]:
import os
import torch
import pandas as pd
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    TrainingArguments,
    Trainer,
    get_linear_schedule_with_warmup,
    GPT2ForSequenceClassification
)

%run "./support_functions.ipynb"

# Definizioni delle classi Dataset
class BPMNDataset(Dataset):
    def __init__(self, path):
        if not os.path.isfile(path):
            raise ValueError('Invalid `path` variable! Needs to be a file')
        
        self.df = pd.read_csv(path, sep=';', engine='python', encoding=get_file_encoding(path))
        self.descriptions = self.df['Labels'].to_list()
        self.n_examples = len(self.descriptions)

    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {'text': self.descriptions[item]}


class Gpt2ClassificationCollator(object):
    def __init__(self, 
                 use_tokenizer, 
                 max_sequence_len=None):
        self.use_tokenizer = use_tokenizer
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=self.max_sequence_len)
        return inputs

def train(model, dataloader, optimizer, scheduler, device):
    predictions_labels = []
    true_labels = []
    total_loss = 0
    model.train()

    
    # Utilizzo tqdm per visualizzare una barra di avanzamento mentre itero sui batch
    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        #loss=discrepanza tra le previsioni del modello e i valori reali dell'obiettivo (ground truth)
        #logits=appresentano le "probabilità" che il modello assegna a ciascuna classe di output
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        #Aggiorno i pesi dell'ottimizzatore e lo scheduler
        optimizer.step()
        scheduler.step()
        logits = logits.detach().cpu().numpy()
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()
        
    avg_epoch_loss = total_loss / len(dataloader)
    
    return true_labels, predictions_labels


def validation(dataloader, device_, model):
    predictions_labels = []

    model.eval()

    for batch in tqdm(dataloader, total=len(dataloader)):
        batch = {k: v.to(device_) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            logits = outputs.logits.detach().cpu().numpy()
            predictions_labels.extend(logits.argmax(axis=-1).tolist())

    return predictions_labels

In [3]:
class BPMNDomainDataset(Dataset):
    def __init__(self, path):
        if not os.path.isfile(path):
            raise ValueError('Invalid `path` variable! Needs to be a file')
        
        self.df = pd.read_csv(path, sep=';', engine='python', encoding=get_file_encoding(path))
        self.descriptions = self.df['Labels'].to_list()
        self.domains = self.df['CollectionName'].to_list()  
        self.flattened_domains = [label for sublist in self.domains for label in sublist.split(',')]    
        self.n_examples = len(self.descriptions)

    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {"text": self.descriptions[item], "label": self.flattened_domains[item]}


class Gpt2ClassificationCollatorDomain(object):
    def __init__(self, 
                use_tokenizer, 
                labels_encoder, 
                max_sequence_len=None):
        self.use_tokenizer = use_tokenizer
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        self.labels_encoder = labels_encoder

    def __call__(self, sequences):
        texts = [sequence.get('text', None) for sequence in sequences]
        labels = [sequence.get('label', None) for sequence in sequences]
        label_ids = [self.labels_encoder[label] for label in labels]
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        inputs['labels'] = torch.tensor(label_ids)  
        return inputs

In [13]:
from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer

# Configuriamo il modello
print('Loading configuration and model...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, 
                                          num_labels=n_labels)

# Configuriamo il tokenizer del modello
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

# Impostiamo il pad token nel modello
model_config.pad_token_id = tokenizer.pad_token_id

# Carichiamo il modello
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, 
                                                      config=model_config)
model.resize_token_embeddings(len(tokenizer))

# Carichiamo il modello su GPU o CPU
model.to(device)
print('Model loaded to `%s`'%device)

Loading configuration and model...
Loading tokenizer...
Loading model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cpu`


In [16]:
# Creo un data collator per codificare testo ed etichette in numeri
gpt2_classification_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer,
                                                          max_sequence_len=max_length)

gpt2_classification_collator_domain = Gpt2ClassificationCollatorDomain(use_tokenizer=tokenizer,
                                                                       labels_encoder=labels_ids,
                                                                       max_sequence_len=max_length)

print('Dealing with Train...')
# Creo un dataset pytorch per l'allenamento
train_dataset = BPMNDomainDataset(path='./AI_Generated_Datas/CoPilotGeneratedWords.csv')
print('Created `train_dataset` with %d examples!'%len(train_dataset))

# Carico il dataset pytorch nel dataloader per l'allenamento
train_dataloader = DataLoader(train_dataset, 
                              batch_size=batch_size, 
                              shuffle=True, 
                              collate_fn=gpt2_classification_collator_domain)  
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))


print('Dealing with Validation...')
# Creo un dataset pytorch per la validazione
valid_dataset = BPMNDataset(path='./Output_Files/BPMN_cleaned2_languages.csv')  
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

# Carico il dataset pytorch nel dataloader per la validazione
valid_dataloader = DataLoader(valid_dataset, 
                              batch_size=batch_size, 
                              shuffle=False, 
                              collate_fn=gpt2_classification_collator)
print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))


Dealing with Train...
Created `train_dataset` with 5 examples!
Created `train_dataloader` with 1 batches!
Dealing with Validation...
Created `valid_dataset` with 4924 examples!
Created `eval_dataloader` with 154 batches!


In [17]:
from torch.optim import AdamW
from sklearn.metrics import accuracy_score

# Definizione dell'ottimizzatore
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Calcolo del numero totale di passaggi di addestramento
total_steps = len(train_dataloader) * epochs

# Creazione del learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)

# Loop per ogni epoca
print('Epoch loop ...')
for epoch in tqdm(range(epochs)):

    print('Training on batches...')
    # Eseguire l'addestramento del modello
    train_labels, train_predict = train(model, train_dataloader, optimizer, scheduler, device)

    print('Validation on batches...')
    # Eseguire la validazione del modello
    valid_predict = validation(valid_dataloader, device, model)

Epoch loop ...


  0%|                                                                                                                       | 0/4 [00:00<?, ?it/s]

Training on batches...



  0%|                                                                                                                       | 0/1 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.37s/it][A


Validation on batches...



  0%|                                                                                                                     | 0/154 [00:00<?, ?it/s][A
  1%|▋                                                                                                            | 1/154 [00:02<06:31,  2.56s/it][A
  1%|█▍                                                                                                           | 2/154 [00:04<06:03,  2.39s/it][A
  2%|██                                                                                                           | 3/154 [00:07<06:13,  2.48s/it][A
  3%|██▊                                                                                                          | 4/154 [00:11<07:48,  3.13s/it][A
  3%|███▌                                                                                                         | 5/154 [00:17<10:45,  4.33s/it][A
  4%|████▏                                                                                         

Training on batches...



  0%|                                                                                                                       | 0/1 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.56s/it][A


Validation on batches...



  0%|                                                                                                                     | 0/154 [00:00<?, ?it/s][A
  1%|▋                                                                                                            | 1/154 [00:02<06:16,  2.46s/it][A
  1%|█▍                                                                                                           | 2/154 [00:04<05:56,  2.35s/it][A
  2%|██                                                                                                           | 3/154 [00:07<06:00,  2.39s/it][A
  3%|██▊                                                                                                          | 4/154 [00:11<07:50,  3.14s/it][A
  3%|███▌                                                                                                         | 5/154 [00:18<10:52,  4.38s/it][A
  4%|████▏                                                                                         

Training on batches...



  0%|                                                                                                                       | 0/1 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.73s/it][A


Validation on batches...



  0%|                                                                                                                     | 0/154 [00:00<?, ?it/s][A
  1%|▋                                                                                                            | 1/154 [00:02<06:43,  2.64s/it][A
  1%|█▍                                                                                                           | 2/154 [00:04<06:16,  2.47s/it][A
  2%|██                                                                                                           | 3/154 [00:07<06:19,  2.51s/it][A
  3%|██▊                                                                                                          | 4/154 [00:12<08:24,  3.36s/it][A
  3%|███▌                                                                                                         | 5/154 [00:18<11:04,  4.46s/it][A
  4%|████▏                                                                                         

Training on batches...



  0%|                                                                                                                       | 0/1 [00:00<?, ?it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.51s/it][A


Validation on batches...



  0%|                                                                                                                     | 0/154 [00:00<?, ?it/s][A
  1%|▋                                                                                                            | 1/154 [00:02<06:08,  2.41s/it][A
  1%|█▍                                                                                                           | 2/154 [00:04<05:46,  2.28s/it][A
  2%|██                                                                                                           | 3/154 [00:07<05:58,  2.37s/it][A
  3%|██▊                                                                                                          | 4/154 [00:11<07:32,  3.02s/it][A
  3%|███▌                                                                                                         | 5/154 [00:17<10:08,  4.09s/it][A
  4%|████▏                                                                                         

In [23]:
from collections import defaultdict
# Mappa dei nomi delle etichette predette
label_names = {
    0: 'Manufacturing',
    1: 'Logistics',
    2: 'Public Administration',
    3: 'Healthcare',
    4: 'Education'
}

%run "./support_functions.ipynb"

df = pd.read_csv("./Output_Files/BPMN_cleaned2_languages.csv", sep=';', engine='python', encoding=get_file_encoding("./Output_Files/BPMN_cleaned2_languages.csv"))

# Inizializziamo un dizionario per accumulare i conteggi per BPMAI e Camunda
counts_bpmai = defaultdict(int)
counts_camunda = defaultdict(int)

# Iteriamo su ogni elemento delle etichette vere e delle predizioni
for true_label, pred_label_idx in zip(df['CollectionName'], valid_predict):
    pred_label = label_names[pred_label_idx]
    if true_label == 'BPMAI':
        counts_bpmai[pred_label] += 1
    elif true_label == 'Camunda':
        counts_camunda[pred_label] += 1

# Stampiamo i conteggi per BPMAI
print("Counts for BPMAI:")
for label, count in counts_bpmai.items():
    print(f"{label}: {count}")

# Stampiamo i conteggi per Camunda
print("\nCounts for Camunda:")
for label, count in counts_camunda.items():
    print(f"{label}: {count}")


4924
Counts for BPMAI:
Logistics: 4009
Healthcare: 111
Manufacturing: 357
Education: 42
Public Administration: 21

Counts for Camunda:
Logistics: 373
Manufacturing: 6
Healthcare: 3
Public Administration: 2
