# Importiamo le librerie

In [None]:
import transformers
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel, BertConfig

In [None]:
from torch import optim
from torch import nn
import torch.nn.functional as F
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import numpy as np
import pickle

In [None]:
import seaborn as sn
from sklearn.metrics import accuracy_score, f1_score
import math
from scipy.stats import wilcoxon

**Funzione che spacchetta gli alberi**

In [None]:
def unplickle_trees(path_tree_file):
    print('--->read DTKs')
    dt_trees = []
    with open(path_tree_file, 'rb') as fr:
        try:
            while True:
                dt_trees.append(pickle.load(fr))
        except EOFError:
            pass
    return [torch.FloatTensor(i) for i in dt_trees]

# Importiamo il dataset e gli alberi

In [None]:
df = pd.read_csv('HR_dataset.csv')
trees = unplickle_trees('file_trees.pkl')

df.head()

In [None]:
sentences = df.testo.values
labels = df.target.values

unique, counts = np.unique(labels, return_counts = True)

print(unique, counts)

**Funzioni utili per il calcolo statistico**

In [None]:
def calculate_mean(number_list):
    sum = 0
    for number in number_list:
        sum += number

    return sum/len(number_list)

def calculate_standard_deviation(number_list):
    mean = calculate_mean(number_list)
    summatory = 0
    for number in number_list:
        summatory += pow((number - mean),2)

    summatory = summatory/len(number_list)

    return math.sqrt(summatory)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## **Italians Bert**
* **Italian Bert**
    * Tokenizzatore -->  BertTokenizer.from_pretrained("dbmdz/bert-base-italian-uncased", do_lower_case=True)
    * Modello --> BertModel.from_pretrained('dbmdz/bert-base-italian-uncased')



* **Umberto** 
    * Tokenizzatore --> AutoTokenizer.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")
    * Modello --> AutoModel.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")



* **Alberto** 
    * Tokenizzatore --> AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
    * Modello --> AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
    
    

* **Multilingua-Bert** 
    * Tokenizzatore --> BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
    * Modello --> BertModel.from_pretrained("bert-base-multilingual-uncased")

In [None]:
def define_input(seed, random_state, sentences, model_type, epochs):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
        
    global tokenizer
    global model_architecture
        
    sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
    
    if model_type == 'Italian_Bert':
        tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-uncased", do_lower_case=True)
        model_architecture = BertModel.from_pretrained('dbmdz/bert-base-italian-uncased').to("cuda" if torch.cuda.is_available() else "cpu")

    if model_type == 'Umberto':
        tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")
        model_architecture = AutoModel.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1").to("cuda" if torch.cuda.is_available() else "cpu")
                
    if model_type == 'Alberto':
        tokenizer = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
        model_architecture = AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0").to("cuda" if torch.cuda.is_available() else "cpu")

    if model_type == 'Multilingua-Bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
        model_architecture = BertModel.from_pretrained("bert-base-multilingual-uncased").to("cuda" if torch.cuda.is_available() else "cpu")

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    
    MAX_LEN = 128
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    
    attention_masks = []

    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
        
    X_inputs, test_inputs, X_labels, test_labels = train_test_split(input_ids, labels, random_state=random_state, test_size=0.2)
    X_masks, test_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=random_state, test_size=0.2)
    X_trees, test_trees, _, _ = train_test_split(trees, input_ids, random_state=random_state, test_size=0.2)

    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(X_inputs, X_labels, random_state=random_state, test_size=0.3)
    train_masks, validation_masks, _, _ = train_test_split(X_masks, X_inputs, random_state=random_state, test_size=0.3)
    train_trees, validation_trees, _, _ = train_test_split(X_trees, X_inputs, random_state=random_state, test_size=0.3)

    train_inputs = torch.tensor(train_inputs)
    train_labels = torch.tensor(train_labels)
    train_masks = torch.tensor(train_masks)
    train_trees = torch.stack(train_trees)

    validation_inputs = torch.tensor(validation_inputs)
    validation_labels = torch.tensor(validation_labels)
    validation_masks = torch.tensor(validation_masks)
    validation_trees = torch.stack(validation_trees)

    test_inputs = torch.tensor(test_inputs)
    test_labels = torch.tensor(test_labels)
    test_masks = torch.tensor(test_masks)
    test_trees = torch.stack(test_trees)

    batch_size = 32

    train_data = TensorDataset(train_inputs, train_masks, train_trees, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_trees, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    test_data = TensorDataset(test_inputs, test_masks, test_trees, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    
    return train_dataloader, validation_dataloader, test_dataloader, model_architecture, device, test_labels
    

# Definiamo i modelli che utilizzeremo

### Kermit Potter

In [None]:
class Kermit_Potter(nn.Module):
    def __init__(self, input_dim_bert, input_dim_dt, output_dim, model_architecture):
        super().__init__()
        self.bert = model_architecture
        self.dropout = nn.Dropout(0.1)
        
        self.fc1 = torch.nn.Linear(input_dim_dt, 2000)
        self.fc2 = torch.nn.Linear(2000, 4000)
        self.fc3 = torch.nn.Linear(4000, 2000)
        self.fc4 = torch.nn.Linear(2000, 4000)

        self.synth_sem_linear = nn.Linear(input_dim_bert + 4000, output_dim)


    def forward(self, x_sem, attention_mask, x_synth):
        with torch.no_grad():
            x_sem = self.bert(x_sem)[0][:, 0, :]
        
        x_sem = self.dropout(x_sem)
        
        x_synth = F.dropout(F.relu(self.fc1(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc2(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc3(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc4(x_synth)), p=0.1)
        
        x_tot = torch.cat((x_sem, x_synth), 1)
        x_tot = self.synth_sem_linear(x_tot)
        
        return x_tot
    
    def get_activation(self, x_sem, x_synth):
        with torch.no_grad():
            x_sem = self.bert(x_sem)[0][:, 0, :]
        
        pooled_output = self.dropout(pooled_output)         
        x_tot = torch.cat((pooled_output, x_synth), 1)
        x_tot = self.synth_sem_linear(x_tot)
        
        return out

### Classic Kermit

In [None]:
class Kermit_Classic(nn.Module):
    def __init__(self, input_dim_bert, input_dim_dt, output_dim, model_architecture):
        super().__init__()
        self.bert = model_architecture
        self.dropout = nn.Dropout(0.1)

        self.synth_sem_linear = nn.Linear(input_dim_bert + 4000, output_dim)


    def forward(self, x_sem, attention_mask, x_synth):
        with torch.no_grad():
            x_sem = self.bert(x_sem)[0][:, 0, :]
        
        x_sem = self.dropout(x_sem)
        
        x_tot = torch.cat((x_sem, x_synth), 1)
        x_tot = self.synth_sem_linear(x_tot)
        
        return x_tot
    
    def get_activation(self, x_sem, x_synth):
        with torch.no_grad():
            x_sem = self.bert(x_sem)[0][:, 0, :]
        
        pooled_output = self.dropout(pooled_output)         
        x_tot = torch.cat((pooled_output, x_synth), 1)
        x_tot = self.synth_sem_linear(x_tot)
        
        return out

### Solo Bert

In [None]:
class BertForSequenceClassification(nn.Module):
  
    def __init__(self, input_dim_bert, output_dim, model_architecture):
        super().__init__()
        self.bert = model_architecture
        
        self.dropout = nn.Dropout(0.1)
        self.sem_linear = nn.Linear(input_dim_bert, output_dim)
        
    def forward(self, x_sem, attention_mask):
        with torch.no_grad():
            pooled_output = self.bert(x_sem, attention_mask)[0][:, 0, :]  
            pooled_output = self.dropout(pooled_output)
        logits = self.sem_linear(pooled_output)

        return logits
        
    def get_activation(self, x_sem, x_synth):
        with torch.no_grad():
            x_sem = self.bert(x_sem)[0][:, 0, :]
            x_tot = torch.cat((x_sem, x_synth), 1)
            x_tot = self.synth_sem_linear(x_tot)
        out = F.log_softmax(x_tot, dim=1)
        return out

# Creiamo le funzioni di applicazione dei modelli

### Esecuzione di Kermit Potter

In [None]:
def execute_Potter(epochs, model_architecture, train_dataloader, validation_dataloader, test_dataloader, device, test_labels):
    
    Potter_model = Kermit_Potter(768,4000,2, model_architecture)

    weights = [1/counts[0], 1/counts[1]] #[ 1 / number of instances for each class]
    class_weights = torch.FloatTensor(weights).cuda()

    criterion = nn.CrossEntropyLoss(weight=class_weights)
    parameters = filter(lambda p: p.requires_grad, Potter_model.parameters())
    optimizer = optim.AdamW(Potter_model.parameters(), lr=2e-5)
    Potter_model.cuda()


    # Store our loss and accuracy for plotting
    train_loss_set = []
    # Number of training epochs 
    epoch = 0

    # BERT training loop
    for _ in trange(epochs, desc="Epoch"):  
        Potter_model.train()  
          # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
      # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            #print(step, batch)
            # Add batch to GPU
            batch = tuple(t.cuda() for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask,b_input_tree, b_labels = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass

            #NB anche BertForSequenceClassification prende in input anche b_input_tree ma non li usa (solo per comodità)
            target_hat = Potter_model(b_input_ids, b_input_mask,b_input_tree)

            loss = criterion(target_hat, b_labels)
            train_loss_set.append(loss.item())

            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        ## VALIDATION

      # Put model in evaluation mode
        Potter_model.eval()
        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.cuda() for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_input_tree, b_labels = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():

              # Forward pass, calculate logit predictions

            #NB anche BertForSequenceClassification prende in input anche b_input_tree ma non li usa (solo per comodità)
              logits = Potter_model(b_input_ids, b_input_mask, b_input_tree)

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        epoch +=1
        
    predictions = []
    Potter_model.eval()

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_input_tree, b_labels = batch

        with torch.no_grad():
            logits = Potter_model(b_input_ids, b_input_mask, b_input_tree)
        logits = logits.detach().cpu().numpy()

        predictions.append(logits)

        flat_predictions = [item for sublist in predictions for item in sublist]
        flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
        
    A = accuracy_score(test_labels.numpy(), flat_predictions)
    B = f1_score(test_labels.numpy(), flat_predictions, average='macro')
    C = f1_score(test_labels.numpy(), flat_predictions, average='weighted')
    D = f1_score(test_labels.numpy(), flat_predictions, average=None)
        
    return A,B,C,D

### Esecuzione di Kermit Classico

In [None]:
def execute_Kermit(epochs, model_architecture, train_dataloader, validation_dataloader, test_dataloader, device, test_labels):
    
    Kermit_model = Kermit_Classic(768,4000,2, model_architecture)

    weights = [1/counts[0], 1/counts[1]] #[ 1 / number of instances for each class]
    class_weights = torch.FloatTensor(weights).cuda()

    criterion = nn.CrossEntropyLoss(weight=class_weights)
    parameters = filter(lambda p: p.requires_grad, Kermit_model.parameters())
    optimizer = optim.AdamW(Kermit_model.parameters(), lr=2e-5)
    Kermit_model.cuda()


    # Store our loss and accuracy for plotting
    train_loss_set = []
    # Number of training epochs 
    epoch = 0

    # BERT training loop
    for _ in trange(epochs, desc="Epoch"):  
        Kermit_model.train()  
          # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
      # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            #print(step, batch)
            # Add batch to GPU
            batch = tuple(t.cuda() for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask,b_input_tree, b_labels = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass

            #NB anche BertForSequenceClassification prende in input anche b_input_tree ma non li usa (solo per comodità)
            target_hat = Kermit_model(b_input_ids, b_input_mask,b_input_tree)

            loss = criterion(target_hat, b_labels)
            train_loss_set.append(loss.item())

            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        ## VALIDATION

      # Put model in evaluation mode
        Kermit_model.eval()
        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.cuda() for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_input_tree, b_labels = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():

              # Forward pass, calculate logit predictions

            #NB anche BertForSequenceClassification prende in input anche b_input_tree ma non li usa (solo per comodità)
              logits = Kermit_model(b_input_ids, b_input_mask, b_input_tree)

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        epoch +=1     
        
    predictions = []
    Kermit_model.eval()

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_input_tree, b_labels = batch

        with torch.no_grad():
            logits = Kermit_model(b_input_ids, b_input_mask, b_input_tree)
        logits = logits.detach().cpu().numpy()

        predictions.append(logits)

        flat_predictions = [item for sublist in predictions for item in sublist]
        flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
        
    A = accuracy_score(test_labels.numpy(), flat_predictions)
    B = f1_score(test_labels.numpy(), flat_predictions, average='macro')
    C = f1_score(test_labels.numpy(), flat_predictions, average='weighted')
    D = f1_score(test_labels.numpy(), flat_predictions, average=None)
        
    return A,B,C,D

### Esecuzione di solo Bert

In [None]:
def execute_Bert(epochs, model_architecture, train_dataloader, validation_dataloader, test_dataloader, device, test_labels):
    Alone_model = BertForSequenceClassification(768,2, model_architecture)

    criterion = nn.CrossEntropyLoss()
    parameters = filter(lambda p: p.requires_grad, Alone_model.parameters())
    optimizer = optim.AdamW(Alone_model.parameters(), lr=2e-5)

    Alone_model.cuda()

    # Store our loss and accuracy for plotting
    train_loss_set = []
    # Number of training epochs 
    epoch = 0

    # BERT training loop
    for _ in trange(epochs, desc="Epoch"):  
        Alone_model.train()  
          # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
      # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            #print(step, batch)
            # Add batch to GPU
            batch = tuple(t.cuda() for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask,b_input_tree, b_labels = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass

            #NB anche BertForSequenceClassification prende in input anche b_input_tree ma non li usa (solo per comodità)
            target_hat = Alone_model(b_input_ids, b_input_mask)

            loss = criterion(target_hat, b_labels)
            train_loss_set.append(loss.item())

            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        ## VALIDATION

      # Put model in evaluation mode
        Alone_model.eval()
        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.cuda() for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_input_tree, b_labels = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():

              # Forward pass, calculate logit predictions

            #NB anche BertForSequenceClassification prende in input anche b_input_tree ma non li usa (solo per comodità)
              logits = Alone_model(b_input_ids, b_input_mask)

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        epoch +=1

    predictions = []
    Alone_model.eval()

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_input_tree, b_labels = batch

        with torch.no_grad():
            logits = Alone_model(b_input_ids, b_input_mask)
        logits = logits.detach().cpu().numpy()

        predictions.append(logits)

        flat_predictions = [item for sublist in predictions for item in sublist]
        flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
        
    A = accuracy_score(test_labels.numpy(), flat_predictions)
    B = f1_score(test_labels.numpy(), flat_predictions, average='macro')
    C = f1_score(test_labels.numpy(), flat_predictions, average='weighted')
    D = f1_score(test_labels.numpy(), flat_predictions, average=None)
        
    return A,B,C,D

# Eseguiamo le funzioni

**Definiamo le varibili che univoche per tutti i modelli**

In [None]:
seed = [46, 23, 17, 54, 31, 54, 1, 28, 52, 27]
random_state = [1024, 3333, 1995, 2780, 3833, 1394, 779, 4787, 845, 5480]
model_architecture_list = ['Italian_Bert','Umberto','Alberto', 'Multilingua-Bert']
epochs = 2

model_architecture = model_architecture_list[2]

**Eseguiamo Potter Kermit**

In [None]:
Potter_accuracy_list = []
Potter_macro_list = []
Potter_weighted_list = []
Potter_other_0 = []
Potter_other_1 = []


for i in range(0, 10):
    train_dataloder, validation_dataloader, test_dataloder, model_architecture, device, test_labels = define_input(seed[i], random_state[i], sentences, model_architecture, epochs)
    
    A,B,C,D = execute_Potter(epochs, model_architecture, train_dataloder, validation_dataloader, test_dataloder, device, test_labels)
    
    Potter_accuracy_list.append(A)
    Potter_macro_list.append(B)
    Potter_weighted_list.append(C)
    Potter_other_0.append(D[0])
    Potter_other_1.append(D[1])

**Eseguiamo Kermit classico**

In [None]:
Kermit_accuracy_list = []
Kermit_macro_list = []
Kermit_weighted_list = []
Kermit_other_0 = []
Kermit_other_1 = []


for i in range(0, 10):
    train_dataloder, validation_dataloader, test_dataloder, model_architecture, device, test_labels = define_input(seed[i], random_state[i], sentences, model_architecture, epochs)
    
    A,B,C,D = execute_Kermit(epochs, model_architecture, train_dataloder, validation_dataloader, test_dataloder, device, test_labels)
    
    Kermit_accuracy_list.append(A)
    Kermit_macro_list.append(B)
    Kermit_weighted_list.append(C)
    Kermit_other_0.append(D[0])
    Kermit_other_1.append(D[1])

**Eseguiamo solo Bert**

In [None]:
Bert_accuracy_list = []
Bert_macro_list = []
Bert_weighted_list = []
Bert_other_0 = []
Bert_other_1 = []


for i in range(0, 10):
    train_dataloder, validation_dataloader, test_dataloder, model_architecture, device, test_labels = define_input(seed[i], random_state[i], sentences, model_architecture, epochs)
    
    A,B,C,D = execute_Bert(epochs, model_architecture, train_dataloder, validation_dataloader, test_dataloder, device, test_labels)
    
    Bert_accuracy_list.append(A)
    Bert_macro_list.append(B)
    Bert_weighted_list.append(C)
    Bert_other_0.append(D[0])
    Bert_other_1.append(D[1])

# Calcoliamo la media e la variazione standard dei 3 modelli

**Kermit Potter**

In [None]:
print('*********** F1 - SCORE CLASS 0')
print(round(calculate_mean(Potter_other_0), 2))
print(round(calculate_standard_deviation(Potter_other_0), 2))

print('*********** F1 - SCORE CLASS 1')
print(round(calculate_mean(Potter_other_1), 2))
print(round(calculate_standard_deviation(Potter_other_1), 2))

print('*********** ACCURACY')
print(round(calculate_mean(Potter_accuracy_list),2))
print(round(calculate_standard_deviation(Potter_accuracy_list),2))

print('*********** MACRO')
print(round(calculate_mean(Potter_macro_list),2))
print(round(calculate_standard_deviation(Potter_macro_list),2))

print('*********** WEIGHTED')
print(round(calculate_mean(Potter_weighted_list), 2))
print(round(calculate_standard_deviation(Potter_weighted_list), 2))

**Kermit classico**

In [None]:
print('*********** F1 - SCORE CLASS 0')
print(round(calculate_mean(Kermit_other_0), 2))
print(round(calculate_standard_deviation(Kermit_other_0), 2))

print('*********** F1 - SCORE CLASS 1')
print(round(calculate_mean(Kermit_other_1), 2))
print(round(calculate_standard_deviation(Kermit_other_1), 2))

print('*********** ACCURACY')
print(round(calculate_mean(Kermit_accuracy_list),2))
print(round(calculate_standard_deviation(Kermit_accuracy_list),2))

print('*********** MACRO')
print(round(calculate_mean(Kermit_macro_list),2))
print(round(calculate_standard_deviation(Kermit_macro_list),2))

print('*********** WEIGHTED')
print(round(calculate_mean(Kermit_weighted_list), 2))
print(round(calculate_standard_deviation(Kermit_weighted_list), 2))

**Solo Bert**

In [None]:
print('*********** F1 - SCORE CLASS 0')
print(round(calculate_mean(Bert_other_0), 2))
print(round(calculate_standard_deviation(Bert_other_0), 2))

print('*********** F1 - SCORE CLASS 1')
print(round(calculate_mean(Bert_other_1), 2))
print(round(calculate_standard_deviation(Bert_other_1), 2))

print('*********** ACCURACY')
print(round(calculate_mean(Bert_accuracy_list),2))
print(round(calculate_standard_deviation(Bert_accuracy_list),2))

print('*********** MACRO')
print(round(calculate_mean(Bert_macro_list),2))
print(round(calculate_standard_deviation(Bert_macro_list),2))

print('*********** WEIGHTED')
print(round(calculate_mean(Bert_weighted_list), 2))
print(round(calculate_standard_deviation(Bert_weighted_list), 2))

# Verifichiamo se i risultati ottenuti sono statisticamente significativi

In [None]:
def p_test(p):
    alpha = 0.05
    if p > alpha:
        print('Same distribution (fail to reject H0)')
    else:
        print('Different distribution (reject H0)')

**Potter vs Kermit**

In [None]:
a_w, a_p = wilcoxon(Potter_accuracy_list, Kermit_accuracy_list)
m_w, m_p = wilcoxon(Potter_macro_list, Kermit_macro_list)
w_w, w_p = wilcoxon(Potter_weighted_list, Kermit_weighted_list)

In [None]:
p_test(a_p)
p_test(m_p)
p_test(w_p)

**Potter vs Bert**

In [None]:
a_w, a_p = wilcoxon(Potter_accuracy_list, Bert_accuracy_list)
m_w, m_p = wilcoxon(Potter_macro_list, Bert_macro_list)
w_w, w_p = wilcoxon(Potter_weighted_list, Bert_weighted_list)

In [None]:
p_test(a_p)
p_test(m_p)
p_test(w_p)

**Kermit vs Bert**

In [None]:
a_w, a_p = wilcoxon(Kermit_accuracy_list, Bert_accuracy_list)
m_w, m_p = wilcoxon(Kermit_macro_list, Bert_macro_list)
w_w, w_p = wilcoxon(Kermit_weighted_list, Bert_weighted_list)

In [None]:
p_test(a_p)
p_test(m_p)
p_test(w_p)