In [1]:
import random
import numpy as np
import pandas as pd
from deap import base, creator, tools, algorithms
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

source_file = 'C:/Users/Gabriel/Pasta/Machine Learning/Introdução ao Aprendizado de Máquina/Testes Estatísticos/csv_result-ionosphere.csv'
df_cru = pd.read_csv(source_file, skipinitialspace=True, na_values='?', keep_default_na=True)

colunas_excluir = ['id', 'class']
colunas_numericas = df_cru.columns.difference(colunas_excluir)

df_cru[colunas_numericas] = df_cru[colunas_numericas].replace('%', np.nan)
df_cru[colunas_numericas] = df_cru[colunas_numericas].apply(pd.to_numeric, errors='coerce')

df_cru = df_cru.dropna(how='all')

df_completo = df_cru.dropna()

In [3]:
X_array = df_completo.drop(columns=['class', 'id']).values
y_array = df_completo['class'].values

X_T, X_test, y_T, y_test = train_test_split(X_array, y_array, random_state=42, test_size=0.15, stratify=y_array)

X_t = pd.DataFrame(X_test, columns=df_completo.drop(columns=['class', 'id']).columns)
y_t = pd.Series(y_test)

df = pd.DataFrame(data=X_T, columns=df_completo.drop(columns=['class', 'id']).columns)
df['classes'] = y_T

y = df['classes'].copy()
X = df.drop(columns='classes').copy()

In [4]:
y.value_counts() 

classes
g    191
b    107
Name: count, dtype: int64

In [5]:
from sklearn.tree import DecisionTreeClassifier
ClassificadorDT = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None, min_samples_split=10,
            splitter='best')
ClassificadorDT.fit(X, y)

In [6]:
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

y_pred=ClassificadorDT.predict(X_t)
accuracy_score(y_pred, y_t)

0.8679245283018868

# GAbDT

In [7]:
def criar_individuo(atributos):
    """
    The idea here is to better explore the space of possibilities and the potential splits of our dataset, not just by using the greedy approach of checking the mean between two attributes, 
    allowing the optimization process to escape local minima and maxima.
    """
    atributo = random.choice(atributos)
    operador = random.choice(["<", ">", "<=", ">="])
    threshold = random.uniform(min(df[atributo]), max(df[atributo]))
    return {"premisse": (atributo, operador, threshold)}

In [8]:
def aplicar_premissa(df, premissa):
    """
    This function applies the premises generated in the initial population by splitting the df into two new groups, df_true and df_false, for those that satisfy the premise and those that do not, respectively.  
    In this way, we can later calculate the majority error generated by the split of each premise and find the one that best divides the data. These premises will then be evolved over generations 
    by the GA, trying to maximize their fitness.
    """
    atributo, operador, threshold = premissa["premisse"]
    
    if operador == "<":
        df_true = df[df[atributo] < threshold]
        df_false = df[df[atributo] >= threshold]
    elif operador == "<=":
        df_true = df[df[atributo] <= threshold]
        df_false = df[df[atributo] > threshold]
    elif operador == ">":
        df_true = df[df[atributo] > threshold]
        df_false = df[df[atributo] <= threshold]
    elif operador == ">=":
        df_true = df[df[atributo] >= threshold]
        df_false = df[df[atributo] < threshold]
    else:
        raise ValueError(f"Operador desconhecido: {operador}")
    
    return df_true, df_false

In [9]:
def erro_majoritario(df, coluna_classe="classes"):
    """
    Calculates the majority classification error (the lower, the better).
    """
    if df.empty:
        return 1  
    classe_majoritaria = df[coluna_classe].value_counts().idxmax()
    erro = sum(df[coluna_classe] != classe_majoritaria)/len(df)
    return erro

In [10]:
def evaluate(df, populacao, coluna_classe="classes"):
    """
    This function uses aplicar_premissa to generate df_true and df_false and calculates the majority error for both, weighted by the size of the generated sets relative to the original df, 
    that is, checking whether the premise splits the data into representative sets.
    """
    resultados = []
    for individuo in populacao:
        df_verdadeiro, df_falso = aplicar_premissa(df, individuo)
        
        # Calculate the majority error in each subset
        erro_true = erro_majoritario(df_verdadeiro, coluna_classe) 
        erro_false = erro_majoritario(df_falso, coluna_classe) 
        
        # Weight proportional to the size of the sets
        peso_true = len(df_verdadeiro)/len(df) if len(df) > 0 else 0
        peso_false = len(df_falso)/len(df) if len(df) > 0 else 0
    
        # Fitness is the weighted error (the lower, the better)
        fitness = peso_true * erro_true + peso_false * erro_false
        
        resultados.append((individuo, fitness))
    
    return resultados

In [11]:
def torneio(pop_avaliada, k=3):
    """
    Tournament selection: selects the best individual among k random individuals to be one of the parents that generates a new individual for the next generation.  
    This is done twice for each new individual generated, selecting pai1 and pai2.
    """
    competidores = random.sample(pop_avaliada, k)
    vencedor = min(competidores, key=lambda x: x[1])  
    return vencedor[0]  

In [12]:
def crossover_rnd(pai1, pai2):
    # Chooses each component of the premise from one parent or the other
    atributo = pai1["premisse"][0] if random.random() < 0.5 else pai2["premisse"][0]
    operador = pai1["premisse"][1] if random.random() < 0.5 else pai2["premisse"][1]
    threshold = pai1["premisse"][2] if random.random() < 0.5 else pai2["premisse"][2]
    
    premissa_filho = (atributo, operador, threshold)
    
    return {"premisse": premissa_filho}

In [13]:
def mutacao(individuo, atributos, intervalo_atributos, prob_mut_attr=0.1, prob_mut_op=0.1, prob_mut_val=0.2):
    # Directly access the 'premisse' dictionary inside the individual
    premissa = individuo['premisse']  
    nova_premissa = list(premissa)  

    # Attribute mutation
    if random.random() < prob_mut_attr:
        nova_premissa[0] = random.choice(atributos)
        min_val, max_val = intervalo_atributos[nova_premissa[0]]
        nova_premissa[2] = random.uniform(min_val, max_val)

    # Operator mutation
    if random.random() < prob_mut_op:
        operadores = ['<', '<=', '>', '>=']
        nova_premissa[1] = random.choice(operadores)

    # Value mutation
    if random.random() < prob_mut_val:
        attr = nova_premissa[0]
        min_val, max_val = intervalo_atributos[attr]
        nova_premissa[2] = random.uniform(min_val, max_val)

    return {'premisse': tuple(nova_premissa)}  #Returns a dictionary again

In [14]:
def calcular_intervalo_atributos(df, atributos):
    """
    Calculates the range of values for each numeric attribute in the DataFrame.
    """
    intervalo = {}
    for atributo in atributos:
        min_val = df[atributo].min()
        max_val = df[atributo].max()
        intervalo[atributo] = (min_val, max_val)
    return intervalo

In [15]:
def nova_geracao(
    pop_avaliada, 
    atributos, 
    intervalo_atributos, 
    tamanho=100, 
    prob_mut_attr=0.1, 
    prob_mut_op=0.1, 
    prob_mut_val=0.2
):
    """
    Generates a new generation with elitism and genetic operators.
    - Keeps the best individual.
    - Uses tournament + crossover + mutation to generate the rest.
    """
    # Elitism
    elite = min(pop_avaliada, key=lambda x: x[1])[0]  #Pega o melhor (menor fitness)
    
    nova_populacao = [elite]  #Inicia com o elite
    
    while len(nova_populacao) < tamanho:
        # Tournament Selection
        pai1 = torneio(pop_avaliada)
        pai2 = torneio(pop_avaliada)
        
        # Crossover
        filho = crossover_rnd(pai1, pai2)
        #print('\nFilho', filho)
        
        # Mutation
        filho_mutado = mutacao(
            filho,
            atributos,
            intervalo_atributos,
            prob_mut_attr=prob_mut_attr,
            prob_mut_op=prob_mut_op,
            prob_mut_val=prob_mut_val
        )
        
        nova_populacao.append(filho_mutado)
    
    return nova_populacao

In [16]:
import pandas as pd
from typing import List, Dict, Any, Tuple

def evoluir_solucoes_2(
    df: pd.DataFrame,
    atributos: List[str],
    intervalo_atributos: Dict[str, Tuple],
    profundidade: int = 0,
    max_profundidade: int = 5,
    n_geracoes: int = 10,
    tamanho_pop: int = 200,
    taxa_mutacao_valor: float = 0.2,
    taxa_mutacao_attr: float = 0.1,
    taxa_mutacao_op: float = 0.1,
    coluna_classe: str = "classes",
    min_samples: int = 2,
    erro_threshold: float = 0.05, 
    limite_ga: int = 5 
) -> Dict[str, Any]:
   
    """
    Second version of the function that evolves data-splitting solutions 
    to build a decision tree using a genetic algorithm.  
    I reorganized some parts of the function and added an improvement 
    that reduces execution time in the end.  
    It remains basically the same, but now it includes a limit beyond which 
    the GA stops running generations if no performance improvement is found 
    between consecutive generations.

    NOTE THAT THE FUNCTIONS ARE EXACTLY THE SAME, I ONLY ADDED QUALITY-OF-LIFE CHANGES.  
    THE RESULTS OBTAINED IN VERSIONS 1 AND 2 ARE THE SAME AND WERE TESTED.
    
    Arguments:
        df (pd.DataFrame): DataFrame containing the data to be split.
        atributos (List[str]): List of attributes (columns) considered for premises.
        intervalo_atributos (Dict[str, Tuple]): Dictionary with value ranges for each attribute.
        profundidade (int): Current depth of the tree.
        max_profundidade (int): Maximum allowed depth for the tree.
        n_geracoes (int): Maximum number of generations for the genetic algorithm.
        tamanho_pop (int): Population size in each GA generation.
        taxa_mutacao_valor (float): Mutation rate for the threshold value in the premise.
        taxa_mutacao_attr (float): Mutation rate for the attribute in the premise.
        taxa_mutacao_op (float): Mutation rate for the operator in the premise.
        coluna_classe (str): Name of the column containing the target classes.
        min_samples (int): Minimum number of samples required to allow a split.
        erro_threshold (float): Majority error threshold to turn a node into a leaf (pre-pruning).
        limite_ga (int): Number of generations without fitness improvement before stopping the GA early.

    Returns:
        Dict[str, Any]: A dictionary representing the node (internal or leaf) of the tree.
    """
    
    # Stopping criteria
    if profundidade >= max_profundidade or len(df) < min_samples or df[coluna_classe].nunique() == 1:
        return {
            "folha": True,
            "classes": df[coluna_classe].value_counts().to_dict()
        }

    # Genetic Algorithm
    populacao = [criar_individuo(atributos) for _ in range(tamanho_pop)]
    
    melhor_fit_geral = float('inf')
    melhor_ind_geral = None
    geracoes_sem_melhora = 0

    for gen in range(n_geracoes):
        avaliacoes = evaluate(df, populacao, coluna_classe)
        melhor_ind_geracao, melhor_fit_geracao = min(avaliacoes, key=lambda x: x[1])

        # Early stopping logic for GA
        if melhor_fit_geracao < melhor_fit_geral:
            melhor_fit_geral = melhor_fit_geracao
            melhor_ind_geral = melhor_ind_geracao
            geracoes_sem_melhora = 0
        else:
            geracoes_sem_melhora += 1
        
        if geracoes_sem_melhora >= limite_ga:
            break  # Stop GA if there is no improvement

        populacao = nova_geracao(
            avaliacoes, 
            atributos, 
            intervalo_atributos, 
            tamanho=tamanho_pop, 
            prob_mut_attr=taxa_mutacao_attr, 
            prob_mut_op=taxa_mutacao_op,
            prob_mut_val=taxa_mutacao_valor
        )

    # If the GA did not find any valid solution
    if melhor_ind_geral is None:
        return {"folha": True, "classes": df[coluna_classe].value_counts().to_dict()}

    # Apply the best premise and build the branches
    df_esquerda, df_direita = aplicar_premissa(df, melhor_ind_geral)

    if len(df_esquerda) < min_samples or len(df_direita) < min_samples:
        return {"folha": True, "classes": df[coluna_classe].value_counts().to_dict()}

    # Recursive calls for left and right branches (pre-pruning logic)
    erro_esquerda = erro_majoritario(df_esquerda, coluna_classe)
    if erro_esquerda < erro_threshold:
        no_esquerdo = {"folha": True, "classes": df_esquerda[coluna_classe].value_counts().to_dict()}
    else:
        no_esquerdo = evoluir_solucoes_2(
            df_esquerda, atributos, intervalo_atributos, profundidade + 1, 
            max_profundidade, n_geracoes, tamanho_pop, taxa_mutacao_valor, 
            taxa_mutacao_attr, taxa_mutacao_op, coluna_classe, min_samples, erro_threshold, limite_ga
        )

    erro_direita = erro_majoritario(df_direita, coluna_classe)
    if erro_direita < erro_threshold:
        no_direito = {"folha": True, "classes": df_direita[coluna_classe].value_counts().to_dict()}
    else:
        no_direito = evoluir_solucoes_2(
            df_direita, atributos, intervalo_atributos, profundidade + 1, 
            max_profundidade, n_geracoes, tamanho_pop, taxa_mutacao_valor, 
            taxa_mutacao_attr, taxa_mutacao_op, coluna_classe, min_samples, erro_threshold, limite_ga
        )

    return {
        "folha": False,
        "premissa": melhor_ind_geral,  # The redundant variable was removed
        "contagem_total": df[coluna_classe].value_counts().to_dict(),
        "contagem_esquerda": df_esquerda[coluna_classe].value_counts().to_dict(),
        "contagem_direita": df_direita[coluna_classe].value_counts().to_dict(),
        "esquerda": no_esquerdo,
        "direita": no_direito
    }


In [17]:
import matplotlib.pyplot as plt
import warnings

def plot_tree_custom(node, x=0, y=0, dx=3.0, dy=1.0, ax=None, depth=0, pos_dict=None, parent_pos=None):
    """
    Visual representation of the decision tree.
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(35, 30), dpi=350)
    
        ax.axis("off")
        pos_dict = {"max_y": 0, "min_y": 0, "depth_max": get_tree_depth(node)}
        plot_tree_custom(node, x, y, dx, dy, ax, depth, pos_dict)
        
        plt.savefig("custom_tree.png", bbox_inches="tight")
        warnings.filterwarnings("ignore", category=UserWarning)
        plt.show()
        return

    # Dynamic adjustment based on total depth
    spacing_factor = pos_dict["depth_max"] - depth + 1
    current_dx = dx * spacing_factor
    current_dy = dy * spacing_factor * 0.5  
    
    # Leaf
    if node.get("folha", False):
        classes = node.get("classes", {})
        total = sum(classes.values())

        text = "Leaf\n"
        text += f"Total: {total}\n"
        text += "Classes:\n"
        for cls, count in sorted(classes.items()):
            text += f"  {cls}: {count} ({count/total:.1%})\n"

        ax.text(x, y, text,
                bbox=dict(facecolor="#c1f0c1", alpha=1.0, edgecolor="darkgreen", boxstyle="round,pad=0.5"),
                ha="center", va="center", fontsize=10)

        if parent_pos:
            ax.plot([parent_pos[0], x], [parent_pos[1], y],
                    color="#555555", linestyle="-", linewidth=1.5, alpha=1.0)

        pos_dict["max_y"] = max(pos_dict["max_y"], y)
        pos_dict["min_y"] = min(pos_dict["min_y"], y)
        return y

    # Internal node
    premissa = node.get("premissa", {}).get("premisse", ("?", "?", "?"))
    total_counts = node.get("contagem_total", {})
    left_counts = node.get("contagem_esquerda", {})
    right_counts = node.get("contagem_direita", {})
    total = sum(total_counts.values())

    text = f"{premissa[0]}\n{premissa[1]} {premissa[2]:.2f}\n"
    text += f"Total: {total}\n"
    text += "Distribution:\n"
    for cls, count in sorted(total_counts.items()):
        text += f"  {cls}: {count} ({count/total:.1%})\n"

    ax.text(x, y, text,
            bbox=dict(facecolor="#b0e0e6", alpha=1.0, edgecolor="darkblue", boxstyle="round,pad=0.5"),
            ha="center", va="center", fontsize=10)

    if parent_pos:
        ax.plot([parent_pos[0], x], [parent_pos[1], y],
                color="#555555", linestyle="-", linewidth=1.5, alpha=1.0)

    # Coordinates of children
    left_y = y - current_dy
    right_y = y + current_dy

    left_child_y = plot_tree_custom(node["esquerda"], x + current_dx, left_y,
                                    dx, dy, ax, depth + 1, pos_dict, parent_pos=(x, y))
    left_text = "True\n"
    ax.text(x + current_dx / 3, (y + left_child_y) / 2, left_text,
            ha="right", va="center", fontsize=15, color="#1f77b4",
           )

    right_child_y = plot_tree_custom(node["direita"], x + current_dx, right_y,
                                     dx, dy, ax, depth + 1, pos_dict, parent_pos=(x, y))
    right_text = "False\n"
    ax.text(x + current_dx / 3, (y + right_child_y) / 2, right_text,
            ha="left", va="center", fontsize=15, color="#d62728",
            )

    return y


def get_tree_depth(node):
    """Calculates the maximum depth of the tree."""
    if node.get("folha", False):
        return 0
    return 1 + max(get_tree_depth(node["esquerda"]), get_tree_depth(node["direita"]))

In [18]:
def predict_custom_tree(tree, X):
    """
    Makes predictions using our custom dictionary-based decision tree structure.
    """
    def predizer_linha(no, linha):
        if no["folha"]:
            # Return the class with the highest frequency
            return max(no["classes"].items(), key=lambda x: x[1])[0]
        
        atributo, operador, valor = no["premissa"]["premisse"]
        entrada = linha[atributo]

        # Apply operators
        if operador == "<":
            if entrada < valor:
                return predizer_linha(no["esquerda"], linha)
            else:
                return predizer_linha(no["direita"], linha)
        elif operador == "<=":
            if entrada <= valor:
                return predizer_linha(no["esquerda"], linha)
            else:
                return predizer_linha(no["direita"], linha)
        elif operador == ">":
            if entrada > valor:
                return predizer_linha(no["esquerda"], linha)
            else:
                return predizer_linha(no["direita"], linha)
        elif operador == ">=":
            if entrada >= valor:
                return predizer_linha(no["esquerda"], linha)
            else:
                return predizer_linha(no["direita"], linha)
        else:
            raise ValueError(f"Unknown operator: {operador}")
    
    # Apply to all rows
    return [predizer_linha(tree, linha) for _, linha in X.iterrows()]

In [19]:
max_profundidade = 15
min_samples = 10
erro_threshold = 0.1 # Pre-Pruning Threshold
n_geracoes = 100
tamanho_pop = 1000
limite_ga = 15
taxa_mutacao_valor = 0.2  
taxa_mutacao_attr = 0.05 
taxa_mutacao_op = 0.2
atributos = df.columns[df.columns != "classes"].tolist()
intervalo_atributos = calcular_intervalo_atributos(df, atributos)

print("Starting the evolution of the decision tree with GAbDT...")
arvore = evoluir_solucoes_2(
    df=df, 
    atributos=atributos, 
    intervalo_atributos=intervalo_atributos, 
    max_profundidade=max_profundidade,
    min_samples=min_samples,
    erro_threshold=erro_threshold,
    n_geracoes=n_geracoes, 
    tamanho_pop=tamanho_pop,
    limite_ga=limite_ga,
    taxa_mutacao_valor=taxa_mutacao_valor,
    taxa_mutacao_attr=taxa_mutacao_attr,
    taxa_mutacao_op=taxa_mutacao_op,
    coluna_classe="classes"
)

print("Tree generated successfully!")

Starting the evolution of the decision tree with GAbDT...
Tree generated successfully!


In [20]:
print(arvore)
#plot_tree_custom(arvore)

{'folha': False, 'premissa': {'premisse': ('a05', '<', 0.23195600552406193)}, 'contagem_total': {'g': 191, 'b': 107}, 'contagem_esquerda': {'b': 59, 'g': 4}, 'contagem_direita': {'g': 187, 'b': 48}, 'esquerda': {'folha': True, 'classes': {'b': 59, 'g': 4}}, 'direita': {'folha': False, 'premissa': {'premisse': ('a01', '>=', 0.6567463707900022)}, 'contagem_total': {'g': 187, 'b': 48}, 'contagem_esquerda': {'g': 187, 'b': 31}, 'contagem_direita': {'b': 17}, 'esquerda': {'folha': True, 'classes': {'g': 187, 'b': 31}}, 'direita': {'folha': True, 'classes': {'b': 17}}}}


In [21]:
from sklearn.metrics import f1_score
# GAbDT
y_pred_GAbDT = predict_custom_tree(arvore, X_t)
print("Accuracy:", accuracy_score(y_t, y_pred_GAbDT))
f1 = f1_score(y_t, y_pred_GAbDT, average=None)  
print("F1-score:", f1)

# Scikit-learn
print("Accuracy:", accuracy_score(y_t, y_pred))
f1 = f1_score(y_t, y_pred, average=None)  
print("F1-score:", f1)

Accuracy: 0.9433962264150944
F1-score: [0.91428571 0.95774648]
Accuracy: 0.8679245283018868
F1-score: [0.82051282 0.89552239]


In [22]:
from collections import Counter
Counter(y_pred_GAbDT)

Counter({'g': 37, 'b': 16})

# Student’s T statistical test using K-fold

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import ttest_rel

# Loading and preparing the Ionosphere dataset
source_file = 'C:/Users/Gabriel/Pasta/Machine Learning/Introdução ao Aprendizado de Máquina/Testes Estatísticos/csv_result-ionosphere.csv'
df_cru = pd.read_csv(source_file, skipinitialspace=True, na_values='?', keep_default_na=True)

colunas_excluir = ['id', 'class']
colunas_numericas = df_cru.columns.difference(colunas_excluir)

df_cru[colunas_numericas] = df_cru[colunas_numericas].replace('%', np.nan)
df_cru[colunas_numericas] = df_cru[colunas_numericas].apply(pd.to_numeric, errors='coerce')
df_cru = df_cru.dropna(how='all')
df_completo = df_cru.dropna()

# Split X and y
X_array = df_completo.drop(columns=['class', 'id']).values
le = LabelEncoder()
y_array = le.fit_transform(df_completo['class'].values)

X = pd.DataFrame(X_array, columns=df_completo.drop(columns=['class', 'id']).columns)
y = pd.Series(y_array)

atributos = list(X.columns)

max_profundidade = 15
min_samples = 10
erro_threshold = 0.075 # Threshold for pre-pruning
n_geracoes = 100
tamanho_pop = 1000
limite_ga = 15
taxa_mutacao_valor = 0.15  
taxa_mutacao_attr = 0.05 
taxa_mutacao_op = 0.15

n_splits = 15
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

accs_custom = []
accs_sklearn = []

for train_index, test_index in skf.split(X, y):
    X_treino_df = X.iloc[train_index].copy()
    y_treino = y.iloc[train_index].copy()
    df_treino = X_treino_df.copy()
    df_treino['classes'] = y_treino.values

    intervalo_atributos = {col: (X_treino_df[col].min(), X_treino_df[col].max()) for col in atributos}

    X_teste_df = X.iloc[test_index].copy()
    y_teste = y.iloc[test_index].copy()

    print("  Evolving the tree with GAbDT...")
    arvore_gabdt = evoluir_solucoes_2(
        df=df_treino,
        atributos=atributos,
        intervalo_atributos=intervalo_atributos,
        max_profundidade=max_profundidade,
        min_samples=min_samples,
        erro_threshold=erro_threshold,
        n_geracoes=n_geracoes,
        tamanho_pop=tamanho_pop,
        limite_ga=limite_ga,
        taxa_mutacao_valor=taxa_mutacao_valor,
        taxa_mutacao_attr=taxa_mutacao_attr,
        taxa_mutacao_op=taxa_mutacao_op,
        coluna_classe="classes"
    )
    
    y_pred_custom = predict_custom_tree(arvore_gabdt, X_teste_df)
    acc_custom = accuracy_score(y_teste, y_pred_custom)
    print(f"GAbDT Accuracy: {acc_custom:.4f}")
    accs_custom.append(acc_custom)

    clf = DecisionTreeClassifier(
        criterion='gini', max_depth=max_profundidade,
        min_samples_split=min_samples, random_state=7
    )
    clf.fit(X_treino_df, y_treino)
    y_pred_sklearn = clf.predict(X_teste_df)
    acc_sklearn = accuracy_score(y_teste, y_pred_sklearn)
    print(f"Scikit-learn Accuracy: {acc_sklearn:.4f}")
    accs_sklearn.append(acc_sklearn)

# Paired t-test
t_stat, p_value = ttest_rel(accs_custom, accs_sklearn)
media_custom = np.mean(accs_custom)
media_sklearn = np.mean(accs_sklearn)

print("\n================== FINAL RESULTS ==================")
print("Custom model accuracies:", accs_custom)
print("Scikit-learn accuracies:", accs_sklearn)
print(f"\nt = {t_stat:.4f}, p = {p_value:.4f}")
print(f"Mean (custom): {media_custom:.4f}")
print(f"Mean (sklearn): {media_sklearn:.4f}")

if p_value < 0.05:
    print("Statistically significant difference!")
else:
    print("Not a significant difference.")


  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.9583
Scikit-learn Accuracy: 0.9167
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.8750
Scikit-learn Accuracy: 0.9583
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.8333
Scikit-learn Accuracy: 0.8750
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.8750
Scikit-learn Accuracy: 0.9167
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.9167
Scikit-learn Accuracy: 0.8333
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.9167
Scikit-learn Accuracy: 0.9583
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.8261
Scikit-learn Accuracy: 0.8696
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.9130
Scikit-learn Accuracy: 0.8696
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.7826
Scikit-learn Accuracy: 0.7826
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.9565
Scikit-learn Accuracy: 0.8261
  Evolving the tree with GAbDT...
GAbDT Accuracy: 0.9565
Scikit-learn Accuracy: 0.9130
  Evolving the tree with GAbDT...
GAbDT Acc