In [1]:
import random
import numpy as np
import pandas as pd
from deap import base, creator, tools, algorithms
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

source_file = 'C:/Users/Gabriel/Pasta/Machine Learning/Aplicações Biblioteca LORE/Tudo Binário/BINÁRIO_FINAL.csv'
df_completo = pd.read_csv(source_file, skipinitialspace=True, na_values='?', keep_default_na=True)

X_array = df_completo.drop(columns='CLASSE').values
y_array = df_completo['CLASSE'].values

X_T, X_test, y_T, y_test = train_test_split(X_array, y_array, random_state=42, test_size=0.2, stratify=y_array)

X_t = pd.DataFrame(X_test, columns=df_completo.drop(columns='CLASSE').columns)
y_t = pd.Series(y_test)

# The dataframe created is in the format expected by the GAbDT
df = pd.DataFrame(data=X_T, columns=df_completo.drop(columns='CLASSE').columns)
df['classes'] = y_T

y = df['classes'].copy()
X = df.drop(columns='classes').copy()

In [3]:
y.value_counts() #80% of 291

classes
APNEICO    144
NORMAL      88
Name: count, dtype: int64

In [4]:
from sklearn.tree import DecisionTreeClassifier
ClassificadorDT = DecisionTreeClassifier(class_weight=None, random_state=42,criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None, min_samples_split=10,
            splitter='best')
ClassificadorDT.fit(X, y)

In [5]:
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

y_pred=ClassificadorDT.predict(X_t)
accuracy_score(y_pred, y_t)

0.711864406779661

# GAbDT

In [6]:
def criar_individuo(atributos):
    """
    The idea here is to better explore the space of possibilities and the potential splits of our dataset, not just by using the greedy approach of checking the mean between two attributes, 
    allowing the optimization process to escape local minima and maxima.
    """
    atributo = random.choice(atributos)
    operador = random.choice(["<", ">", "<=", ">="])
    threshold = random.uniform(min(df[atributo]), max(df[atributo]))
    return {"premisse": (atributo, operador, threshold)}

In [7]:
def aplicar_premissa(df, premissa):
    """
    This function applies the premises generated in the initial population by splitting the df into two new groups, df_true and df_false, for those that satisfy the premise and those that do not, respectively.  
    In this way, we can later calculate the majority error generated by the split of each premise and find the one that best divides the data. These premises will then be evolved over generations 
    by the GA, trying to maximize their fitness.
    """
    atributo, operador, threshold = premissa["premisse"]
    
    if operador == "<":
        df_true = df[df[atributo] < threshold]
        df_false = df[df[atributo] >= threshold]
    elif operador == "<=":
        df_true = df[df[atributo] <= threshold]
        df_false = df[df[atributo] > threshold]
    elif operador == ">":
        df_true = df[df[atributo] > threshold]
        df_false = df[df[atributo] <= threshold]
    elif operador == ">=":
        df_true = df[df[atributo] >= threshold]
        df_false = df[df[atributo] < threshold]
    else:
        raise ValueError(f"Operador desconhecido: {operador}")
    
    return df_true, df_false

In [8]:
def erro_majoritario(df, coluna_classe="classes"):
    """
    Calculates the majority classification error (the lower, the better).
    """
    if df.empty:
        return 1  
    classe_majoritaria = df[coluna_classe].value_counts().idxmax()
    erro = sum(df[coluna_classe] != classe_majoritaria)/len(df)
    return erro

In [9]:
def evaluate(df, populacao, coluna_classe="classes"):
    """
    This function uses aplicar_premissa to generate df_true and df_false and calculates the majority error for both, weighted by the size of the generated sets relative to the original df, 
    that is, checking whether the premise splits the data into representative sets.
    """
    resultados = []
    for individuo in populacao:
        df_verdadeiro, df_falso = aplicar_premissa(df, individuo)
        
        # Calculate the majority error in each subset
        erro_true = erro_majoritario(df_verdadeiro, coluna_classe) 
        erro_false = erro_majoritario(df_falso, coluna_classe) 
        
        # Weight proportional to the size of the sets
        peso_true = len(df_verdadeiro)/len(df) if len(df) > 0 else 0
        peso_false = len(df_falso)/len(df) if len(df) > 0 else 0
    
        # Fitness is the weighted error (the lower, the better)
        fitness = peso_true * erro_true + peso_false * erro_false
        
        resultados.append((individuo, fitness))
    
    return resultados

In [10]:
def torneio(pop_avaliada, k=3):
    """
    Tournament selection: selects the best individual among k random individuals to be one of the parents that generates a new individual for the next generation.  
    This is done twice for each new individual generated, selecting pai1 and pai2.
    """
    competidores = random.sample(pop_avaliada, k)
    vencedor = min(competidores, key=lambda x: x[1])  
    return vencedor[0]  

In [11]:
def crossover_rnd(pai1, pai2):
    # Chooses each component of the premise from one parent or the other
    atributo = pai1["premisse"][0] if random.random() < 0.5 else pai2["premisse"][0]
    operador = pai1["premisse"][1] if random.random() < 0.5 else pai2["premisse"][1]
    threshold = pai1["premisse"][2] if random.random() < 0.5 else pai2["premisse"][2]
    
    premissa_filho = (atributo, operador, threshold)
    
    return {"premisse": premissa_filho}

In [12]:
def mutacao(individuo, atributos, intervalo_atributos, prob_mut_attr=0.1, prob_mut_op=0.1, prob_mut_val=0.2):
    # Directly access the 'premisse' dictionary inside the individual
    premissa = individuo['premisse']  
    nova_premissa = list(premissa)  

    # Attribute mutation
    if random.random() < prob_mut_attr:
        nova_premissa[0] = random.choice(atributos)
        min_val, max_val = intervalo_atributos[nova_premissa[0]]
        nova_premissa[2] = random.uniform(min_val, max_val)

    # Operator mutation
    if random.random() < prob_mut_op:
        operadores = ['<', '<=', '>', '>=']
        nova_premissa[1] = random.choice(operadores)

    # Value mutation
    if random.random() < prob_mut_val:
        attr = nova_premissa[0]
        min_val, max_val = intervalo_atributos[attr]
        nova_premissa[2] = random.uniform(min_val, max_val)

    return {'premisse': tuple(nova_premissa)}  #Returns a dictionary again

In [13]:
def calcular_intervalo_atributos(df, atributos):
    """
    Calculates the range of values for each numeric attribute in the DataFrame.
    """
    intervalo = {}
    for atributo in atributos:
        min_val = df[atributo].min()
        max_val = df[atributo].max()
        intervalo[atributo] = (min_val, max_val)
    return intervalo

In [14]:
def nova_geracao(
    pop_avaliada, 
    atributos, 
    intervalo_atributos, 
    tamanho=100, 
    prob_mut_attr=0.1, 
    prob_mut_op=0.1, 
    prob_mut_val=0.2
):
    """
    Generates a new generation with elitism and genetic operators.
    - Keeps the best individual.
    - Uses tournament + crossover + mutation to generate the rest.
    """
    # Elitism
    elite = min(pop_avaliada, key=lambda x: x[1])[0]  #Pega o melhor (menor fitness)
    
    nova_populacao = [elite]  #Inicia com o elite
    
    while len(nova_populacao) < tamanho:
        # Tournament Selection
        pai1 = torneio(pop_avaliada)
        pai2 = torneio(pop_avaliada)
        
        # Crossover
        filho = crossover_rnd(pai1, pai2)
        #print('\nFilho', filho)
        
        # Mutation
        filho_mutado = mutacao(
            filho,
            atributos,
            intervalo_atributos,
            prob_mut_attr=prob_mut_attr,
            prob_mut_op=prob_mut_op,
            prob_mut_val=prob_mut_val
        )
        
        nova_populacao.append(filho_mutado)
    
    return nova_populacao

In [15]:
def evoluir_solucoes(df, 
                      atributos, 
                      intervalo_atributos, 
                      profundidade=0, 
                      max_profundidade=5, 
                      n_geracoes=10, 
                      tamanho_pop=200, 
                      taxa_mutacao=0.2, 
                      coluna_classe="classes",
                      min_samples=2):
    """
    Evolves solutions to split the data at each decision tree node using a genetic algorithm.
    """
    # Initializes the population
    populacao = [criar_individuo(atributos) for _ in range(tamanho_pop)]
    
    # Stopping criterion
    if profundidade >= max_profundidade or len(df) < min_samples:
        return {
            "folha": True,
            "classes": df[coluna_classe].value_counts().to_dict()
        }

    # Evaluation across generations
    for gen in range(n_geracoes):
        avaliacoes = evaluate(df, populacao, coluna_classe)
        melhor_ind, melhor_fit = min(avaliacoes, key=lambda x: x[1])
        populacao = nova_geracao(
            avaliacoes, 
            atributos, 
            intervalo_atributos, 
            tamanho=tamanho_pop, 
            prob_mut_attr=0.1, 
            prob_mut_op=0.1, 
            prob_mut_val=0.2
        )

    # Select the best premise and apply it
    melhor_individuo = melhor_ind
    premissa = melhor_individuo 
    df_esquerda, df_direita = aplicar_premissa(df, melhor_individuo)
    
    if len(df_esquerda) < min_samples or len(df_direita) < min_samples:
        return {
            "folha": True,
            "classes": df[coluna_classe].value_counts().to_dict()
        }
    
    # Check the majority error for the subsets
    erro_esquerda = erro_majoritario(df_esquerda, coluna_classe)
    erro_direita = erro_majoritario(df_direita, coluna_classe)

    # Evolve or stop for each branch separately
    if erro_esquerda < erro_threshold:
        no_esquerdo = {
            "folha": True,
            "classes": df_esquerda[coluna_classe].value_counts().to_dict()
        }
    else:
        no_esquerdo = evoluir_solucoes(
            df_esquerda, atributos, intervalo_atributos,
            profundidade + 1, max_profundidade, n_geracoes,
            tamanho_pop, taxa_mutacao, coluna_classe, min_samples
        )

    if erro_direita < erro_threshold:
        no_direito = {
            "folha": True,
            "classes": df_direita[coluna_classe].value_counts().to_dict()
        }
    else:
        no_direito = evoluir_solucoes(
            df_direita, atributos, intervalo_atributos,
            profundidade + 1, max_profundidade, n_geracoes,
            tamanho_pop, taxa_mutacao, coluna_classe, min_samples
        )

    return {
        "folha": False,
        "premissa": premissa,
        "contagem_total": df[coluna_classe].value_counts().to_dict(),
        "contagem_esquerda": df_esquerda[coluna_classe].value_counts().to_dict(),
        "contagem_direita": df_direita[coluna_classe].value_counts().to_dict(),
        "esquerda": no_esquerdo,
        "direita": no_direito
    }

In [16]:
import matplotlib.pyplot as plt
import warnings

def plot_tree_custom(node, x=0, y=0, dx=3.0, dy=1.0, ax=None, depth=0, pos_dict=None, parent_pos=None):
    """
    Visual representation of the decision tree.
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(35, 30), dpi=350)
    
        ax.axis("off")
        pos_dict = {"max_y": 0, "min_y": 0, "depth_max": get_tree_depth(node)}
        plot_tree_custom(node, x, y, dx, dy, ax, depth, pos_dict)
        
        plt.savefig("custom_tree.png", bbox_inches="tight")
        warnings.filterwarnings("ignore", category=UserWarning)
        plt.show()
        return

    # Dynamic adjustment based on total depth
    spacing_factor = pos_dict["depth_max"] - depth + 1
    current_dx = dx * spacing_factor
    current_dy = dy * spacing_factor * 0.5  
    
    # Leaf
    if node.get("folha", False):
        classes = node.get("classes", {})
        total = sum(classes.values())

        text = "Leaf\n"
        text += f"Total: {total}\n"
        text += "Classes:\n"
        for cls, count in sorted(classes.items()):
            text += f"  {cls}: {count} ({count/total:.1%})\n"

        ax.text(x, y, text,
                bbox=dict(facecolor="#c1f0c1", alpha=1.0, edgecolor="darkgreen", boxstyle="round,pad=0.5"),
                ha="center", va="center", fontsize=10)

        if parent_pos:
            ax.plot([parent_pos[0], x], [parent_pos[1], y],
                    color="#555555", linestyle="-", linewidth=1.5, alpha=1.0)

        pos_dict["max_y"] = max(pos_dict["max_y"], y)
        pos_dict["min_y"] = min(pos_dict["min_y"], y)
        return y

    # Internal node
    premissa = node.get("premissa", {}).get("premisse", ("?", "?", "?"))
    total_counts = node.get("contagem_total", {})
    left_counts = node.get("contagem_esquerda", {})
    right_counts = node.get("contagem_direita", {})
    total = sum(total_counts.values())

    text = f"{premissa[0]}\n{premissa[1]} {premissa[2]:.2f}\n"
    text += f"Total: {total}\n"
    text += "Distribution:\n"
    for cls, count in sorted(total_counts.items()):
        text += f"  {cls}: {count} ({count/total:.1%})\n"

    ax.text(x, y, text,
            bbox=dict(facecolor="#b0e0e6", alpha=1.0, edgecolor="darkblue", boxstyle="round,pad=0.5"),
            ha="center", va="center", fontsize=10)

    if parent_pos:
        ax.plot([parent_pos[0], x], [parent_pos[1], y],
                color="#555555", linestyle="-", linewidth=1.5, alpha=1.0)

    # Coordinates of children
    left_y = y - current_dy
    right_y = y + current_dy

    left_child_y = plot_tree_custom(node["esquerda"], x + current_dx, left_y,
                                    dx, dy, ax, depth + 1, pos_dict, parent_pos=(x, y))
    left_text = "True\n"
    ax.text(x + current_dx / 3, (y + left_child_y) / 2, left_text,
            ha="right", va="center", fontsize=15, color="#1f77b4",
           )

    right_child_y = plot_tree_custom(node["direita"], x + current_dx, right_y,
                                     dx, dy, ax, depth + 1, pos_dict, parent_pos=(x, y))
    right_text = "False\n"
    ax.text(x + current_dx / 3, (y + right_child_y) / 2, right_text,
            ha="left", va="center", fontsize=15, color="#d62728",
            )

    return y


def get_tree_depth(node):
    """Calculates the maximum depth of the tree."""
    if node.get("folha", False):
        return 0
    return 1 + max(get_tree_depth(node["esquerda"]), get_tree_depth(node["direita"]))

In [17]:
def predict_custom_tree(tree, X):
    """
    Makes predictions using our custom dictionary-based decision tree structure.
    """
    def predizer_linha(no, linha):
        if no["folha"]:
            # Return the class with the highest frequency
            return max(no["classes"].items(), key=lambda x: x[1])[0]
        
        atributo, operador, valor = no["premissa"]["premisse"]
        entrada = linha[atributo]

        # Apply operators
        if operador == "<":
            if entrada < valor:
                return predizer_linha(no["esquerda"], linha)
            else:
                return predizer_linha(no["direita"], linha)
        elif operador == "<=":
            if entrada <= valor:
                return predizer_linha(no["esquerda"], linha)
            else:
                return predizer_linha(no["direita"], linha)
        elif operador == ">":
            if entrada > valor:
                return predizer_linha(no["esquerda"], linha)
            else:
                return predizer_linha(no["direita"], linha)
        elif operador == ">=":
            if entrada >= valor:
                return predizer_linha(no["esquerda"], linha)
            else:
                return predizer_linha(no["direita"], linha)
        else:
            raise ValueError(f"Unknown operator: {operador}")
    
    # Apply to all rows
    return [predizer_linha(tree, linha) for _, linha in X.iterrows()]

In [18]:
# Training the Decision Tree
# Parameters
n_geracoes = 15
tamanho_pop = 200
taxa_mutacao = 0.2
erro_threshold = 0.125
max_profundidade=15
min_samples=10
atributos = df.columns[df.columns != "classes"].tolist()
intervalo_atributos = calcular_intervalo_atributos(df, atributos)

# Initializes the evolution
arvore = evoluir_solucoes(df, 
                           atributos, 
                           intervalo_atributos, 
                           n_geracoes=n_geracoes, 
                           tamanho_pop=tamanho_pop,
                           min_samples=min_samples,
                           max_profundidade=max_profundidade,
                           taxa_mutacao=taxa_mutacao, 
                           coluna_classe="classes")

In [19]:
print(arvore)

{'folha': False, 'premissa': {'premisse': ('SATMIN', '>=', np.float64(86.01647526107025))}, 'contagem_total': {'APNEICO': 144, 'NORMAL': 88}, 'contagem_esquerda': {'NORMAL': 45, 'APNEICO': 22}, 'contagem_direita': {'APNEICO': 122, 'NORMAL': 43}, 'esquerda': {'folha': False, 'premissa': {'premisse': ('W0', '<', np.float64(8.626521667170381))}, 'contagem_total': {'NORMAL': 45, 'APNEICO': 22}, 'contagem_esquerda': {'NORMAL': 44, 'APNEICO': 12}, 'contagem_direita': {'APNEICO': 10, 'NORMAL': 1}, 'esquerda': {'folha': True, 'classes': {'NORMAL': 44, 'APNEICO': 12}}, 'direita': {'folha': True, 'classes': {'APNEICO': 10, 'NORMAL': 1}}}, 'direita': {'folha': True, 'classes': {'APNEICO': 122, 'NORMAL': 43}}}


In [20]:
from sklearn.metrics import f1_score

y_pred_GAbDT = predict_custom_tree(arvore, X_t)
print("Acurácia:", accuracy_score(y_t, y_pred_GAbDT))

f1 = f1_score(y_t, y_pred_GAbDT, average=None) 
print("F1-score:", f1)

Acurácia: 0.7457627118644068
F1-score: [0.81927711 0.57142857]


In [21]:
from sklearn.metrics import f1_score
print("Acurácia:", accuracy_score(y_t, y_pred))
f1 = f1_score(y_t, y_pred, average=None)  # para classificação binária
print("F1-score:", f1)

Acurácia: 0.711864406779661
F1-score: [0.77922078 0.58536585]


In [22]:
#plot_tree_custom(arvore)

# Student’s T statistical test using K-fold

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import ttest_rel
import numpy as np
import pandas as pd

source_file = 'C:/Users/Gabriel/Pasta/Machine Learning/Aplicações Biblioteca LORE/Tudo Binário/BINÁRIO_FINAL.csv'
df_completo = pd.read_csv(source_file, skipinitialspace=True, na_values='?', keep_default_na=True)

X_array = df_completo.drop(columns='CLASSE').values
y_array = df_completo['CLASSE'].values

X = pd.DataFrame(X_array, columns=df_completo.drop(columns='CLASSE').columns)
y = pd.Series(y_array)


atributos = list(X.columns)
intervalo_atributos = {col: (X[col].min(), X[col].max()) for col in atributos}

n_geracoes = 15
tamanho_pop = 200
taxa_mutacao = 0.2
erro_threshold = 0.125
max_profundidade=15
min_samples=10
atributos = df.columns[df.columns != "classes"].tolist()
intervalo_atributos = calcular_intervalo_atributos(df, atributos)

n_splits = 20
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

accs_custom = []
accs_sklearn = []

for train_index, test_index in skf.split(X, y):
    # Create train/test DataFrames
    X_treino_df = X.iloc[train_index].copy()
    y_treino = y.iloc[train_index].copy()
    df_treino = X_treino_df.copy()
    df_treino['classes'] = y_treino.values

    X_teste_df = X.iloc[test_index].copy()
    y_teste = y.iloc[test_index].copy()
    
    # GAbDT
    arvore = evoluir_solucoes(df_treino, atributos, intervalo_atributos,
                               n_geracoes=n_geracoes, tamanho_pop=tamanho_pop,
                               min_samples=min_samples, max_profundidade=max_profundidade,
                               taxa_mutacao=taxa_mutacao, coluna_classe="classes")
    
    y_pred_custom = predict_custom_tree(arvore, X_teste_df)
    acc_custom = accuracy_score(y_teste, y_pred_custom)
    # print accuracy of GAbDT
    accs_custom.append(acc_custom)

    # Standard Scikit-Learn model (with the same hyperparameters as GAbDT)
    clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=10,
            splitter='best')
    clf.fit(X_treino_df, y_treino)
    y_pred_sklearn = clf.predict(X_teste_df)
    acc_sklearn = accuracy_score(y_teste, y_pred_sklearn)
    # print accuracy of Scikit-Learn
    accs_sklearn.append(acc_sklearn)


# Paired Student's t-test
t_stat, p_value = ttest_rel(accs_custom, accs_sklearn)

# Mean calculation
media_custom = sum(accs_custom) / len(accs_custom)
media_sklearn = sum(accs_sklearn) / len(accs_sklearn)

print("Custom model accuracies:", accs_custom)
print()
print("Scikit-learn accuracies:", accs_sklearn)
print()
print(f"t = {t_stat:.4f}, p = {p_value:.4f}")

print(f"\nMean (custom): {media_custom:.4f}")
print(f"Mean (sklearn): {media_sklearn:.4f}")

if p_value < 0.05:
    print("Statistically significant difference!")
else:
    print("No significant difference.")

Custom model accuracies: [0.8, 0.8, 0.7333333333333333, 0.7333333333333333, 0.8, 0.6, 0.6666666666666666, 0.7333333333333333, 0.5333333333333333, 0.6, 0.7333333333333333, 0.7857142857142857, 0.8571428571428571, 0.9285714285714286, 0.5714285714285714, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.42857142857142855, 0.7142857142857143]

Scikit-learn accuracies: [0.7333333333333333, 0.6666666666666666, 0.6666666666666666, 0.8666666666666667, 0.7333333333333333, 0.5333333333333333, 0.6666666666666666, 0.4, 0.5333333333333333, 0.5333333333333333, 0.6, 0.5, 0.7142857142857143, 0.8571428571428571, 0.5714285714285714, 0.5, 0.5, 0.6428571428571429, 0.7857142857142857, 0.7142857142857143]

t = 2.1851, p = 0.0416

Mean (custom): 0.7081
Mean (sklearn): 0.6360
Statistically significant difference!
