In [1]:
import os
import pandas as pd
import glob
import json
import re
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
def extract_random_seed(config_str):
    """Extrai o valor de RANDOM_SEED do string da configuração"""
    try:
        # Usando regex para extrair o valor após 'RANDOM_SEED':
        match = re.search(r"'RANDOM_SEED':\s*(\d+)", config_str)
        if match:
            return int(match.group(1))
        else:
            # Alternativa: convertendo a string para dicionário e acessando a chave
            config_dict = eval(config_str.replace("True", "True").replace("False", "False"))
            return config_dict.get('RANDOM_SEED')
    except:
        return None

In [15]:
def collect_results_from_eniac():
    # Caminho base para o diretório eniac
    base_path = os.path.join(os.getcwd(), '../results/best_config')
    
    # Lista para armazenar todos os DataFrames
    all_dfs = []
    
    # Busca por todos os arquivos results.csv em qualquer subdiretório
    csv_pattern = os.path.join(base_path, '*', '*', 'results.csv')
    csv_files = glob.glob(csv_pattern)
    
    print(f"Encontrados {len(csv_files)} arquivos results.csv")
    
    for csv_path in csv_files:
        # Extrai o nome da abordagem do diretório pai
        approach_dir = os.path.dirname(csv_path)
        approach_name = os.path.basename(approach_dir)
        
        try:
            # Carrega o CSV
            df = pd.read_csv(csv_path)
            
            # Remove a coluna de index original se existir
            if len(df.columns) > 0 and (df.columns[0].startswith('Unnamed') or df.columns[0].isdigit()):
                df = df.drop(df.columns[0], axis=1)
            
            # Adiciona colunas de abordagem
            df['approach'] = approach_name
            
            # Extrai RANDOM_SEED da coluna config e cria uma nova coluna se a coluna config existir
            if 'config' in df.columns:
                df['seed'] = df['config'].apply(extract_random_seed)
            
            # Adiciona informação sobre o arquivo de origem para debug
            df['source_file'] = csv_path
            
            # Adiciona o DataFrame à lista
            all_dfs.append(df)
            # print(f"Arquivo processado: {csv_path} - {len(df)} registros")
            
        except Exception as e:
            print(f"Erro ao processar {csv_path}: {e}")
    
    # Concatena todos os DataFrames
    if all_dfs:
        result_df = pd.concat(all_dfs, ignore_index=True)
        print(f"\nTotal de registros coletados: {len(result_df)}")
        return result_df
    else:
        print("Nenhum arquivo CSV foi encontrado ou processado.")
        return None

In [16]:
# Coleta e processa todos os arquivos
combined_df = collect_results_from_eniac()

# if combined_df is not None:
#     # Mostra estatísticas básicas
#     print(f"\nTotal de registros: {len(combined_df)}")
#     print(f"Abordagens encontradas: {combined_df['approach'].unique()}")
#     print(f"Número de execuções diferentes: {combined_df['datetime'].nunique()}")
    
#     # Salva o DataFrame combinado
#     combined_df.to_csv('combined_results.csv', index=False)
#     print("\nArquivo 'combined_results.csv' salvo com sucesso!")
    
#     # Exibe as primeiras linhas para verificação
#     print("\nPrimeiras linhas do DataFrame combinado:")
#     display(combined_df.tail())

Encontrados 30 arquivos results.csv

Total de registros coletados: 120


In [19]:

# Lista para armazenar os resultados
best_configs = []

# Para cada abordagem (valores de n)
for approach in combined_df['approach'].unique():
    # Para cada modelo
    for model in ['GCN', 'MLP', 'RF']:
        # Filtrar os dados pela abordagem e modelo
        subset = combined_df[(combined_df['approach'] == approach) & (combined_df['model_name'] == model)]
        
        # Se existirem dados para essa combinação
        if not subset.empty:
            # Encontrar a linha com a maior acurácia
            best_row = subset.loc[subset['accuracy'].idxmax()]
            
            # Extrair as métricas solicitadas
            result = {
                'approach': approach,
                'model_name': model,
                # 'seed': best_row['seed'],
                # 'datetime': best_row['datetime'],
                'accuracy': best_row['accuracy'],
                'rush_precision': best_row['rush_precision'],
                'rush_recall': best_row['rush_recall'],
                'rush_f1_score': best_row['rush_f1_score'],
                'pass_precision': best_row['pass_precision'],
                'pass_recall': best_row['pass_recall'],
                'pass_f1_score': best_row['pass_f1_score']
            }
            
            # Adicionar à lista de resultados
            best_configs.append(result)

# Criar um DataFrame com os resultados
best_configs_df = pd.DataFrame(best_configs)

# Ordenar por abordagem e modelo para facilitar a visualização
best_configs_df = best_configs_df.sort_values(['approach', 'model_name'])

# Mostrar os resultados
print("Melhores configurações por abordagem e modelo com métricas:")
display(best_configs_df)

# Salvar os resultados em um arquivo CSV
best_configs_df.to_csv('best_configurations_with_metrics.csv', index=False)
print("\nResultados salvos em 'best_configurations_with_metrics.csv'")

# Opcional: Exibir estatísticas resumidas das métricas
metrics = ['accuracy', 'rush_precision', 'rush_recall', 'rush_f1_score', 
           'pass_precision', 'pass_recall', 'pass_f1_score']

print("\nEstatísticas resumidas das métricas para as melhores configurações:")
for metric in metrics:
    print(f"\n{metric.upper()}:")
    stats = best_configs_df.pivot_table(
        values=metric, 
        index='approach', 
        columns='model_name',
        aggfunc='mean'
    )
    display(stats)

Melhores configurações por abordagem e modelo com métricas:


Unnamed: 0,approach,model_name,accuracy,rush_precision,rush_recall,rush_f1_score,pass_precision,pass_recall,pass_f1_score
27,2025-07-28_10-08-55,GCN,0.779869,0.741525,0.859247,0.796058,0.832685,0.700491,0.760889
28,2025-07-28_10-08-55,MLP,0.718494,0.718494,0.718494,0.718494,0.718494,0.718494,0.718494
29,2025-07-28_10-08-55,RF,0.740589,0.721386,0.783961,0.751373,0.763441,0.697218,0.728828
39,2025-07-28_15-11-48,GCN,0.764738,0.755532,0.782751,0.768901,0.774632,0.746725,0.760422
40,2025-07-28_15-11-48,MLP,0.666485,0.665941,0.668122,0.667030,0.667032,0.664847,0.665938
...,...,...,...,...,...,...,...,...,...
10,2025-07-29_11-47-29,MLP,0.695961,0.691569,0.707424,0.699406,0.700559,0.684498,0.692435
11,2025-07-29_11-47-29,RF,0.735808,0.714286,0.786026,0.748441,0.762136,0.685590,0.721839
6,2025-07-29_12-47-43,GCN,0.746725,0.706204,0.844978,0.769384,0.807065,0.648472,0.719128
7,2025-07-29_12-47-43,MLP,0.668122,0.650685,0.725983,0.686275,0.690123,0.610262,0.647740



Resultados salvos em 'best_configurations_with_metrics.csv'

Estatísticas resumidas das métricas para as melhores configurações:

ACCURACY:


model_name,GCN,MLP,RF
approach,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-28_10-08-55,0.779869,0.718494,0.740589
2025-07-28_15-11-48,0.764738,0.666485,0.740721
2025-07-28_16-59-04,0.748362,0.666485,0.740721
2025-07-28_17-13-49,0.779051,0.718494,0.740589
2025-07-28_17-32-32,0.769231,0.675941,0.737316
2025-07-28_18-07-48,0.729803,0.64738,0.723253
2025-07-28_18-22-54,0.779051,0.687398,0.735679
2025-07-28_18-40-27,0.772504,0.681669,0.737316
2025-07-28_18-58-32,0.776596,0.678396,0.731588
2025-07-28_19-17-28,0.778232,0.722586,0.728314



RUSH_PRECISION:


model_name,GCN,MLP,RF
approach,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-28_10-08-55,0.741525,0.718494,0.721386
2025-07-28_15-11-48,0.755532,0.665941,0.722054
2025-07-28_16-59-04,0.721519,0.665941,0.722054
2025-07-28_17-13-49,0.78089,0.718494,0.721386
2025-07-28_17-32-32,0.753467,0.690941,0.713235
2025-07-28_18-07-48,0.684487,0.641213,0.702275
2025-07-28_18-22-54,0.760305,0.689256,0.712389
2025-07-28_18-40-27,0.786575,0.683168,0.713864
2025-07-28_18-58-32,0.752239,0.676948,0.71407
2025-07-28_19-17-28,0.724868,0.713166,0.706667



RUSH_RECALL:


model_name,GCN,MLP,RF
approach,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-28_10-08-55,0.859247,0.718494,0.783961
2025-07-28_15-11-48,0.782751,0.668122,0.782751
2025-07-28_16-59-04,0.808952,0.668122,0.782751
2025-07-28_17-13-49,0.775777,0.718494,0.783961
2025-07-28_17-32-32,0.800327,0.636661,0.793781
2025-07-28_18-07-48,0.85262,0.669214,0.775109
2025-07-28_18-22-54,0.815057,0.682488,0.790507
2025-07-28_18-40-27,0.747954,0.677578,0.792144
2025-07-28_18-58-32,0.824877,0.682488,0.772504
2025-07-28_19-17-28,0.89689,0.744681,0.780687



RUSH_F1_SCORE:


model_name,GCN,MLP,RF
approach,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-28_10-08-55,0.796058,0.718494,0.751373
2025-07-28_15-11-48,0.768901,0.66703,0.751179
2025-07-28_16-59-04,0.762738,0.66703,0.751179
2025-07-28_17-13-49,0.778325,0.718494,0.751373
2025-07-28_17-32-32,0.77619,0.662692,0.751356
2025-07-28_18-07-48,0.759358,0.654915,0.736897
2025-07-28_18-22-54,0.78673,0.685855,0.749418
2025-07-28_18-40-27,0.766779,0.680362,0.75097
2025-07-28_18-58-32,0.786885,0.679707,0.742138
2025-07-28_19-17-28,0.801756,0.728583,0.741835



PASS_PRECISION:


model_name,GCN,MLP,RF
approach,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-28_10-08-55,0.832685,0.718494,0.763441
2025-07-28_15-11-48,0.774632,0.667032,0.762813
2025-07-28_16-59-04,0.782609,0.667032,0.762813
2025-07-28_17-13-49,0.777236,0.718494,0.763441
2025-07-28_17-32-32,0.787086,0.663126,0.767528
2025-07-28_18-07-48,0.804631,0.65411,0.749086
2025-07-28_18-22-54,0.800705,0.685575,0.764706
2025-07-28_18-40-27,0.75975,0.680195,0.766544
2025-07-28_18-58-32,0.806159,0.679868,0.752228
2025-07-28_19-17-28,0.864807,0.732877,0.755027



PASS_RECALL:


model_name,GCN,MLP,RF
approach,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-28_10-08-55,0.700491,0.718494,0.697218
2025-07-28_15-11-48,0.746725,0.664847,0.69869
2025-07-28_16-59-04,0.687773,0.664847,0.69869
2025-07-28_17-13-49,0.782324,0.718494,0.697218
2025-07-28_17-32-32,0.738134,0.715221,0.680851
2025-07-28_18-07-48,0.606987,0.625546,0.671397
2025-07-28_18-22-54,0.743044,0.692308,0.680851
2025-07-28_18-40-27,0.797054,0.685761,0.682488
2025-07-28_18-58-32,0.728314,0.674304,0.690671
2025-07-28_19-17-28,0.659574,0.700491,0.675941



PASS_F1_SCORE:


model_name,GCN,MLP,RF
approach,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-28_10-08-55,0.760889,0.718494,0.728828
2025-07-28_15-11-48,0.760422,0.665938,0.729345
2025-07-28_16-59-04,0.732132,0.665938,0.729345
2025-07-28_17-13-49,0.779772,0.718494,0.728828
2025-07-28_17-32-32,0.761824,0.688189,0.721596
2025-07-28_18-07-48,0.691973,0.639509,0.708117
2025-07-28_18-22-54,0.770798,0.688925,0.720346
2025-07-28_18-40-27,0.777955,0.682967,0.722078
2025-07-28_18-58-32,0.765262,0.677075,0.720137
2025-07-28_19-17-28,0.748375,0.716318,0.713299


In [20]:
# Agrupar por abordagem e modelo, encontrando a acurácia máxima
best_models = combined_df.groupby(['approach', 'model_name'])['accuracy'].max().reset_index()

# Criar um pivot table para melhor visualização
pivot_best = best_models.pivot(index='approach', columns='model_name', values='accuracy')
display(best_models)

# Mostrar os melhores resultados
print("Melhores acurácias por abordagem e modelo:")
print(pivot_best)

# Encontrar para cada abordagem e modelo, o seed que produziu a melhor acurácia
best_configs = []

for approach in combined_df['approach'].unique():
    for model in ['GCN', 'RF', 'MLP']:
        subset = combined_df[(combined_df['approach'] == approach) & (combined_df['model_name'] == model)]
        if not subset.empty:
            best_row = subset.loc[subset['accuracy'].idxmax()]
            best_configs.append({
                'approach': approach,
                'model_name': model,
                'accuracy': best_row['accuracy'],
                'seed': best_row['seed'],
                'datetime': best_row['datetime']
            })

best_configs_df = pd.DataFrame(best_configs)
best_configs_df = best_configs_df.sort_values(['approach', 'model_name'])

print("\nDetalhes das melhores configurações:")
print(best_configs_df)

# Visualização dos resultados
plt.figure(figsize=(12, 8))
sns.barplot(x='approach', y='accuracy', hue='model_name', data=best_configs_df)
plt.title('Melhor Acurácia por Abordagem e Modelo')
plt.ylabel('Acurácia')
plt.xlabel('Abordagem')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('best_accuracy_by_approach_model.png')
plt.show()

Unnamed: 0,approach,model_name,accuracy
0,2025-07-28_10-08-55,GCN,0.779869
1,2025-07-28_10-08-55,MLP,0.718494
2,2025-07-28_10-08-55,RF,0.740589
3,2025-07-28_15-11-48,GCN,0.764738
4,2025-07-28_15-11-48,MLP,0.666485
...,...,...,...
85,2025-07-29_11-47-29,MLP,0.695961
86,2025-07-29_11-47-29,RF,0.735808
87,2025-07-29_12-47-43,GCN,0.746725
88,2025-07-29_12-47-43,MLP,0.668122


Melhores acurácias por abordagem e modelo:
model_name                GCN       MLP        RF
approach                                         
2025-07-28_10-08-55  0.779869  0.718494  0.740589
2025-07-28_15-11-48  0.764738  0.666485  0.740721
2025-07-28_16-59-04  0.748362  0.666485  0.740721
2025-07-28_17-13-49  0.779051  0.718494  0.740589
2025-07-28_17-32-32  0.769231  0.675941  0.737316
2025-07-28_18-07-48  0.729803  0.647380  0.723253
2025-07-28_18-22-54  0.779051  0.687398  0.735679
2025-07-28_18-40-27  0.772504  0.681669  0.737316
2025-07-28_18-58-32  0.776596  0.678396  0.731588
2025-07-28_19-17-28  0.778232  0.722586  0.728314
2025-07-28_21-58-10  0.748362  0.666485  0.740721
2025-07-28_22-59-07  0.729803  0.647380  0.723253
2025-07-28_23-58-46  0.730895  0.677948  0.727074
2025-07-29_00-55-21  0.758188  0.659389  0.724345
2025-07-29_01-45-50  0.745087  0.659934  0.725983
2025-07-29_02-18-45  0.757096  0.666485  0.740721
2025-07-29_02-43-41  0.760371  0.690502  0.723799
2025-07

KeyError: 'datetime'