In [1]:
import os
import pandas as pd
from omegaconf import OmegaConf


config = OmegaConf.load("lstm_config.yaml")

In [9]:
def results_dataframe():
    """
    Lê e combina dados de múltiplos arquivos CSV em diferentes diretórios.
    
    Returns:
        DataFrame: Combina todos os CSVs encontrados com uma coluna adicional 'Model_covariates'
                  indicando o diretório de origem.
    """
    df_aux = pd.DataFrame()
    for directory in os.listdir(config.paths.eval_folder):
        for csv in os.listdir(config.paths.eval_folder+directory):
            df_temp = pd.read_csv(config.paths.eval_folder+directory+"/"+csv)
            df_temp['Model_covariates'] = directory
            df_aux = pd.concat([df_aux,df_temp])
    return df_aux

def house_result_variation_calculate(house,df_):
    """
    Calcula a variação das métricas de avaliação para uma casa específica em relação
    ao modelo base (#).
    
    Args:
        house: Identificador da casa a ser analisada
        
    Returns:
        DataFrame: Dados originais com colunas adicionais mostrando as variações das métricas
    """
    df_filtred = df_[df_.House_Hold == house]
    metrics_values = df_filtred[["MAE","SMAPE","RMSE"]].loc[df_filtred.Model_covariates == "#"]
    variation = df_filtred[["MAE","SMAPE","RMSE"]].sub(metrics_values)
    variation.rename(columns={a:b for a,b in zip(variation.columns,[x+"_variation" for x in variation.columns])},inplace=True)
    variation = pd.concat([df_filtred,variation],axis=1)
    return variation


def check_better_results(df_house_variation):
    """
    Analisa as variações e identifica onde houve melhorias nas métricas.
    Uma variação negativa indica melhoria em relação ao modelo base.
    Depois soma as colunas resumindo as variaveis booleanas

    Args:
        df_house_variation: DataFrame com as variações calculadas
        
    Returns:
        DataFrame: Sumário das melhorias por casa, onde para cada coluna apresenta a soma de quantas vezes uma covariavel adicionada ao modelo 
                   apresentou melhor desempenho que o modelo sem covariaveis
    """
    df_temp = df_house_variation
    for variation in df_temp.columns[-3:]:
        name_col = variation+"_bool"
        df_temp[name_col] = df_temp[variation].apply(lambda x : 1 if x < 0 else 0)
    df_better_results = df_temp[["House_Hold","MAE_variation_bool","SMAPE_variation_bool","RMSE_variation_bool"]].groupby("House_Hold").sum()
    df_better_results = df_better_results.reset_index()
    return df_better_results


def condense_results():
    """
    Função principal que coordena todo o processo de análise.
    Processa os dados de todas as casas e gera um relatório final
    mostrando onde houve melhorias nas métricas. 
    Salva em um csv.
    
    Returns:
        DataFrame: Resumo final das melhorias por casa
    """
    df_temp = results_dataframe()
    df_aux = pd.DataFrame()
    for house in df_temp.House_Hold.unique():
        a = house_result_variation_calculate(house,df_temp)
        b = check_better_results(a)
        df_aux = pd.concat([df_aux,b])
    df_aux.sort_values("MAE_variation_bool",ascending=False,inplace=True)
    df_aux.to_csv("resultados_condensados.csv")    
    return df_aux.sort_values("MAE_variation_bool",ascending=False)

In [10]:
condense_results()

Unnamed: 0,House_Hold,MAE_variation_bool,SMAPE_variation_bool,RMSE_variation_bool
0,ESTABLISHEDAFFLUENCE_BLOCK_28_MAC004895,6,6,6
0,ESTABLISHEDAFFLUENCE_BLOCK_13_MAC005411,6,5,3
0,ESTABLISHEDAFFLUENCE_BLOCK_23_MAC000164,6,2,6
0,ESTABLISHEDAFFLUENCE_BLOCK_34_MAC002265,6,6,2
0,ESTABLISHEDAFFLUENCE_BLOCK_22_MAC004736,6,6,5
...,...,...,...,...
0,ESTABLISHEDAFFLUENCE_BLOCK_26_MAC002966,0,0,0
0,STRETCHEDSOCIETY_BLOCK_106_MAC001460,0,1,6
0,ESTABLISHEDAFFLUENCE_BLOCK_10_MAC002036,0,2,0
0,STRETCHEDSOCIETY_BLOCK_104_MAC004481,0,1,3


In [73]:
df_ = results_dataframe()
df_

Unnamed: 0,House_Hold,MAE,MAPE,SMAPE,RMSE,Time_execution,Model_covariates
0,STRETCHEDSOCIETY_BLOCK_91_MAC001165,0.070,989939.440,1.058,0.150,265.43s,dayofweek_num
0,ESTABLISHEDAFFLUENCE_BLOCK_25_MAC003004,0.029,308873.400,0.853,0.053,394.94s,dayofweek_num
0,ESTABLISHEDAFFLUENCE_BLOCK_13_MAC005411,0.168,0.358,0.284,0.217,354.63s,dayofweek_num
0,ESTABLISHEDAFFLUENCE_BLOCK_21_MAC004390,0.400,1.036,0.616,0.610,829.83s,dayofweek_num
0,ESTABLISHEDAFFLUENCE_BLOCK_33_MAC004926,0.052,0.358,0.295,0.106,413.29s,dayofweek_num
...,...,...,...,...,...,...,...
0,STRETCHEDSOCIETY_BLOCK_109_MAC000529,0.101,0.621,0.409,0.135,465.06s,bool_weather_missing_values
0,ESTABLISHEDAFFLUENCE_BLOCK_40_MAC005359,0.163,0.389,0.516,0.339,938.99s,bool_weather_missing_values
0,STRETCHEDSOCIETY_BLOCK_98_MAC001062,0.115,0.305,0.289,0.174,426.36s,bool_weather_missing_values
0,STRETCHEDSOCIETY_BLOCK_104_MAC004395,0.777,0.889,0.665,0.962,337.33s,bool_weather_missing_values


In [112]:
df_aux = pd.DataFrame()
for house in df_.House_Hold.unique():
    a = house_result_variation_calculate(house)
    b = check_better_results(a)
    df_aux = pd.concat([df_aux,b])
df_aux

Unnamed: 0,House_Hold,MAE_variation_bool,SMAPE_variation_bool,RMSE_variation_bool
0,STRETCHEDSOCIETY_BLOCK_91_MAC001165,2,6,6
0,ESTABLISHEDAFFLUENCE_BLOCK_25_MAC003004,2,2,2
0,ESTABLISHEDAFFLUENCE_BLOCK_13_MAC005411,6,5,3
0,ESTABLISHEDAFFLUENCE_BLOCK_21_MAC004390,1,1,2
0,ESTABLISHEDAFFLUENCE_BLOCK_33_MAC004926,3,3,6
...,...,...,...,...
0,STRETCHEDSOCIETY_BLOCK_104_MAC004395,2,2,4
0,ESTABLISHEDAFFLUENCE_BLOCK_16_MAC002096,3,5,3
0,STRETCHEDSOCIETY_BLOCK_99_MAC004799,0,0,0
0,ESTABLISHEDAFFLUENCE_BLOCK_17_MAC002007,0,3,0
