In [78]:
import sys
import os
import numpy as np
import pandas as pd
import math
import scipy
pd.options.mode.chained_assignment = None

from itertools import product
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [79]:
np.set_printoptions(suppress=True)
sns.set(style='white')

In [80]:
def createHeatMap(path_fig, df_solution, criteria, problem_id, plot_graph=False):
    #Cria uma nova coluna new_outcome que tem realiza um AND no input e new_input (Para saber se foi o mesmo resultado)
    #df_solution["new_outcome"] = df_solution["input_outcome"] & df_solution["new_input_outcome"]
    new_outcome = True
    for criterion in criteria:
        new_outcome = new_outcome & df_solution[f"{criterion}_outcome"]

    #Aqui é feito um produto das classificações de Falha e Aprovação baseado no numero de criterios
    classifications = ['F', 'P']
    permutations = [''.join(i) for i in product('FP', repeat=len(criteria))]

    
    #Dataframe que será usado para mapa de calor
    df_heatmap = pd.DataFrame(0, index=classifications, columns=permutations)

    for condition in product(classifications, permutations):
        #Aqui é pego cada uma das disposicoes possíveis de Falhas e Aprovações e são contados dentro do dataframe df_solution
        #Após, a contagem é colocada no df_heatmap para criar o mapa de calor com as proporções
        
        #input = 0 if position[1][0] == "F" else 1
        #new_input = 0 if position[1][1] == "F" else 1
        
        original_outcome = False if condition[0] == "F" else True
        query = f"original_outcome == {original_outcome}"
        for i, criterion in enumerate(criteria):
            current_outcome = False if condition[1][i] == "F" else True
            query = query + f" & {criterion}_outcome == {current_outcome}"
            
        #count = len(df_solution.query(f'old_outcome == {old} & input_outcome == {input} & new_input_outcome == {new_input}'))
        count = len(df_solution.query(query))
        
        #Se não fizer isso aparece um FutureWarning
        df_heatmap[condition[1]][condition[0]] = None 
        df_heatmap[condition[1]][condition[0]] = count/len(df_solution)

    if(plot_graph == True):
        #Criação do mapa de calor
        #plt.rcParams.update({'font.size': 28})
        plt.rcParams['font.size'] = 28
        fig, ax = plt.subplots(figsize=(15,10)) 
        heatmap = sns.heatmap(df_heatmap, annot=True, fmt='.2%', cmap='Blues', square=True)
        plt.legend(title = f"{problem_id}", loc = "center", bbox_to_anchor=(1.0, 1.0), fontsize=20)
        #É salvo no endereço
        heatmap_path = os.path.join(path_fig, f"heatmap_{problem_id}.png")
        heatmap.figure.savefig(heatmap_path, bbox_inches='tight')
        plt.close()

    return df_heatmap

In [81]:
def createPieChart(path_fig, df_heatmap, size, totals, problem_id, criteria_size, plot_graph=False):
    #Pega a quantidade de valores que são iguais em todos os criterios
    #C = df_heatmap["FF"]["F"] + df_heatmap["PP"]["P"]
    C = df_heatmap[f"{'F'*criteria_size}"]["F"] + df_heatmap[f"{'P'*criteria_size}"]["P"]

    #Pega as discordancias de tipo 1 (Original não passava e os novos passam) e do tipo 2 (Original passava e os novos não passam)
    D1 = 1 - C - df_heatmap.drop(["F"]).drop([f"{'P'*criteria_size}"], axis=1).sum(axis=1)["P"]
    D2 = 1 - C - df_heatmap.drop(["P"]).drop([f"{'F'*criteria_size}"], axis=1).sum(axis=1)["F"]

    #Soma quantidade de falhas antigas em todos os tipos de condicoes
    failure_old = df_heatmap.drop(["P"]).sum(axis=1)["F"]*100
    
    #Soma quantidade de falhas para qualquer um dos criterios em todos os tipos de condicoes (Soma duas vezes porque soma em ambos eixos)
    failure_new = df_heatmap.drop([f"{'P'*criteria_size}"], axis=1).sum().sum()*100

    #Soma quantidade de instancias que falharam em algum dos testes novos mas não no antigo
    failure_exclusive = df_heatmap.drop(["F"]).drop([f"{'P'*criteria_size}"], axis=1).sum(axis=1)["P"]*100

    #Guarda quantidade de falhas e discordancias
    totals.append([problem_id, size, round(failure_old, 2), round(failure_new, 2), round(failure_exclusive, 2)])
    
    data = [C, D1, D2]
    keys = ['Concordância', 'Discordância tipo 1', 'Discordância tipo 2']

    if(plot_graph == True):
        #Cria o grafico
        plt.rcParams['font.size'] = 28
        palette_color = sns.color_palette('bright')
        fig, ax = plt.subplots(figsize=(15,10)) 
        pie = plt.pie(data, labels=keys, colors=palette_color, autopct='%.1f%%')
        plt.legend(title = f"{problem_id}", loc = "center", bbox_to_anchor=(1.0, 1.0), fontsize=20)
        pie_path = os.path.join(path_fig, f"pie_{problem_id}.png")
        plt.savefig(pie_path, bbox_inches='tight')
        plt.close()

    return totals, data

In [82]:
def createCatPlot(path_fig, df_solution, criteria, problem_id, plot_graph=False):
    metrics = {
        "Aprovação":[],
        "Reprovação":[],
        "Falha":[],
        "Erros":[],
        "Timeouts":[],
    }
    
    for criterion in criteria:
        approved = len(df_solution.query(f'{criterion}_outcome == 1'))
        failed = len(df_solution.query(f'{criterion}_outcome == 0'))
        errors = len(df_solution.query(f'{criterion}_result.str.contains("E")'))
        timeout = len(df_solution.query(f'{criterion}_result.str.contains("T")'))
        
        #agreement = len(df_solution.query(f'old_outcome == 0 and {criterion}_outcome == 0 or old_outcome == 1 and {criterion}_outcome == 1'))
        
        reproved = failed - errors - timeout
        a = approved
        f = failed
        e = errors
        t = timeout
        r = reproved

        metrics["Aprovação"].append(a)
        metrics["Reprovação"].append(f)
        metrics["Falha"].append(e)
        metrics["Erros"].append(t)
        metrics["Timeouts"].append(r)

    df_metrics = pd.DataFrame(metrics)#, index=criteria
    df_metrics["criterion"] = criteria

    if(plot_graph == True):
        #Cria o gráficos
        plt.rcParams['font.size'] = 28
        fig, ax = plt.subplots(figsize=(15,10)) 
        catplot = sns.catplot(data=df_metrics, kind = "bar")
        plt.legend(title = f"{problem_id}")
        catplot_path = os.path.join(path_fig, f"catplot_{problem_id}.png")
        plt.legend()
        catplot.figure.savefig(catplot_path, bbox_inches='tight')
        plt.close()

    return None

In [83]:
def getData(df_heatmap, size, problem_id, df_data):

    aux = df_heatmap.stack()
    
    if(df_data.empty == True):
        df_data = pd.DataFrame([aux.values],  columns=[f'{i}{j}' for i, j in aux.index], index=[problem_id])
        df_data.insert(0, 'Soluções', int(size))
    else:
        df_aux = pd.DataFrame([aux.values],  columns=[f'{i}{j}' for i, j in aux.index], index=[problem_id])
        df_aux['Soluções'] = int(size)
        df_data = pd.concat([df_data, df_aux], axis=0)
    #print(df_data)

    return df_data

In [84]:
def add_values(bp, ax):
    #for element in ['whiskers', 'medians', 'caps']:
    for i, element in enumerate(['medians']):
        for line in bp[element]:
            try:
                (x_l, y_l),(x_r, y_r) = line.get_xydata()
                if not np.isnan(y_l):
                    y_line_center = y_l + ((y_r - y_l + 0.25)/2)
                    x_line_center = x_l # Since it's a line and it's horisontal
                    ax.text(x_line_center, y_line_center, # Position
                            '%.5f' % x_l, # Value (3f = 3 decimal float)
                            verticalalignment='center', # Centered vertically with line 
                            fontsize=10)
            except:
                continue
    sign = 1
    inc = 0
    for flier_list in bp['fliers']:
        fliers_data = flier_list.get_xydata()
        for i, flier in enumerate(fliers_data):
            print(flier)
            try:
                print(flier[1])
                if(i == 2):
                    y_line_center = flier[1] + 0.2*sign
                else:
                    y_line_center = flier[1] + 0.15*sign
                x_line_center = flier[0] - 0.006
                ax.text(x_line_center, y_line_center, # Position
                        '%.5f' % flier[0], # Value (3f = 3 decimal float)
                        verticalalignment='center', # Centered vertically with line 
                        fontsize=10)
                sign = -sign
            except:
                continue
    return None
                
def boxplot_settings(data, columns):
    fig, axes = plt.subplots(1, figsize=(12, 7))
    axes.set_aspect(aspect='auto', adjustable='datalim')
    
    new_data = data.copy()
    #for col in columns:
    #    new_data[col].apply(lambda x: math.sqrt(x))
    bp_series = new_data.boxplot(column=columns, grid=True, figsize=(25, 10), ax=axes, return_type='dict', vert=False)
    for key in bp_series:
        print(key)
    for line in bp_series['boxes']:
        print(line.get_xydata())
    add_values(bp_series, axes)

    plt.title('Porcentagens dos Resultados Agregados', fontsize=22)
    plt.ylabel('Cenário', fontsize=18)
    plt.xlabel('Porcentagem', fontsize=18)
    plt.show()
    
#posts_by_type = generate_dataframe()
#boxplot_settings(posts_by_type, [])

In [85]:
# Sample DataFrame
#data = pd.DataFrame({'A_result': ['PP', 'PP', 'PP', 'FF'], 'B_result': ['FFF', 'PPE', 'PPP', 'FFT']})

def getOutcomeData(df, criteria):
    outcomes = ['P', 'F', 'E', 'T']
    
    criterion_compiled = {criterion:{i:{outcome:0 for outcome in outcomes} for i in range(len(df._get_value(0, f'{criterion}_result')))} 
                          for criterion in criteria}
    highlighs = {}
    test_sizes = [len(criterion_compiled[d].values()) for d in criterion_compiled]
    for i_size, criterion in enumerate(criteria):
        for outcome in ['P','F','E','T']:
            for idx in range(test_sizes[i_size]):
                result = df[df[f'{criterion}_result'].str[idx] == outcome]  # Change index as needed (0 in this example)
                criterion_compiled[criterion][idx][outcome] = len(result)

        highlighs[criterion] = getSomeData(criterion_compiled, criterion, test_sizes[i_size])
        #print(f'Total: {total}\nHighest Failure Count: {highest}\nTest: {test}\nPercentage of test sets: {highest*100/total}')
    
    return criterion_compiled, highlighs
    
def getSomeData(compiled_data, criterion, tests):
    failures = [compiled_data[criterion][i]['F'] for i in range(tests)]
    highest = max([compiled_data[criterion][i]['F'] for i in range(tests)])
    test = failures.index(highest)
    total = sum([compiled_data[criterion][0][outcome] for outcome in ['P', 'F', 'E', 'T']])
    return [total, highest, test]
    
def plotHighestFailureGraph(all_criterion_compiled, criteria):
    fig, axes = plt.subplots(1, figsize=(12, 7))
    axes.set_aspect(aspect='auto', adjustable='datalim')
    
    #for col in columns:
    #    new_data[col].apply(lambda x: math.sqrt(x))
    #print(all_criterion_compiled)
    for criterion in criteria:
        plt.bar(all_criterion_compiled.index, all_criterion_compiled[criterion])
    plt.axhline(all_criterion_compiled['new_input'].mean(), color='k', linestyle='dashed', linewidth=1)
    plt.axhline(all_criterion_compiled['new_input'].median(), color='g', linestyle='dashed', linewidth=1)

    # Change some other texts on the graphs?
    plt.title('Porcentagens das Falhas', fontsize=22)
    plt.ylabel('Porcentagem', fontsize=18)
    plt.xlabel('Problema', fontsize=18)
    plt.show()

#getOutcomeData(data, ['A', 'B'])

In [86]:
# Sample DataFrame
#data = pd.DataFrame({'A_result': ['PP', 'PP', 'PP', 'FF'], 'B_result': ['FFF', 'PPE', 'PPP', 'FFT']})

def getDiscoveryErrors(df, criteria):
    outcomes = ['P', 'F', 'E', 'T']
    
    criterion_compiled = {criterion:{i:0 for i in range(len(df._get_value(0, f'{criterion}_result')))} 
                          for criterion in criteria}
    test_sizes = [len(criterion_compiled[d].values()) for d in criterion_compiled]
    for i_size, criterion in enumerate(criteria):
        df_criterion = df[[f'{criterion}_result']].copy()
        for idx in range(test_sizes[i_size]):
            result = df_criterion[df_criterion[f'{criterion}_result'].str[idx] != 'P'] 
            df_criterion = df_criterion[~df_criterion.isin(result)].dropna(how = 'all')
            criterion_compiled[criterion][idx] = len(result)
    
    return criterion_compiled


In [87]:
def main():
    #Criterios e Problemas
    problem_ids = [736, 742, 744, 751, 798, 800, 804, 806, 807, 809, 810, 811, 812, 815, 816, 817, 819, 820, 821, 822, 823, 824, 827, 828, 829, 831, 832, 833, 834, 835, 836, 838, 839, 840, 842]
    #problem_ids = [821, 822, 823, 824, 827, 828, 829, 831, 832, 833, 834, 835, 836, 838, 839, 840, 842]
    #problem_ids = [751, 800, 806, 824, 828, 831, 834]
    #criteria = ["input", "new_input"]
    criteria = ["new_input"]
    
    #Pegar endereços das tabelas e da pata que conterá os gráficos
    path_csv =  os.path.join(os.getcwd(), "CSV")
    path_fig = os.path.join(os.getcwd(), "Fig")
    os.makedirs(path_fig, exist_ok=True)
    
    totals = []
    #Cria os graficos para cada problema
    df_data = pd.DataFrame()
    df_failures = pd.DataFrame()
    for i, problem_id in enumerate(tqdm(problem_ids)):
        print(f'Problema {problem_id}')
        #Pega endereços dos arquivos csv
        path_solution = os.path.join(path_csv, f"solution_{problem_id}.csv")
        
        #Lẽ os arquivos
        df_solution = pd.read_csv(path_solution)

        #print(getOutcomeData(df_solution, criteria))

        df_heatmap = createHeatMap(path_fig, df_solution, criteria, problem_id, False)
        #df_heatmap = createHeatMap(path_fig, df_solution, criteria, problem_id)
        totals, data = createPieChart(path_fig, df_heatmap, len(df_solution), totals, problem_id, len(criteria), False)
        #totals, data = createPieChart(path_fig, df_heatmap, len(df_solution), totals, problem_id)
        df_data = getData(df_heatmap, len(df_solution), problem_id, df_data)
        createCatPlot(path_fig, df_solution, criteria, problem_id, False)
        #createCatPlot(path_fig, df_solution, criteria, problem_id)
        
        
        
        #outcome_data = getOutcomeData(df_solution, criteria)
        #print(outcome_data)
        #for criterion in criteria:
        #    df_failures.at[problem_id, criterion] = outcome_data[1][criterion][1]*100/outcome_data[1][criterion][0]

        print(getDiscoveryErrors(df_solution, criteria))
        
    #print(df_failures[df_failures['original'] > df_failures['new_input']])
    #plotHighestFailureGraph(df_failures, criteria)
    #print(data)
    
    #boxplot_settings(df_data, ['PF', 'FP'])
    #df_data.to_csv('data.csv', index=True)

In [88]:
main()


  9%|███▊                                        | 3/35 [00:00<00:01, 28.17it/s]

Problema 736
{'new_input': {0: 4186, 1: 7, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}}
Problema 742
{'new_input': {0: 12941, 1: 1439, 2: 3, 3: 7, 4: 0}}
Problema 744
{'new_input': {0: 7186, 1: 853, 2: 123, 3: 61}}
Problema 751
{'new_input': {0: 4589, 1: 347, 2: 9, 3: 16, 4: 1602, 5: 8, 6: 60, 7: 0}}
Problema 798
{'new_input': {0: 4439, 1: 525, 2: 61, 3: 29, 4: 25}}
Problema 800
{'new_input': {0: 2494, 1: 1259, 2: 16, 3: 5, 4: 0, 5: 77, 6: 0}}
Problema 804


 31%|█████████████▌                             | 11/35 [00:00<00:00, 31.62it/s]

{'new_input': {0: 22578, 1: 1852, 2: 66, 3: 0, 4: 0, 5: 242, 6: 0, 7: 49, 8: 0}}
Problema 806
{'new_input': {0: 10028, 1: 1505, 2: 489, 3: 1, 4: 165, 5: 7, 6: 0, 7: 0, 8: 201}}
Problema 807
{'new_input': {0: 14837, 1: 817, 2: 182, 3: 2841, 4: 792, 5: 37, 6: 5, 7: 179, 8: 566}}
Problema 809
{'new_input': {0: 8112, 1: 111, 2: 0, 3: 0, 4: 0}}
Problema 810
{'new_input': {0: 12988, 1: 4527, 2: 1284}}
Problema 811
{'new_input': {0: 5992, 1: 1807, 2: 140, 3: 413, 4: 661, 5: 188, 6: 263, 7: 104, 8: 217}}
Problema 812
{'new_input': {0: 7699, 1: 4321, 2: 446}}
Problema 815


 54%|███████████████████████▎                   | 19/35 [00:00<00:00, 33.41it/s]

{'new_input': {0: 6574, 1: 8, 2: 270, 3: 41, 4: 6, 5: 0, 6: 0}}
Problema 816
{'new_input': {0: 8927, 1: 118, 2: 658, 3: 397, 4: 1398, 5: 19, 6: 0}}
Problema 817
{'new_input': {0: 10729, 1: 304, 2: 1540, 3: 219, 4: 157, 5: 6}}
Problema 819
{'new_input': {0: 7553, 1: 23, 2: 1816, 3: 291, 4: 15, 5: 2, 6: 12, 7: 33}}
Problema 820
{'new_input': {0: 11319, 1: 388, 2: 638, 3: 259, 4: 262, 5: 27, 6: 0, 7: 2, 8: 4}}
Problema 821
{'new_input': {0: 4856, 1: 234, 2: 332, 3: 0}}
Problema 822
{'new_input': {0: 4534, 1: 1420, 2: 389, 3: 72, 4: 497, 5: 1, 6: 1}}
Problema 823


 77%|█████████████████████████████████▏         | 27/35 [00:00<00:00, 34.96it/s]

{'new_input': {0: 7235, 1: 779, 2: 897, 3: 365, 4: 15, 5: 1}}
Problema 824
{'new_input': {0: 8136, 1: 2573, 2: 1790, 3: 26}}
Problema 827
{'new_input': {0: 5567, 1: 181, 2: 170, 3: 30, 4: 309, 5: 0}}
Problema 828
{'new_input': {0: 6789, 1: 373, 2: 468, 3: 112, 4: 518}}
Problema 829
{'new_input': {0: 2759, 1: 541, 2: 259, 3: 333, 4: 0}}
Problema 831
{'new_input': {0: 7998, 1: 113, 2: 6, 3: 0, 4: 417, 5: 240}}
Problema 832
{'new_input': {0: 8255, 1: 408, 2: 257, 3: 0}}
Problema 833
{'new_input': {0: 4627, 1: 3, 2: 1, 3: 1299, 4: 6, 5: 61, 6: 26, 7: 157, 8: 3, 9: 0, 10: 1}}
Problema 834


100%|███████████████████████████████████████████| 35/35 [00:01<00:00, 33.45it/s]

{'new_input': {0: 3696, 1: 29, 2: 73, 3: 0, 4: 6, 5: 11, 6: 2, 7: 0}}
Problema 835
{'new_input': {0: 8402, 1: 270, 2: 13, 3: 57, 4: 3, 5: 0, 6: 8, 7: 0, 8: 0}}
Problema 836
{'new_input': {0: 3582, 1: 2725, 2: 51, 3: 101, 4: 0, 5: 6, 6: 0, 7: 1, 8: 0}}
Problema 838
{'new_input': {0: 4028, 1: 3, 2: 129, 3: 1, 4: 908, 5: 0, 6: 0}}
Problema 839
{'new_input': {0: 9681, 1: 1516, 2: 698, 3: 114, 4: 2621, 5: 85, 6: 25}}
Problema 840
{'new_input': {0: 6250, 1: 460, 2: 102, 3: 308, 4: 2, 5: 166, 6: 125, 7: 2, 8: 0}}
Problema 842
{'new_input': {0: 9593, 1: 473, 2: 92, 3: 128, 4: 28, 5: 56, 6: 22, 7: 55, 8: 21}}



