## Why does each tool fail in reproducing failures and performing FL?

In [6]:
# Initialize the independent variables
import pandas as pd

tools = {"GZoltar" : "GZOLTAR1_7", "Flacoco" : "FLACOCO"}
targets = ["default", "java7", "java8", "java11", "java17"]
#targets = ["default"]

# Focus on the 191 bugs we are able to compile down to all versions
def load_dataframe(target):
    path_to_csv = "results_faultlocalization/" + target + "/results.csv"
    return pd.read_csv(path_to_csv, index_col=["pid", "bid"])

def filter_dataframe(df):
    return df.loc[df.env & df.checkout & df.compile & df.check_classfiles]

dfs = {}
for target in targets:
    dfs[target] = filter_dataframe(load_dataframe(target))

# Build the interserction index
index = dfs[targets[0]].index
for i in range(1, len(targets)):
    index = index.intersection(dfs[targets[i]].index)

# Process the dataframes to contain only the intersection
for target in targets:
    dfs[target] = dfs[target].loc[index].sort_index()

bugs = set(dfs[targets[0]].index)
print(len(bugs))

191


In [7]:
from pathlib import Path

# Iterate over the independent variables
results = []
i = 0
for pid, bid in bugs:
    i += 1
    # Load the baseline test failures
    with open("scripts/test_baselines/%s_%s_failing_tests" % (pid, bid)) as f:
        failing_tests_baseline = set([x.strip() for x in f.readlines()])
        
    for target in targets:
        for tool in tools.keys():
            # Initialize the dataframe for storing the results
            # - First 4 columns refer to the independent variables. Used also as the index cols
            # - "executed" is True when there exists execution information
            # - "failing_in_executed" is True when the failing tests are all in the executed tests
            # - "failing_less" is True when there are missing failing tests from the failing tests, even though they were executed
            # - "failing_more" is True when there are non-expected failing tests in the failing tests
            result = {"pid" : pid, "bid" : bid, "target" : target, "tool" : tool, 
                      "executed" : None, "failing_in_executed" : None, 
                      "failing_less" : None, "failing_more" : None,
                      "failing_same" : None, "failing_empty" : None,
                      "fl_empty" : None
                     }

            
            execution_path = Path("results_faultlocalization", target, "%s_executed_tests_%s%s.csv" % (tools[tool], pid, bid))
            failing_path = Path("results_faultlocalization", target, "%s_failing_tests_%s%s.csv" % (tools[tool], pid, bid))
            fl_path = Path("results_faultlocalization", target, "%s_suspicious_%s%s.csv" % (tools[tool], pid, bid))
            if not execution_path.exists():
                result["executed"] = False
            else:
                with open(execution_path) as f:
                    executed_tests = set([x.strip() for x in f.readlines()])
                with open(failing_path) as f:
                    failing_tests = set([x.strip() for x in f.readlines()])
                with open(fl_path) as f:
                    fl_results = [x for x in [x.strip() for x in f.readlines()]]
                
                result["executed"] = True
                result["failing_in_executed"] = failing_tests_baseline.issubset(executed_tests)
                result["failing_same"] = failing_tests == failing_tests_baseline
                result["failing_less"] = failing_tests.issubset(failing_tests_baseline) and not result["failing_same"]
                result["failing_more"] = not failing_tests.issubset(failing_tests_baseline)
                
                result["failing_empty"] = len(failing_tests) == 0
                result["fl_empty"] = len(fl_results) == 0
                
            results.append(result)

In [8]:
df = pd.DataFrame(results)
df

Unnamed: 0,pid,bid,target,tool,executed,failing_in_executed,failing_less,failing_more,failing_same,failing_empty,fl_empty
0,Compress,1,default,GZoltar,True,True,False,False,True,False,False
1,Compress,1,default,Flacoco,True,True,False,False,True,False,False
2,Compress,1,java7,GZoltar,True,True,False,False,True,False,False
3,Compress,1,java7,Flacoco,True,True,False,False,True,False,False
4,Compress,1,java8,GZoltar,True,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
1905,JacksonCore,13,java8,Flacoco,True,True,False,False,True,False,False
1906,JacksonCore,13,java11,GZoltar,True,True,False,False,True,False,False
1907,JacksonCore,13,java11,Flacoco,True,True,False,False,True,False,False
1908,JacksonCore,13,java17,GZoltar,True,False,True,False,False,True,True


In [9]:
for tool in tools:
    print("Analyzing %s..." % tool)
    print()
    tool_data = df[df["tool"]==tool]
    
    executed_data = tool_data[tool_data["executed"]==True]
    print("%s registered execution data for %s bug instances across the %s targets." 
          % (tool, len(executed_data), len(targets)))
    print()
    
    failing_in_executed_data = executed_data[executed_data["failing_in_executed"]==True]
    print("Out of those, only %s contained all baseline failing tests in the execution set."
         % (len(failing_in_executed_data)))
    print("That means %s, in the cases it executed, did not execute all failing test cases %s times." 
          % (tool, len(executed_data) - len(failing_in_executed_data)))
    print()
    
    failing_less = failing_in_executed_data[failing_in_executed_data["failing_less"]==True]
    failing_more = failing_in_executed_data[failing_in_executed_data["failing_more"]==True]
    print("Even in the cases where %s was able to execute all failing tests" % tool +
          " it was not always able to correctly reproduce failures.")
    print("In %s cases, not all expected failing tests where reported as failing." % len(failing_less))
    print("In %s cases, there where unexpected failing tests reported." % len(failing_more))
    print()
    
    failing_same = failing_in_executed_data[failing_in_executed_data["failing_same"]==True]
    print("Only in %s cases where failing tests properly reproduced." % len(failing_same))
    
    print("\n" * 4)
    
    print("SankeyMATIC:")
    for target in targets:
        print("%s [%s] %s" % (target, len(tool_data[tool_data["target"]==target]), tool))
    print("%s [%s] Executed" % (tool, len(executed_data)))
    print("%s [%s] Failed to Execute" % (tool, len(tool_data) - len(executed_data)))
    print("Executed [%s] All Failing Tests Executed" % (len(failing_in_executed_data)))
    print("Executed [%s] Not All Failing Tests Executed" % (len(executed_data) - len(failing_in_executed_data)))
    print("All Failing Tests Executed [%s] Unexpected Failing Tests Reported" % (len(failing_in_executed_data) - len(failing_same)))
    print("All Failing Tests Executed [%s] Correct Reproduction" % (len(failing_same)))    
    
    print("\n" * 4)
    
    print("%s reported an empty set of failing tests in %s cases."
          % (tool, len(tool_data[tool_data["failing_empty"]==True])))
    print("%s reported an empty list of suspicious lines in %s cases."
          % (tool, len(tool_data[tool_data["fl_empty"]==True])))
    print(pd.concat([tool_data[tool_data["failing_empty"]==True], tool_data[tool_data["fl_empty"]==True]])
          .drop_duplicates(keep=False))
          
    print("\n" * 4)

Analyzing GZoltar...

GZoltar registered execution data for 804 bug instances across the 5 targets.

Out of those, only 430 contained all baseline failing tests in the execution set.
That means GZoltar, in the cases it executed, did not execute all failing test cases 374 times.

Even in the cases where GZoltar was able to execute all failing tests it was not always able to correctly reproduce failures.
In 0 cases, not all expected failing tests where reported as failing.
In 162 cases, there where unexpected failing tests reported.

Only in 268 cases where failing tests properly reproduced.





SankeyMATIC:
default [191] GZoltar
java7 [191] GZoltar
java8 [191] GZoltar
java11 [191] GZoltar
java17 [191] GZoltar
GZoltar [804] Executed
GZoltar [151] Failed to Execute
Executed [430] All Failing Tests Executed
Executed [374] Not All Failing Tests Executed
All Failing Tests Executed [162] Unexpected Failing Tests Reported
All Failing Tests Executed [268] Correct Reproduction





GZoltar repo