# Merge PIT mutation to test-xml-file (Flaky Failures)

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import csv
import re
import xml.etree.ElementTree as ET
import filecmp
from pathlib import Path
import shutil
import collections
from itertools import count, groupby
import ast
import warnings
from collections import Counter
from tqdm import tqdm
import json
from lxml import etree
import base64
import unicodedata

## Functions

### General functions

In [2]:
def readXMLFile(filePath):
    parser = etree.XMLParser(strip_cdata=False,recover=True)
    with open(filePath, "rb") as source:
        tree = etree.parse(source, parser=parser)
    xmlroot = tree.getroot()
    return xmlroot

def writeXML(xml,output):
    outputXML = ET.ElementTree(xml)
    outputXML.write(output+".xml")

def save_dict_to_json(data_dict, file_name):
    with open(file_name, 'w') as json_file:
        json.dump(data_dict, json_file, indent=2)

def cleanTestXML(testXML):
    # This is called to remove the current mutants before merging ( avoid duplicate)
    for child in testXML.findall(".//mutant"):
        child.getparent().remove(child)
    testXML.findall("./mutants")[0].attrib.clear()
    return testXML

def getFilesByEndsWith(fileDir,ends):
    filesList = []
    for path, subdirs, files in os.walk(fileDir):
        for file in files:
            if (file.endswith(ends)):
                filesList.append(os.path.join(path,file))
    return filesList

def getFilesByStartsWith(fileDir,starts):
    filesList = []
    for path, subdirs, files in os.walk(fileDir):
        for file in files:
            if (file.startswith(starts)):
                filesList.append(os.path.join(path,file))
    return filesList
    
def removeTestDirTag(testXML):
    for child in testXML.iter('test_dir'):
        child.getparent().remove(child)
    return testXML

def read_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        


### Merge functions

In [3]:

def get_full_exception_and_stacktrace(message):
    full_message = ""
    stacktrace = ""
    lines = message.split(("\n"))
    for l in range (0,len(lines)):
        if lines[l].startswith("\tat ") or lines[l].startswith("at "):
            if lines[l].strip():
                stacktrace = stacktrace + lines[l] + "\n"
        else:
            if lines[l].strip():
                full_message = full_message + lines[l] + " "
    return full_message.lstrip().rstrip(),stacktrace

def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")

def remove_cdata_words(line):
    new_line = re.sub(r'(\[CDATA\[(\w*|\w*)\]\])', '', str(line))
    new_line = re.sub(r'(\/\/(\s?)<!\[CDATA\[)', '', str(new_line))
    #new_line = re.sub(r'(]]\>(?!<\/killingException>))', '))>', str(new_line))
    return new_line


def getExceptionWithMessageAndStacktraces(exception):
    lines_as_arr = []
    updated_exceptions = base64.b64decode(exception)
    for line in updated_exceptions.splitlines():
        updated_line = remove_cdata_words(line.decode('utf-8'))
        updated_line = remove_control_characters(updated_line)
        lines_as_arr.append(updated_line+"\n")
    full_exception,stackTrace = get_full_exception_and_stacktrace('\t'.join(lines_as_arr))

    return full_exception,stackTrace

def appendMutant(xmlFile,status,totalKills,perException,mutant_name,mutantId,full_exception,stackTrace):
    mutant = etree.SubElement(xmlFile.find('mutants'), "mutant")
    etree.SubElement(mutant, "mutant_name", {'mutant_id': str(mutantId),'source': 'PIT20runs','status': str(status), 'numberOfKills': str(totalKills),
                                            'killedByThisException': str(perException)}).text = mutant_name
    etree.SubElement(mutant, "mutant_exception").text = re.sub('\t','',full_exception)
    etree.SubElement(mutant, "mutant_stackTrace").text = "\n"+stackTrace
    return xmlFile

def mutationStat(xmlFile,mutant,mutantsDetails):
    mutantXML = readXMLFile(mutant)
    xmlFile.find('mutants').attrib['Total'] = str(len(mutantXML.findall('./mutation')))
    xmlFile.find('mutants').attrib['Killed'] = str(len([k for k in mutantsDetails.keys() if k.endswith('KILLED')]))
    xmlFile.find('mutants').attrib['Flaky'] = str(len([k for k in mutantsDetails.keys() if k.endswith('FLAKY')]))
    return xmlFile

def collectKilledMutants(pits,indx):
    killedMutants = {}
    xmlFilesPerTests = []
    for pit in pits:
        xmlFilesPerTests.append(readXMLFile(pit))
    
    for ind in indx:
        status = [file[ind].attrib['status'] for file in xmlFilesPerTests]
        
        key = str(ind)
        if (all(k == "KILLED" for k in status)):
            key = key + '|KILLED'
        elif('KILLED' in status and 'SURVIVED' in status):
            key = key + '|FLAKY'
        elif('FLAKY' in status):
            key = key + '|FLAKY'
        else:
            key = 'skip'
        if (key != 'skip'):
            exceptions = []
            for mutant in xmlFilesPerTests:
                exceptions.extend([e.text for e in mutant[ind].iter('killingException')])
            
            killedMutants[key] = exceptions

    return killedMutants


def mergedMutants(test,killedMutants,test_name):
    for k,v in killedMutants.items():
        for e in list(set(v)):
            exceptionWithMessage,stacttraces = getExceptionWithMessageAndStacktraces(e)
            test = appendMutant(test,k.split('|')[1],len(v),v.count(e),test_name,k.split('|')[0],exceptionWithMessage,stacttraces)
    return test


### Summurize xml functions

In [16]:
def createSummaryXMLPlainFile():
    # create plain xml file first
    root = ET.Element("root")
    tests_xml = ET.SubElement(root, "tests")
    mutants_xml = ET.SubElement(root, "mutants")
    return root , tests_xml, mutants_xml

def add_xml_block(tests_xml,block,test_name,freq,node_id,status,source,exception_type,target_lines,project_name):
    if(block == "test"):        
        test = ET.SubElement(tests_xml, "test")
    else:
        test = ET.SubElement(tests_xml, "mutant")
    ET.SubElement(test, block+"_name", {'source': source,'id': str(node_id),'status': status, 'frequency': str(freq), 'project': str(project_name)}).text = test_name    
    
    test_exception = ET.SubElement(test, block+"_exception")    
    test_exception.text = exception_type
    
    test_stacktracke = ET.SubElement(test, block+"_stackTrace")
    for line in target_lines:
        ET.SubElement(test_stacktracke, "line").text = line
            
    return tests_xml

def get_main_exception_type(failure):

    if (":" in failure):
        exception_type = failure.split(':', 1)[0]
        if ("." in exception_type):
            return exception_type.rsplit('.', 1)[1]
        else:
            return exception_type
    else:
        if ("." in failure):
            return failure.rsplit('.', 1)[1]
        else:
            return failure

#%%
def get_stacktrace(full_stackTrace,test_name):
    if ("Caused by:" in full_stackTrace):
        full_stackTrace = full_stackTrace.split("Caused by:")[0]    
    stackTrace_lines = full_stackTrace.split("at ")
    updated_stackTrace_lines =[]
    for line in stackTrace_lines:
        if (len(line)>0):
            line = line.replace("\n","")
            line = line.replace("\t","")     
            updated_stackTrace_lines.append(line.lstrip().rstrip())
    updated_stackTrace_lines = list(filter(None, updated_stackTrace_lines))
    if (len([l for l in updated_stackTrace_lines if l.startswith(test_name+"(")])>0):
        split_line_by = "test_name"
        lines = [i for i in updated_stackTrace_lines if i.startswith(test_name+"(")]
        stoped_index = updated_stackTrace_lines.index(lines[0])
        target_lines = updated_stackTrace_lines[:stoped_index+1]
        return target_lines,split_line_by,len(updated_stackTrace_lines)
    elif (len([l for l in updated_stackTrace_lines[1:] if l.endswith(".invoke0(Native Method)")])>0):
        split_line_by = "Native_Method"
        lines = [i for i in updated_stackTrace_lines if i.endswith(".invoke0(Native Method)")]
        stoped_index = updated_stackTrace_lines.index(lines[0])
        target_lines = updated_stackTrace_lines[:stoped_index]
        return target_lines,split_line_by,len(updated_stackTrace_lines)
    elif (len([l for l in updated_stackTrace_lines[1:] if l.endswith(".invoke(Unknown Source)")])>0):
        split_line_by = "Unknown_Source"
        lines = [i for i in updated_stackTrace_lines if i.endswith(".invoke(Unknown Source)")]
        stoped_index = updated_stackTrace_lines.index(lines[0])
        target_lines = updated_stackTrace_lines[:stoped_index]
        return target_lines,split_line_by,len(updated_stackTrace_lines)
    else:
        result = []
        for idx, val in enumerate(updated_stackTrace_lines):
            if (val.startswith(test_name.rsplit(".",1)[0])):
                result.append(idx)
        split_index = 5
        if (len(result)>0):
            split_index = max(result)
        split_line_by = "top-5-lines"
        target_lines = updated_stackTrace_lines[:split_index+1]
        return target_lines,split_line_by,len(updated_stackTrace_lines) 

def fixStactTraces(lines):
    fixed = []
    for line in lines:
        if (')at' in line):
            fixed.append(line.split(')at')[0]+')')
            fixed.append(line.split(')at')[1])
        else:
            fixed.append(line)
    return fixed

def getResultFromTestBlock(xmlroot):
    # keys are: id|freq|project|source|status
    # values  : exception|#|StackTrace                    
    testId = 0        
    testResultPerTestName = {}
    for test in xmlroot.iter('test'):
        if (type(test.find('test_exception').text) is str and type(test.find('test_stackTrace').text) is str):
            testId = testId + 1
            key = str(testId)+'|'+str(test.find("test_name").attrib["frequency"])+'|'+str(test.find("test_name").attrib["project"])+'|'+str(test.find("test_name").attrib["source"])+'|'+str(test.find("test_name").attrib["status"])                        
            # exceptions part .. 
            exception_type = get_main_exception_type(test.find('test_exception').text.lstrip().rstrip())
            # stackTrace Part 
            target_lines,split_line_by, total_stacktrace_lines = get_stacktrace(((test.find('test_stackTrace').text).lstrip()).rstrip(),test.find("test_name").text)

            # Condition ( to lines in one line ' split by ')at')
            target_lines = fixStactTraces(target_lines)
            testResultPerTestName[key]=exception_type+"|#|"+'|'.join(target_lines)
    
    return testResultPerTestName

def findUniqueFailures(testResultPerTestName,tests_xml, test_name):
    uniqueFailures = set([v for k,v in testResultPerTestName.items()])
    for failure in uniqueFailures:
        testResultByFailure = {k:v for k,v in testResultPerTestName.items() if v == failure}
        totalFreq = sum([int(k.split("|")[1]) for k,v in testResultByFailure.items()])
        testId = list(testResultByFailure.keys())[0].split("|")[0]
        project = list(testResultByFailure.keys())[0].split("|")[2]
        source = list(testResultByFailure.keys())[0].split("|")[3]
        status = list(testResultByFailure.keys())[0].split("|")[4]
        tests_xml = add_xml_block(tests_xml,"test",test_name,totalFreq,testId,status,source,failure.split("|#|")[0],failure.split("|#|")[1].split('|'),project)
    return tests_xml


def getResultFromMutationBlock(xmlroot):
    mutantInd = 0
    mutantResultPerTestName = {}     
    for mutant in xmlroot.iter('mutant'):
        mutantInd = mutantInd + 1
        key = str(mutantInd)+'|##|'+ mutant.find('mutant_name').attrib['mutant_id'] +'|##|' + mutant.find('mutant_name').attrib['source'] + '|#|' + mutant.find('mutant_name').attrib['status'] +'|#|' + mutant.find('mutant_name').attrib['numberOfKills'] +'|#|' + str(mutant.find('mutant_name').attrib['killedByThisException'])
        exception_type = get_main_exception_type(mutant.find('mutant_exception').text.lstrip().rstrip())
        target_lines,split_line_by, total_stacktrace_lines = get_stacktrace(((mutant.find('mutant_stackTrace').text).lstrip()).rstrip(),mutant.find("mutant_name").text)        
        mutantResultPerTestName[key]=exception_type+"|#|"+'|'.join(target_lines)
    return mutantResultPerTestName

def add_xml_block_for_mutants(mutantPartXML,mutantAttrib,test_name,exception,stackTrace):
    mutant = ET.SubElement(mutantPartXML, "mutant")
    ET.SubElement(mutant, "mutant_name", mutantAttrib).text = test_name
    mutant_exception = ET.SubElement(mutant, "mutant_exception")    
    mutant_exception.text = exception
        
    mutant_stacktracke = ET.SubElement(mutant, "mutant_stackTrace")
    for line in stackTrace:
        ET.SubElement(mutant_stacktracke, "line").text = line
    
    return mutantPartXML

def findUniqueMutations(mutationResultPerTestName,mutantPartXML, test_name):
    # We ignore any mutation that have two different exceptions and stack trace lines. .. 
    mutatoinAttrib= ['mutant_id','source','status','numberOfKills','killedByThisException']
    mutationIds = [int(i.split('|##|')[1]) for i in mutationResultPerTestName.keys()]
    for mutantId in sorted(set(mutationIds)):
        mutationTracer = 0
        resultPerId = {k:v for k,v in mutationResultPerTestName.items() if '|##|'+str(mutantId) + '|##|' in k}
        if (len(resultPerId)==1):
            mutationTracer = 1
        else:
            if (len(set([v for v in resultPerId.values()])) == 1):
                mutationTracer = 1
        if (mutationTracer>0):
            mutationAttribValues = list(resultPerId.keys())[0].split('|##|')[2].split('|#|')
            mutationAttribValues.insert(0,str(mutantId))
            mutantAttrib = dict(zip(mutatoinAttrib, mutationAttribValues))
            exception = list(resultPerId.values())[0].split('|#|')[0]
            stackTraceLines = list(resultPerId.values())[0].split('|#|')[1].split('|')
            if (stackTraceLines[0] != ""):
                mutantPartXML = add_xml_block_for_mutants(mutantPartXML,mutantAttrib,test_name,exception,stackTraceLines)
    
    return mutantPartXML

def addPITdetails(summaryXML,total):
    summaryXML.findall('./mutants')[0].set('Total', str(total))
    summaryXML.findall('./mutants')[0].set('Killed', str([t.find('mutant_name').attrib['status'] for t in summaryXML.findall('.//mutant')].count('KILLED')))
    summaryXML.findall('./mutants')[0].set('Flaky', str([t.find('mutant_name').attrib['status'] for t in summaryXML.findall('.//mutant')].count('FLAKY')))
    return summaryXML


def processMutations(mutationsData,test_name):
    removedKeys = []
    keys = {k:v for k,v in mutationsData.items() if int(k.split('|#|')[2])>20}
    values = [v.split('|#|')[0] for v in keys.values()]
    mutantIDsWith40Runs = []
    for k1 in keys.keys():
        if ('KILLED' in k1):
            if (int(k1.split('|#|')[2])==40):
                if (k1.split('|##|')[1] not in mutantIDsWith40Runs):
                    mutantIDsWith40Runs.append(k1.split('|##|')[1])
                else:
                    removedKeys.append(k1)
            elif (int(k1.split('|#|')[2])==32): # This is the case of ninja exceptions ... 
                if (int(k1.split('|#|')[3])==12):
                    removedKeys.append(k1)
    for remove in set(removedKeys):
        del mutationsData[remove]
    return mutationsData

### Features Process Functions

In [5]:
def gettTestInfo(xmlFile):
    testName = xmlFile.findall('.//test')[0].find('test_name').text
    className = testName.rsplit('.',1)[0]
    projectName = xmlFile.findall('.//test')[0].find('test_name').attrib['project']
    return testName,className,projectName

def readClassNames(classNamesDir):
    allClassNames = pd.DataFrame(columns=['Project','ClassNameType','ClassName'])
    for classType in ['Test','CUT']:
        allTestsFiles = getFilesByEndsWith(classNamesDir,classType+'.txt')
        for file in allTestsFiles:
            if (file.endswith('/'+classType+'.txt')):
                project = file.split(classNamesDir+'/')[1].split('/')[0]
                with open(file) as f:
                    classNames = f.readlines()
                    classNames_list = [x.strip() for x in classNames]

                    for className in classNames_list:
                        allClassNames = allClassNames.append(pd.Series([project,classType,className], index=allClassNames.columns ), ignore_index=True)
    return allClassNames

def analyzeStackTraceLines(test,testName,className,projectName,allClassNames):
    testNameVal = 0
    classNameVal = 0
    otherTestVal = 0
    jUnitVal = 0
    CUTlines = 0
    
    projectRepoName = allClassNames.loc[allClassNames['ClassName'] == className, 'Project'].iloc[0]
    classNamesPerProject = allClassNames[allClassNames['Project']==projectRepoName]
    testClassNames = classNamesPerProject[classNamesPerProject['ClassNameType']=='Test']['ClassName'].unique()
    cutClassNames = classNamesPerProject[classNamesPerProject['ClassNameType']=='CUT']['ClassName'].unique()
    
    for line in test.iter('line'):
        if (line.text.startswith(testName)):
            testNameVal = 1
        else:
            if (line.text.startswith(className)):
                classNameVal = 1
            elif(any(line.text.startswith(x) for x in testClassNames)):
                otherTestVal = 1
            elif(line.text.startswith('org.junit.') or line.text.startswith('junit.framework')):
                jUnitVal = 1
            elif(any(line.text.startswith(y) for y in cutClassNames)):
                CUTlines = 1
                
    return testNameVal,classNameVal,otherTestVal,jUnitVal,CUTlines

def readTxtFile(file):
    with open(file) as f:
        classNames = f.readlines()
        classNames_list = [x.strip() for x in classNames]
        return classNames_list

def getAllClassesNames(classesNames,testClassesNames,projectName):
    allClassNames = pd.DataFrame(columns=['Project','ClassNameType','ClassName'])
    for className in classesNames:
        if ('<' in className or '$' in className):
            if ('<' in className.rsplit('.',1)[1]):
                className = className.rsplit('<',1)[0]
            if ('$' in className.rsplit('.',1)[1]):
                className = className.rsplit('$',1)[0]
        allClassNames = allClassNames.append(pd.Series([projectName,"CUT",className], index=allClassNames.columns ), ignore_index=True)
        
    for testClassName in testClassesNames:
        if ('<' in className or '$' in className):
            if ('<' in testClassName.rsplit('.',1)[1]):
                testClassName = testClassName.rsplit('<',1)[0]
            if ('$' in testClassName.rsplit('.',1)[1]):
                testClassName = testClassName.rsplit('$',1)[0]

        allClassNames = allClassNames.append(pd.Series([projectName,"Test",testClassName], index=allClassNames.columns ), ignore_index=True)
    return allClassNames

def findDuplicateFailures(summaryFiles):
    for testFile in tqdm (summaryFiles,desc="Completed..."):
        if (os.path.getsize(testFile) >0):
            testXMLfile = readXMLFile(testFile)
            if (len(testXMLfile.findall('.//test'))>0):
                uniqueFailures = {}
                for test in testXMLfile.findall('.//mutant'):
                    status = test.find('mutant_name').attrib['status']
                    mutantId = test.find('mutant_name').attrib['mutant_id']
                    testException =  test.find('mutant_exception').text
                    stackTraceLines =  [line.text for line in test.iter('line')]
                    if (status+'#'+testException+'#'+'|'.join(stackTraceLines) not in uniqueFailures.values()):
                        uniqueFailures[int(mutantId)] = status+'#'+testException+'#'+'|'.join(stackTraceLines)
                
                FeaturesPerTest = pd.read_csv(os.path.join(testFile.rsplit('/',1)[0],'FeaturesPerTest.csv'))
                FeaturesPerTest['DuplicateFailure'] = 0
                for index, row in FeaturesPerTest.iterrows():
                    if row['FailureType'] == 'mutant':
                        if (row['FailureId'] not in list(uniqueFailures.keys())):
                            FeaturesPerTest.at[index,'DuplicateFailure']=1

                FeaturesPerTest.to_csv(os.path.join(testFile.rsplit('/',1)[0],'FeaturesPerTest.csv'), index=False)


## Input

In [19]:
# Directory of the 22 project dataset
datasetDir = "Path-to-22-projects/ICST-Final-Dataset"

allXmlFiles = getFilesByEndsWith(datasetDir,'.xml')
testXmlFiles = [t for t in allXmlFiles if not t.split('/')[-1].startswith('summary-of-') and not t.split('/')[-1].startswith('pit-report')]


pitIndexPerTest = 'Path-to/target-index-per-test.json'
targetIndex = read_json_file(pitIndexPerTest)


#### Pair of test-xml-file : PITs reports
allPITsReports = getFilesByStartsWith(datasetDir,'pit-report')

pitTests = list(set([t.split('/')[-2] for t in allPITsReports]))
dataset = {}
for pit in pitTests:
    dataset[pit.replace('#','.')] = [p for p in allPITsReports if pit+'/' in p]



#### This is for Step 3
parser = etree.XMLParser(strip_cdata=False,recover=True)
classNamesDir = 'Path-to/FlakeFlagger-testCode-CUT-reports'
allClassNames = readClassNames(classNamesDir)
summaryFiles = getFilesByStartsWith(datasetDir,'summary-of-')

## Main

### STEP0): which index-per-pit-per-test I need to parse?

In [13]:
alreadyParsedXML = read_json_file(pitIndexPerTest)
targetMutations = ['KILLED','SURVIVED','FLAKY']

overwrite = False # Chenge it if you like to parse the pit reports again. 
if (overwrite):
    alreadyParsedXML = {}
    for k in tqdm(list(dataset.keys())):
        if (k not in alreadyParsedXML.keys()):
            mutantXmlFile = readXMLFile(dataset[k][0])
            alreadyParsedXML[k]= ([ind for ind in range(0,len(mutantXmlFile.findall('./mutation'))) if mutantXmlFile.findall('./mutation')[ind].attrib['status'] in targetMutations])
            save_dict_to_json(alreadyParsedXML, 'data-input/target-index-per-test.json')    

### STEP 1) Clean test-xml-file before merge mutations. The merge will be on the same xml file as 'Mutants' block

In [15]:
alreadyParsedXML = read_json_file(pitIndexPerTest)

for test in tqdm(list(dataset.keys())):
    test_name = [t for t in testXmlFiles if t.endswith(test+'.xml')]
    testXML = [t for t in testXmlFiles if t.endswith(test+'.xml')]
    if (len(test_name)>0 and test in dataset.keys() and test in alreadyParsedXML.keys()): 
        # This IF is because not every pit has test-xml-file (not flaky test) or there is no pit reports

        # read the xml file 
        testXML = readXMLFile(test_name[0])

        # 1) Prepare the xml file before mering the mutations
        testFailuresOnly = cleanTestXML(testXML)

        # 2) Temp: remove test_dir if exist (no longer needed)
        testFailuresOnly = removeTestDirTag(testFailuresOnly)

        # 3) Get killed mutants per test and their exceptions: Dictionary (k = mutantId (index) and V = list of exceptions)
        killedMutants = collectKilledMutants(dataset[test],alreadyParsedXML[test])

        # 4) Merge mutants to the XML file .. 
        mergeXML = mergedMutants(testFailuresOnly,killedMutants,test)
        
        # 5) Save mergeFile 
        writeXML(testFailuresOnly,test_name[0].rsplit('.xml',1)[0])
        

100%|██████████| 683/683 [00:00<00:00, 6721.97it/s]


### STEP 2) Summarize each test-xml-file to consider exception type and stacktraces only (as designed by our paper ICST2024)

In [17]:
for test_name in tqdm(list(dataset.keys())):
    testXML = [t for t in testXmlFiles if t.endswith(test_name+'.xml')]
    if (len(testXML)>0):
        if (os.path.getsize(testXML[0])>0):
            # 1) read the xml file
            xmlroot = readXMLFile(testXML[0])

            # 2) Create plain xml file for summary-of- file
            summaryXML, testPartXML, mutantPartXML = createSummaryXMLPlainFile()
            
            # 3) get the data from each test block first 
            testResultPerTestName = getResultFromTestBlock(xmlroot)

            # 4) parse the test result first .. 
            testPartXML = findUniqueFailures(testResultPerTestName, testPartXML, test_name)

            # 5) get the data from each mutation block
            mutationResultPerTestNameOriginal = getResultFromMutationBlock(xmlroot)

            # 5a) process the collected mutants  
            mutationResultPerTestName = processMutations(mutationResultPerTestNameOriginal,test_name)       

            # save_dict_to_json(mutationResultPerTestName,'file_name.json')
            # stillExtra = [k.split('|##|')[1] for k in mutationResultPerTestName.keys() if int(k.split('|#|')[2])>20 and 'KILLED' in k]
            # for s in set(stillExtra):
            #     c = [k.split('|#|')[-1] for k in mutationResultPerTestName.keys() if k.split('|##|')[1] == s]
            #     if (sum([int(ss) for ss in c])>20):
            #         test20X.append(test_name)

            # 6) parse the mutantions result for each test .. 
            mutantPartXML = findUniqueMutations(mutationResultPerTestName,mutantPartXML, test_name)

            # 7) EXTRA: add pit details (how many mutants/ how many are killed)
            summaryXML = addPITdetails(summaryXML,len(readXMLFile(dataset[test_name][0]).findall('./mutation')))
            
            # # 8) export the summary xml file 
            outputFile = testXML[0].rsplit('/',1)[0]+'/summary-of-'+ testXML[0].rsplit('/',1)[1]
            abstract_tree = ET.ElementTree(summaryXML)
            abstract_tree.write(outputFile)

100%|██████████| 683/683 [02:26<00:00,  4.66it/s]


### STEP 3) Collect the features (Table 1 in the paper) from each failure

In [None]:
for testFile in tqdm(summaryFiles):
    if (os.path.getsize(testFile) >0):
        testXMLfile = readXMLFile(testFile)
        if (len(testXMLfile.findall('.//test'))>0):
            perTest = []
            testName,className,projectName = gettTestInfo(testXMLfile)
            
            if (className in allClassNames['ClassName'].unique()):
                for failure in ['test','mutant']:
                    for test in testXMLfile.findall('.//'+failure):
                        status = test.find(failure+'_name').attrib['status']
                        testException =  test.find(failure+'_exception').text
                        if (failure == 'test'):
                            rowId = test.find(failure+'_name').attrib['id']
                        else:
                            rowId = test.find(failure+'_name').attrib['mutant_id']
                        
                        testNameExist,classNameExist,otherTestNameExist,junitLinesExist,cutLinesExist = analyzeStackTraceLines(test,testName,className,projectName,allClassNames)
                        

                        # consider all flaky failures .. 
                        maxFreq = 1
                        if (failure == "test"):
                            maxFreq = int(test.find(failure+'_name').attrib['frequency'])
                        for i in range (0,maxFreq):
                            perFailure = []
                            perFailure.append(projectName)
                            perFailure.append(testName)
                            perFailure.append(failure)
                            perFailure.append(rowId)
                            perFailure.append(status)
                            perFailure.append(testException)
                            perFailure.append(testNameExist)
                            perFailure.append(classNameExist)
                            perFailure.append(otherTestNameExist)
                            perFailure.append(junitLinesExist)
                            perFailure.append(cutLinesExist)
                            perTest.append(perFailure)

                perTestdf = pd.DataFrame(perTest, columns =['Project','Test','FailureType','FailureId','FailureStatus','FailureException','TestNameInStackTrace','ClassNameInStackTrace','otherTestClassInStackTrace','JunitInStackTrace','CUTinStackTrace'])

                # This step to add label if a test failure has mutants or not
                if (len(testXMLfile.findall('.//mutant'))>0):
                    perTestdf['HasMutants'] = 1
                else:
                    perTestdf['HasMutants'] = 0
                    
                perTestdf.to_csv(testFile.rsplit('/',1)[0]+'/FeaturesPerTest.csv', index=False)