# TF-IDF

## Functions

In [1]:
import pandas as pd
import numpy as np
import statistics
import operator
from collections import Counter
from pathlib import Path
import xml.etree.ElementTree as ET
import filecmp
from lxml import etree
import subprocess
import os
import sklearn as sk
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, f1_score, precision_score,recall_score, confusion_matrix,precision_recall_fscore_support
from imblearn.over_sampling import SMOTE
from sklearn.tree import export_text
import math
from sklearn.feature_selection import mutual_info_classif
from tqdm import tqdm
from sklearn import tree
from functools import reduce
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


In [2]:
def getFilesByStartsWith(fileDir,starts):
    filesList = []
    for path, subdirs, files in os.walk(fileDir):
        for file in files:
            if (file.startswith(starts)):
                filesList.append(os.path.join(path,file))
    return filesList

def readXMLFile(filePath):
    parser = etree.XMLParser(strip_cdata=False,recover=True)
    with open(filePath, "rb") as source:
        tree = etree.parse(source, parser=parser)
    xmlroot = tree.getroot()
    return xmlroot

def find_index_starts_with(stacktraces, stoppedLines):
    index = next((index for index, element in enumerate(stacktraces) if any(element.startswith(prefix) for prefix in stoppedLines)), -1)
    return index

def remove_elements_starts_with(stacktraces, removedLines):
    return [element for element in stacktraces if not any(element.startswith(prefix) for prefix in removedLines)]

def processStackTraceLines(lines):
    newListofLines = []
    execludedLines = ['java.lang.invoke.LambdaForm','sun.reflect.GeneratedMethodAccessor','sun.reflect.GeneratedConstructorAccessor','com.sun.proxy.']
    for line in lines:
        newLine = line.replace('\t','')
        newLine = newLine.replace('\n','')
        if not any(newLine.startswith(e) for e in execludedLines):
            newListofLines.append(newLine)
    return newListofLines

def getSummaryFilesByProjectName(files):
    result = {}
    for file in files:
        if (os.path.getsize(file) >0):
            xmlFile = readXMLFile(file)
            if (len(xmlFile.findall('.//test'))>0):
                result[file] = xmlFile.findall('.//test')[0].find('test_name').attrib['project']
    return result

def getStackTraceTokens(listOfStackTraceLines):
    tokens  = []
    for line in listOfStackTraceLines:
        firstPart = line.rsplit('(',1)[0].split('.')
        javaFilePart = line.rsplit('(',1)[1].split(')')[0]
        if ('.java:' in javaFilePart):
            # #This part when we execlude the java as file extension and line number 
            # secondPart = javaFilePart.split('.java:')[0]
            # tokens = tokens + firstPart
            # tokens.append(secondPart)

            #This part when we INCLUDE the java as file extension and line number 
            secondPart = javaFilePart.split('.java:')[0] + ' java ' +javaFilePart.split('.java:')[1]
            tokens = tokens + firstPart
            tokens.append(secondPart)
        else:
            secondPart = javaFilePart.split(' ')
            tokens = tokens + firstPart + secondPart
    return tokens

def collectTokens(testFiles,project):

    # KEy = TestName + '|#|' + failureType + '|#|' +status + '|#|' + ID
    # Value = Tokens list (including ExceptionType)
    
    execludedLines = ['java.lang.invoke.LambdaForm','sun.reflect.GeneratedMethodAccessor','sun.reflect.GeneratedConstructorAccessor','com.sun.proxy.']
    stoppedLines = ['junit.framework.TestCase.runBare(','sun.reflect.NativeMethodAccessorImpl.invoke0(','org.junit.rules.ExternalResource$1.evaluate(']

    resultPerProject = {}
    xmlNodeTypes = ['test','mutant']
    for testFile in testFiles.keys():
        testRoot = readXMLFile(testFile)
        for rootType in xmlNodeTypes:
            for failure in testRoot.findall('.//'+rootType):
                if (rootType == 'test'):
                    failureID = failure.find(rootType+'_name').attrib['id']
                    failureFreq = failure.find(rootType+'_name').attrib['frequency']
                else:
                    failureID = failure.find(rootType+'_name').attrib['mutant_id']
                    failureFreq = '1'
                
                testName = failure.find(rootType+'_name').text.replace(" ", "").replace("\t", "").replace("\n", "")
                failureStatus = failure.find(rootType+'_name').attrib['status']
                if (failureStatus == 'FLAKY' and rootType == 'mutant'):
                    pass
                else:
                    failureException = failure.find(rootType+'_exception').text.split(' ')
                    failureStackTraceLines = [line.text.replace(" ", "").replace("\t", "").replace("\n", "") for line in  failure.iter('line')]

                    # Update: June 2023. if the stacktrace lines does not have the test name and the class name, stop in the first occurance of the stoppedLines
                    if(all(not any(item.startswith(prefix) for prefix in [testName,testName.rsplit('.',1)[0]]) for item in failureStackTraceLines)):
                        ind = find_index_starts_with(failureStackTraceLines, stoppedLines)
                        if (ind > 0):
                            failureStackTraceLines = failureStackTraceLines[:ind]

                    # processedLines = processStackTraceLines(failureStackTraceLines)
                    # remove non-determinist lines (Jon)
                    processedLines = remove_elements_starts_with(failureStackTraceLines, execludedLines)

                    stackTraceTokens  = getStackTraceTokens(processedLines)

                    resultPerProject[project+'|#|'+testName+'|#|'+rootType+'|#|'+ failureID+'|#|'+failureFreq+ '|#|'+failureStatus] = ' '.join(stackTraceTokens+failureException)
        
    return resultPerProject


def TfidfFailuresVectorizer(failures):
    FailuresVect = TfidfVectorizer(max_features=300)
    FailVec = FailuresVect.fit_transform(failures)
    tokens_names = FailuresVect.get_feature_names()
    result = FailVec.todense()
    resultList = result.tolist()
    resultDF = pd.DataFrame(resultList,columns=tokens_names)

    return resultDF

def replicate_row(row):
    repeated_series = [row.drop('repeat_count')] * (row['repeat_count'] - 1 + 1) 
    return pd.concat(repeated_series, axis=1).transpose()


def generateUniqueId(data,uniqueColumns):
    data['FailurePrimaryKey'] = "NA"
    for index, row in data.iterrows():
        pKey = []
        for col in uniqueColumns:
            pKey.append(str(row[col]))
        data.loc[index, 'FailurePrimaryKey'] = '|#|'.join(pKey)
    return data


def get_scores (tn,fp,fn,tp):
    if(tp==0):
        accuracy = (tp+tn)/(tn+fp+fn+tp)
        Precision = 0
        Recall = 0
        F1 = 0    
    else:
        accuracy = (tp+tn)/(tn+fp+fn+tp)
        Precision = tp/(tp+fp)
        Recall = tp/(tp+fn)
        F1 = 2*((Precision*Recall)/(Precision+Recall))    
    return accuracy, F1, Precision, Recall

def PredictFailure(data,data_target,project,targetClass):
    if ('index' in data.columns):
        data = data.drop(['index'], axis=1)

    # The type of k-fold is StratifiedKFold to ensure each fold has flaky failures. 
    fold = StratifiedKFold(n_splits=10,shuffle=True)

    TN = FP = FN = TP = 0
    for train_index, test_index in fold.split(data,data_target):
        x_train, x_test = data.iloc[list(train_index)], data.iloc[list(test_index)]
        y_train, y_test = data_target.iloc[list(train_index)], data_target.iloc[list(test_index)]

        tfidf = TfidfVectorizer()
        x_trained = tfidf.fit_transform(x_train['Tokens'])
        x_tested = tfidf.transform(x_test['Tokens'])

        clf = SVC()
        clf.fit(x_trained, y_train)
        y_pred = clf.predict(x_tested)
        if (targetClass == "FLAKY"):
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=['KILLED','FLAKY']).ravel()
        else:
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=['mutant','test']).ravel()
        TN = TN + tn
        FP = FP + fp
        FN = FN + fn
        TP = TP + tp
    
    accuracy, F1, Precision, Recall = get_scores (TN,FP,FN,TP)
    return [project,TN+FP+FN+TP,TP,FN,FP,TN,Precision*100, Recall*100,F1*100]

def getFilesByEndsWith(fileDir,ends):
    filesList = []
    for path, subdirs, files in os.walk(fileDir):
        for file in files:
            if (file.endswith(ends)):
                filesList.append(os.path.join(path,file))
    return filesList

## Inputs

In [3]:
output = 'Result'
DatasetDir = 'Path-to-dataset' # change this to the path of our dataset
parser = etree.XMLParser(strip_cdata=False,recover=True)


csvFiles = getFilesByEndsWith(DatasetDir,'FeaturesPerTest.csv') # use FeaturesPerTestAll.csv to count all failures .. 
ignoreTests = []
for c in csvFiles:
    c_data = pd.read_csv(c)
    if (len(c_data)<1):
        ignoreTests.append(c.rsplit('/')[-2]+'.xml')


allSummaryFiles = getFilesByStartsWith(DatasetDir,'summary-of-')
allSummaryFilesUpdated = [item for item in allSummaryFiles if not any(item.endswith(prefix) for prefix in ignoreTests)]


testsByProjectNames = getSummaryFilesByProjectName(allSummaryFilesUpdated)

## Main

In [4]:
minFlaky = 10
tfidfResult = []
for project in set(testsByProjectNames.values()):
        # get failures per projects .. 
        failuresPErProject = {k:v for k,v in testsByProjectNames.items() if v == project}

        tokensPerProjectsUpdated = collectTokens(failuresPErProject,project)

        TakensDF = []
        for k,v in tokensPerProjectsUpdated.items():
            perFailure = k.split('|#|')
            perFailure.append(v)
            TakensDF.append(perFailure)
        
        tfidfPerProjectNoDuplicates = pd.DataFrame(TakensDF,columns=['Project','Test','FailureType','FailureId','FailureFreq','FailureStatus','Tokens'])

        flakyTests = tfidfPerProjectNoDuplicates[tfidfPerProjectNoDuplicates['FailureType']=='test']['Test'].unique().tolist()
        mutants = tfidfPerProjectNoDuplicates[tfidfPerProjectNoDuplicates['FailureType']=='mutant']['Test'].unique().tolist()
        testWithNoMutants = [e for e in flakyTests if e in mutants]
        tfidfPerProjectNoDuplicatesWithMutants = tfidfPerProjectNoDuplicates[tfidfPerProjectNoDuplicates['Test'].isin(testWithNoMutants)]
        
        tfidfPerProject = tfidfPerProjectNoDuplicatesWithMutants.loc[tfidfPerProjectNoDuplicatesWithMutants.index.repeat(tfidfPerProjectNoDuplicatesWithMutants['FailureFreq'])].reset_index(drop=True)
        

        if (len(tfidfPerProject[tfidfPerProject['FailureStatus']=='FLAKY'])>minFlaky):
            tfidfProjectResult = PredictFailure(tfidfPerProject[['Tokens']],tfidfPerProject['FailureStatus'],project,'FLAKY')
            tfidfResult.append(tfidfProjectResult)

tfidfResult_DF = pd.DataFrame(tfidfResult,columns=['Project','Total','TP','FN','FP','TN','P','R','F1'])

tfidfResult_DF.to_csv(output+'/TFIDF.csv', index=False)