# Data Evaluation

The purpose of this notebook is to streamline the process of manually evaluating the extracted test case data from notebook 1.

For this we will sample test cases from the data (using a fixed seed for reproducibility) and have information about the test cases be displayed for manual evaluation, including the relevant lines of code. 

## Imports

In [1]:
import pandas as pd
import os

## Setup

In [2]:
# Set library root folder:

# For TensorFlow
library_root_tensorflow = "A:/BachelorThesis/DLL_Testing_Tool/DL_Libraries/Tensorflow/tensorflow-master/tensorflow/python/" 
tests_root_tensorflow = "kernel_tests"
save_data_to_tensorflow = "extracted_data/tensorflow_evaluation_data.csv"

# For Pytorch
library_root_pytorch = "A:/BachelorThesis/DLL_Testing_Tool/DL_Libraries/PyTorch/pytorch-master/" 
tests_root_pytorch = "test"
save_data_to_pytorch = "extracted_data/pytorch_evaluation_data.csv"


## Import the data

In [21]:
# import tensorflow data and preview
df_tensorflow = pd.read_csv('extracted_data/tensorflow_data.csv')
df_tensorflow.head(10)

Unnamed: 0,File_Path,Line_Number,Found_in_Function,Function_Definition_Line_Number,Assert_Statement_Type,Oracle_Argument_ Position,Differential_Function_Line_Number,Differential_Test_Function
0,kernel_tests\aggregate_ops_test.py,69,testAddN,59,assertAllClose,1,66,np.sum
1,kernel_tests\aggregate_ops_test.py,83,testUnknownShapes,72,assertAllClose,1,80,np.sum
2,kernel_tests\aggregate_ops_test.py,83,testUnknownShapes,72,assertAllClose,2,79,sess.run
3,kernel_tests\array_ops_test.py,172,CheckVersusNumpy,150,assertAllClose,1,154,np.random.rand
4,kernel_tests\array_ops_test.py,172,CheckVersusNumpy,150,assertAllClose,2,164,array_ops.boolean_mask
5,kernel_tests\array_ops_test.py,217,testEmptyInput2D,210,assertAllClose,1,212,astype
6,kernel_tests\array_ops_test.py,217,testEmptyInput2D,210,assertAllClose,2,214,array_ops.boolean_mask
7,kernel_tests\array_ops_test.py,226,testEmptyInput1D,219,assertAllClose,1,221,astype
8,kernel_tests\array_ops_test.py,226,testEmptyInput1D,219,assertAllClose,2,223,array_ops.boolean_mask
9,kernel_tests\array_ops_test.py,1666,testAxis,1637,assertAllClose,2,1647,self._scale_per_slice


In [23]:
# import pytorch data and preview
df_pytorch = pd.read_csv('extracted_data/pytorch_data.csv')
df_pytorch.head(10)

Unnamed: 0,File_Path,Line_Number,Found_in_Function,Function_Definition_Line_Number,Assert_Statement_Type,Oracle_Argument_ Position,Differential_Function_Line_Number,Differential_Test_Function
0,test\test_ao_sparse.py,98,test_sparse_qlinear,32,assert_array_almost_equal,1,96,dense_qlinear_dynamic
1,test\test_ao_sparse.py,98,test_sparse_qlinear,32,assert_array_almost_equal,2,95,sparse_qlinear_dynamic
2,test\test_ao_sparse.py,103,test_sparse_qlinear,32,assert_array_almost_equal,1,101,dense_qlinear
3,test\test_ao_sparse.py,103,test_sparse_qlinear,32,assert_array_almost_equal,2,100,sparse_qlinear
4,test\test_autograd.py,1329,test_set_grad_coroutines_benign_exceptions,1295,assertLess,1,1328,coro.throw
5,test\test_autograd.py,1329,test_set_grad_coroutines_benign_exceptions,1295,assertLess,1,1339,coro.throw
6,test\test_autograd.py,1340,test_set_grad_coroutines_benign_exceptions,1295,assertLess,1,1328,coro.throw
7,test\test_autograd.py,1340,test_set_grad_coroutines_benign_exceptions,1295,assertLess,1,1339,coro.throw
8,test\test_autograd.py,2790,test_var_mean_differentiable,2778,allclose,2,2785,input2.mean
9,test\test_autograd.py,8972,test_cat_r_to_c,8965,gradcheck,2,8967,torch.randn


## Analyze coverage

To track the progress of our test case extraction we display statistics about how many cases still are still unsupported. This is either denoted by an "UNSUPPORTED ..." statement in the "Differential Test Function" column of the data or by an empty string in this column, i.e. `NaN`.

In [24]:
not_covered_tensorflow = df_tensorflow[df_tensorflow['Differential_Test_Function'].str.contains('UNSUPPORTED', na=False) | df_tensorflow['Differential_Test_Function'].isna()]
print("TensorFlow:\t"+ str(len(not_covered_tensorflow)) + " cases not covered.")
print(not_covered_tensorflow.Differential_Test_Function.value_counts(dropna=False))

not_covered_pytorch = df_pytorch[df_pytorch['Differential_Test_Function'].str.contains('UNSUPPORTED', na=False) | df_pytorch['Differential_Test_Function'].isna()]
print("\nPyTorch:\t"+ str(len(not_covered_pytorch)) + " cases not covered.")
print(not_covered_pytorch.Differential_Test_Function.value_counts(dropna=False))

TensorFlow:	47 cases not covered.
UNSUPPORTED Name (named variable or defined function: entropy)                   5
UNSUPPORTED Unary Operation                                                      4
UNSUPPORTED Name (named variable or defined function: len)                       4
UNSUPPORTED Name (named variable or defined function: _IotaInitializer)          2
UNSUPPORTED Name (named variable or defined function: densified)                 2
UNSUPPORTED Name (named variable or defined function: run_all_reduce_1device)    2
UNSUPPORTED Name (named variable or defined function: sorted)                    2
UNSUPPORTED Name (named variable or defined function: SortEigenValues)           2
NaN                                                                              2
UNSUPPORTED Name (named variable or defined function: x)                         2
UNSUPPORTED Name (named variable or defined function: draw)                      2
UNSUPPORTED Name (named variable or defined function:

In [17]:
not_covered_tensorflow[not_covered_tensorflow['Differential_Test_Function'].str.contains('UNSUPPORTED Constant', na=False)]

#not_covered_pytorch[not_covered_pytorch['Differential_Test_Function'].isna()]
#not_covered_tensorflow[not_covered_tensorflow['Differential_Test_Function'].isna()]

Unnamed: 0,File_Path,Line_Number,Found_in_Function,Function_Definition_Line_Number,Assert_Statement_Type,Oracle_Argument_ Position,Differential_Function_Line_Number,Differential_Test_Function
26,kernel_tests\atrous_convolution_test.py,265,_test_gradient,254,assertLess,2,264,UNSUPPORTED Constant
45,kernel_tests\betainc_op_test.py,189,testBetaIncGrads,175,assertLess,2,176,UNSUPPORTED Constant
48,kernel_tests\betainc_op_test.py,198,testBetaIncGrads,175,assertLess,2,176,UNSUPPORTED Constant
57,kernel_tests\candidate_sampler_ops_test.py,132,testSeed,112,assertLessEqual,1,126,UNSUPPORTED Constant
148,kernel_tests\conv1d_transpose_test.py,90,testConv1DTransposeSame,63,assertAllClose,1,85,UNSUPPORTED Constant
...,...,...,...,...,...,...,...,...
1048,kernel_tests\distributions\uniform_test.py,56,testUniformRange,52,assertAllClose,1,53,UNSUPPORTED Constant
1049,kernel_tests\distributions\uniform_test.py,57,testUniformRange,52,assertAllClose,1,54,UNSUPPORTED Constant
1149,kernel_tests\random\random_binomial_test.py,75,testMoments,51,assertAllLess,2,60,UNSUPPORTED Constant
1153,kernel_tests\random\random_gamma_test.py,100,_testMoments,72,assertAllLess,2,82,UNSUPPORTED Constant


# Tool for manual evaluation

This tool is meant to help with quickly evaluating test cases from the dataset. For each test case, it prints all information collected about the case, including the oracle argument position and the extracted function name, as well as the code inside the function where the test case was defined. Then the evaluator is asked for an evaluation of the test case via input. This evaluation is then stored alongside the test case in the data.

Evaluation keys:  
y: Test case correctly identified  
n: Test case is not differential testing  
?: Allows for the entry of a comment. This is meant for situations where the current case is differential testing, but the differential testing function was not extracted correctly (or some other data is incorrect).  

In [18]:
class EvaluationAutomator:
    def __init__(self, df, library_root, save_data_to):
        """Initialize the evaluation automator.
        
        df: Dataframe to evaluate.
        library_root: The root folder of the DL library
        save_data_to: Relative location to load/save the evaluation data
        """
        self.df = df
        self.save_data_to = save_data_to
        self.library_root = library_root
        
        # try importing evaluation data if it already exists
        if os.path.isfile(self.save_data_to): 
            self.eval_df = pd.read_csv(self.save_data_to)
            print("Evaluation data opened.")
        
        # otherwise initialize evaluation df and add new column for the evaluation result
        else:
            self.eval_df = df.copy()
            todo_list = ["TODO"] * len(self.eval_df.index)
            self.eval_df.insert(len(df.columns), 'Evaluation', todo_list)
            self.eval_df.to_csv(self.save_data_to)
            print("New evaluation data created.")
            
    def getEvalData(self):
        """Returns the data frame containing the evaluation data."""
        return self.eval_df
    
    def evaluate(self, index):
        """Present the data entry at the given index for evaluation."""
        
        # present the data entry
        print(self.df.iloc[index])
        print("\n")
        
        # check if it has already been evaluated
        if self.eval_df.at[index, 'Evaluation'] != "TODO":
            print("Already evaluated! Previous evaluation: " + self.eval_df.at[index, 'Evaluation'])
            if input("Re-evaluate? (y / n) ") != "y":
                return
            
        
        # print the relevant source code lines:
        
        # get source file of current test case and open it as an array of lines
        source = open(self.library_root + self.df.iloc[index]['File_Path']).readlines()

        # set beginning and end line number for the code section to display
        beginning_line_no = self.df.iloc[index]['Function_Definition_Line_Number']
        end_line_no = self.df.iloc[index]['Line_Number']

        # print these lines
        for line in range(beginning_line_no, end_line_no+1):
            print(str(line) + ": " + source[line-1])
        
        # ask for a decision from the evaluator:
        decision_bool = True
        while decision_bool:
            decision = input("Correctly identified? (y / n / ?): ")
            
            if decision in ["y", "n"]:
                decision_bool = False

            elif decision == "?":
                decision = input("Please comment on this case: ")
                decision_bool = False
                
            else:
                print("Error. Please specify y/n/?")
                decision_bool = True
                
        # write the decision to the evaluation data
        self.eval_df.at[index, 'Evaluation'] = decision
        self.eval_df.to_csv(self.save_data_to, index=False)

# initialize automators:
# TensorFlow
evalAutomator_tensorflow = EvaluationAutomator(df_tensorflow, library_root_tensorflow, save_data_to_tensorflow)

# PyTorch
evalAutomator_pytorch = EvaluationAutomator(df_pytorch, library_root_pytorch, save_data_to_pytorch)

# test evaluation on a particular case
evalAutomator_tensorflow.evaluate(218)

Evaluation data opened.
Evaluation data opened.
File_Path                            kernel_tests\cwise_ops_test.py
Line_Number                                                    1313
Found_in_Function                                     testBroadcast
Function_Definition_Line_Number                                1298
Assert_Statement_Type                                assertAllClose
Oracle_Argument_ Position                                         1
Differential_Function_Line_Number                              1310
Differential_Test_Function                               np.polyval
Name: 218, dtype: object


Already evaluated! Previous evaluation: Same code under test, with different parameters
Re-evaluate? (y / n) n


# Guide for evaluation

For each test case, please try to check the following facts:

- Is the test case a differential test case? 
- Was the correct argument identified? (Check if `Oracle_Arugment_Position` is indeed the oracle)
- Is the extracted function one of the relevant internal or differential functions?

If the answer to all three is questions is yes, then this case was most likely correctly identified (`y`)

## Sampling cases for evaluation

Set a seed and the number of cases you would like to evaluate, as well as the data to evaluate by setting the `evalAutomator` used:

In [19]:
RANDOM_SEED = 42 + 1
NUM_CASES = 10

evalAutomator = evalAutomator_tensorflow

In [20]:
# sample cases
sampled_cases = df_tensorflow.sample(n=NUM_CASES, random_state=RANDOM_SEED)

sample_counter = 0

# iterate over each case and evaluate
for i, row in sampled_cases.iterrows():
    print("\nCase " + str(i) + " (" + str(sample_counter) + " / " + str(len(sampled_cases)) + ")\n")
    evalAutomator.evaluate(i)
    sample_counter += 1


Case 36 (0 / 10)

File_Path                            kernel_tests\basic_gpu_test.py
Line_Number                                                     198
Found_in_Function                                      testGradient
Function_Definition_Line_Number                                 179
Assert_Statement_Type                                    assertLess
Oracle_Argument_ Position                                         1
Differential_Function_Line_Number                               196
Differential_Test_Function            gradient_checker_v2.max_error
Name: 36, dtype: object


179:   def testGradient(self):

180:     x1 = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(

181:         [1, 3, 2])

182:     x2 = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(

183:         [1, 3, 2])

184: 

185:     def div_x1(x1):

186:       return math_ops.truediv(x1, x2) * math_ops.cast(1.1, dtype=x1.dtype)

187: 

188:     def div_x2(x2):

189:    

KeyboardInterrupt: Interrupted by user

## Analyse evaluations

Now we can gain summary statistics about the performance of our function extraction.

In [None]:
eval_data = evalAutomator.getEvalData()

evaluation_counts = eval_data.Evaluation.value_counts()

print(evaluation_counts)

total_cases_evaluated = len(eval_data) - evaluation_counts['TODO'] 

print("\nn: " + str(round((evaluation_counts['n'] / total_cases_evaluated)*100)) + " %")

print("y: " + str(round((evaluation_counts['y'] / total_cases_evaluated)*100)) + " %")

print("?: " + str(round(((total_cases_evaluated - evaluation_counts['y'] - evaluation_counts['n']) / total_cases_evaluated)*100)) + " %")

## Show the extracted functions

Here we can gain a glimpse into the functions that were extracted:


In [None]:

# Filter out all unsupported rows
extracted_functions_df = eval_data[~eval_data['Differential_Test_Function'].str.contains('UNSUPPORTED', na=False)]


numpy_functions_df = extracted_functions_df[extracted_functions_df['Differential_Test_Function'].str.contains('np', na=False)]
extracted_functions_df = extracted_functions_df[~extracted_functions_df['Differential_Test_Function'].str.contains('np', na=False)]

stats_functions_df = extracted_functions_df[extracted_functions_df['Differential_Test_Function'].str.contains('stats', na=False)]
extracted_functions_df = extracted_functions_df[~extracted_functions_df['Differential_Test_Function'].str.contains('stats', na=False)]




extracted_functions = extracted_functions_df.Differential_Test_Function.unique()
print(extracted_functions)



extracted_functions_numpy = numpy_functions_df.Differential_Test_Function.unique()
print(extracted_functions_numpy)

extracted_functions_stats = stats_functions_df.Differential_Test_Function.unique()
print(extracted_functions_stats)