In [None]:
import glob
import json
import pandas as pd
import re

In [None]:
def add_parameters_as_string_in_df(df):
    return [','.join(map(str, l)) for l in df['parameters']]

In [None]:
def analyze_data():
    # we care about methods that are invoked in production
    for prod_data_file in prod_files:
        print("[INFO] Analyzing file", prod_data_file)
        prod_data = pd.read_json(prod_data_file)
        # read data for same the method from test folder
        corresponding_test_data_file = prod_data_file.replace("object-data-prod", "object-data-test")
        test_data = pd.read_json(corresponding_test_data_file)
        print("[INFO] Invocations in production execution:", len(prod_data))
        print("[INFO] Invocations in test execution:", len(test_data))
        
        # remove invocations of method during test execution that are not called by test methods
        print("[INFO] Removing invocations from test executions that are not made directly by an invoking test")
        test_data = test_data.drop(test_data[test_data["calledByInvokingTest"] == False].index)
        print("[INFO] Updated number of invocations in test execution:", len(test_data))
        
        if len(test_data) == 0:
            print("[INFO] No invocations made directly from invoking tests, SKIPPING THIS METHOD")
            print("=====================================================================")
            continue
        
        # convert parameter list to string for before intersection and union
        prod_data['parametersAsString'] = add_parameters_as_string_in_df(prod_data)
        test_data['parametersAsString'] = add_parameters_as_string_in_df(test_data)
        
        # only in prod
        only_prod = list(set(prod_data["parametersAsString"]) - set(test_data["parametersAsString"]))
        print("[INFO] Parameters only in production, but not in test executions:", only_prod)
        print("=====================================================================")
        if len(only_prod) == 0:
            print("[INFO] No distinct parameters in production, SKIPPING THIS METHOD")
            print("=====================================================================")
            continue
        
        # only in test
        only_test = list(set(test_data["parametersAsString"]) - set(prod_data["parametersAsString"]))
        print("[INFO] Parameters only in test, but not in production executions:", only_test)      
        print("=====================================================================")
        
        # intersection
        intersection = list(set(prod_data["parametersAsString"]) & set(test_data["parametersAsString"]))
        print("[INFO] Parameters common to both production and test executions:", intersection)
        print("=====================================================================")
        
        # union
        union = list(set(prod_data["parametersAsString"]).union(set(test_data["parametersAsString"])))
        print("[INFO] Union of test and production parameters:", union)
        print("[INFO] Size of union:", len(union))
        print("=================================================================================================")

In [None]:
def sanitize_file_to_json(data_file):
    # nothing to do if file already sanitized
    with open(data_file, 'r') as f:
        lines = f.readlines()
        if lines[0].startswith("["):
            print("[INFO]", data_file, "is already sanitized")
            return
    with open(data_file, 'r') as f:
        lines = f.readlines()
        # add comma to all but last line
        for i in range(len(lines) - 1):
            lines[i] = re.sub(r"}\n", "},\n", lines[i])
        # add closing brace after last line
        lines[len(lines) - 1] = lines[len(lines) - 1] + "]\n"
        # add opening brace before first line
        lines[0] = "[\n" + lines[0]

    with open(data_file, 'w') as f:
        for l in range(len(lines)):
            f.write(lines[l])

In [None]:
def get_data_files_and_sanitize():
    print("[INFO] Found", len(prod_files), "file(s) with production data")
    print("[INFO] Found", len(test_files), "file(s) with test data")

    for p in prod_files:
        sanitize_file_to_json(p)

    for t in test_files:
        sanitize_file_to_json(t)
    
    print("=================================================================================================")

In [None]:
global prod_files, test_files
prod_files = glob.glob("/tmp/proze-object-data-prod/*.json")
test_files = glob.glob("/tmp/proze-object-data-test/*.json")

get_data_files_and_sanitize()
analyze_data()