# Process data format to make it compatible with OpenAI

In [2]:
import os
import json

In [3]:
def jsonlines_load(fname: str):
    with open(fname, 'r') as f:
        return [json.loads(line) for line in f]

## Convert api docs into json format

In [6]:
# Directory containing the .txt files
directory = "api_docs"

# Initialize an empty list to store the data
data = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        function_name = os.path.splitext(filename)[0]  # Get the function name (without .txt)
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
            api_doc = f.read().strip()  # Read the content of the file
        # Append the data in the desired format
        data.append({"function_name": function_name, "api_doc": api_doc})

# Write the data to a JSON file, each line as a separate JSON object
# with open("../our_proptest_data/api_docs.jsonl", "w", encoding="utf-8") as json_file:
#     for item in data:
#         json_file.write(json.dumps(item) + "\n")

In [7]:
# Directory containing the .txt files
directory = "api_codes"

# Initialize an empty list to store the data
data = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".py"):
        function_name = os.path.splitext(filename)[0]  # Get the function name (without .txt)
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
            api_doc = f.read().strip()  # Read the content of the file
        # Append the data in the desired format
        data.append({"function_name": function_name, "api_code": api_doc})

# Write the data to a JSON file, each line as a separate JSON object
# with open("../our_proptest_data/api_codes.jsonl", "w", encoding="utf-8") as json_file:
#     for item in data:
#         json_file.write(json.dumps(item) + "\n")

In [8]:
api_doc_data = jsonlines_load("api_docs.jsonl")
api_code_data = jsonlines_load("api_codes.jsonl")
api_doc_code_data = {}

for i in range(len(api_doc_data)):
    name = api_doc_data[i]["function_name"]
    if name not in api_doc_code_data:
        api_doc_code_data[name] = {}
    api_doc_code_data[name].update(api_doc_data[i])

for i in range(len(api_code_data)):
    name = api_code_data[i]["function_name"]
    if name not in api_doc_code_data:
        api_doc_code_data[name] = {}
    api_doc_code_data[name].update(api_code_data[i])

api_doc_code_data = list(api_doc_code_data.values())

# with open(f"api_doc_code.jsonl", "w", encoding="utf-8") as json_file:
#     for item in api_doc_code_data:
#         json_file.write(json.dumps(item)+"\n")

In [11]:
# api_doc_code_data = jsonlines_load("api_doc_code.jsonl")
# api_code_data = jsonlines_load("api_codes.jsonl")
# api_doc_data = jsonlines_load("api_docs.jsonl")

# for i in range(len(api_doc_code_data)):
#     for j in range(len(api_doc_data)):
#         if api_doc_code_data[i]['function_name'] == api_doc_data[j]['function_name']:
#             assert api_doc_code_data[i]['api_doc'] == api_doc_data[j]['api_doc']

#     for j in range(len(api_code_data)):
#         if api_doc_code_data[i]['function_name'] == api_code_data[j]['function_name']:
#             assert api_doc_code_data[i]['api_code'] == api_code_data[j]['api_code']
    

## Convert json format into txt format (property)

In [13]:
with open('output_jsonl/property/property_0_30_10-30-20-29.jsonl', 'r') as f:
    output_dir = 'properties'
    for line in f:
        data = json.loads(line.strip())
        function_name = data['function_name']
        properties = data['properties'][0]

        txt_file_name = function_name + '.txt'
        txt_file_path = os.path.join(output_dir, txt_file_name)

        # with open(txt_file_path, 'w') as txt_file:
        #     txt_file.write(properties)

## Write json file into .py (pbt)

In [14]:
with open('output_jsonl/pbt/pbt_0_30_10-30-20-37.jsonl', 'r') as f:
    output_dir = 'proptest'
    for line in f:
        data = json.loads(line.strip())
        function_name = data['function_name']
        pbt = data['pbt']
        
        function_dir = os.path.join(output_dir, function_name)
        os.makedirs(function_dir, exist_ok=True)

        for i, pbt_data in enumerate(pbt):
            j = i+1
            if pbt_data.startswith('```python\n'):
                pbt_data = pbt_data[len('```python\n'):].strip()
            if pbt_data.endswith('```'):
                pbt_data = pbt_data[:-len('```')].strip()
            txt_file_name = f'pbt_{j}.py'
            txt_file_path = os.path.join(function_dir, txt_file_name)

            # with open(txt_file_path, 'w') as txt_file:
            #     txt_file.write(pbt_data)

## Write mutants in json format into .py file

In [21]:
with open('../our_proptest_data/output_jsonl/mutants/statistics_variance_10-30-18-08.jsonl', 'r') as f:
    output_dir = '../our_proptest_data/mutants'
    i = 1
    for line in f:
        data = json.loads(line.strip())
        function_name = data['function_name']
        mutant_data = data['mutants'][0]
        
        function_dir = os.path.join(output_dir, function_name)
        os.makedirs(function_dir, exist_ok=True)

        if mutant_data.startswith('```python\n'):
            mutant_data = mutant_data[len('```python\n'):].strip()
        if mutant_data.endswith('```'):
            mutant_data = mutant_data[:-len('```')].strip()
        txt_file_name = f'mutant_{i}.py'
        txt_file_path = os.path.join(function_dir, txt_file_name)

        with open(txt_file_path, 'w') as txt_file:
            txt_file.write(mutant_data)
        
        i += 1

## Generate file for mutant generation in json format

In [20]:
function_name = "statistics_variance"
test_property_1 = """1. The output variance should always be a non-negative number, as variance cannot be negative due to the squaring of differences from the mean."""

test_property_2 =  """2. If the input data consists of identical values, the output variance should be zero, indicating no variability in the data."""

properties = [test_property_1, test_property_2]

pbt_1 = """
@given(st.lists(st.floats(min_value=-1e6, max_value=1e6), min_size=2))
def test_variance_non_negative_property(data):
    result = statistics.variance(data)
    assert result >= 0
""".strip()

pbt_2 = """
@given(st.lists(st.floats(min_value=-1e6, max_value=1e6), min_size=2))
def test_variance_identical_values_property(data):
    if len(set(data)) == 1:  # All values are identical
        result = statistics.variance(data)
        assert result == 0
""".strip()

pbts = [pbt_1, pbt_2]

api_doc = """statistics.variance(data, xbar=None)\nReturn the sample variance of data, an iterable of at least two real-valued numbers. Variance, or second moment about the mean, is a measure of the variability (spread or dispersion) of data. A large variance indicates that the data is spread out; a small variance indicates it is clustered closely around the mean.\n\nIf the optional second argument xbar is given, it should be the mean of data. If it is missing or None (the default), the mean is automatically calculated.\n\nUse this function when your data is a sample from a population. To calculate the variance from the entire population, see pvariance().\n\nRaises StatisticsError if data has fewer than two values.\n\nExamples:\n\n>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]\n>>> variance(data)\n1.3720238095238095\nIf you have already calculated the mean of your data, you can pass it as the optional second argument xbar to avoid recalculation:\n\n>>> m = mean(data)\n>>> variance(data, m)\n1.3720238095238095\nThis function does not attempt to verify that you have passed the actual mean as xbar. Using arbitrary values for xbar can lead to invalid or impossible results.\n\nDecimal and Fraction values are supported:\n\n>>> from decimal import Decimal as D\n>>> variance([D(\"27.5\"), D(\"30.25\"), D(\"30.25\"), D(\"34.5\"), D(\"41.75\")])\nDecimal('31.01875')\n\n>>> from fractions import Fraction as F\n>>> variance([F(1, 6), F(1, 2), F(5, 3)])\nFraction(67, 108)\nNote This is the sample variance s\u00b2 with Bessel\u2019s correction, also known as variance with N-1 degrees of freedom. Provided that the data points are representative (e.g. independent and identically distributed), the result should be an unbiased estimate of the true population variance.\nIf you somehow know the actual population mean \u03bc you should pass it to the pvariance() function as the mu parameter to get the variance of a sample.
""".strip()

to_save_test_property = {
    "function_name": "statistics_variance",
    "properties": properties,
    "pbt": pbts,
    'api_doc': api_doc
}

output_dir = '../our_proptest_data/sound_valid'

test_file_name = f"{function_name}.jsonl"
test_file_path = os.path.join(output_dir, test_file_name)

# with open(test_file_path, "w", encoding="utf-8") as json_file:
#     # for item in data:
#     json_file.write(json.dumps(to_save_test_property) + "\n")