# Process data format to make it compatible with OpenAI

In [1]:
import os
import json

In [6]:
def jsonlines_load(fname: str):
    with open(fname, 'r') as f:
        return [json.loads(line) for line in f]

## Convert api docs into json format

In [6]:
# Directory containing the .txt files
directory = "api_docs"

# Initialize an empty list to store the data
data = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        function_name = os.path.splitext(filename)[0]  # Get the function name (without .txt)
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
            api_doc = f.read().strip()  # Read the content of the file
        # Append the data in the desired format
        data.append({"function_name": function_name, "api_doc": api_doc})

# Write the data to a JSON file, each line as a separate JSON object
# with open("../our_proptest_data/api_docs.jsonl", "w", encoding="utf-8") as json_file:
#     for item in data:
#         json_file.write(json.dumps(item) + "\n")

In [7]:
# Directory containing the .txt files
directory = "api_codes"

# Initialize an empty list to store the data
data = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".py"):
        function_name = os.path.splitext(filename)[0]  # Get the function name (without .txt)
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
            api_doc = f.read().strip()  # Read the content of the file
        # Append the data in the desired format
        data.append({"function_name": function_name, "api_code": api_doc})

# Write the data to a JSON file, each line as a separate JSON object
# with open("../our_proptest_data/api_codes.jsonl", "w", encoding="utf-8") as json_file:
#     for item in data:
#         json_file.write(json.dumps(item) + "\n")

In [8]:
api_doc_data = jsonlines_load("api_docs.jsonl")
api_code_data = jsonlines_load("api_codes.jsonl")
api_doc_code_data = {}

for i in range(len(api_doc_data)):
    name = api_doc_data[i]["function_name"]
    if name not in api_doc_code_data:
        api_doc_code_data[name] = {}
    api_doc_code_data[name].update(api_doc_data[i])

for i in range(len(api_code_data)):
    name = api_code_data[i]["function_name"]
    if name not in api_doc_code_data:
        api_doc_code_data[name] = {}
    api_doc_code_data[name].update(api_code_data[i])

api_doc_code_data = list(api_doc_code_data.values())

# with open(f"api_doc_code.jsonl", "w", encoding="utf-8") as json_file:
#     for item in api_doc_code_data:
#         json_file.write(json.dumps(item)+"\n")

In [11]:
# api_doc_code_data = jsonlines_load("api_doc_code.jsonl")
# api_code_data = jsonlines_load("api_codes.jsonl")
# api_doc_data = jsonlines_load("api_docs.jsonl")

# for i in range(len(api_doc_code_data)):
#     for j in range(len(api_doc_data)):
#         if api_doc_code_data[i]['function_name'] == api_doc_data[j]['function_name']:
#             assert api_doc_code_data[i]['api_doc'] == api_doc_data[j]['api_doc']

#     for j in range(len(api_code_data)):
#         if api_doc_code_data[i]['function_name'] == api_code_data[j]['function_name']:
#             assert api_doc_code_data[i]['api_code'] == api_code_data[j]['api_code']
    

## Convert json format into txt format (property)

In [None]:
with open('code_only/output_jsonl/property/property_0_30_11-07-19-32.jsonl', 'r') as f:
    output_dir = 'code_only/properties'
    for line in f:
        data = json.loads(line.strip())
        function_name = data['function_name']
        properties = data['properties'][0]

        txt_file_name = function_name + '.txt'
        txt_file_path = os.path.join(output_dir, txt_file_name)

        # with open(txt_file_path, 'w') as txt_file:
        #     txt_file.write(properties)

## Write json file into .py (pbt)

In [None]:
with open('code_only/output_jsonl/pbt/pbt_0_30_11-07-19-48.jsonl', 'r') as f:
    output_dir = 'code_only/proptest'
    for line in f:
        data = json.loads(line.strip())
        function_name = data['function_name']
        pbt = data['pbt']
        
        function_dir = os.path.join(output_dir, function_name)
        os.makedirs(function_dir, exist_ok=True)

        for i, pbt_data in enumerate(pbt):
            j = i+1
            if pbt_data.startswith('```python\n'):
                pbt_data = pbt_data[len('```python\n'):].strip()
            if pbt_data.endswith('```'):
                pbt_data = pbt_data[:-len('```')].strip()
            txt_file_name = f'pbt_{j}.py'
            txt_file_path = os.path.join(function_dir, txt_file_name)

            # with open(txt_file_path, 'w') as txt_file:
            #     txt_file.write(pbt_data)

## Write mutants in json format into .py file

In [None]:
# single file
with open('mutants/html.escape.jsonl', 'r') as f:
    output_dir = 'our_proptest_data/mutants'
    i = 1
    for line in f:
        data = json.loads(line.strip())
        function_name = data['function_name']
        mutant_data = data['mutants'][0]
        
        function_dir = os.path.join(output_dir, function_name)
        os.makedirs(function_dir, exist_ok=True)

        if mutant_data.startswith('```python\n'):
            mutant_data = mutant_data[len('```python\n'):].strip()
        if mutant_data.endswith('```'):
            mutant_data = mutant_data[:-len('```')].strip()
        txt_file_name = f'mutant_{i}.py'
        txt_file_path = os.path.join(function_dir, txt_file_name)

        # with open(txt_file_path, 'w') as txt_file:
        #     txt_file.write(mutant_data)
        
        i += 1

In [None]:
output_dir = "doc_only/mutants/"
for filename in os.listdir("doc_only/output_jsonl/mutants"):
    if filename.endswith(".jsonl"):
        data = jsonlines_load("doc_only/output_jsonl/mutants/" + filename)
        for i in range(len(data)):
            function_name = data[i]['function_name']
            mutant_data = data[i]['mutants'][0]

            function_dir = os.path.join(output_dir, function_name)
            os.makedirs(function_dir, exist_ok=True)

            if mutant_data.startswith('```python\n'):
                mutant_data = mutant_data[len('```python\n'):].strip()
            if mutant_data.endswith('```'):
                mutant_data = mutant_data[:-len('```')].strip()
            txt_file_name = f'mutant_{i+1}.py'
            txt_file_path = os.path.join(function_dir, txt_file_name)

            # with open(txt_file_path, 'w') as txt_file:
            #     txt_file.write(mutant_data)

## Generate file for mutant generation in json format

A demo to show the file format. Better to convert the file format into json format automatically.

In [36]:
function_name = "statistics.pstdev"







properties = [
    """1. The output of `pstdev` should always be non-negative, as standard deviation cannot be negative.""",
    """2. If the input data set is empty, `pstdev` should raise a `StatisticsError`, indicating that at least one data point is required.""",
    """3. If the input data contains only one data point, the output of `pstdev` should be zero, since there is no variation in a single value.""",
    """4. The output of `pstdev` should be consistent with the output of `pstdev` when the input data is the same, regardless of the order of the data points.""",
    # """5. The median is invariant under the order of the input data; that is, sorting the data before finding the median should yield the same result as finding the median directly from the unsorted data.""",
]





pbt_1 = """
@given(st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=1))
def test_statistics_pstdev_non_negative_property(data):
    result = pstdev(data)
    assert result >= 0
""".strip()

pbt_2 = """
@given(st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=0))
def test_statistics_pstdev_empty_input_property(data):
    if len(data) == 0:
        try:
            pstdev(data)
            assert False, "Expected StatisticsError for empty input"
        except StatisticsError:
            pass
""".strip()

pbt_3 = """
@given(st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=1))
def test_statistics_pstdev_single_value_property(data):
    result = pstdev([data[0]])
    assert result == 0
""".strip()

pbt_4 = """
@given(st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=1))
def test_statistics_pstdev_order_invariance_property(data):
    result1 = pstdev(data)
    result2 = pstdev(data[::-1])
    assert result1 == result2
""".strip()

pbt_5 = """
""".strip()

pbts = [
        pbt_1, 
        pbt_2,
        pbt_3,
        pbt_4,
        # pbt_5,
        ]

api_doc = """statistics.pstdev(data, mu=None)
Return the population standard deviation (the square root of the population variance). See pvariance() for arguments and other details.

>>>
pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
0.986893273527251
""".strip()

to_save_test_property = {
    "function_name":function_name,
    "properties": properties,
    "pbt": pbts,
    'api_doc': api_doc
}

output_dir = 'sound_valid'
os.makedirs(output_dir, exist_ok=True)

test_file_name = f"{function_name}.jsonl"
test_file_path = os.path.join(output_dir, test_file_name)

with open(test_file_path, "w", encoding="utf-8") as json_file:
    # for item in data:
    json_file.write(json.dumps(to_save_test_property) + "\n")