# Tests4Py Benchmark: MarkUp
This notebook handles benchmark tests for MarkUp in the Tests4Py framework.


In [1]:
# Suppress logging for the notebook; uncomment the last line to disable Avicenna logs
import logging

# This will disable all logging messages
logging.disable(logging.CRITICAL)

### Build Program from Repository

In [2]:
from tests4py.api.logging import deactivate
deactivate()

from debugging_benchmark.tests4py_benchmark.repository import MarkUpBenchmarkRepository

# Initialize the benchmark repository and select the first program
repository = MarkUpBenchmarkRepository()
programs = repository.build()
program = programs[0]  # Assuming there is only one MarkUp Subject; we use markup_1

Cloning into '/tmp/markup_1'...
[33m  DEPRECATION: Legacy editable install of markup==0.0.0 from file:///tmp/markup_1 (setup.py develop) is deprecated. pip 25.0 will enforce this behaviour change. A possible replacement is to add a pyproject.toml or enable --use-pep517, and use setuptools >= 64. If the resulting installation is not behaving as expected, try using --config-settings editable_mode=compat. Please consult the setuptools documentation for more information. Discussion can be found at https://github.com/pypa/pip/issues/11457[0m[33m
[0mCloning into '/tmp/markup_2'...
[33m  DEPRECATION: Legacy editable install of markup==0.0.0 from file:///tmp/markup_2 (setup.py develop) is deprecated. pip 25.0 will enforce this behaviour change. A possible replacement is to add a pyproject.toml or enable --use-pep517, and use setuptools >= 64. If the resulting installation is not behaving as expected, try using --config-settings editable_mode=compat. Please consult the setuptools documenta

### Initialize Avicenna

Initialize the `Avicenna` diagnostic system with specific parameters including minimum recall.

In [3]:
# Create an Avicenna instance with configurations for diagnosis
from avicenna.avicenna import Avicenna

# Convert program to dictionary format for Avicenna initialization
param = program.to_dict()

# Initialize Avicenna with a minimum recall configuration
avicenna = Avicenna(
    **param,
    min_recall=0.7,
)

### Diagnosis Execution and Explanation

In [4]:
# Perform the diagnosis using Avicenna and store the results
from typing import Tuple
from isla.language import Formula

try:
    diagnosis: Tuple[Formula, float, float] = avicenna.explain()
    print("Diagnosis complete.")
except Exception as e:
    print(f"Error during diagnosis: {e}")

printing test inputs
{Input(DerivationTree('<start>', (DerivationTree('<structure>', (DerivationTree('<string>', (DerivationTree('<str>', (DerivationTree('<chars>', (DerivationTree('<char>', (DerivationTree('d', (), id=3396),), id=3397), DerivationTree('<chars>', (DerivationTree('<char>', (DerivationTree('O', (), id=3393),), id=3394),), id=3395)), id=3398),), id=3399),), id=3400),), id=3401),), id=3402), <OracleResult.PASSING: 'PASSING'>), Input(DerivationTree('<start>', (DerivationTree('<structure>', (DerivationTree('<string>', (DerivationTree('<str>', (DerivationTree('<chars>', (DerivationTree('<char>', (DerivationTree('g', (), id=3450),), id=3451), DerivationTree('<chars>', (DerivationTree('<char>', (DerivationTree('K', (), id=3447),), id=3448),), id=3449)), id=3452),), id=3453),), id=3454), DerivationTree('<html>', (DerivationTree('<open>', (DerivationTree('<LPAR>', (DerivationTree('<', (), id=3443),), id=3444), DerivationTree('<string>', (DerivationTree('', (), id=3441),), id=3442

In [None]:
from isla.language import ISLaUnparser

print(f"Avicenna determined the following constraints to describe the failure circumstances:\n")

print(ISLaUnparser(diagnosis[0]).unparse())
print(f"Avicenna calculated a precision of {diagnosis[1]*100:.2f}% and a recall of {diagnosis[2]*100:.2f}%", end="\n\n")

Avicenna determined the following constraints to describe the failure circumstances:

exists <char> elem in start:
  (= elem "\"")
Avicenna calculated a precision of 88.55% and a recall of 100.00%



The constraint: 

```
exists <char> elem in start:
    (= elem "\"")
```

means that the error is predicted when there is a double quote character (`"`) in the evaluated string or text elements. The diagnosis suggests that the presence of a double quote might be triggering errors, likely due to how these characters are handled or expected within the markup context.

## Evaluation

### Predictor

Generate test inputs using a grammar-based fuzzer, and classify these inputs as passing or failing based on the learned constraints.

In [None]:
from debugging_framework.fuzzingbook.fuzzer import GrammarFuzzer
from debugging_framework.input.input import Input, OracleResult

def generate_inputs(grammar, num_inputs=1000):
    fuzzer = GrammarFuzzer(grammar)
    evaluation_data_set = set()

    while len(evaluation_data_set) < num_inputs:
        tree = fuzzer.fuzz()
        evaluation_data_set.add(Input.from_str(grammar=grammar, input_string=tree))

    return evaluation_data_set

def classify_inputs(program, evaluation_data_set):
    oracle = program.get_oracle()
    failing, passing = set(), set()

    for inp in evaluation_data_set:
        oracle_result, exception = oracle(inp)
        if oracle_result == OracleResult.FAILING:
            failing.add(inp)
        elif oracle_result == OracleResult.PASSING:
            passing.add(inp)

    return passing, failing

In [None]:
grammar = program.get_grammar()
evaluation_data_set = generate_inputs(grammar)
passing, failing = classify_inputs(program, evaluation_data_set)

print(f"Generated {len(evaluation_data_set)} unique inputs for evaluation.")
print(f"Generated {len(passing)} passing inputs for evaluation!")
print(f"Generated {len(failing)} failing inputs for evaluation!")

Generated 1000 unique inputs for evaluation.
Generated 961 passing inputs for evaluation!
Generated 39 failing inputs for evaluation!


Calculate and display the precision and recall for the diagnostic results based on the test evaluations.

In [None]:
from isla.evaluator import evaluate

# Calculate Precision and Recall
tp = sum(bool(evaluate(diagnosis[0], inp.tree, grammar)) for inp in failing)
fn = len(failing) - tp
fp = sum(bool(evaluate(diagnosis[0], inp.tree, grammar)) for inp in passing)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"The Diagnosis achieved a Precision of {precision*100:.2f}% " +
      f"a Recall of {recall*100:.2f}%")

The Diagnosis achieved a Precision of 84.62% a Recall of 56.41%


### Producer

#### Generating more Failing Inputs from Diagnosis

In [None]:
formula = """
exists <char> elem in start:
    (= elem "\\"")
"""

In [None]:
from isla.solver import ISLaSolver

solver = ISLaSolver(
    grammar,
    formula=formula,
    enable_optimized_z3_queries=False
)

In [None]:
failing_inputs = []
for _ in range(100):
    try:
        inp = solver.solve()
        failing_inputs.append(inp)
        # print(str(inp).ljust(30), oracle(inp))
    except StopIteration:
        continue

In [None]:
from typing import List

oracle = program.get_oracle()

producer_failing: List[bool] = []
for inp in failing_inputs:
    oracle_result, exception = oracle(inp)
    producer_failing.append(
        oracle_result.is_failing()
    )

In [None]:
print(f"Generated {len(failing_inputs)} inputs which are expected to be failing. ({sum(not(inp) for inp in producer_failing)} inputs are passing)")

Generated 100 inputs which are expected to be failing. (0 inputs are passing)


#### Generating Passing Inputs by Negating Constraint

In [None]:
# Negated Constraint
formula = """
not(exists <char> elem in start:
    (= elem "\\""))
"""

In [None]:
from isla.solver import ISLaSolver

solver = ISLaSolver(
    grammar,
    formula=formula,
    enable_optimized_z3_queries=False
)

In [None]:
passing_inputs = []
for _ in range(100):
    try:
        inp = solver.solve()
        passing_inputs.append(inp)
        # print(str(inp).ljust(30), oracle(inp))
    except StopIteration:
        continue

In [None]:
oracle = program.get_oracle()

producer_passing: List[bool] = []
for inp in passing_inputs:
    oracle_result, exception = oracle(inp)
    producer_passing.append(
        oracle_result.is_failing()
    )

In [None]:
print(f"Generated {len(passing_inputs)} inputs which are expected to be passing. ({sum(producer_passing)} inputs are failing)")

Generated 11 inputs which are expected to be passing. (1 inputs are failing)


In [None]:
from isla.evaluator import evaluate

# Calculate Precision and Recall
tp = sum(inp for inp in producer_failing)
fn = len(producer_failing) - tp
fp = sum(inp for inp in producer_passing)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"Producer Evaluation:")
print(f"The Diagnosis achieved a Precision of {precision*100:.2f}% " +
      f"a Recall of {recall*100:.2f}%")

Producer Evaluation:
The Diagnosis achieved a Precision of 99.01% a Recall of 100.00%
