In [1]:
# %% DeepSeek-V3 generator (drop-in replacement for FLAMES/CodeLlama)
# Uses your env/vars if already set; no extra "runner" logic; just call generate(masked_context)
import os, re, time
from typing import Optional
try:
    from openai import OpenAI
except Exception as e:
    raise RuntimeError("Install the OpenAI SDK first: pip install openai") from e

# --- pick up your existing settings, or fall back to sensible defaults
DEEPSEEK_API_KEY = "..."


DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", globals().get("DEEPSEEK_BASE_URL", "https://api.deepseek.com"))
DEEPSEEK_FIM_URL  = os.getenv("DEEPSEEK_FIM_URL",  globals().get("DEEPSEEK_FIM_URL",  "https://api.deepseek.com/beta"))
DEEPSEEK_MODEL    = os.getenv("DEEPSEEK_MODEL",    globals().get("DEEPSEEK_MODEL",    "deepseek-chat"))  # non-reasoning

chat_client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL)
fim_client  = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_FIM_URL)

# --- predicate extraction (don’t cut at ')' inside the condition)
_PRED_RE = re.compile(r"require\s*\(\s*(.*?)\s*\)\s*;?\s*$")
def _extract_predicate(text: str) -> Optional[str]:
    if not text: return None
    line = re.sub(r"^`+|`+$", "", text.strip()).strip()
    m = _PRED_RE.search(line)
    if m: return m.group(1).strip() 
    line = line.rstrip(";").strip()
    if line.lower().startswith("require(") and line.endswith(")"):
        inner = line[len("require("):-1].strip()
        return inner or None
    return line or None

def _split_mask(s: str):
    if not isinstance(s, str): return (None, None)
    for tok in ["<FILL_ME>", "<MASK>", "<HOLE>", "§MASK§"]:
        i = s.find(tok)
        if i != -1: return s[:i], s[i+len(tok):]
    m = re.search(r"<\s*FILL\s*_?\s*ME\s*>", s, flags=re.I)
    if m: a,b = m.span(); return s[:a], s[b:]
    return (None, None)

def _backoff(attempt: int): time.sleep(min(0.5*(2**attempt), 10.0))

# --- FIM (prefix/suffix) first; suffix anchors the completion
def _ds_fim(prefix: str, suffix: str, *, max_tokens=128, temperature=0.8, top_p=0.95) -> str:
    for a in range(6):
        try:
            r = fim_client.completions.create(
                model=DEEPSEEK_MODEL,
                prompt=prefix,
                suffix=suffix,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
            )
            out = (r.choices[0].text or "").strip().splitlines()[0].strip()
            return _extract_predicate(out) or out
        except Exception:
            if a >= 5: raise
            _backoff(a)
    raise RuntimeError("DeepSeek FIM failed after retries.")

# --- chat fallback (no security hinting; same as your RQ1 style)
_DS_SYSTEM = (
    "You are an expert Solidity assistant. When asked to fill a missing predicate, "
    "return EITHER a single line 'require(<predicate>);' OR just the predicate expression. No explanations."
)
_FEW_SHOT = [
    {"role":"user", "content": "Fill the missing predicate in: require(<FILL_ME>); Context: function transfer(address to, uint value) external { require(<FILL_ME>); balances[msg.sender]-=value; balances[to]+=value; }"},
    {"role":"assistant", "content": "balances[msg.sender] >= value"},
    {"role":"user", "content": "Fill the missing predicate in: require(<FILL_ME>); Context: function setOwner(address newOwner) external { require(<FILL_ME>); owner = newOwner; }"},
    {"role":"assistant", "content": "msg.sender == owner"},
]
def _ds_chat(prompt: str, *, max_tokens=64, temperature=0.8, top_p=0.95) -> str:
    for a in range(6):
        try:
            r = chat_client.chat.completions.create(
                model=DEEPSEEK_MODEL,
                messages=[{"role":"system","content":_DS_SYSTEM}, *_FEW_SHOT, {"role":"user","content":prompt}],
                temperature=temperature, top_p=top_p, max_tokens=max_tokens,
                stop=[");", "\n"],  # stop right after first predicate line
            )
            msg = r.choices[0].message.content
            return _extract_predicate(msg) or (msg or "").strip()
        except Exception:
            if a >= 5: raise
            _backoff(a)
    raise RuntimeError("DeepSeek chat failed after retries.")

# --- the generator your loop should call
def deepseek_generate(masked_context: str, extra_hint: str = "") -> str:
    prefix, suffix = _split_mask(masked_context or "")
    if prefix is not None:
        return _ds_fim(prefix, suffix)
    # minimal fallback prompt — DO NOT hint about security
    prompt = f"Fill the missing predicate in: require(<FILL_ME>); Context: {masked_context}\n{extra_hint or ''}"
    return _ds_chat(prompt)

# --- make it the active generator for your existing loop
GENERATOR = deepseek_generate
generate  = deepseek_generate
CURRENT_MODEL_NAME = "DeepSeek-V3"  # optional: for your logging
print("DeepSeek-V3 generator is active: call generate(masked_context) in your existing loop.")


RuntimeError: Install the OpenAI SDK first: pip install openai

In [2]:
# %% Synthesize a require(...) for a given contract + line number using DeepSeek
import re

def synthesize_require_statement(
    contract_text: str,
    line_no: int,
    injection: str = "V",   # "V" (vulnerable line), "alpha" (function entry), "omega" (function exit)
    indent_fallback: str = "    ",
    extra_hint: str | None = None,
    return_predicate_only: bool = False,
) -> str:
    """
    contract_text : full Solidity source (string)
    line_no       : 1-based line number reference inside the source file
    injection     : "V"  -> insert before the given line
                    "alpha" -> insert as first statement in the containing function
                    "omega" -> insert just before the closing brace of the containing function
    returns       : 'require(<predicate>);' (or just '<predicate>' if return_predicate_only=True)
    """
    if not isinstance(contract_text, str):
        raise ValueError("contract_text must be a string")

    lines = contract_text.splitlines()
    if line_no < 1 or line_no > max(1, len(lines)):
        raise ValueError(f"line_no {line_no} out of range (1..{len(lines)})")

    # --- small helpers --------------------------------------------------------
    def _indent_of(s: str) -> str:
        m = re.match(r"^(\s*)", s)
        return m.group(1) if m else ""

    def _find_function_start_line(idx0: int) -> int | None:
        # scan upwards (0-based) to find a line that declares a function
        for i in range(idx0, -1, -1):
            if re.search(r"^\s*function\b", lines[i]):
                return i
        return None

    def _find_open_brace_line(start_i: int) -> int | None:
        # from function decl line forward, find first line that has '{'
        for j in range(start_i, len(lines)):
            if "{" in lines[j]:
                return j
        return None

    def _find_function_end_line(open_line: int) -> int | None:
        # naive brace counting until we close what opened at open_line
        depth = 0
        # include braces on the open_line
        for j in range(open_line, len(lines)):
            for ch in lines[j]:
                if ch == "{":
                    depth += 1
                elif ch == "}":
                    depth -= 1
                    if depth == 0:
                        return j
        return None

    def _build_masked_context(insert_at_line: int, indent_src: str) -> str:
        masked = indent_src + "require(<FILL_ME>);"
        new_lines = lines[:insert_at_line] + [masked] + lines[insert_at_line:]
        return "\n".join(new_lines)

    # --- decide insertion point ----------------------------------------------
    idx0 = line_no - 1  # 0-based
    if injection == "V":
        insert_at = idx0
        indent = _indent_of(lines[idx0]) if idx0 < len(lines) else indent_fallback

    elif injection in ("alpha", "ωalpha", "pre", "entry"):
        fn_decl = _find_function_start_line(idx0)
        if fn_decl is None:
            # fallback: behave like V
            insert_at = idx0
            indent = _indent_of(lines[idx0])
        else:
            open_line = _find_open_brace_line(fn_decl)
            if open_line is None:
                # fallback: insert just after function decl
                insert_at = min(fn_decl + 1, len(lines))
                indent = _indent_of(lines[fn_decl]) + indent_fallback
            else:
                insert_at = min(open_line + 1, len(lines))  # first statement line after '{'
                indent = _indent_of(lines[open_line]) + indent_fallback

    elif injection in ("omega", "post", "exit"):
        fn_decl = _find_function_start_line(idx0)
        if fn_decl is None:
            # fallback: behave like V
            insert_at = idx0
            indent = _indent_of(lines[idx0])
        else:
            open_line = _find_open_brace_line(fn_decl)
            if open_line is None:
                insert_at = min(fn_decl + 1, len(lines))
                indent = _indent_of(lines[fn_decl]) + indent_fallback
            else:
                end_line = _find_function_end_line(open_line)
                if end_line is None:
                    # fallback: end of file
                    insert_at = len(lines)
                    indent = indent_fallback
                else:
                    insert_at = max(end_line, 0)
                    # insert *before* the closing '}' line, with the same indent
                    indent = _indent_of(lines[end_line])

    else:
        raise ValueError(f"Unknown injection='{injection}'. Use 'V', 'alpha', or 'omega'.")

    print(f"Inserting at line {insert_at+1}")
    masked_context = _build_masked_context(insert_at, indent or indent_fallback)

    # --- call your DeepSeek generator (must exist in your notebook) ----------
    try:
        predicate = deepseek_generate(masked_context, extra_hint or "")
    except NameError as e:
        raise RuntimeError(
            "deepseek_generate is not defined. "
            "Run your DeepSeek setup cell first (the one that defines deepseek_generate)."
        ) from e

    predicate = (predicate or "").strip()
    if return_predicate_only:
        return predicate
    return f"require({predicate});"



TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

In [None]:
# whole contract text
src = open("test.sol").read()

# Generate a guard *at the vulnerable line* (before that line):
print(synthesize_require_statement(src, line_no=53, injection="V"))

# Generate a guard as a *pre-condition* at the function entry that contains line 53:
print(synthesize_require_statement(src, line_no=53, injection="alpha"))

# Generate a guard as a *post-condition* before the function's closing brace:
print(synthesize_require_statement(src, line_no=53, injection="omega"))


## The rest, we just modify

In [None]:
import validation_library as veri   
from peft import PeftConfig, PeftModel
from transformers import LlamaForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os  
import importlib
import gc
from dotenv import load_dotenv
from collections import defaultdict

importlib.reload(veri)

counter_greater_than_200 = 0

contract_lines = []


contracts = veri.get_files("smartbugs-curated/0.4.x/contracts/dataset")

for contract_path, contract_name in contracts:
    contract, line = veri.find_occurrences(contract_path, "// <yes> <report>")
    print(contract_name, line)
    
    if contract.count('\n') > 200:
        counter_greater_than_200 += 1 
    else:
        contract = veri.replace_lines_with_string(contract, line, '')
        contract_lines.append((contract_name, contract, line))

print(contract_lines)


In [None]:
load_dotenv()
token = os.getenv("HF_TOKEN")
#print(token)

all_contracts = []
mapping = []  

#VL

for idx, (contract_name, contract, lines) in enumerate(contract_lines):
   
    for line in lines:
        prompt_with_fill = veri.replace_lines_with_string(contract, [line], 'require(<FILL_ME>);') ##ask if I should generate again or not
        all_contracts.append(prompt_with_fill)
        mapping.append((contract_name, contract, line)) 

In [None]:
print(all_contracts[0])

In [None]:
results_20 = []
counter = 0
for contract in all_contracts:   
    # find at which line the <FILL_ME> token exists, return that line number (0 - ... for line numbers) and replace that line with blank newline
    def find_fill_me_line(contract):
        lines = contract.split("\n")
        for i, line in enumerate(lines):
            if "<FILL_ME>" in line:
                lines[i] = ""
                return i, "\n".join(lines)
        return -1, contract
    line_no, contract = find_fill_me_line(contract)
    #print(f"Found <FILL_ME> token at line {line_no}")
    inv = synthesize_require_statement(contract, line_no=line_no, injection="V") 
    counter += 1
    print(f'c: {counter}')
    print(f'for line: {line_no}, invariant is ``{inv}``')
    print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ')
    results_20.append(inv)


In [None]:
len(results_20)

In [None]:
results_20 += [''] * (len(all_contracts) - len(results_20))
print(len(all_contracts))
replaced_contracts_VL = []
contracts_with_results = defaultdict(list)



for generated, (contract_name, contract, line) in zip(results_20, mapping):
    pre, post = veri.find_function_bounds(contract, line)
    contracts_with_results[contract_name].append((contract, {
        "VL": (line,f'require({generated});'),
        "pre": (pre+1,""),
        "post": (post,"")
    }))
    
veri.print_json_report("reports/aggregated/DeepSeek/contract_VL.json", contracts_with_results)

In [None]:
from tqdm import tqdm

all_contracts = []
mapping = []  
results_20 = []    

#pre
line_no = []
for contract_name, entries in tqdm(contracts_with_results.items(), desc="Processing contracts"):
    for contract, annotations in entries:
        VL, VL_require = annotations["VL"]
        line_no.append(VL)
        line, _ = annotations["pre"]
        contract_with_VL = veri.replace_lines_with_string(contract, [VL], VL_require)

        inv = synthesize_require_statement(contract_with_VL, line_no=VL, injection="alpha") 
        results_20.append(inv)
        all_contracts.append((contract_with_VL))
        mapping.append((contract_name, contract, line))

In [None]:
#print(all_contracts[0])

In [None]:
results_20 += [''] * (len(all_contracts) - len(results_20))
flat_entries = []
for contract_name, entries in contracts_with_results.items():
    for entry in entries:
        flat_entries.append((contract_name, entry))

for (generated, (contract_name, contract, line)), (_, (existing_contract, annotations)) in zip(zip(results_20, mapping), flat_entries):
    line, _ = annotations["pre"]
    annotations["pre"] = (line, f'require({generated});')

veri.print_json_report("reports/aggregated/DeepSeek/contract_PV.json", contracts_with_results)
    

In [None]:
from tqdm import tqdm

all_contracts = []
mapping = []  
results_20 = []    

#post
line_no = []
for contract_name, entries in tqdm(contracts_with_results.items(), desc="Processing contracts"):
    for contract, annotations in entries:
        VL, VL_require = annotations["VL"]
        line_no.append(VL)
        line, _ = annotations["post"]
        contract_with_VL = veri.replace_lines_with_string(contract, [VL], VL_require)

        inv = synthesize_require_statement(contract_with_VL, line_no=VL, injection="omega") 
        results_20.append(inv)
        all_contracts.append((contract_with_VL))
        mapping.append((contract_name, contract, line))

In [None]:
#print(all_contracts[0])

In [None]:
results_20 += [''] * (len(all_contracts) - len(results_20))
flat_entries = []
for contract_name, entries in contracts_with_results.items():
    for entry in entries:
        flat_entries.append((contract_name, entry))

for (generated, (contract_name, contract, line)), (_, (existing_contract, annotations)) in zip(zip(results_20, mapping), flat_entries):
    annotations["post"] = (line, f'require({generated});')

veri.print_json_report("reports/aggregated/DeepSeek/contracts_PVP.json", contracts_with_results)
    

In [1]:
def rearrange_latex_table(latex_data):
    """
    Rearranges a LaTeX table by moving the last seven numerical columns
    to the beginning of the number sequence for each row.

    Args:
        latex_data (str): A string containing the rows of the LaTeX table.

    Returns:
        str: The rearranged LaTeX table as a string.
    """
    rearranged_lines = []
    # Split the input data into individual lines for processing.
    lines = latex_data.strip().split('\\\\')

    for line in lines:
        # Sanitize the line by removing leading/trailing whitespace.
        line = line.strip()
        if not line:
            continue

        # We split the line at the first ampersand to separate the
        # descriptive part from the numerical data.
        try:
            description_part, numbers_part = line.split('&', 1)
        except ValueError:
            # Skip lines that don't contain an ampersand.
            rearranged_lines.append(line + ' \\\\')
            continue

        # Clean up the description part and ensure it ends with an ampersand.
        description_part = description_part.strip() + ' &'

        # Split the numerical part into a list of numbers.
        # Each number is stripped of whitespace.
        numbers = [num.strip() for num in numbers_part.split('&')]

        # Check if there are enough columns to perform the operation.
        if len(numbers) >= 7:
            # Isolate the last seven columns.
            last_seven_columns = numbers[-7:]
            # Isolate the remaining columns at the beginning.
            first_columns = numbers[:-7]
            # Create the new, rearranged list of numbers.
            rearranged_numbers = last_seven_columns + first_columns
        else:
            # If there are fewer than 7 numbers, no change is made.
            rearranged_numbers = numbers

        # Join the rearranged numbers back into a single string,
        # separated by ampersands for the LaTeX format.
        new_numbers_part = ' & '.join(rearranged_numbers)

        # Reconstruct the full line and add it to our list of results.
        rearranged_lines.append(f"{description_part} {new_numbers_part} \\\\")

    # Join all the processed lines back together with newlines.
    return '\n'.join(rearranged_lines)

# The original LaTeX table data provided in the prompt.
original_table = """
Access control (16)     & 6 & 5 & 5 & 6 & 4 & 6 & 6 & 6 & 7 & 7 & 7 & 7 & 7 & 7 & 7 & 4 & 3 & 4 & 3 & 3 & 3 \\
Arithmetic (20)         & 7 & 4 & 7 & 6 & 5 & 5 & 6 & 6 & 3 & 7 & 6 & 5 & 6 & 4 & 6 & 6 & 1 & 8 & 2 & 4 & 4 \\
Bad randomness (8)      & 1 & 2 & 3 & 3 & 2 & 4 & 4 & 0 & 1 & 1 & 1 & 1 & 1 & 1 & 0 & 0 & 1 & 0 & 1 & 1 & 1 \\
Denial of service (4)   & 1 & 1 & 0 & 1 & 0 & 0 & 0 & 1 & 2 & 0 & 2 & 1 & 1 & 1 & 1 & 1 & 0 & 1 & 1 & 1 & 1 \\
Front running (6)       & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\
Other (2)               & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\
Reentrancy (26)         & 0 & 1 & 4 & 2 & 6 & 7 & 6 & 0 & 7 & 2 & 7 & 2 & 3 & 3 & 5 & 1 & 0 & 5 & 4 & 0 & 4 \\
Time manipulation (4)   & 1 & 0 & 1 & 1 & 1 & 0 & 0 & 0 & 1 & 2 & 1 & 1 & 2 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 \\
Unchecked LLC (22)      & 9 & 0 & 0 & 10& 3 & 0 & 3 & 7 & 0 & 0 & 6 & 4 & 0 & 4 & 6 & 4 & 4 & 10& 7 & 4 & 7 \\
"""

# Call the function with the original table data.
new_table = rearrange_latex_table(original_table)

# Print the resulting new table to the console.
print("--- Original Table ---")
print(original_table.strip())
print("\n--- Rearranged Table ---")
print(new_table)


--- Original Table ---
Access control (16)     & 6 & 5 & 5 & 6 & 4 & 6 & 6 & 6 & 7 & 7 & 7 & 7 & 7 & 7 & 7 & 4 & 3 & 4 & 3 & 3 & 3 \
Arithmetic (20)         & 7 & 4 & 7 & 6 & 5 & 5 & 6 & 6 & 3 & 7 & 6 & 5 & 6 & 4 & 6 & 6 & 1 & 8 & 2 & 4 & 4 \
Bad randomness (8)      & 1 & 2 & 3 & 3 & 2 & 4 & 4 & 0 & 1 & 1 & 1 & 1 & 1 & 1 & 0 & 0 & 1 & 0 & 1 & 1 & 1 \
Denial of service (4)   & 1 & 1 & 0 & 1 & 0 & 0 & 0 & 1 & 2 & 0 & 2 & 1 & 1 & 1 & 1 & 1 & 0 & 1 & 1 & 1 & 1 \
Front running (6)       & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \
Other (2)               & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \
Reentrancy (26)         & 0 & 1 & 4 & 2 & 6 & 7 & 6 & 0 & 7 & 2 & 7 & 2 & 3 & 3 & 5 & 1 & 0 & 5 & 4 & 0 & 4 \
Time manipulation (4)   & 1 & 0 & 1 & 1 & 1 & 0 & 0 & 0 & 1 & 2 & 1 & 1 & 2 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 \
Unchecked LLC (22)      & 9 & 0 & 0 & 10& 3 & 0 & 3 & 7 & 0 & 0 & 6 & 4 & 0 & 4 & 6 & 4 & 4 & 10&