This Jupyter notebook shows the gathering and preprocessing of Solidity files for the subsequent fine-tuning of a Large Language Model (LLM). A comprehensive preprocessing pipeline consisting of seven steps was created:


1. Step "Cleaning" - Removes unnecessary parts such as comments or blank lines from each Solidity file.

2. Step "Formatting" - Converts the code in each file so that the final model only generates code in a correct format. 

3. Step "Slither Analysis" - Checks for vulnerabilities in each Solidity file of the dataset. The files are sorted by the vulnerabilities they contain, and vulnerability annotations are added to the line or construct in which the vulnerability was detected.

4. Step "Splitting" - Splits each Solidity file according to the definitions (i.e. contract, interface or library) it contains. The required imports are added to the splitted files accordingly.

5. Step "Similarity Check" - Checks for duplicate and very similar contracts and removes them accordingly.

6. Step "Solhint Fixes" - Fixes some of the best practice issues detected by Solhint in the Solidity files.

7. Step "Token Insertion" - Inserts special tokens into each remaining Solidity file that mark the end of a sequence, secure code or fill-in-the-middle (FIM) code.

All required libraries

In [None]:
import re                     # For String manipulation and pattern detection inside Strings
import subprocess             # For running console commands inside python (Prettier, Slither, Solhint)
import os                     # For plenty of operating system commands
import glob
import shutil                 # For moving and copying files
import random
from random import *
import json                   # For working with JSON-files
import csv                    # For working with CSV-files
import requests               # For working with APIs
import pandas as pd           # Data science library
from concurrent.futures import ThreadPoolExecutor      # For multithreading slow steps (such as slither analysis)

## Data Gathering

GitHub

In [None]:
# save_path = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Uncleaned_Datasets/Uncleaned_Dataset_GitHub_06.11.24"

# pragma_0_8_pattern = r"pragma\s+solidity\s+[\^<>=]*\s*0\.8\.[0-9]+(?:\s*<\s*0\.9\.0)?;"

# k = 870
# for i in range(5000, 60001, 200):
#   for j in range(0, 200):
#     sol_file = (requests.get(f"https://scr.ide.tuhh.de/api/contracts?language=Solidity&pragma=0.8.0&size=1000..10000&limit=200&skip={i}").json()['data'][j]['versions'][0]['content'])
#     if "import" in sol_file or "assembly" in sol_file:
#       pass
#     elif re.search(pragma_0_8_pattern, sol_file) and "contract" in sol_file:
#       with open(os.path.join(save_path, f"solidity_code_{k}.sol"), "w", encoding='utf-8') as file:
#         file.write(sol_file)
#       k += 1

Etherscan

In [None]:
# data_19_11 = pd.read_csv("D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Uncleaned_Datasets/Uncleaned_Dataset_Etherscan_19.11.24/verified_etherscan_contracts_(19.11.24).csv", usecols=[1])

# api_key = "EAB5Y8CCNUEVH1ATFUN1IBBXQTT74322DV"

# contractAddress_19_11 = data_19_11.iloc[1:, 0]

# save_path_19_11 = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Uncleaned_Datasets/Uncleaned_Dataset_Etherscan_19.11.24"

# j = 1
# for i in range(1, 5001):
#     sol_file = requests.get(f"https://api.etherscan.io/api?module=contract&action=getsourcecode&address={contractAddress_19_11[i]}&apikey={api_key}").json()["result"][0]["SourceCode"]
#     if sol_file[0] == "{" or sol_file[0] == "#":
#         pass
#     else:
#         with open(os.path.join(save_path_19_11, f"solidity_code_{j}.sol"), "w", encoding='utf-8') as file:
#             file.write(sol_file)
#         j += 1

DISL

In [None]:
# from datasets import load_dataset

# dataset = load_dataset("ASSERT-KTH/DISL", "decompose", split="test")

# pragma_0_8_pattern = r"pragma\s+solidity\s+[\^<>=]*\s*0\.8\.[0-9]+(?:\s*<\s*0\.9\.0)?;"
# save_path = "/content/drive/MyDrive/Colab_Notebooks_2/DISL_Contracts/"

# k = 1
# for i in range(0, len(dataset["train"])):
#   sol_file = dataset["train"][i]['source_code']
#   if "import" in sol_file:
#     pass
#   elif re.search(pragma_0_8_pattern, sol_file) and "contract" in sol_file:
#     with open(os.path.join(save_path, f"solidity_code_{k}.sol"), "w", encoding='utf-8') as file:
#       file.write(sol_file)
#     k += 1

## Data Preprocessing

Step 1: Cleaning

In [2]:
# Just there to count how much contracts were removed during this step
count = 0

# Cleans the contract, i.e., unnecessary comments are removed and so on.
def clean_contract(path_r, path_w):
    with open(path_r, "r", encoding='utf-8') as file:    # utf-8 encoding is necessary as some contracts have special characters in their comments
        cleaned_code = file.read()

    directory_length = len(os.listdir(path_w))

    pragma_0_8_pattern = r"pragma\s+solidity\s+[\^<>=]*\s*0\.8\.[0-9]+(?:\s*<\s*0\.9\.0)?;"
    pragma_0_7_below_pattern = r"pragma\s+solidity\s+[\^<>=]*\s*[0-9]+\.[0-7]+\.[0-9]+(?:\s*<\s*0\.[0-9]+\.[0-9])?;"
    
    if re.search(pragma_0_8_pattern, cleaned_code) and not re.search(pragma_0_7_below_pattern, cleaned_code):     # Sometimes there are different pragmas in one file, only pragma ^0.8.0 should be recognized
        # Inserts a space between // and the SPDX-License-Identifier. In some contracts this is not the case
        cleaned_code = re.sub(r'//SPDX-License-Identifier:', '// SPDX-License-Identifier:', cleaned_code)

        # Searches for the "// SPDX-License-Identifier" line in the contract (no matter if at the beginning or below some other comments) and stores it 
        spdx_pattern = r"^// SPDX-License-Identifier: .*$"
        spdx_matches = re.findall(spdx_pattern, cleaned_code, re.MULTILINE)  

        # Removes import statements
        # cleaned_code = re.sub(r"import [^{};]*\;", "", cleaned_code)
    
        # Removes pragma directives except for the solidity pragma
        cleaned_code = re.sub(r"pragma\s+experimental.*?;\n", "", cleaned_code)

        # Normalizes the solidity pragma to a standard version
        cleaned_code = re.sub(r"pragma\s+solidity\s+[\^<>=]*\s*[0-9]+\.[0-9]+\.[0-9]+(?:\s*<\s*[0-9]+\.[0-9]+\.[0-9])?;", "", cleaned_code)
        cleaned_code = "pragma solidity ^0.8.0;" + "\n" + cleaned_code

        # Removes single-line comments, but only if they are not inside strings "" or ''  
        lines = cleaned_code.splitlines()
        cleaned_lines = []
        single_line_comment_pattern = r"//.*"
        for line in lines:
            if is_in_string(line):
                cleaned_lines.append(line)
            else:
                line = re.sub(single_line_comment_pattern, "", line)
                cleaned_lines.append(line)

        cleaned_code = "\n".join(cleaned_lines)

        # Removes long comment blocks
        cleaned_code = re.sub(r"/\*.*?\*/", "", cleaned_code, flags=re.DOTALL)    # re.DOTALL deletes also line breaks, i.e., whole blocks

        # Inserts the "// SPDX-License-Identifier" with corresponding license back into the contract
        if spdx_matches:
            # In some contracts are more than one SPDX line, therefore use only the first in the matching list
            spdx_line = spdx_matches[0]
            if not cleaned_code.startswith(spdx_line):
                cleaned_code = spdx_line + "\n" + cleaned_code
        else:
            # Sometimes no License-Identfier is given, hence UNLICENSE is added
            cleaned_code = "// SPDX-License-Identifier: UNLICENSE" + "\n" + cleaned_code

        file_name = os.path.basename(path_r)
        output_file = os.path.join(path_w, file_name)
        # output_file = os.path.join(path_w, f"solidity_code_{1+directory_length}.sol")
        with open(output_file, "w", encoding='utf-8') as file:
            file.write(cleaned_code)
    
    else:
        global count
        count += 1
        file_name = os.path.basename(path_r)
        print(f"Deleted {file_name} | Deleted files: {count}")
        pass

# This function is required to check whether the line contains a string or not
def is_in_string(line):
    # Count occurrences of double and single quotes and comment signs
    double_quotes = line.count('"')
    single_quotes = line.count("'")
    comment = line.count("//")

    # Removes every space, this is necessary for the startswith() check
    line = "".join(line.split())

    # If there are two double or single quotes and the line starts not with /, then it's a string
    return ((double_quotes % 3 == 2) and (comment % 2 == 1) and not line.startswith('/')) or ((single_quotes % 3 == 2) and (comment % 2 == 1) and not line.startswith('/'))


In [19]:
src_folder = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Solhint_Dataset"

solidity_files = [f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))]

for file in solidity_files:
    clean_contract(f"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Solhint_Dataset/{file}",
                   "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Final_Datasets/Dataset_Without_Imports-Pragmas-SPDX")

# for i in range(1, 4572):
#     clean_contract(f"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Uncleaned_Datasets/Uncleaned_Dataset_Etherscan_25.11.24/solidity_code_{i}.sol",
#                    f"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Cleaned_Datasets/Etherscan_25.11.24")

Step 2: Formatting

In [337]:
folder_path = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Test3"

# Formats a solidity smart contract using Prettier and the corresponing plugin "prettier-plugin-solidity"
def format_contract(file_name):
    # folder_path = os.path.dirname(file_path)
    # file_name = os.path.basename(file_path)
    try:
        # With "subprocess" one can execute console commands in python
        subprocess.run(["C:/Users/Fabian Hensel/AppData/Roaming/npm/prettier.cmd", "--write", "--plugin=prettier-plugin-solidity", file_name], check=True, cwd=folder_path)
        # print(f"Formatted {file_name} successfully!")
    except subprocess.CalledProcessError as e:
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f"{file_name} deleted!")
        else:
            print(f"{file_name} not found.")
        print(f"Error formatting {file_name}: {e}")


# Runs the prettier formatting parallel
def run_prettier_parallel(path_r):
    contract_files = os.listdir(path_r)

    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        executor.map(format_contract, contract_files)

    print("Parallel formatting completed!")

In [None]:
run_prettier_parallel(folder_path)

Step 3: Slither Analysis

In [None]:
# Just there to count how much contracts were removed during this step
count = 0

# Always installs the newest available solc version
subprocess.run(["solc-select", "use", "--always-install", "0.8.28"])

unchecked_contracts_dir = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Cleaned_Datasets/DISL_27.11.24/contracts_8" 
# Splits the path into components
path_parts = os.path.normpath(unchecked_contracts_dir).split(os.sep)
# Gets the last two components
last_two_folders = path_parts[-3:]

low_vulnerability_dir = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/DISL_Contract_Slither_Analysis/Low" 
medium_vulnerability_dir = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/DISL_Contract_Slither_Analysis/Medium"
high_vulnerability_dir = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/DISL_Contract_Slither_Analysis/High"

secure_optimized_contracts_dir = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/DISL_Contract_Slither_Analysis/Secure"
secure_contracts_dir = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/DISL_Contract_Slither_Analysis/Opt_Issue"

# A dictionary of the slither detectors and the corresponding description and recommendation how to avoid the vulnerability
slither_vulnerabilities_and_optimizations = {
    "abiencoderv2-array": {
        "description": "'solc' versions '0.4.7-0.5.9' contain a compiler bug leading to incorrect ABI encoder usage.",
        "recommendation": "Use a compiler >= '0.5.10'."
    },

    "arbitrary-send-erc20": {
        "description": "'msg.sender' is not used as 'from' in 'transferFrom'.",
        "recommendation": "Use 'msg.sender' as 'from' in 'transferFrom'."
    },

    "array-by-reference": {
        "description": "Arrays passed to a function that expects reference to a storage array.",
        "recommendation": "Ensure the correct usage of 'memory' and 'storage' in the function parameters. Make all the locations explicit."
    },

    "encode-packed-collision": {
        "description": "Collision due to dynamic type usages in 'abi.encodePacked'.",
        "recommendation": "Do not use more than one dynamic type in 'abi.encodePacked()' (see the Solidity documentation). Use 'abi.encode()', preferably."
    },

    "incorrect-shift": {
        "description": "Values in the shift operation are reversed.",
        "recommendation": "Swap the order of parameters."
    },

    "multiple-constructors": {
        "description": "Multiple constructor definitions in the same contract (using new and old schemes).",
        "recommendation": "Only declare one constructor, preferably using the new scheme 'constructor(...)' instead of 'function <contractName>(...)'."
    },

    "name-reused": {
        "description": "Codebase has two contracts with similar names, the compilation artifacts will not contain one of the contracts with the duplicate name.",
        "recommendation": "Rename the contract."
    },

    "protected-vars": {
        "description": "Unprotected variable that is marked as protected.",
        "recommendation": "Add access controls to the vulnerable function."
    },

    "public-mappings-nested": {
        "description": "Prior to Solidity 0.5, a public mapping with nested structures returned incorrect values.",
        "recommendation": "Do not use public mapping with nested structures."
    },

    "rtlo": {
        "description": "An attacker can manipulate the logic of the contract by using a right-to-left-override character '(U+202E)'.",
        "recommendation": "Special control characters must not be allowed."
    },

    "shadowing-state": {
        "description": "Shadowed state variables. 'owner' of 'BaseContract' is never assigned and the modifier 'isOwner' does not work.",
        "recommendation": "Remove the state variable shadowing."
    },

    "suicidal": {
        "description": "Unprotected call to a function executing 'selfdestruct'/'suicide'.",
        "recommendation": "Protect access to all sensitive functions."
    },

    "uninitialized-state": {
        "description": "Uninitialized state variables",
        "recommendation": "Initialize all the variables. If a variable is meant to be initialized to zero, explicitly set it to zero to improve code readability."
    },

    "uninitialized-storage": {
        "description": "An uninitialized storage variable will act as a reference to the first state variable, and can override a critical variable.",
        "recommendation": "Initialize all storage variables."
    },

    "unprotected-upgrade": {
        "description": "Logic contract that can be destructed.",
        "recommendation": "Add a constructor to ensure 'initialize' cannot be called on the logic contract."
    },

    "arbitrary-send-erc20-permit": {
        "description": "'msg.sender' is not used as 'from' in 'transferFrom' and permit is used.",
        "recommendation": "Ensure that the underlying ERC20 token correctly implements a permit function."
    },

    "arbitrary-send-eth": {
        "description": "Unprotected call to a function sending Ether to an arbitrary address.",
        "recommendation": "Ensure that an arbitrary user cannot withdraw unauthorized funds."
    },

    "controlled-array-length": {
        "description": "Direct assignment of an array's length.",
        "recommendation": "Do not allow array lengths to be set directly; instead, add values as needed. Otherwise, thoroughly review the contract to ensure a user-controlled variable cannot reach an array length assignment."
    },

    "controlled-delegatecall": {
        "description": "'delegatecall' or 'callcode' to an address controlled by the user.",
        "recommendation": "Avoid using 'delegatecall'. Use only trusted destinations."
    },

    "delegatecall-loop": {
        "description": "Use of 'delegatecall' inside a loop in a payable function.",
        "recommendation": "Carefully check that the function called by 'delegatecall' is not payable/doesn't use 'msg.value'."
    },

    "incorrect-exp": {
        "description": "Use of bitwise 'xor ^' instead of exponential '**'.",
        "recommendation": "Use the correct operator '**' for exponentiation."
    },

    "incorrect-return": {
        "description": "'return' in an assembly block halts unexpectedly the execution.",
        "recommendation": "Use the 'leave' statement."
    },

    "msg-value-loop": {
        "description": "Use of 'msg.value' inside a loop.",
        "recommendation": "Provide an explicit array of amounts alongside the receivers array, and check that the sum of all amounts matches 'msg.value'."
    },

    "reentrancy-eth": {
        "description": "Reentrancy bug (see 'https://github.com/crytic/not-so-smart-contracts/tree/master/reentrancy'). Do not report reentrancies that don't involve Ether.",
        "recommendation": "Apply the check-effects-interactions pattern (see 'https://docs.soliditylang.org/en/v0.4.21/security-considerations.html#re-entrancy')."
    },

    "return-leave": {
        "description": "A 'return' is used where a 'leave' should be used.",
        "recommendation": "Use the 'leave' statement."
    },

    "storage-array": {
        "description": "'solc' versions '0.4.7-0.5.9' contain a compiler bug leading to incorrect values in signed integer arrays.",
        "recommendation": "Use a compiler version >= '0.5.10'."
    },

    "unchecked-transfer": {
        "description": "The return value of an external 'transfer'/'transferFrom' call is not checked.",
        "recommendation": "Use 'SafeERC20', or ensure that the 'transfer'/'transferFrom' return value is checked."
    },

    "weak-prng": {
        "description": "Weak PRNG (Pseudo Random Number Generator) due to a modulo on 'block.timestamp', 'now' or 'blockhash'. These can be influenced by miners to some extent.",
        "recommendation": "Do not use 'block.timestamp', 'now' or 'blockhash' as a source of randomness."
    },

    "codex": {
        "description": "Use codex (see 'https://openai.com/index/openai-codex/') to find vulnerabilities.",
        "recommendation": "Review codex's message."
    },

    "domain-separator-collision": {
        "description": "An ERC20 token has a function whose signature collides with EIP-2612's 'DOMAIN_SEPARATOR()', causing unanticipated behavior for contracts using 'permit' functionality.",
        "recommendation": "Remove or rename the function that collides with 'DOMAIN_SEPARATOR()'."
    },

    "enum-conversion": {
        "description": "Out-of-range 'enum' conversion ('solc' < '0.4.5').",
        "recommendation": "Use a recent compiler version. If 'solc' < '0.4.5' is required, check the 'enum' conversion range."
    },

    "erc20-interface": {
        "description": "Incorrect return values for 'ERC20' functions. A contract compiled with Solidity > '0.4.22' interacting with these functions will fail to execute them, as the return value is missing.",
        "recommendation": "Set the appropriate return values and types for the defined 'ERC20' functions."
    },

    "erc721-interface": {
        "description": "Incorrect return values for 'ERC721' functions. A contract compiled with solidity > '0.4.22' interacting with these functions will fail to execute them, as the return value is missing.",
        "recommendation": "Set the appropriate return values and vtypes for the defined 'ERC721' functions."
    },

    "incorrect-equality": {
        "description": "Use of strict equalities that can be easily manipulated by an attacker.",
        "recommendation": "Don't use strict equality to determine if an account has enough Ether or tokens."
    },

    "locked-ether": {
        "description": "Contract with a 'payable' function, but without a withdrawal capacity.",
        "recommendation": "Remove the 'payable' attribute or add a withdraw function."
    },

    "mapping-deletion": {
        "description": "A deletion in a structure containing a mapping will not delete the mapping (see the Solidity documentation). The remaining data may be used to compromise the contract.",
        "recommendation": "Use a lock mechanism instead of a deletion to disable structure containing a mapping."
    },

    "shadowing-abstract": {
        "description": "State variables shadowed from abstract contracts.",
        "recommendation": "Remove the state variable shadowing."
    },

    "tautological-compare": {
        "description": "A variable compared to itself is probably an error as it will always return 'true' for '==', '>=', '<=' and always 'false' for '<', '>' and '!='.",
        "recommendation": "Remove comparison or compare to different value."
    },

    "tautology": {
        "description": "Expressions that are tautologies or contradictions.",
        "recommendation": "Fix the incorrect comparison by changing the value type or the comparison."
    },

    "write-after-write": {
        "description": "Variables that are written but never read and written again.",
        "recommendation": "Fix or remove the writes."
    },

    "boolean-cst": {
        "description": "Misuse of a Boolean constant.",
        "recommendation": "Verify and simplify the condition."
    },

    "constant-function-asm": {
        "description": "Functions declared as constant/pure/view using assembly code. A call to an incorrectly labeled function may trap a contract compiled with Solidity 0.5.",
        "recommendation": "Ensure the attributes of contracts compiled prior to Solidity 0.5.0 are correct."
    },

    "constant-function-state": {
        "description": "Functions declared as constant/pure/view change the state. A call to an incorrectly labeled function may trap a contract compiled with Solidity 0.5.",
        "recommendation": "Ensure that attributes of contracts compiled prior to Solidity 0.5.0 are correct."
    },

    "divide-before-multiply": {
        "description": "Solidity's integer division truncates. Thus, performing division before multiplication can lead to precision loss.",
        "recommendation": "Consider ordering multiplication before division."
    },

    "out-of-order-retryable": {
        "description": "Out-of-order retryable transactions.",
        "recommendation": "Do not rely on the order or successful execution of retryable tickets."
    },

    "reentrancy-no-eth": {
        "description": "Reentrancy bug (see 'https://github.com/crytic/not-so-smart-contracts/tree/master/reentrancy'). Do not report reentrancies that involve Ether.",
        "recommendation": "Apply the check-effects-interactions pattern (see 'https://docs.soliditylang.org/en/v0.4.21/security-considerations.html#re-entrancy')."
    },

    "reused-constructor": {
        "description": "The same base constructor is called with arguments from two different locations in the same inheritance hierarchy.",
        "recommendation": "Remove the duplicate constructor call."
    },

    "tx-origin": {
        "description": "'tx.origin'-based protection can be abused by a malicious contract if a legitimate user interacts with the malicious contract.",
        "recommendation": "Do not use 'tx.origin' for authorization."
    },

    "unchecked-lowlevel": {
        "description": "The return value of a low-level call is not checked.",
        "recommendation": "Ensure that the return value of a low-level call is checked or logged."
    },

    "unchecked-send": {
        "description": "The return value of a 'send' is not checked.",
        "recommendation": "Ensure that the return value of 'send' is checked or logged."
    },

    "uninitialized-local": {
        "description": "Uninitialized local variables.",
        "recommendation": "Initialize all the variables. If a variable is meant to be initialized to zero, explicitly set it to zero to improve code readability."
    },

    "unused-return": {
        "description": "The return value of an external call is not stored in a local or state variable.",
        "recommendation": "Ensure that all the return values of the function calls are used."
    },

    "incorrect-modifier": {
        "description": "If a modifier does not execute '_' or revert, the execution of the function will return the default value, which can be misleading for the caller.",
        "recommendation": "All the paths in a modifier must execute '_' or revert."
    },

    "shadowing-builtin": {
        "description": "Shadowing built-in symbols using local variables, state variables, functions, modifiers, or events.",
        "recommendation": "Rename the local variables, state variables, functions, modifiers, and events that shadow a builtin symbol."
    },

    "shadowing-local": {
        "description": "Shadowing using local variables.",
        "recommendation": "Rename the local variables that shadow another component."
    },

    "uninitialized-fptr-cst": {
        "description": "'solc' versions '0.4.5-0.4.26' and '0.5.0-0.5.8' contain a compiler bug leading to unexpected behavior when calling uninitialized function pointers in constructors.",
        "recommendation": "Initialize function pointers before calling. Avoid function pointers if possible."
    },

    "variable-scope": {
        "description": "Usage of a variable before the declaration is stepped over.",
        "recommendation": "Move all variable declarations prior to any usage of the variable, and ensure that reaching a variable declaration does not depend on some conditional if it is used unconditionally."
    },

    "void-cst": {
        "description": "Call to a constructor that is not implemented.",
        "recommendation": "Remove the constructor call."
    },

    "calls-loop": {
        "description": "Calls inside a loop might lead to a denial-of-service attack.",
        "recommendation": "Favor pull over push strategy for external calls."
    },

    "events-access": {
        "description": "Missing events for critical access control parameters.",
        "recommendation": "Emit an event for critical parameter changes."
    },

    "events-maths": {
        "description": "Missing events for critical arithmetic parameters.",
        "recommendation": "Emit an event for critical parameter changes."
    },

    "incorrect-unary": {
        "description": "Unary expressions such as 'x=+1' probably typos.",
        "recommendation": "Remove the unary expression."
    },

    "missing-zero-check": {
        "description": "Missing zero address validation.",
        "recommendation": "Check that the address is not zero."
    },

    "reentrancy-benign": {
        "description": "Reentrancy bug (see 'https://github.com/crytic/not-so-smart-contracts/tree/master/reentrancy'). Only report reentrancy that acts as a double call.",
        "recommendation": "Apply the check-effects-interactions pattern (see 'https://docs.soliditylang.org/en/v0.4.21/security-considerations.html#re-entrancy')."
    },

    "reentrancy-events": {
        "description": "Reentrancies (see 'https://github.com/crytic/not-so-smart-contracts/tree/master/reentrancy') that allow manipulation of the order or value of events.",
        "recommendation": "Apply the check-effects-interactions pattern (see 'https://docs.soliditylang.org/en/v0.4.21/security-considerations.html#re-entrancy')."
    },

    "return-bomb": {
        "description": "A low level callee may consume all callers gas unexpectedly.",
        "recommendation": "Avoid unlimited implicit decoding of returndata."
    },

    "timestamp": {
        "description": "Dangerous usage of 'block.timestamp'. 'block.timestamp' can be manipulated by miners.",
        "recommendation": "Avoid relying on 'block.timestamp'."
    },

    "cache-array-length": {
        "description": "'for' loops that use 'length' member of some storage array in their loop condition and don't modify it.",
        "recommendation": "Cache the lengths of storage arrays if they are used and not modified in 'for' loops."
    },

    "constable-states": {
        "description": "State variables that are not updated following deployment should be declared 'constant' to save gas.",
        "recommendation": "Add the 'constant' attribute to state variables that never change."
    },

    "external-function": {
        "description": "'public' functions that are never called by the contract should be declared 'external', and its immutable parameters should be located in 'calldata' to save gas.",
        "recommendation": "Use the 'external' attribute for functions never called from the contract, and change the location of immutable parameters to 'calldata' to save gas."
    },

    "immutable-states": {
        "description": "State variables that are not updated following deployment should be declared 'immutable' to save gas.",
        "recommendation": "Add the 'immutable' attribute to state variables that never change or are set only in the constructor."
    },

    "var-read-using-this": {
        "description": "The contract reads its own variable using 'this', adding overhead of an unnecessary STATICCALL.",
        "recommendation": "Read the variable directly from storage instead of calling the contract."
    },
}

# This function annotates the vulnerabilities in the sol code with comments
def insert_comments_with_position(code_lines, issues):
    # Sorts the issues by lines for correct insertion
    issues_sorted = sorted(issues, key=lambda x: x['line'], reverse=True)

    # Calculates the tabs which are used in the line which has to be commented
    tabs = [{
        "tabs": (len(code_lines[issue['line']-1]) - len(code_lines[issue['line']-1].lstrip(' '))) // 4,
        "tabs_parent": (len(code_lines[issue['parent_line']-1]) - len(code_lines[issue['parent_line']-1].lstrip(' '))) // 4
    } for issue in issues_sorted]

    # Inserts comments from below to above to avoid line shifts
    for i, issue in enumerate(issues_sorted):
        line_num = issue['line'] - 1 
        comment = ""

        # If the slither analysis description is less than 200 characters (see 1) it is preferred before the description of the dictionary
        description = re.sub(rf"\s*\({last_two_folders[0]}/{last_two_folders[1]}/{last_two_folders[2]}/.*?\)", "", issue['description'])

        # As the code will be splitted later on, issue annotations in libraries or interfaces are unnecessary
        if issue['parent_name'] == "Ownable" or issue['parent_name'] == "ERC20" or issue['parent_name'] == "IERC20Metadata" or issue['parent_name'] == "Context":
            continue

        if issue['construct'] == "function" or issue['construct'] == "contract" or (issue['construct'] == "variable" and issue['parent_construct'] == "contract"):
            if issue['severity'] == "High" or issue['severity'] == "Medium" or issue['severity'] == "Low":  
                # Warning: (severity: low/medium/high/optimization): Description of Vulnerability
                if len(description) < 200:   # 1
                    comment1 = f"// WARNING Vulnerability ({issue['type']} | severity: {issue['severity']} | ID: {issue['id']}): {description}"
                else:
                    comment1 = f"// WARNING Vulnerability ({issue['type']} | severity: {issue['severity']} | ID: {issue['id']}): {slither_vulnerabilities_and_optimizations[issue['type']]['description']}\n"
                # Recommendation how to avoid Vulnerability
                comment2 = f"// Recommendation for {issue['id']}: {slither_vulnerabilities_and_optimizations[issue['type']]['recommendation']}"
                comment = (tabs[i].get("tabs")*"\t") + comment1 + (tabs[i].get("tabs")*"\t") + comment2 + "\n"

            elif issue['severity'] == "Optimization":
                comment1 = f"// WARNING Optimization Issue ({issue['type']} | ID: {issue['id']}): {description}"
                comment2 = f"// Recommendation for {issue['id']}: {slither_vulnerabilities_and_optimizations[issue['type']]['recommendation']}"
                comment = (tabs[i].get("tabs")*"\t") + comment1 + (tabs[i].get("tabs")*"\t") + comment2 + "\n"

        elif issue['construct'] == "variable" and issue['parent_construct'] == "function":
            line_num = issue['parent_line'] - 1
            if issue['severity'] == "High" or issue['severity'] == "Medium" or issue['severity'] == "Low":  

                comment1 = f"// WARNING Vulnerability ({issue['type']} | severity: {issue['severity']} | ID: {issue['id']}): {description}"
                comment2 = f"// Recommendation for {issue['id']}: {slither_vulnerabilities_and_optimizations[issue['type']]['recommendation']}"
                comment = (tabs[i].get("tabs_parent")*"\t") + comment1 + (tabs[i].get("tabs_parent")*"\t") + comment2 + "\n"                      

            elif issue['severity'] == "Optimization":
                comment1 = f"// WARNING Optimization Issue ({issue['type']} | ID: {issue['id']}): {description}"
                comment2 = f"// Recommendation for {issue['id']}: {slither_vulnerabilities_and_optimizations[issue['type']]['recommendation']}"
                comment = (tabs[i].get("tabs_parent")*"\t") + comment1 + (tabs[i].get("tabs_parent")*"\t") + comment2 + "\n"

        elif issue['construct'] == "node" or (issue['construct'] == "variable" and not issue['parent_construct'] == "contract"):
            # type of issue (e.g. reentrancy-benign) | ID of corresponding issue
            comment = (tabs[i].get("tabs")*"\t") + f"// {issue['type']} | ID: {issue['id']}" + "\n"

        # Inserts comment before the corresponding line
        if 0 <= line_num < len(code_lines):
            code_lines.insert(line_num, comment)        
    
    return code_lines

# Analyses any contract and inserts vulnerability information
def analyze_contract(contract_file):
    try:
        contract_path = os.path.join(unchecked_contracts_dir, contract_file)
        output_json = f"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/DISL_Contract_Slither_Analysis/Output_JSON/{contract_file}.json"

        # Carry out the slither analysis
        # print(f"Analyzing {contract_file}...")

        # Runs the slither analysis and captures the output as json file. Informational issues are excluded.
        subprocess.run(
            ["slither", contract_path, "--exclude-informational", "--json", output_json],
            capture_output=True,
            text=True
        )

        # Reads and processes the JSON result
        with open(output_json) as file:
            data = json.load(file)       # the json file with the corresponding slither analysis result
            issues = []                  # issues are annotated to the corresponding parent of the line where the vulnerability was found, i.e., to the function/contract which contains the statement or to the exact line
            severities = []
            if len(data.get("results", {})) != 0:                                               # if slither results are empty no issues are available
                for i in range(0, len(data.get("results", {}).get("detectors", []))):           # iterates over the existing detectors in slither json file
                    issues += [
                        {
                            "line": element.get("source_mapping", {}).get("lines", [None])[0],                                                          # start line where vulnerability was found
                            "type": data.get("results", {}).get("detectors", [])[i].get("check", None),                                                 # type of vulnerability (i.e. the slither detector)
                            "severity": data.get("results", {}).get("detectors", [])[i].get("impact", None),                                            # severity of vulnerability
                            "description": data.get("results", {}).get("detectors", [])[i].get("description", None).replace("\n\t", " ").replace("-", "").replace(":", ""),  # description of issue
                            "id": data.get("results", {}).get("detectors", [])[i].get("id", None)[-7:],                                                 # the id of a detected vulnerability
                            "construct": element.get("type", None),                                                                                     # the corresponding construct (contract/function/variable/node)
                            "parent_construct": element.get("type_specific_fields", {}).get("parent", {}).get("type", None),                            # the construcht of the parent
                            "parent_line": element.get("type_specific_fields", {}).get("parent", {}).get("source_mapping", {}).get("lines", [0])[0],    # the start line of the parent
                            "parent_name": element.get("type_specific_fields", {}).get("parent", {}).get("name", None)                                  # name of the parent
                        }
                        for element in data.get("results", {}).get("detectors", [])[i].get("elements", [])
                    ] 
                issues = list({json.dumps(issue): issue for issue in issues}.values())        # deletes duplicates (i.e. identical issues) and keeps correct order. Uses json.dumps as dicts are mutable objects, 
                                                                                              # i.e., cannont be used as elements in sets. Therefore "issues = list(dict.fromkeys(issues))" does not work!
                severities = [detector.get("impact", None) for detector in data.get("results", {}).get("detectors", [])]   # a list of the severities of the corresponding vulnerabilities
            
        # Reads the original code
        with open(contract_path, "r") as file:
            code_lines = file.readlines()

        annotated_code = code_lines
        if issues:
            annotated_code = insert_comments_with_position(code_lines, issues)  # Inserts vulnerability/optimization comments
        
        # Writes the commented code into an own new file and the corresponding folder (high/medium/low/optimization/secure)
        if "High" in severities:
            annotated_path = os.path.join(high_vulnerability_dir, contract_file)
        elif "Medium" in severities:
            annotated_path = os.path.join(medium_vulnerability_dir, contract_file)
        elif "Low" in severities:
            annotated_path = os.path.join(low_vulnerability_dir, contract_file)
        elif "Optimization" in severities:
            annotated_path = os.path.join(secure_contracts_dir, contract_file)
        else:
            annotated_path = os.path.join(secure_optimized_contracts_dir, contract_file)

        with open(annotated_path, "w") as file:
            file.writelines(annotated_code)
                
        os.remove(f"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/DISL_Contract_Slither_Analysis/Output_JSON/{contract_file}.json")        
        # print(f"Annotated contract saved to {annotated_path}")

    except subprocess.CalledProcessError as e:
        print(f"Error analyzing {contract_file}: {e.stderr}")
    except Exception as e:
        # shutil.move(f"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Test/{contract_file}", secure_contracts_dir)
        global count
        count += 1
        print(f"Unexpected error with {contract_file}: {e}")
        return None
    
def run_slither_parallel(path_r):
    contract_files = os.listdir(path_r)

    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        executor.map(analyze_contract, contract_files)

    print("Parallel Analysis completed!")

In [None]:
run_slither_parallel(unchecked_contracts_dir)
print(count)

Step 4: Splitting

In [4]:
# A dictionary of common contract imports
contract_imports = {
    # @openzeppelin/contracts/access/
    "AccessControl": "@openzeppelin/contracts/access/AccessControl.sol",
    "IAccessControl": "@openzeppelin/contracts/access/IAccessControl.sol",
    "Ownable": "@openzeppelin/contracts/access/Ownable.sol",
    "Ownable2Step": "@openzeppelin/contracts/access/Ownable2Step.sol",

    # @openzeppelin/contracts/finance/
    "VestingWallet": "@openzeppelin/contracts/finance/VestingWallet.sol",
    "VestingWalletCliff": "@openzeppelin/contracts/finance/VestingWalletCliff.sol",

    # @openzeppelin/contracts/governance/
    "Governor": "@openzeppelin/contracts/governance/Governor.sol",
    "IGovernor": "@openzeppelin/contracts/governance/IGovernor.sol",
    "TimelockController": "@openzeppelin/contracts/governance/TimelockController.sol",

    # @openzeppelin/contracts/interfaces/
    "IERC1155": "@openzeppelin/contracts/interfaces/IERC1155.sol",
    "IERC1155MetadataURI": "@openzeppelin/contracts/interfaces/IERC1155MetadataURI.sol",
    "IERC1155Receiver": "@openzeppelin/contracts/interfaces/IERC1155Receiver.sol",
    "IERC1271": "@openzeppelin/contracts/interfaces/IERC1271.sol",
    "IERC1363": "@openzeppelin/contracts/interfaces/IERC1363.sol",
    "IERC1363Receiver": "@openzeppelin/contracts/interfaces/IERC1363Receiver.sol",
    "IERC1363Spender": "@openzeppelin/contracts/interfaces/IERC1363Spender.sol",
    "IERC165": "@openzeppelin/contracts/interfaces/IERC165.sol",
    "IERC1820Implementer": "@openzeppelin/contracts/interfaces/IERC1820Implementer.sol",
    "IERC1820Registry": "@openzeppelin/contracts/interfaces/IERC1820Registry.sol",
    "IERC1967": "@openzeppelin/contracts/interfaces/IERC1967.sol",
    "IERC20": "@openzeppelin/contracts/interfaces/IERC20.sol",
    "IERC20Metadata": "@openzeppelin/contracts/interfaces/IERC20Metadata.sol",
    "IERC20Errors": "@openzeppelin/contracts/interfaces/IERC20Errors.sol",
    "IERC2309": "@openzeppelin/contracts/interfaces/IERC2309.sol",
    "IERC2981": "@openzeppelin/contracts/interfaces/IERC2981.sol",
    "IERC5805": "@openzeppelin/contracts/interfaces/IERC5805.sol",
    "IERC721": "@openzeppelin/contracts/interfaces/IERC721.sol",
    "IERC721Enumerable": "@openzeppelin/contracts/interfaces/IERC721Enumerable.sol",
    "IERC721Metadata": "@openzeppelin/contracts/interfaces/IERC721Metadata.sol",
    "IERC721Receiver": "@openzeppelin/contracts/interfaces/IERC721Receiver.sol",
    "IERC777": "@openzeppelin/contracts/interfaces/IERC777.sol",
    "IERC777Recipient": "@openzeppelin/contracts/interfaces/IERC777Recipient.sol",
    "IERC777Sender": "@openzeppelin/contracts/interfaces/IERC777Sender.sol",

    # @openzeppelin/contracts/metatx/
    "ERC2771Context": "@openzeppelin/contracts/metatx/ERC2771Context.sol",
    "ERC2771Forwarder": "@openzeppelin/contracts/metatx/ERC2771Forwarder.sol",

    # @openzeppelin/contracts/proxy/
    "Clones": "@openzeppelin/contracts/proxy/Clones.sol",
    "Proxy": "@openzeppelin/contracts/proxy/Proxy.sol",
    "BeaconProxy": "@openzeppelin/contracts/proxy/beacon/BeaconProxy.sol",
    "IBeacon": "@openzeppelin/contracts/proxy/beacon/IBeacon.sol",
    "Initializable": "@openzeppelin/contracts/proxy/utils/Initializable.sol",
    "TransparentUpgradeableProxy": "@openzeppelin/contracts/proxy/transparent/TransparentUpgradeableProxy.sol",
    "ProxyAdmin": "@openzeppelin/contracts/proxy/transparent/ProxyAdmin.sol",

    # @openzeppelin/contracts/token/
    "ERC1155": "@openzeppelin/contracts/token/ERC1155/ERC1155.sol",
    "ERC1155Burnable": "@openzeppelin/contracts/token/ERC1155/extensions/ERC1155Burnable.sol",
    "ERC1155Pausable": "@openzeppelin/contracts/token/ERC1155/extensions/ERC1155Pausable.sol",
    "ERC1155Supply": "@openzeppelin/contracts/token/ERC1155/extensions/ERC1155Supply.sol",
    "ERC20": "@openzeppelin/contracts/token/ERC20/ERC20.sol",
    "ERC20Burnable": "@openzeppelin/contracts/token/ERC20/extensions/ERC20Burnable.sol",
    "ERC20Pausable": "@openzeppelin/contracts/token/ERC20/extensions/ERC20Pausable.sol",
    "ERC20Detailed": "@openzeppelin/contracts/token/ERC20/extensions/ERC20Detailed.sol", 
    "ERC20Mintable": "@openzeppelin/contracts/token/ERC20/extensions/ERC20Mintable.sol",
    "ERC20Permit": "@openzeppelin/contracts/token/ERC20/extensions/ERC20Permit.sol",
    "ERC20Wrapper": "@openzeppelin/contracts/token/ERC20/extensions/ERC20Wrapper.sol",
    "ERC20FlashMint": "@openzeppelin/contracts/token/ERC20/extensions/ERC20FlashMint.sol",
    "ERC20Capped": "@openzeppelin/contracts/token/ERC20/extensions/ERC20Capped.sol",
    "ERC1363": "@openzeppelin/contracts/token/ERC20/extensions/ERC1363.sol",
    "SafeERC20": "@openzeppelin/contracts/token/ERC20/utils/SafeERC20.sol",
    "ERC721": "@openzeppelin/contracts/token/ERC721/ERC721.sol",
    "ERC721Burnable": "@openzeppelin/contracts/token/ERC721/extensions/ERC721Burnable.sol",
    "ERC721Consecutive": "@openzeppelin/contracts/token/ERC721/extensions/ERC721Consecutive.sol",
    "ERC721Pausable": "@openzeppelin/contracts/token/ERC721/extensions/ERC721Pausable.sol",
    "ERC721Royalty": "@openzeppelin/contracts/token/ERC721/extensions/ERC721Royalty.sol",
    "ERC721Wrapper": "@openzeppelin/contracts/token/ERC721/extensions/ERC721Wrapper.sol",
    "ERC721Votes": "@openzeppelin/contracts/token/ERC721/extensions/ERC721Votes.sol",
    "ERC721Holder": "@openzeppelin/contracts/token/ERC721/utils/ERC721Holder.sol",
    "ERC721Utils": "@openzeppelin/contracts/token/ERC721/utils/ERC721Utils.sol",
    "ERC2981": "@openzeppelin/contracts/token/common/ERC2981.sol",

    # @openzeppelin/contracts/utils/
    "Address": "@openzeppelin/contracts/utils/Address.sol",
    "Arrays": "@openzeppelin/contracts/utils/Arrays.sol",
    "Bytes": "@openzeppelin/contracts/utils/Bytes.sol",
    "Context": "@openzeppelin/contracts/utils/Context.sol",
    "Panic": "@openzeppelin/contracts/utils/Panic.sol",
    "Pausable": "@openzeppelin/contracts/utils/Pausable.sol",
    "ReentrancyGuard": "@openzeppelin/contracts/utils/ReentrancyGuard.sol",
    "ReentrancyGuardTransient": "@openzeppelin/contracts/utils/ReentrancyGuardTransient.sol",
    "Strings": "@openzeppelin/contracts/utils/Strings.sol",
    "Comparators": "@openzeppelin/contracts/utils/Comparators.sol",
    "Counters": "@openzeppelin/contracts/utils/Counters.sol",
    "Errors": "@openzeppelin/contracts/utils/Errors.sol",
    "Nonces": "@openzeppelin/contracts/utils/Nonces.sol",
    "Packing": "@openzeppelin/contracts/utils/Packing.sol",
    "Time": "@openzeppelin/contracts/utils/types/Time.sol",
    "Heap": "@openzeppelin/contracts/utils/structs/Heap.sol",
    "MerkleTree": "@openzeppelin/contracts/utils/structs/MerkleTree.sol",
    "BitMaps": "@openzeppelin/contracts/utils/structs/BitMaps.sol",
    "Checkpoints": "@openzeppelin/contracts/utils/structs/Checkpoints.sol",
    "EnumerableSet": "@openzeppelin/contracts/utils/structs/EnumerableSet.sol",
    "EnumerableMap": "@openzeppelin/contracts/utils/structs/EnumerableMap.sol",
    "Math": "@openzeppelin/contracts/utils/math/Math.sol",
    "SafeMath": "@openzeppelin/contracts/utils/math/SafeMath.sol",
    "SafeCast": "@openzeppelin/contracts/utils/math/SafeCast.sol",
    "SignedMath": "@openzeppelin/contracts/utils/math/SignedMath.sol",
    "ERC165": "@openzeppelin/contracts/utils/introspection/ERC165.sol",
    "ECDSA": "@openzeppelin/contracts/utils/cryptography/ECDSA.sol",
    "Hashes": "@openzeppelin/contracts/utils/cryptography/Hashes.sol",
    "MerkleProof": "@openzeppelin/contracts/utils/cryptography/MerkleProof.sol",
    "RSA": "@openzeppelin/contracts/utils/cryptography/RSA.sol",
    "SignatureChecker": "@openzeppelin/contracts/utils/cryptography/SignatureChecker.sol",

    # Other openzeppelin contracts
    "ERC20PresetMinterPauser": "@openzeppelin/contracts/token/ERC20/presets/ERC20PresetMinterPauser.sol",
    "ERC721PresetMinterPauserAutoId": "@openzeppelin/contracts/token/ERC721/presets/ERC721PresetMinterPauserAutoId.sol",
    "VRFConsumerBase": "@openzeppelin/contracts/src/v0.8/VRFConsumerBase.sol", 
    "AggregatorV3Interface": "@openzeppelin/contracts/src/v0.8/interfaces/AggregatorV3Interface.sol", 
    "PaymentSplitter": "@openzeppelin/contracts/finance/PaymentSplitter.sol",

    # Chainlink
    "VRFConsumerBase": "@chainlink/contracts/src/v0.8/VRFConsumerBase.sol",
    "AggregatorV3Interface": "@chainlink/contracts/src/v0.8/interfaces/AggregatorV3Interface.sol", 
    "LinkTokenInterface": "@chainlink/contracts/src/v0.8/interfaces/LinkTokenInterface.sol",
    
    # Uniswap V2
    "IUniswapV2Router02": "@uniswap/v2-periphery/contracts/interfaces/IUniswapV2Router02.sol",
    "IUniswapV2Router01": "@uniswap/v2-periphery/contracts/interfaces/IUniswapV2Router01.sol",
    "IUniswapV2Factory": "@uniswap/v2-core/contracts/interfaces/IUniswapV2Factory.sol",
    "IUniswapV2Pair": "@uniswap/v2-core/contracts/interfaces/IUniswapV2Pair.sol",

    # Uniswap V3
    "ISwapRouter": "@uniswap/v3-periphery/contracts/interfaces/ISwapRouter.sol",
    "IQuoter": "@uniswap/v3-periphery/contracts/interfaces/IQuoter.sol",
    "IUniswapV3Factory": "@uniswap/v3-core/contracts/interfaces/IUniswapV3Factory.sol",
    "IUniswapV3Pool": "@uniswap/v3-core/contracts/interfaces/IUniswapV3Pool.sol",

    # Sushiswap (based on Uniswap V2)
    "ISushiRouter": "@sushiswap/core/contracts/uniswapv2/interfaces/IUniswapV2Router02.sol",
    "ISushiFactory": "@sushiswap/core/contracts/uniswapv2/interfaces/IUniswapV2Factory.sol",
    
    # Aave Protocol (v2 and v3)
    "ILendingPool": "@aave/protocol-v2/contracts/interfaces/ILendingPool.sol",
    "IAToken": "@aave/protocol-v2/contracts/interfaces/IAToken.sol",
    "IProtocolDataProvider": "@aave/protocol-v2/contracts/interfaces/IProtocolDataProvider.sol",
    "ILendingPoolAddressesProvider": "@aave/protocol-v2/contracts/interfaces/ILendingPoolAddressesProvider.sol",
    "IStableDebtToken": "@aave/protocol-v2/contracts/interfaces/IStableDebtToken.sol",
    "IVariableDebtToken": "@aave/protocol-v2/contracts/interfaces/IVariableDebtToken.sol",
    
    # Compound Protocol
    "CErc20": "@compound-finance/compound-protocol/contracts/CErc20.sol",
    "Comptroller": "@compound-finance/compound-protocol/contracts/Comptroller.sol",
    "PriceOracle": "@compound-finance/compound-protocol/contracts/PriceOracle.sol",

    # Balancer
    "IBalancerPool": "@balancer-labs/v2-pool-utils/contracts/interfaces/IBalancerPool.sol",
    "IVault": "@balancer-labs/v2-vault/contracts/interfaces/IVault.sol",
    
    # MakerDAO (DAI)
    "IDai": "@makerdao/dss/contracts/Dai.sol",
    "IVat": "@makerdao/dss/contracts/Vat.sol",
    "IJug": "@makerdao/dss/contracts/Jug.sol",
    "IPot": "@makerdao/dss/contracts/Pot.sol",
    
    # Curve Finance
    "ICurvePool": "@curvefi/contracts/contracts/pool/CurvePool.sol",
    "ICurveFi": "@curvefi/contracts/contracts/ICurveFi.sol",
    
    # Synthetix
    "ISynthetix": "@synthetixio/contracts/source/interfaces/ISynthetix.sol",
    "IExchanger": "@synthetixio/contracts/source/interfaces/IExchanger.sol",
    "IExchangeRates": "@synthetixio/contracts/source/interfaces/IExchangeRates.sol",
    
    # Arbitrum (Layer 2 Solution)
    "IInbox": "@arbitrum/nitro/contracts/src/bridge/IInbox.sol",
    "IOutbox": "@arbitrum/nitro/contracts/src/bridge/IOutbox.sol",
    "IBridge": "@arbitrum/nitro/contracts/src/bridge/IBridge.sol",

    # Optimism (Layer 2 Solution)
    "L1CrossDomainMessenger": "@eth-optimism/contracts/L1/messaging/L1CrossDomainMessenger.sol",
    "L2CrossDomainMessenger": "@eth-optimism/contracts/L2/messaging/L2CrossDomainMessenger.sol",
    
    # ENS (Ethereum Name Service)
    "ENS": "@ensdomains/ens/contracts/ENS.sol",
    "ENSRegistry": "@ensdomains/ens/contracts/ENSRegistry.sol",
    "PublicResolver": "@ensdomains/resolver/contracts/PublicResolver.sol",
    
    # Moloch DAO
    "IMoloch": "@molochventures/moloch/contracts/interfaces/IMoloch.sol",
    
    # Gnosis Safe (Multisig)
    "GnosisSafe": "@gnosis.pm/safe-contracts/contracts/GnosisSafe.sol",
    "GnosisSafeProxyFactory": "@gnosis.pm/safe-contracts/contracts/proxies/GnosisSafeProxyFactory.sol",
    
    # Tornado Cash
    "ITornadoInstance": "@tornadocash/contracts/contracts/interfaces/ITornadoInstance.sol",
    "ITornadoGovernance": "@tornadocash/contracts/contracts/interfaces/ITornadoGovernance.sol",
}

def split_solidity_code(path_r, path_w):

    with open(path_r, "r", encoding="utf-8") as file:
        source_code = file.read()

    # Gets the length of the directory (this is necessary for the correct naming of the output files)
    directory_length = len(os.listdir(path_w))

    # Regex to match diverse definitions (contract, interface, abstract contract and so on)
    pattern = re.compile(
        r"\b(?:contract|interface|library|abstract\s+contract)\s+(\w+)(?:\s+is\s+([^;{]+))?\s*{", 
        re.MULTILINE
    )

    # Finds all matches (each match corresponds to a new contract, interface, library, abstract contract or contract .. is ..)
    matches = list(pattern.finditer(source_code))

    # If no matches or only one contract in file do nothing
    if not matches or len(matches) < 2:
        # print("No contracts, interfaces, libraries, abstract contracts, or inheritance contracts found.")
        return

    # Ensures that the output directory exists
    if not os.path.exists(path_w):
        os.makedirs(path_w)

    # Gets the pragma solidity statement to carry it to each file
    pragma_pattern = r"pragma\s+solidity\s+[\^<>=]*\s*[0-9]+\.[0-9]+\.[0-9]+;"
    pragma_matches = re.findall(pragma_pattern, source_code, re.MULTILINE) 

    # Gets the spdx pattern to carry it to each file
    spdx_pattern = r"^// SPDX-License-Identifier: .*$"
    spdx_match = re.match(spdx_pattern, source_code, re.MULTILINE) 

    # The pattern for inheritance
    contract_inheritance_pattern = r"contract\s+\w+\s+is\s+([^;{]+)"
    interface_inheritance_pattern = r"interface\s+\w+\s+is\s+([^;{]+)"

    # WARNING Vulnerability pattern, this is required to correctly split comments outside a contract with the contract
    warning_vul_pattern = r"// WARNING Vulnerability .*$\n// Recommendation .*$\ncontract"

    # A list of all contract/interface/library/abstract contract/... names which are in the current contract
    contract_names = []
    for match in matches:
        contract_names.append(match.group(1))
    
    # This is required to place the extracted comment in the right contract file
    count = -1
    warning_vul_string = ""
    # Splits the code into blocks and saves them in separate files
    for i, match in enumerate(matches):
        start_pos = match.start()  # Starting position of the current match
        contract_type = re.sub(r' {', "", match.group(0))  # contract, interface, library, abstract contract, and so on ...
        contract_name = match.group(1)  # The name of the contract/interface/library/abstract contract/...
        contract_inherited = match.group(2)  # The name of the the inherited contracts

        # Determines the end position: either the next match or the end of the file
        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(source_code)

        # Extracts the block of code for the current contract/interface/library
        contract_code = source_code[start_pos:end_pos].strip()

        # Detects commens before contracts and deletes this comment from the contract that is before this contract
        comment_detection_code = source_code[start_pos:end_pos+8].strip()
        warning_vul_matches = re.findall(warning_vul_pattern, comment_detection_code, re.MULTILINE)
        if warning_vul_matches:
            count = i + 1             # inserts it in next iteration
            warning_vul_string = warning_vul_matches[0].replace("\ncontract", "")
            contract_code = contract_code.replace("\n" + warning_vul_string, "")

        # Deletes any comments (which are inserted during slither analysis) from the code. This is required for correct imports, as otherwise the dependency detection logic will detect wrong dependencies in comments
        uncommented_code = re.sub(r"//.*", "", contract_code) 

        # Defines the imports for the corresponding contracts
        imports = ""

        # Searches for inheritances in the contract code besides these dependencies which are defined with "is"
        tmp_code = re.sub(r"\b(?:contract|interface|library|abstract\s+contract)\s+(\w+)(?:\s+is\s+([^;{]+))?\s*{|\"(?:\\.|[^\"\\])*\"|\'(?:\\.|[^\'\\])*\'", "", uncommented_code)  # All ambiguities are removed from the code
        tmp_list = []
        for name in contract_names:
            tmp_code2 = re.sub(rf"(?<!\w){name}(?![\w,;])", "ThIsHaStObEuNiQuEaNdSoOn123", tmp_code)     # Removes composed words which keep in the name (e.g. ERC20, ERC20.method should stay, but ERC20Example, ERC20SeeHere00 should not stay)
            if "ThIsHaStObEuNiQuEaNdSoOn123" in tmp_code2 and name in tmp_code:
                tmp_list.append(name)

        # Searches for inheritances in the contract code which are defined with keywords like "using" or "is" and creates the corresponding imports
        inheritance_matches = re.findall(contract_inheritance_pattern, uncommented_code, re.MULTILINE)     # Finds all "(abstract) contract ... is ..." dependencies
        inheritance_matches += re.findall(interface_inheritance_pattern, uncommented_code, re.MULTILINE)   # Finds all "interface ... is ..." dependencies
        if inheritance_matches or tmp_list:
            contracts = []
            if inheritance_matches:                                                                     # "Is" dependencies are collected
                contracts += [contract.strip() for contract in inheritance_matches[0].split(',')]
            contracts += tmp_list                                                                       # The list with all other dependencies is added
            contracts = list(dict.fromkeys(contracts))                                                  # Deletes duplicates but keeps the correct order
            for j in range(len(contracts)):
                if contracts[j] in contract_imports.keys():
                    imports += f'import "{contract_imports[contracts[j]]}" as {contracts[j]};\n'                          # Imports the required dependency from Github
                else:
                    imports += f'import "./{contracts[j]}.sol" as {contracts[j]};\n'                                      # Imports the required dependency as local file

        # Combines the spdx identifier, pragma, and imports with the contract code
        full_code = ""
        if imports == "":
            if count == i:        # inserts warning_vul_string in next iteration
                full_code = f"{spdx_match.group(0)}\n\n{pragma_matches[0]}\n{imports}\n{warning_vul_string}\n{contract_code}"
            else:
                full_code = f"{spdx_match.group(0)}\n\n{pragma_matches[0]}\n{imports}\n{contract_code}"
        else:
            if count == i:
                full_code = f"{spdx_match.group(0)}\n\n{pragma_matches[0]}\n\n{imports}\n{warning_vul_string}\n{contract_code}"
            else:
                full_code = f"{spdx_match.group(0)}\n\n{pragma_matches[0]}\n\n{imports}\n{contract_code}"

        # Writes each block to a separate file
        output_file = os.path.join(path_w, f"solidity_code_{1+i+directory_length}.sol")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(full_code)
            # print(f"Saved '{contract_type}' to {output_file}")

In [5]:
src_folder = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Mixed_Dataset"

solidity_files = [f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))]

for file in solidity_files:
    split_solidity_code(f"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Mixed_Dataset/{file}",
                        f"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Splitted_Dataset/")

Step 5: Jaccard Similarity Check

In [53]:
# Calculates the Jaccard similarity of two sets. The Jaccard similarity is defined as the intersection of two sets divided by the union of these sets.
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

# Removes similar contracts according to a threshold.
def remove_similar_contracts(input_dir, output_dir, threshold=0.9):

    tokenized_contracts = []
    original_contracts = []
    # Keeps track of the duplicates
    duplicates = set()

    # Reads each contract and tokenizes it
    for filename in os.listdir(input_dir):
        if filename.endswith(".sol"):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                contract = file.read()
                # contract_for_tokenization = re.sub(r"//.*", "", contract)       # comments are removed for similarity check, as they might falsify the result
                tokens = set(re.findall(r'\w+', contract))
                tokenized_contracts.append(tokens)
                original_contracts.append(contract)

    # Compares the contracts for similarity with Jaccard similarity
    for i in range(len(tokenized_contracts)):
        for j in range(i + 1, len(tokenized_contracts)):
            similarity = jaccard_similarity(tokenized_contracts[i], tokenized_contracts[j])
            if similarity > threshold:
                duplicates.add(j)

    # Writes unique contracts to the output directory
    k = 0
    for i in range(len(tokenized_contracts)):
        if i not in duplicates:
            k += 1
            output_file_path = os.path.join(output_dir, f"solidity_code_{k}.sol")
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(original_contracts[i]) 

In [54]:
remove_similar_contracts("D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/SpecialToken_Dataset/", 
                         "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Final_Datasets/Dataset_Fill_in_the_Middle/", 
                         0.95)

Step 6: Solhint fixes

In [None]:
folder_path = "D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Solhint_Dataset"

# Fixes some of the Solhint warnings. (avoid-throw, avoid-sha3, no-console, explicit-types, private-vars-underscore, payable-fallback, quotes, contract-name-camelcase, avoid-suicide)
def solhint_fixes(file_name):
    try:
        subprocess.run(
            ["C:/Users/Fabian Hensel/AppData/Roaming/npm/solhint.cmd", "--fix", "--noPrompt", file_name],
            check=True,
            cwd=folder_path
        )
    except Exception as e:
        print(f"Error applying fixes to {file_name}: {e}")

def run_solhint_parallel(path_r):
    contract_files = os.listdir(path_r)

    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        executor.map(solhint_fixes, contract_files)

    print("Parallel fixing completed!")

In [None]:
run_solhint_parallel(folder_path)

Step 7: Special Token Insertion

In [None]:
import re

# Inserts special tokens such as [begin_function] [end_function] to mark specific regions in the code. This should help the llm during fine-tuning to recognize specific patterns. 
def insert_special_tokens(path_r, path_w, constructList):

    try:
        with open(path_r, "r", encoding='utf-8') as file:   
            solidity_code = file.read()

        # Removes shadowing-local vulnerability annotations, as these are irrelevant for function extraction
        # shadowing_local_pattern = r"\/\/ WARNING Vulnerability \(shadowing-local.*\n\t\/\/ Recommendation for.*"
        # solidity_code = re.sub(shadowing_local_pattern, '', solidity_code)
        # solidity_code = re.sub('\n\n\t\n', '\n\n', solidity_code)

        # For vulnerability detection in constructs such as contracts, interfaces, and libraries
        vul_pattern = r"// WARNING Vulnerability"
        vul_matches = len(re.findall(vul_pattern, solidity_code))

        for construct in constructList:

            modified_code = []
            last_end = 0 
            functions_code = []

            pattern = ""
            begin_token = ""
            end_token = ""

            start_pattern = r"// SPDX-License-Identifier: [^{}]*\{"
            start_match = 0
            for match in re.finditer(start_pattern, solidity_code):
                start_match = match

            if construct == "function":
                pattern = r"    function [^{};]*(\{|\;)"                 # Matches the function header until { or ; The indentation is on purpose to avoid to match with wrong function words (e.g. in comments) 
                # begin_token = "\t<|secure_function|>\n"
                end_token = "[END_FUN]"
                begin_vulnerable_token = "<|vulnerable_function|>\n"
                end_vulnerable_token = "[END_VUL_FUN]"
                functions_code = extract_constructs(solidity_code)[0]

            elif construct == "contract":
                pattern = r"(?m)^(abstract|contract) [^{}]*\{"               # Matches the contract header until {   
                if vul_matches > 0:              
                    begin_token = "[BEG_VUL_CON]\n"
                    end_token = "[END_VUL_CON]"
                else:
                    #begin_token = "[BEG_CON]\n"
                    end_token = "[END_CON]"

            elif construct == "interface":
                pattern = r"interface [^{}]*\{"                         # Matches the interface header until {
                if vul_matches > 0:
                    begin_token = "[BEG_VUL_INT]\n"
                    end_token = "[END_VUL_INT]"
                else:
                    #begin_token = "[BEG_INT]\n"
                    end_token = "[END_INT]"

            elif construct == "constructor":
                pattern = r"    constructor[^{}]*\{"                    # Matches the constructor header until {
                #begin_token = "\t[BEG_CSR]\n"
                end_token = "[END_CSR]"
                begin_vulnerable_token = "<|vulnerable_constructor|>\n"
                end_vulnerable_token = "[END_VUL_CSR]"
                constructors_code = extract_constructs(solidity_code)[3]

            elif construct == "struct":
                pattern = r"    struct[^{}]*\{"                         # Matches the struct header until {
                #begin_token = "\t[BEG_STR]\n"
                end_token = "[END_STR]"

            elif construct == "modifier":
                pattern = r"    modifier [^{};]*(\{|\;)"                 # Matches the modifier header until {    
                #begin_token = "\t[BEG_MOD]\n"
                end_token = "[END_MOD]"

            elif construct == "event":                  
                pattern = r"    event [^{};]*\;"                          # Matches the event until ;
                #begin_token = "\t[BEG_EVE]\n"
                end_token = "[END_EVE]"

            elif construct == "enum":
                pattern = r"    enum [^{}]*\{"                          # Matches the enum until }
                #begin_token = "\t[BEG_ENU]\n"
                end_token = "[END_ENU]"

            elif construct == "error":
                pattern = r"    error [^{};]*\;"                        # Matches an error until ;
                #begin_token = "\t[BEG_ERR]\n"
                end_token = "[END_ERR]"

            elif construct == "library":
                pattern = r"library [^{}]*\{"                           # Matches the library header until {
                if vul_matches > 0:
                    begin_token = "[BEG_VUL_LIB]\n"
                    end_token = "[END_VUL_LIB]"
                else:
                    #begin_token = "[BEG_LIB]\n"
                    end_token = "[END_LIB]"

            elif construct == "assembly": 
                pattern = r"assembly [^{}]*\{"                          # Matches the library header until { ; Here also the spaces are counted as assembly could have a different number of spaces
                #begin_token = f"[BEG_ASM]"
                end_token = f"[END_ASM]"
                begin_vulnerable_token = f"[BEG_VUL_ASM]"
                end_vulnerable_token = f"[END_VUL_ASM]"
                assemblies_code = extract_constructs(solidity_code)[2]

            elif construct == "mapping":
                pattern = r"    mapping[^{};]*\;"                      # Matches the mapping until ;
                # begin_token = f"[BEG_MAP]"
                end_token = f"[END_MAP]"

            else:
                print("No valid construct.")
                return

            lines = solidity_code.split("\n")

            # Iterates over all construct matches
            for i, (match, match2) in enumerate(zip(re.finditer(pattern, solidity_code), re.findall(pattern, solidity_code))):
                # Adds the code before the function to the output
                modified_code.append(solidity_code[last_end:match.start()])

                match_lines = solidity_code[last_end-1:match.start()].split("\n")
                final_match = ""

                start_match_lines = solidity_code[start_match.end():match.start()].split("\n")
                final_start_match = ""

                try:
                    if i == 0:
                        final_start_match = start_match_lines[-3]
                except:
                    pass

                try:
                    final_match = match_lines[-3]
                    if i == 0:
                        final_start_match = start_match_lines[-3]
                except:
                    pass

                start_index = match.end() - 1                                   # position of the '{' that starts the construct body
                end_index = start_index + 1                                     # position of the '}' that ends the construct body

                # For vulnerability detection in constructs such as functions, assemblies, and constructors 
                vul_hit = False
                vul_pattern_2 = r"//.* | ID:"

                if ("{" in match2 or "contract" in match2 or "struct" in match2 or "library" in match2 or "interface" in match2 or "constructor" in match2 or "abstract" in match2 or "enum" in match2) and not "->" in str(match):

                    if "function" in str(match):
                        function_code = functions_code[i]
                        if re.search(vul_pattern_2, function_code) or re.search(vul_pattern, final_match) or (re.search(vul_pattern, final_start_match) and i == 0):
                            vul_hit = True

                    if "constructor" in str(match):
                        constructor_code = constructors_code[i]
                        if re.search(vul_pattern_2, constructor_code) or re.search(vul_pattern, final_match) or (re.search(vul_pattern, solidity_code[start_match.end():match.start()]) and i == 0):
                            vul_hit = True

                    if "assembly" in str(match):
                        assembly_code = assemblies_code[i]
                        spaces = 0
                        if re.search(vul_pattern_2, assembly_code):
                            vul_hit = True

                        for line in lines:
                            if "assembly" in line:
                                spaces = len(line) - len(line.lstrip(' '))
                                lines.remove(line)
                                break
                        
                        if not vul_hit:
                            modified_code.append(begin_token+"\n"+spaces*' ')
                        else:
                            modified_code.append(begin_vulnerable_token+"\n"+spaces*' ')
                    else: 
                        # Adds the [begin_construct] token before the construct if no vulnerability was detected, otherwise the [begin_vulnerable_construct] is added
                        if not vul_hit:
                            modified_code.append(begin_token)
                        else:
                            modified_code.append(begin_vulnerable_token)
            
                    brace_count = 1            # initializes the brace count

                    # Finds the closing brace of the construct
                    while brace_count > 0 and end_index < len(solidity_code):
                        if solidity_code[end_index] == '{':
                            brace_count += 1
                        elif solidity_code[end_index] == '}':
                            brace_count -= 1
                        end_index += 1
                    
                    # Adds the construct code along with the [end_construct] token
                    modified_code.append(solidity_code[match.start():end_index])

                    # if not vul_hit:
                    #     modified_code.append(end_token)
                    # else:
                    #     modified_code.append(end_vulnerable_token)

                elif "->" in str(match):
                    continue
                else:
                    if "function" in str(match):
                        function_code = functions_code[i]
                        if (re.search(vul_pattern, final_start_match) and i == 0):
                            modified_code.append(begin_vulnerable_token)
                            modified_code.append(solidity_code[match.start():end_index])
                            #modified_code.append(end_vulnerable_token+"\n")
                        else:
                            # modified_code.append(begin_token)
                            modified_code.append(solidity_code[match.start():end_index])
                            #modified_code.append(end_token+"\n")
                    else:
                        # modified_code.append(begin_token)
                        modified_code.append(solidity_code[match.start():end_index])
                        #modified_code.append(end_token+"\n")
                    end_index += 1

                # Updates last_end to continue parsing after this construct
                last_end = end_index

            # Adds any remaining code after the last construct
            modified_code.append(solidity_code[last_end:])

            solidity_code = ''.join(modified_code)

        with open(path_w, "w", encoding='utf-8') as file:
            file.write(solidity_code)

    except Exception as e:
        print(f"Unexpected error: {e}")
        return None
    
# Extracts the code of all functions, modifiers, assemblies, and constructors of a solidity file and puts it into a list
def extract_constructs(solidity_code):

    function_pattern = r"    function [^{};]*(\{|\;)"
    modifier_pattern = r"    modifier [^{};]*(\{|\;)"
    assembly_pattern = r"assembly [^{}]*\{"
    constructor_pattern = r"    constructor[^{}]*\{"

    patterns = [function_pattern, modifier_pattern, assembly_pattern, constructor_pattern]

    constructs_code = []
    functions_code = []
    modifiers_code = []
    assemblies_code = []
    constructors_code = []

    # Iterates over the given patterns
    for pattern in patterns:

        # Iterates over the construct matches
        for match in re.finditer(pattern, solidity_code):
            start_index = match.end() - 1                                   
            end_index = start_index + 1

            brace_count = 0
            for index in range(start_index, len(solidity_code)):
                if solidity_code[index] == "{":
                    brace_count += 1
                elif solidity_code[index] == "}":
                    brace_count -= 1
                    
                if brace_count == 0:
                    end_index = index
                    break
            
            if pattern == function_pattern:
                functions_code.append(solidity_code[match.start():end_index+1])
            elif pattern == modifier_pattern:
                modifiers_code.append(solidity_code[match.start():end_index+1])
            elif pattern == assembly_pattern:
                assemblies_code.append(solidity_code[match.start():end_index+1])
            elif pattern == constructor_pattern:
                constructors_code.append(solidity_code[match.start():end_index+1])

    constructs_code.append(functions_code)
    constructs_code.append(modifiers_code)
    constructs_code.append(assemblies_code)
    constructs_code.append(constructors_code)

    return constructs_code


In [None]:
src_folder = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Solhint_Dataset"

solidity_files = [f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))]

for file in solidity_files:
    insert_special_tokens(path_r=f"D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Solhint_Dataset/{file}",
                          path_w=f"D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/SpecialToken_Dataset/{file}",
                          constructList=["function", "contract", "interface", "constructor", "struct", "modifier", "library", "assembly", "event", "enum", "error", "mapping"])

Optional Step: Construct Extraction, FIM Transformation (Token Insertion), Single-line Convertion

In [None]:
import random
from itertools import groupby

# Extracts the code of the diverse solidity constructs 
def extract_constructs(path_r, path_w):

    with open(path_r, "r", encoding="utf-8") as file:
        solidity_code = file.read()

    # The patterns to search for inside the code
    function_pattern = r"    function [^{};]*(\{|\;)"
    modifier_pattern = r"    modifier [^{};]*(\{|\;)"
    mapping_pattern = r"    mapping[^{};]*\;"
    struct_pattern = r"    struct[^{}]*\{"
    error_pattern = r"    error [^{};]*\;"
    enum_pattern = r"    enum [^{}]*\{"
    event_pattern = r"    event [^{};]*\;"
    import_pattern = r"import [^{};]*\;"
    spdx_pattern = r"// SPDX-License-Identifier: .*"
    using_pattern = r"    using [^{};]*\;"
    constructor_pattern = r"    constructor[^{}]*\{"
    pragma_pattern = r"pragma [^{}]*\{"
    # statement_pattern = r"    (string|uint256|int256|bool|address|bytes).*? = [^{}();]*?;"
    
    patterns = [function_pattern, 
                modifier_pattern, 
                mapping_pattern, 
                struct_pattern, 
                error_pattern, 
                enum_pattern, 
                event_pattern, 
                import_pattern, 
                spdx_pattern, 
                using_pattern,
                constructor_pattern,
                pragma_pattern]

    # To numerate the sol_files correct
    directory_length = len(os.listdir(path_w))
    i = 1

    # Iterates over the given patterns
    for pattern in patterns:

        # Iterates over the construct matches
        for match in re.finditer(pattern, solidity_code):
            start_index = match.end() - 1                                   
            end_index = start_index + 1

            # Identifies where a construct such as a function ends with help of the brackets
            brace_count = 0
            for index in range(start_index, len(solidity_code)):
                if solidity_code[index] == "{":
                    brace_count += 1
                elif solidity_code[index] == "}":
                    brace_count -= 1
                    
                if brace_count == 0:
                    end_index = index
                    break
            
            if pattern == function_pattern:
                function_header = solidity_code[match.start():match.end()]         # function header gets extracted
                function_body = solidity_code[match.end():end_index+1]             # function body gets extracted
                code = create_fim_samples(function_header, function_body, '<|secure_function|>\n', '<|vulnerable_function|>\n', upper_token_limit=35)       # fim is applied

            elif pattern == modifier_pattern:
                modifier_header = solidity_code[match.start():match.end()]
                modifier_body = solidity_code[match.end():end_index+1]
                code = create_fim_samples(modifier_header, modifier_body, '', '', upper_token_limit=25)

            elif pattern == constructor_pattern: 
                constructor_header = solidity_code[match.start():match.end()]
                constructor_body = solidity_code[match.end():end_index+1]
                code = create_fim_samples(constructor_header, constructor_body, '<|secure_constructor|>\n', '<|vulnerable_constructor|>\n', upper_token_limit=25)

            elif pattern == pragma_pattern: 
                if re.search(import_pattern, solidity_code):
                    pragma_body = solidity_code[match.start():match.end()]
                    contract_vul_pattern = r"\/\/ WARNING Vulnerability.*\n\/\/ Recommendation for.*"
                    unclear_dependency_pattern = r"\(.*\)"                                                         # to not count wrong commas in dependency definitions such as ERC721("Friendly Fractals N", "FRFRACN")
                    pragma_body = re.sub(contract_vul_pattern, '', pragma_body)                                    # removes the vulnerability (if there) annotations for contracts (not required here)
                    pragma_body = re.sub('\n\n\n', '\n\n', pragma_body)
                    lines = pragma_body.strip().split('\n')
                    flatten = pragma_body.replace("    ", "\t").replace("\n", "\\n").replace("\t", "\\t").strip()  # flattens the contract definition to one line, to also detect commas in multiline definitions
                    flatten = re.sub(unclear_dependency_pattern, '', flatten)
                    comma_count = flatten.count(',')                                                               # counts the commas in the contracts defintion to detect the dependencies
                    mid_start = 2                                                                                  # mid start is alway 2 as the format is clear 
                    all_imports_end = max_consecutive_count(lines, import_pattern) + mid_start                     # all_imports is mid start + all consecutive counted imports
                    mid_end = comma_count + 1 + mid_start                                                          # mid end is mid start plus the comma_counts of the dependencies in the contract definition + 1
                    useless_lines = lines[mid_end:all_imports_end]
                    for line in useless_lines:                                                                     # the useless lines are removed
                        lines.remove(line)
                    prefix = '\n'.join(lines[:mid_start])
                    middle = '\n'.join(lines[mid_start:mid_end])
                    suffix = '\n'.join(lines[mid_end:])

                    code = '<|fim_begin|>' + prefix + '\n' + '<|fim_hole|>' + '\n' + suffix + '<|fim_end|>' + middle
                else:
                    continue

            else:
                code = solidity_code[match.start():end_index+1]

            code = code.replace("    ", "\t").replace("\n", "\\n").replace("\t", "\\t").strip()
            output_file = os.path.join(path_w, f"solidity_code_{i+directory_length}.sol")
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(code)
            i += 1

# This function is only required for the fim realization of the imports. It counts how much imports are between the pragma and the begin of the contract.
def max_consecutive_count(no_lines, import_pattern):
    max_count = 0
    current_count = 0

    for item in no_lines:
        if re.search(import_pattern, item):
            current_count += 1
            max_count = max(max_count, current_count)
        else:
            current_count = 0  # resets the count when a different element appears

    return max_count

# Realizes fill in the middle for functions, i.e. the function is splitted into prefix, middle, and suffix and the middle part is moved to the end. Additionally, fim tokens are inserted.
def create_fim_samples(header, body, secure_token, vulnerable_token, upper_token_limit=35, fim_ratio=0.4):

    body = re.sub(r' \| ID:.*', ' vulnerability', body)      # replaces the inner construct vulnerability pattern by 'vul-type vulnerability'

    lines = body.strip().split('\n')              # splits the functions body into single lines
    lines[0] = '\n        ' + lines[0]            # adds to the first line the corresponding newline and spaces, as these are otherwise removed
    lines = [x for x, _ in groupby(lines)]        # removes consecutive duplicates, i.e. for a list = ['a', 'a', 'b', 'a', 'a'] to list = ['a', 'b', 'a']

    unchanged_lines = lines                       # for the case of to short or to long functions, the blank lines '' should not be removed, i.e. the body stays unchanged

    inner_vul_pattern = r".*\/\/.*|.*"               # the pattern to search for inside functions or constructors to verify them as vulnerable. Pattern was inserted during slither analysis step

    with_fim = ""
    without_fim = ""

    removed_indices = [i for i, x in enumerate(lines) if x == '']                # tracks the indices of the ''
    lines = list(filter(lambda x: x != '', lines))                               # removes blank lines as the fim_hole token should not be used for an empty line

    # Restores the indices after fim is applied, such that the format stays correct
    def restore_indices(sublist, sublist_start_idx):
        for index in removed_indices:
            if sublist_start_idx <= index < sublist_start_idx + len(sublist):
                sublist.insert(index - sublist_start_idx, '')                    # inserts '' at the right indices
        return sublist

    if len(lines) < 3 or len(lines) > upper_token_limit:                         # very short and very long functions will be ignored
        unchanged_body = '\n'.join(unchanged_lines[:])
        if re.search(inner_vul_pattern, body):
            without_fim = vulnerable_token + header + unchanged_body
        else:
            without_fim = secure_token + header + unchanged_body
        # without_fim = without_fim.replace("    ", "\t").replace("\n", "\\n").replace("\t", "\\t").strip()     # converts the function to one line
        return without_fim

    mid_start = random.randint(0, len(lines) - 3)                                          # defines randomly the start point of the middle part, e.g. lines == 5 than start point between 0 and (inclusive) 2
    mid_end = min(mid_start + max(1, int(len(lines) * fim_ratio)), len(lines) - 1)         # determines the end point of the middle part, fim_ratio defines how much lines approximately are moved out

    prefix = '\n'.join(restore_indices(lines[:mid_start], 0))
    middle = '\n'.join(restore_indices(lines[mid_start:mid_end], mid_start))
    suffix = '\n'.join(restore_indices(lines[mid_end:], mid_end))

    # Searches for the vulnerability pattern and applies corresponding tokens
    if re.search(inner_vul_pattern, body):
        with_fim = vulnerable_token + '<|fim_begin|>' + header + prefix + '\n' + '<|fim_hole|>' + '\n' + suffix + '<|fim_end|>' + middle
    else:
        with_fim = secure_token + '<|fim_begin|>' + header + prefix + '\n' + '<|fim_hole|>' + '\n' + suffix + '<|fim_end|>' + middle

    # with_fim = with_fim.replace("    ", "\t").replace("\n", "\\n").replace("\t", "\\t").strip()
    return with_fim

    # code = code.replace("    ", "\t").replace("\n", "\\n").replace("\t", "\\t").strip()

In [52]:
src_folder = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Solhint_Dataset"

solidity_files = [f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))]

for file in solidity_files:
    extract_constructs(path_r=f"D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Solhint_Dataset/{file}",
                       path_w=f"D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/SpecialToken_Dataset")

Helper Functions during Preprocessing

In [None]:
folder_path = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Final_Datasets/Dataset_Fill_in_the_Middle/Test"

base_name = "solidity_code_"
file_extension = ".sol"

# Gets a sorted list of all .sol files in the directory
files = sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension)])

# Loops through each file and rename it
for index, filename in enumerate(files, start=1):
    # Constructs the new file name
    new_name = f"{base_name}{index}{file_extension}"
    
    # Defines the full file paths
    old_file = os.path.join(folder_path, filename)
    new_file = os.path.join(folder_path, new_name)
    
    # Renames the file
    os.rename(old_file, new_file)

print("Files renamed successfully.")

In [None]:
import random

# Puts an number of files randomly from one folder to another folder
def randomly_move_files(src_folder, dst_folder, num_files, move=True):

    # Lists all files in the source folder
    files = [f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))]
    if len(files) < num_files:
        print(f"Not enough files in the source folder. Found {len(files)} files, but need {num_files}.")
        return

    # Randomly selects the specified number of files
    selected_files = random.sample(files, num_files)

    # Moves or copies the selected files
    for file_name in selected_files:
        src_path = os.path.join(src_folder, file_name)
        dst_path = os.path.join(dst_folder, file_name)
        if move:
            shutil.move(src_path, dst_path)
        else:
            shutil.copy(src_path, dst_path)

    operation = "moved" if move else "copied"
    print(f"Successfully {operation} {num_files} files from {src_folder} to {dst_folder}.")

source_folder = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Test3"
destination_folder = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Final_Datasets/Dataset_Fill_in_the_Middle/Train"
number_of_files = 18118

randomly_move_files(source_folder, destination_folder, number_of_files, move=True)

In [55]:
# Shuffles files in a folder and renames it in new order
def shuffle_and_rename_files(path_r, path_w):
    files = [f for f in os.listdir(path_r) if os.path.isfile(os.path.join(path_r, f))]
    
    random.shuffle(files)
    
    for i, file_name in enumerate(files, start=1):
        old_path = os.path.join(path_r, file_name)
        new_name = f"solidity_code_{i}.sol"
        new_path = os.path.join(path_w, new_name)
        
        shutil.copy(old_path, new_path)
        print(f"Renamed: {file_name} -> {new_name}")

In [None]:
old_path = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Final_Datasets/Dataset_Fill_in_the_Middle"
new_path = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Test3"
shuffle_and_rename_files(old_path, new_path)

In [None]:
def to_one_line(path_r):
    
    with open(path_r, "r", encoding="utf-8") as file:
        solidity_code = file.read()

    solidity_code = solidity_code.replace("    ", "\t").replace("\n", "\\n").replace("\t", "\\t").strip()

    with open(path_r, "w", encoding="utf-8") as file:
        file.write(solidity_code)

path_r = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Test2/solidity_code_2696.sol"
to_one_line(path_r)

In [None]:
src_folder = "D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Final_Datasets/Dataset_Only_Constructs"

solidity_files = [f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))]

for file in solidity_files:
    to_one_line(path_r=f"D:/Uni_Ausbildung_Schule/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Final_Datasets/Dataset_Only_Constructs/{file}")

In [None]:
# Empty the folders
# folder_path = r"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Splitted_Dataset"
# # folder_path2 = r"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Cleaned_Dataset"
# folder_path3 = r"D:/TUHH_Computer_Science_Master/Forschungsprojekt/Preprocessing/Similarity_Dataset"

# file_pattern = "*.sol"

# files = glob.glob(os.path.join(folder_path, file_pattern))
# #files2 = glob.glob(os.path.join(folder_path2, file_pattern))
# files3 = glob.glob(os.path.join(folder_path3, file_pattern))

# for file in files:
#     os.remove(file)
#     #print(f"{file} deleted!")

# # for file in files2:
# #     os.remove(file)
# #     #print(f"{file} deleted!")

# for file in files3:
#     os.remove(file)
#     #print(f"{file} deleted!")

In [None]:
# Resolves some of the shadowing-local vulnerabilities from the low-vulnverability folder by renaming the owner identifier to accountOwner
def rename_owner_in_functions(file_path, output_path):
    with open(file_path, 'r') as f:
        code = f.read()

    # Regex to match the comment with a function directly below it and extract function signature and body
    function_pattern = (
        r"(// WARNING Vulnerability.*?shadowing-local.*?function\s+\w+\(.*?\)\s+.*?{.*?})"
    )
    
    # Callback function to process each match
    def rename_owner(match):
        function_header = match.group(1)
        
        updated_header = re.sub(r"//.*", "", function_header)

        # Renames 'owner' to 'accountOwner'
        updated_header = re.sub(r"\bowner\b", "accountOwner", updated_header)

        return updated_header 

    # Applies the regex and renaming
    updated_code = re.sub(function_pattern, rename_owner, code, flags=re.S)

    with open(output_path, 'w') as f:
        f.write(updated_code)
    print(f"Updated code written to {output_path}")


In [None]:
# Adds the missing immutable and constant keywords for some statements (repairs optimization issues)
def add_keyword(file_path, output_file):

    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    modified_lines = []

    # The comment pattern of an optimization issue
    pattern_immutable = r"// Recommendation for.*: Add the 'immutable'"
    pattern_constable = r"// Recommendation for.*: Add the 'constant'"

    visibility_pattern = r"\b(public|private|internal|external)\b"

    for i, line in enumerate(lines):
        
        # Inserts the first line
        if i == 0:
            modified_lines.append(lines[0])
        if i+1 < len(lines):
            next_line = lines[i + 1]
 
        # Checks if the current line contains the pattern
        if re.search(pattern_immutable, line) and i + 1 < len(lines):
            # Processes the next line for the variable declaration
            next_line = lines[i + 1]
            if re.search(visibility_pattern, next_line) and "immutable" not in next_line:
                modified_line = re.sub(r"((public|private|internal|external)\s+)", r"\1immutable ", next_line)
                modified_lines.append(modified_line)  
                continue                              # Skips the line since it was already processed
        
        elif re.search(pattern_constable, line) and i + 1 < len(lines):
            next_line = lines[i + 1]
            if re.search(visibility_pattern, next_line) and "constant" not in next_line:
                modified_line = re.sub(r"((public|private|internal|external)\s+)", r"\1constant ", next_line)
                modified_lines.append(modified_line)  
                continue                              

        modified_lines.append(next_line)
    
    # Deletes the last unnecessary line from the list
    modified_lines.pop()
    # Write the modified lines to the output file
    with open(output_file, 'w') as file:
        file.writelines(modified_lines)
