#DFIR-Metric

##Module III - NIST Computer Forensics Tool Testing Program (CFTT) Forensic String Search

###Authors: Bilel Cherif, Tamas Bisztray, Richard A. Dubniczky, Aaesha Aldahmani, Saeed Alshehhi and Norbert Tihanyi


In [40]:
# STEP 0: Install packages and download the NIST String-Search dataset
from tqdm import tqdm
import subprocess, sys, os, requests


#Install pip packages
requirements_txt = """
spire-doc
pytsk3
pandas
requests
"""
pip_packages = requirements_txt.strip().split("\n")

print("[*] Installing Python packages...")
for package in tqdm(pip_packages, desc="[*] Pip Packages"):
    subprocess.run([sys.executable, "-m", "pip", "install", package, "--upgrade"],
                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print("[*] Python packages installation: DONE")

#Download NIST dataset from the official repo
url = "https://cfreds-archive.nist.gov/StringSearching/string-search-federated-testing-data-set-version-1-1-revised-september-27-2019.zip"
local_zip = "string_search_dataset.zip"

# Skip download if already present
if not os.path.exists(local_zip):
    response = requests.get(url, stream=True)
    total = int(response.headers.get("content-length", 0))
    with open(local_zip, "wb") as f, tqdm(
            desc="[*] Downloading ZIP",
            total=total,
            unit="iB",
            unit_scale=True,
    ) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                bar.update(len(chunk))
    print("[+] Download : DONE")
else:
    print(f"[*] {local_zip} already exists: skipping download")

# Extract the dataset
extract_dir = "string_search_dataset"
if not os.path.isdir(extract_dir):
    print(f"[*] Extracting...")
    import zipfile
    with zipfile.ZipFile(local_zip, "r") as z:
        z.extractall(extract_dir)
    print("[+] Extraction: DONE")
else:
    print(f"[*] {extract_dir}/ already exists: skipping extraction")

print("[*] All steps: DONE")


[*] Installing Python packages...


[*] Pip Packages: 100%|██████████| 4/4 [00:13<00:00,  3.46s/it]

[*] Python packages installation: DONE
[*] string_search_dataset.zip already exists: skipping download
[*] string_search_dataset/ already exists: skipping extraction
[*] All steps: DONE





In [45]:
#STEP 1: Generate the DFIR-Metric-NSS.json form the NIST images (ss-win-07-25-18.dd and ss-unix-07-25-18.dd)
import pytsk3
import sys
from spire.doc import *
from spire.doc.common import *
import re
import json
import pandas as pd

class StringSearchBench():
    def __init__(self):
        self.NIST_dataset_path = "/content/string_search_dataset/string-search-federated-testing-data-set-version-1-1-revised-september-27-2019"
        self.win_partions_dict = {"the first windows data partion": 34, "the second windows data partion": 1953124, "the third windows data partion": 2929686}
        self.unix_partions_dict = {"the linux filesystem": 978944, "the first HFS+": 2048, "the second HFS+": 1955840}
        self.win_partions = ["the first windows data partion", "the second windows data partion", "the third windows data partion"]
        self.unix_partitions = ["the linux filesystem", "the first HFS+", "the second HFS+"]
        self.emails = ["iron.man@marvel.com", "potus@capitol.gov", "berlin@deutchland.net", "kgb@moscow.red.square.ru"]
        self.phones = ["\(901\)555-1111", " 301.555-9009", "800-555-1122", "202.555.3270"]
        self.extensions = {"text": ".txt", "web": ".html", "word": ".doc and .docx", "all": ".txt, .html, .doc and .docx"}
        self.words = ["Wolf", "thunderbird", "DireWolf" ,"garçon",  "الكسكس", "فلافل", b"\x70\x61\x6e\x64\x61", "shotgun", "flintlock", "rifle", "revolver", "longbow", "crossbow", "fox", "tiger"]
        self.words_condition = [["shotgun", "flintlock", "rifle"], ["peroxide", "nitroglycerin"], ["revolver", "longbow", "crossbow"],["panda", "fox"], ["fox", "tiger"], ["panda", "tiger", "fox"]]
        self.pattern = ["email", "Phone Number", "Social Security Number"]
        self.operators=["or", "and"]
        self.case_sensitivity = [True, False]
        self.whole_word = [True, False]
        self.file_state = ["deleted and non deleted", "deleted", "non deleted"]
        self.win_img = self.NIST_dataset_path + "/copy-to-test-computer/ss-win-07-25-18.dd"
        self.unix_img = self.NIST_dataset_path + "/copy-to-test-computer/ss-unix-07-25-18.dd"

        self.unix_mmls = """
The mmls commant output for ss-unix-07-25-18.dd is:
mmls ss-unix-07-25-18.dd
GUID Partition Table (EFI)
Offset Sector: 0
Units are in 512-byte sectors
Slot      Start        End          Length       Description
000:  Meta      0000000000   0000000000   0000000001   Safety Table
001:  -------   0000000000   0000002047   0000002048   Unallocated
002:  Meta      0000000001   0000000001   0000000001   GPT Header
003:  Meta      0000000002   0000000033   0000000032   Partition Table
004:  000       0000002048   0000978610   0000976563   OS X Hierarchical File System Plus (HFS+) partition
005:  -------   0000978611   0000978943   0000000333   Unallocated
006:  001       0000978944   0001955506   0000976563   Linux filesystem data
007:  -------   0001955507   0001955839   0000000333   Unallocated
008:  002       0001955840   0002932402   0000976563   OS X Hierarchical File System Plus (HFS+) partition
009:  -------   0002932403   0002932735   0000000333   Unallocated
010:  003       0002932736   0004097671   0001164936
011:  -------   0004097672   0004097711   0000000040   Unallocated
        """

        self.win_mmls = """
The mmls commant output for ss-win-07-25-18.dd is:
mmls ss-win-07-25-18.dd
GUID Partition Table (EFI)
Offset Sector: 0
Units are in 512-byte sectors

    Slot      Start        End          Length       Description
000:  Meta      0000000000   0000000000   0000000001   Safety Table
001:  -------   0000000000   0000000033   0000000034   Unallocated
002:  Meta      0000000001   0000000001   0000000001   GPT Header
003:  Meta      0000000002   0000000033   0000000032   Partition Table
004:  000       0000000034   0000976561   0000976528   Basic data partition
005:  001       0000976562   0001953123   0000976562   Basic data partition
006:  002       0001953124   0002929685   0000976562   Basic data partition
007:  003       0002929686   0003906247   0000976562   Basic data partition
008:  -------   0003906248   0004097711   0000191464   Unallocated
        """

        self.email_prompt = """
The following is the output of mmls on a disk image. Write a script to find all {} files that contain the email {} in {}.
The output should be a list sorted by inode in the format: <xml>["inode:filename", "inode:filename", ...]</xml>.
Deleted files should have filenames starting with DELETED, and non-deleted files should start with LIVE.
For example: <xml>["122150:DELETED-test-email.txt", "122152:LIVE-test-email.txt"]</xml>.
{}
"""


        self.phone_prompt = """
The following is the output of mmls on a disk image. Write a script to find all {} files that contain the phone number {} in {}.
The output should be a list sorted by inode in the format: <xml>["inode:filename", "inode:filename", ...]</xml>.
Deleted files should have filenames starting with DELETED, and non-deleted files should start with LIVE.
For example: <xml>"130735:DELETED-phone-examplefile.txt", "122152:LIVE-phone-file2.txt"</xml>.
{}
"""

        self.text_prompt = """
The following is the output of mmls on a disk image. Write a script to find all the {} files that contain the word/words {} in {}.
The output should be a list sorted by inode in the format: <xml>["inode:filename", "inode:filename", ...]</xml>.
Deleted files should have filenames starting with DELETED, and non-deleted files should start with LIVE.
For example: <xml>"122150:DELETED-example-ascii.txt", "122152:LIVE-example-fat-ascii.txt"</xml>.
{}

        """
        self.extension_count_prompt = """
The following is the output of mmls on a disk image. Write a script to count all the {} files with the extension(s) {} in {}.
The output format should comply with this format: <count>
Count is the number of identified files
The deleted files start with DELETED the non deleted files start with LIVE
{}
        """
        self.extension_count_prompt = """
The following is the output of mmls on a disk image. Write a script to count all {} files with the extension(s) {} in {}.
The output should be a single value in the format: <xml>count</xml>, representing the number of identified files.
Deleted files should have filenames starting with DELETED, and non-deleted files should start with LIVE. For example: <xml>211</xml>
        {}
        """


    def matches(self, text, keywords, match_whole_words=False, case_sensitive=False, logic_operator="none"):
        # Choose flags
        flags = 0 if case_sensitive else re.IGNORECASE

        # Prepare final match pattern
        if isinstance(keywords, list):
            if logic_operator == "or":
                # Match any of the words (whole words if match_whole_words is True)
                word_pattern = '\\b(?:' + '|'.join(re.escape(word) for word in keywords) + ')\\b' if match_whole_words else '|'.join(re.escape(word) for word in keywords)
                pattern = word_pattern
            elif logic_operator == "and":
                # Match all the words (whole words if match_whole_words is True)
                word_pattern = '\\b(?:' + '|'.join(re.escape(word) for word in keywords) + ')\\b' if match_whole_words else '|'.join(re.escape(word) for word in keywords)
                pattern = '^.*' + '.*'.join(f'(?=.*\\b{re.escape(word)}\\b)' for word in keywords) + '.*$'
            else:
                # Default to "none", where any of the words are matched
                word_pattern = '\\b(?:' + '|'.join(re.escape(word) for word in keywords) + ')\\b' if match_whole_words else '|'.join(re.escape(word) for word in keywords)
                pattern = word_pattern
        else:
            # If keywords is a single string, use it as is
            pattern = keywords

        # Perform regex search
        if re.search(pattern.encode(), text, flags=flags):
            return True
        else:
            return False


    def detect_utf16(self, file_data):
        ENCODING_UTF_16_BE = 'utf_16_be'
        ENCODING_UTF_16_LE = 'utf_16_le'

        # http://unicode.org/faq/utf_bom.html#BOM
        BOM_UTF_16_BE = b'\xfe\xff'
        BOM_UTF_16_LE = b'\xff\xfe'

        BYTE_EOL = (13,10) # \r\n

        UTF_16_NULL_PERCENT_POSITIVE = 0.7
        UTF_16_NULL_PERCENT_NEGATIVE = 0.1
        null_byte_odd,null_byte_even = 0,0
        eol_odd,eol_even = 0,0
        if file_data[:2] == BOM_UTF_16_BE:
            return ENCODING_UTF_16_BE

        elif file_data[:2] == BOM_UTF_16_LE:
            return ENCODING_UTF_16_LE

        else:
            odd_byte = None
            for file_byte in file_data:
                # build pairs of bytes
                if (odd_byte is None):
                    odd_byte = file_byte
                    continue

                # look for odd/even null byte and check other byte for EOL
                if (odd_byte == 0):
                    null_byte_odd += 1
                    if (file_byte in BYTE_EOL):
                        eol_even += 1

                elif (file_byte == 0):
                    null_byte_even += 1
                    if (odd_byte in BYTE_EOL):
                        eol_odd += 1

                odd_byte = None

            # attempt detection based on line endings
            if ((not eol_odd) and eol_even):
                return ENCODING_UTF_16_BE

            if (eol_odd and (not eol_even)):
                return ENCODING_UTF_16_LE

            # can't detect on line endings - evaluate ratio of null bytes in odd/even positions
            # this will give an indication of how much ASCII (1-127) level text is present
            data_size_half = (len(file_data) / 2)
            threshold_positive = int(data_size_half * UTF_16_NULL_PERCENT_POSITIVE)
            threshold_negative = int(data_size_half * UTF_16_NULL_PERCENT_NEGATIVE)

            # must have enough file data to have value ([threshold_positive] must be non-zero)
            if (threshold_positive):
                if ((null_byte_odd > threshold_positive) and (null_byte_even < threshold_negative)):
                    return ENCODING_UTF_16_LE

                if ((null_byte_odd < threshold_negative) and (null_byte_even > threshold_positive)):
                    return ENCODING_UTF_16_BE

            # not UTF-16 - or insufficient data to determine with confidence
            return False

    def search_in_image(self, image_path, hex_or_ascii, start_sector=34, sector_size=512, _match_whole_words = True, _case_sensitive = False, _logic_operator = "or", extension = "all", mode="search"):
        if isinstance(hex_or_ascii, list):
            hex_or_ascii = hex_or_ascii
        elif isinstance(hex_or_ascii, bytes) :
                hex_or_ascii = hex_or_ascii.decode()
        hex_or_ascii_bytes = hex_or_ascii
        result = []
        #print(f"[*] Searching for files containing '{hex_or_ascii}'...")

        # Open image file
        img_info = pytsk3.Img_Info(image_path)
        fs_info = pytsk3.FS_Info(img_info, offset=start_sector * sector_size)
        def walk(directory, result):
            for entry in directory:
                if not hasattr(entry, "info") or not hasattr(entry.info, "name"):
                    continue

                name = entry.info.name.name.decode("utf-8", errors="ignore")
                if name in [".", ".."]:
                    continue

                meta = entry.info.meta
                if meta is None:
                    continue

                # Include both allocated and unallocated files
                is_allocated = bool(meta.flags & pytsk3.TSK_FS_META_FLAG_ALLOC)
                is_unallocated = bool(meta.flags & pytsk3.TSK_FS_META_FLAG_UNALLOC)
                if meta.type == pytsk3.TSK_FS_META_TYPE_DIR:
                    try:
                        walk(entry.as_directory(), result)
                    except Exception:
                        continue

                elif meta.size > 0 and (is_allocated or is_unallocated):
                    if extension == "all":
                        #print(f"Doc file: {name}")
                        if name.endswith(".doc") or name.endswith(".docx"):
                            filedata = entry.read_random(0, meta.size)
                            with open(self.NIST_dataset_path + "/copy-to-test-computer/temp/" + name, "wb") as f:
                                f.write(filedata)
                            document = Document()

                            # Load a Word file from disk
                            document.LoadFromFile("./copy-to-test-computer/temp/" + name)
                            text = document.GetText()
                            if self.matches(text.encode(), hex_or_ascii_bytes, match_whole_words = _match_whole_words, case_sensitive = _case_sensitive, logic_operator = _logic_operator):
                                alloc_status = "deleted" if is_unallocated else "allocated"
                                #print(f"{meta.addr}:{name} [{alloc_status}]")
                                result.append(f"{meta.addr}:{name}#{alloc_status}")
                        else:
                            filedata = entry.read_random(0, meta.size)
                            if name.endswith(".txt"):
                                encoding = self.detect_utf16(filedata)

                                if encoding == "utf_16_be" :
                                    filedata = filedata.decode("utf-16be")
                                    filedata = filedata.encode("utf-8")
                                elif encoding == "utf_16_le":
                                    filedata = filedata.decode("utf-16le")
                                    filedata = filedata.encode("utf-8")

                            if self.matches(filedata, hex_or_ascii_bytes, match_whole_words = _match_whole_words, case_sensitive = _case_sensitive, logic_operator = _logic_operator):
                                alloc_status = "deleted" if is_unallocated else "allocated"
                                #print(f"{meta.addr}:{name} [{alloc_status}]")
                                result.append(f"{meta.addr}:{name}#{alloc_status}")
                    elif extension == "word":
                        if name.endswith(".doc") or name.endswith(".docx"):
                            #print(f"Doc file: {name}")
                            filedata = entry.read_random(0, meta.size)
                            with open("./copy-to-test-computer/temp/" + name, "wb") as f:
                                f.write(filedata)
                            document = Document()

                            # Load a Word file from disk
                            document.LoadFromFile("./copy-to-test-computer/temp/" + name)
                            text = document.GetText()
                            if self.matches(text.encode(), hex_or_ascii_bytes, match_whole_words = _match_whole_words, case_sensitive = _case_sensitive, logic_operator = _logic_operator):
                                alloc_status = "deleted" if is_unallocated else "allocated"
                                #print(f"{meta.addr}:{name} [{alloc_status}]")
                                result.append(f"{meta.addr}:{name}#{alloc_status}")
                    elif extension == "web":
                        if name.endswith(".html"):
                            filedata = entry.read_random(0, meta.size)
                            if self.matches(filedata, hex_or_ascii_bytes, match_whole_words = _match_whole_words, case_sensitive = _case_sensitive, logic_operator = _logic_operator):
                                alloc_status = "deleted" if is_unallocated else "allocated"
                                #print(f"{meta.addr}:{name} [{alloc_status}]")
                                result.append(f"{meta.addr}:{name}#{alloc_status}")
                    elif extension == "text":
                        if name.endswith(".txt"):
                            filedata = entry.read_random(0, meta.size)
                            if self.matches(filedata, hex_or_ascii_bytes, match_whole_words = _match_whole_words, case_sensitive = _case_sensitive, logic_operator = _logic_operator):
                                alloc_status = "deleted" if is_unallocated else "allocated"
                                #print(f"{meta.addr}:{name} [{alloc_status}]")
                                result.append(f"{meta.addr}:{name}#{alloc_status}")

        def walk_count(directory, count):
            count = count
            for entry in directory:
                if not hasattr(entry, "info") or not hasattr(entry.info, "name"):
                    continue

                name = entry.info.name.name.decode("utf-8", errors="ignore")
                if name in [".", ".."]:
                    continue

                meta = entry.info.meta
                if meta is None:
                    continue

                # Include both allocated and unallocated files
                is_allocated = bool(meta.flags & pytsk3.TSK_FS_META_FLAG_ALLOC)
                is_unallocated = bool(meta.flags & pytsk3.TSK_FS_META_FLAG_UNALLOC)

                if meta.size > 0 and (is_allocated or is_unallocated):
                    if extension == "all":
                        count +=1
                    elif extension == "word":
                        if name.endswith(".doc") or name.endswith(".docx"):
                            count +=1
                    elif extension == "web":
                        if name.endswith(".html"):
                            count+=1
                    elif extension == "text":
                        if name.endswith(".txt"):
                            count+=1
                if meta.type == pytsk3.TSK_FS_META_TYPE_DIR:
                    try:
                        count = walk_count(entry.as_directory(), count)
                    except Exception:
                        continue
            return count

        directory = fs_info.open_dir("/")
        if mode == "search":
            walk(directory, result)
            return result
        elif mode == "count":
            count = 0
            count = walk_count(directory, count)
            return f"{count}"

    def gen_prompt(self, kind, mmls, partition,target, state, extension, operator="all"):
        if kind == "email":
            prompt = self.email_prompt.format(state, target, partition, mmls)
        elif kind == "phone":
            prompt = self.phone_prompt.format(state, target, partition, mmls)
        elif kind == "string":
            prompt = self.text_prompt.format(state, target, partition, mmls)
        elif kind == "condition":
            if operator == "or":
                temp_target = " or ".join(target)
            elif operator == "and":
                temp_target = " and ".join(target)
            prompt = self.text_prompt.format(state, temp_target, partition, mmls)
        elif kind == "count":
            prompt = self.extension_count_prompt.format(state, extension, partition, mmls)
        return prompt

    def filter_state(self, state, baseline):
        temp_base = []
        if len(baseline) > 0:
            if state == "deleted":
                for base in baseline:
                    _base = base.split("#")
                    if _base[-1] == "deleted":
                        temp_base.append(_base[0])
            elif state == "non deleted":
                for base in baseline:
                    _base = base.split("#")
                    if _base[-1] != "deleted":
                        temp_base.append(_base[0])
            else:
                for base in baseline:
                    _base = base.split("#")
                    temp_base.append(_base[0])

        else:
            temp_base = baseline

        return temp_base

    def generate_baseline(self, os, __mode, target = "wolf",operator = "or", extension = "all", case_sensitivity=False, whole_word=False, _start_sector = 34):
        if os == "win":
            return self.search_in_image(self.win_img, target, start_sector=_start_sector, sector_size=512, _match_whole_words = whole_word, _case_sensitive = case_sensitivity, _logic_operator = operator, extension = extension, mode = __mode)
        elif os == "unix":
            return self.search_in_image(self.unix_img, target, start_sector=_start_sector, sector_size=512, _match_whole_words = whole_word, _case_sensitive = case_sensitivity, _logic_operator = operator, extension = extension, mode= __mode)

    def win_gen(self, kind, state, target, partition, case_sensitivity, whole_word, extension, operator, _mode):
        elements = {}
        if _mode == "search":
            question = self.gen_prompt(kind, self.win_mmls, partition, target, state, extension, operator = operator)
            baseline = self.generate_baseline("win", _mode, target=target, _start_sector=self.win_partions_dict[partition], case_sensitivity=case_sensitivity, whole_word=whole_word, extension=extension, operator=operator)
            baseline = self.filter_state(state, baseline)
            elements["offset"] = f"{self.win_partions_dict[partition]}"
            elements["image"] = "ss-win-07-25-18.dd"
            elements["target"] = f"{target}"
            elements["extension"] = f"{extension}"
        elif _mode == "count":
            question = self.gen_prompt(kind, self.win_mmls, partition, target, state, self.extensions[extension])
            baseline = self.generate_baseline("win", _mode, target="none", _start_sector=self.win_partions_dict[partition], case_sensitivity=case_sensitivity, whole_word=whole_word, extension=extension, operator=operator)
            elements["offset"] = f"{self.win_partions_dict[partition]}"
            elements["image"] = "ss-win-07-25-18.dd"
            elements["target"] = f"{self.extensions[extension]}"
            elements["extension"] = "count"
        return {"question": question, "answer": baseline, "target": target, "eval_elements": elements}

    def unix_gen(self, kind, state, target, partition, case_sensitivity, whole_word, extension, operator, _mode):
        elements = {}
        if _mode == "search":
            question = self.gen_prompt(kind, self.unix_mmls, partition, target, state, extension, operator=operator)
            baseline = self.generate_baseline("unix", _mode, target=target, _start_sector=self.unix_partions_dict[partition], case_sensitivity=case_sensitivity, whole_word=whole_word, extension=extension, operator=operator)
            baseline = self.filter_state(state, baseline)
            elements["offset"] = f"{self.unix_partions_dict[partition]}"
            elements["image"] = "ss-unix-07-25-18.dd"
            elements["extension"] = f"{target}"
            elements["target"] = f"{extension}"
        elif _mode == "count":
            question = self.gen_prompt(kind, self.unix_mmls, partition, target, state, self.extensions[extension])
            baseline = self.generate_baseline("unix", _mode, target="none", _start_sector=self.unix_partions_dict[partition], case_sensitivity=case_sensitivity, whole_word=whole_word, extension=extension, operator=operator)
            elements["offset"] = f"{self.unix_partions_dict[partition]}"
            elements["image"] = "ss-unix-07-25-18.dd"
            elements["extension"] = f"{self.extensions[extension]}"
            elements["target"] = "count"
        return {"question": question, "answer": baseline, "target": target, "eval_elements": elements}

    def generate_bench(self):
        output = []

        # Generate for emails
        for partition in self.win_partions:
            for state in self.file_state:
                for email in self.emails:
                    temp = self.win_gen("email", state, email, partition, False, False, "all", "or", "search")
                    output.append(temp)

        for partition in self.unix_partitions:
            for state in self.file_state:
                for email in self.emails:
                    temp = self.unix_gen("email", state, email, partition, False, False, "all", "or", "search")
                    output.append(temp)

        # Generate for phone numbers
        for partition in self.win_partions:
            for state in self.file_state:
                for phone in self.phones:
                    temp = self.win_gen("phone", state, phone, partition, False, False, "all", "or", "search")
                    output.append(temp)

        for partition in self.unix_partitions:
            for phone in self.phones:
                for state in self.file_state:
                    temp = self.unix_gen("phone", state, phone, partition, False, False, "all", "or", "search")
                    output.append(temp)
        # Generate for words
        for partition in self.win_partions:
            for state in self.file_state:
                for word in self.words:
                    temp = self.win_gen("string", state, word, partition, False, False, "all", "or", "search")
                    output.append(temp)

        for partition in self.unix_partitions:
            for state in self.file_state:
                for word in self.words:
                    temp = self.unix_gen("string", state, word, partition, False, False, "all", "or", "search")
                    output.append(temp)

        # Generate word with conditions
        # Unfinished
        partition = self.win_partions[0]
        for word in self.words_condition:
            for state in self.file_state:
                for operator in self.operators:
                    temp = self.win_gen("condition", state, word, partition, False, False, "all", operator, "search")
                    output.append(temp)

        partition = self.unix_partitions[0]
        for word in self.words_condition:
            for state in self.file_state:
                for operator in self.operators:
                    temp = self.unix_gen("condition", state, word, partition, False, False, "all", operator, "search")
                    output.append(temp)

        # count
        for partition in self.win_partions:
            for extension in self.extensions:
                temp = self.win_gen("count", self.file_state[0], extension, partition, False, False, extension, "or", "count")
                output.append(temp)

        for partition in self.unix_partitions:
            for extension in self.extensions:
                temp = self.unix_gen("count", self.file_state[0], extension, partition, False, False, extension, "or", "count")
                output.append(temp)
        return output
    def export_bench(self, output_file):
        raw_data = self.generate_bench()

        formatted_data = {
            "dataset": "DFIR-Metric Module III (NIST Computer Forensics Tool Testing Program (CFTT) Forensic String Search)",
            "authors": "Bilel Cherif, Aaesha Aldahmani, Saeed Alshehhi, Tamas Bisztray, Richard A. Dubniczky, and Norbert Tihanyi",
            "sources": "https://github.com/DFIR-Metric",
            "number_of_questions": len(raw_data),
            "questions": [
                {
                    "question": item["question"],
                    "answer": item["answer"]
                }
                for item in raw_data
            ]
        }

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(formatted_data, f, indent=4, ensure_ascii=False)

#Generating the NIST JSON
output = "DFIR-Metric-NSS.json"
engine = StringSearchBench()
engine.export_bench(output)
