# kBench Factory

Prerequisites:

In [None]:
from KBDr.kclient import *
from pathlib import Path
import os, json

assert os.getcwd().find('/notebooks') != -1
repository_map_path = os.path.join(os.getcwd(), '..', 'kclient/repositories/map.json')

Configure the following

In [None]:
bug_type: Literal['fixed', 'open'] = 'fixed'
max_reported_days = 1
client = kGymAsyncClient(
    base_url='https://kgym-oss-api.kalorona.com',
    timeout=30,
    max_connections=5
)
BENCH_NAME = 'kb'

## Curate Dataset

### Option 1: Use `SyzbotCrawler`

#### Option 1a: Fixed Bugs

In [None]:
syzbot_crawler = SyzbotCrawler()
extids = await syzbot_crawler.crawl_fixed_table()
raw_dataset = SyzbotDataset(root=[await syzbot_crawler.crawl_extid(bug_type, extid) for extid in extids])

#### Option 1b: Open Bugs

In [None]:
syzbot_crawler = SyzbotCrawler(max_reported_days=max_reported_days)
extids = await syzbot_crawler.crawl_open_table()
print(extids)
raw_dataset = SyzbotDataset(root=[await syzbot_crawler.crawl_extid(bug_type, extid) for extid in extids])

### Option 2: Load existing raw Syzbot data

In [None]:
raw_syzbot_dir = Path(input('Type in the raw syzbot directory:'))
raw_dataset = SyzbotDataset(
    root=[SyzbotData.model_validate_json((raw_syzbot_dir / p).read_text()) for p in os.listdir(str(raw_syzbot_dir))]
)

### Custom Filtering

In [None]:
def custom_filter(x: SyzbotData) -> bool:
    return True

raw_dataset = SyzbotDataset(
    root=list(filter(custom_filter, raw_dataset.root))
)

### Populate dataset

Use `kclient/repositories/populate.py` to populate Linux repositories that Syzbot may use

In [None]:
repository_map = json.loads(Path(repository_map_path).read_text())
populator = SyzbotPopulator(bug_type=bug_type, repository_map=repository_map)
dataset = await populator.populate_batch(raw_dataset)

In [None]:
with open(f'./benchmarks/dataset-{BENCH_NAME}.json', 'w') as fp:
    fp.write(raw_dataset.model_dump_json())

### Custom Filtering

In [None]:
def custom_filter(x: SyzbotData) -> bool:
    return True

raw_dataset = SyzbotDataset(
    root=list(filter(custom_filter, raw_dataset.root))
)

## Bypass: Use existing dataset

Pull it from HuggingFace:

In [None]:
dataset = SyzbotDataset.from_hf('chenxi-kalorona-huang/kbench', 'kb')

In [None]:
for bug in dataset.root:
    if bug.userspaceImage:
        bug.userspaceImage = bug.userspaceImage.replace('-kdump', '')

## Run preliminary evaluation to keep good bugs (can take 1-2 days)

In [None]:
bench = await kBench.build(
    client,
    dataset,
    commit_from='crash' if bug_type == 'open' else 'parent',
    ninstance=5,
    machine_type='qemu:2-8192'
)

with open(f'./benchmarks/dataset-{BENCH_NAME}.json', 'w') as fp:
    fp.write(bench.dataset.model_dump_json())

with open(f'./benchmarks/kBench-{BENCH_NAME}.json', 'w') as fp:
    fp.write(bench.model_dump_json())