# Build training dataset (DVFS 2.25GHz)

This notebook extracts loop-only profiling samples from `apps/<benchmark>/` and writes a labeled CSV.
Labels follow the DVFS guidance groups: HighFreq, MedFreq, LowFreq.


In [1]:
from pathlib import Path
from typing import Dict, Iterable, List, Sequence
import csv

FEATURE_FIELDS = [
    'CPI',
    'Math_Intensity',
    'Stall_Ratio',
    'System_BW_Proxy',
    'Branch_MPKI',
    'GFLOPS_Approx',
    'Clock_Ratio',
]

# LABELS = {
#     'HighFreq': {
#         'atomic_fight',
#         'branch_mispredict',
#         'dgemm',
#         'fft_mix',
#         'icache_thrash',
#         'tree_walk',
#     },
#     'MedFreq': {
#         'l3_stencil',
#         'spmv',
#         'stream',
#     },
#     'LowFreq': {
#         'io_write',
#         'mpi_bandwidth',
#         'mpi_barrier',
#         'pointer_chase',
#     },
# }

# Replace these two label sets with the outputs from process_apps.ipynb
LABELS_ENERGY = {
    'HighFreq': {
        'atomic_fight',
        'branch_mispredict',
        'dgemm',
        'fft_mix',
        'io_write',
        'l3_stencil'
    },
    'MedFreq': {
        'icache_thrash', 
        'pointer_chase'
    },
    'LowFreq': {
        'mpi_bandwidth', 
        'mpi_barrier', 
        'spmv', 
        'stream'
    }
    }

LABELS_EDP = {
    'HighFreq': {
        'atomic_fight',
        'branch_mispredict',
        'dgemm',
        'fft_mix',
        'icache_thrash',
        'io_write',
        'l3_stencil',
        'mpi_barrier',
    },
    'MedFreq': {'pointer_chase'},
    'LowFreq': {
        'mpi_bandwidth', 
        'spmv', 
        'stream'
    }
    }


def parse_header(stdout_path: Path) -> Sequence[str]:
    with stdout_path.open() as handle:
        for line in handle:
            if line.startswith('# GID|'):
                parts = line.lstrip('#').strip().split('|')
                fields = parts[3:]
                if fields:
                    return fields
    raise RuntimeError(f'No LIKWID header found in {stdout_path}')


def find_label(app: str, label_map: Dict[str, set[str]]) -> str:
    for label, apps in label_map.items():
        if app in apps:
            return label
    raise KeyError(f'No label mapping for app {app}')


def iter_profile_rows(prof_path: Path) -> Iterable[List[str]]:
    # print(f'Opening profile file: {prof_path}')
    with prof_path.open() as handle:
        in_loop = False
        for raw_line in handle:
            line = raw_line.strip()
            if not line:
                continue
            if line.startswith('LOOP_START_REL'):
                in_loop = True
                continue
            if line.startswith('LOOP_END_REL'):
                break
            if not in_loop:
                continue
            if line.startswith('#'):
                continue
            yield line.split(',')


def extract_metrics(row_parts: Sequence[str], header: Sequence[str]) -> Dict[str, float]:
    metrics_count = int(row_parts[1])
    cpu_count = int(row_parts[2])
    values = [float(val) for val in row_parts[3:]]

    metric_fields = list(header[1:])
    if metrics_count != len(metric_fields):
        raise ValueError(
            f'Header mismatch: expected {len(metric_fields)} metrics, got {metrics_count}'
        )

    expected_values = 1 + metrics_count * cpu_count
    if len(values) != expected_values:
        raise ValueError(
            f'Expected {expected_values} numeric entries, got {len(values)}'
        )

    metric_values = values[1:]
    extracted: Dict[str, float] = {}
    for idx, field in enumerate(metric_fields):
        start = idx * cpu_count
        end = start + cpu_count
        segment = metric_values[start:end]
        if not segment:
            continue
        extracted[field] = float(sum(segment) / len(segment))
    return extracted


def find_apps_dir(start: Path) -> Path:
    current = start.resolve()
    for _ in range(6):
        candidate = current / 'apps'
        if candidate.is_dir():
            return candidate
        if current.parent == current:
            break
        current = current.parent
    raise FileNotFoundError('No apps directory found. Run from repo root or set apps_dir manually.')


In [2]:
# Configuration
apps_dir = find_apps_dir(Path.cwd())
selected_freqs = ['1.5GHz', '1.8GHz','2.25GHz']
group = 'dvfs'
energy_out_path = Path('csv/training_dataset_dvfs_energy_labels.csv')
edp_out_path = Path('csv/training_dataset_dvfs_edp_labels.csv')


In [3]:
# Build datasets
columns = ['label'] + FEATURE_FIELDS


def build_dataset(label_map: Dict[str, set[str]], out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    rows_written = 0
    skipped_apps = []

    with out_path.open('w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=columns)
        writer.writeheader()

        for app_dir in sorted(apps_dir.iterdir()):
            if not app_dir.is_dir():
                continue
            app = app_dir.name
            try:
                label = find_label(app, label_map)
            except KeyError:
                continue

            for freq in selected_freqs:
                stdout_path = app_dir / f'{app}_{group}_{freq}.out'
                prof_path = app_dir / f'{app}_{group}_{freq}.prof'
                if not stdout_path.exists() or not prof_path.exists():
                    skipped_apps.append(f'{app}:{freq}')
                    continue

                header = parse_header(stdout_path)
                missing = [f for f in FEATURE_FIELDS if f not in header]
                if missing:
                    skipped_apps.append(f'{app}:{freq}')
                    continue

                for parts in iter_profile_rows(prof_path):
                    metrics = extract_metrics(parts, header)
                    if any(field not in metrics for field in FEATURE_FIELDS):
                        continue

                    zero_metric_count = sum(1 for field in FEATURE_FIELDS if metrics.get(field, 0.0) == 0.0)
                    if zero_metric_count >= 3:
                        continue
                    row = {
                        'label': label,
                    }
                    for field in FEATURE_FIELDS:
                        row[field] = metrics[field]
                    writer.writerow(row)
                    rows_written += 1

    return rows_written, skipped_apps

energy_rows, energy_skipped = build_dataset(LABELS_ENERGY, energy_out_path)
edp_rows, edp_skipped = build_dataset(LABELS_EDP, edp_out_path)

energy_rows, energy_out_path, edp_rows, edp_out_path


(21525,
 PosixPath('csv/training_dataset_dvfs_energy_labels.csv'),
 21525,
 PosixPath('csv/training_dataset_dvfs_edp_labels.csv'))

In [4]:
# Apps/frequencies skipped due to missing files or fields
# sorted(set(skipped_apps))
