# Merge Solver Results and Identify Best Algorithms

This notebook loads the results from `quantum_solver_runs.json` and `solver_runs.json`, merges them, and identifies for each instance which algorithm gave the best optimal solution.

**Outputs:**
- `merged_best_by_instance.csv`
- `merged_best_by_instance.json`

In [1]:
import json
import math
import csv
from collections import defaultdict
from pathlib import Path

In [2]:
RESULTS_DIR = Path("experiment_results")
# Use the / operator to join the Path object with the string
QUANTUM_PATH = RESULTS_DIR / "quantum_solver_runs.json"
CLASSICAL_PATH = RESULTS_DIR / "solver_runs.json"
OUT_CSV = RESULTS_DIR / "merged_best_by_instance.csv"
OUT_JSON = RESULTS_DIR / "merged_best_by_instance.json"
EXACT_PATH = RESULTS_DIR / "exact_solver_runs.json"


def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

In [3]:
def safe_value(val):
    try:
        if val is None:
            return None
        if isinstance(val, (int, float)):
            if math.isnan(val) or not math.isfinite(val):
                return None
            return float(val)
        v = float(val)
        if math.isnan(v) or not math.isfinite(v):
            return None
        return v
    except Exception:
        return None

def collect_runs(*lists_of_runs):
    runs_by_instance = defaultdict(list)
    for lst in lists_of_runs:
        for r in lst:
            inst = r.get("instance_file") or r.get("instance")
            if not inst:
                continue
            bv_raw = r.get("best_value")
            bv = safe_value(bv_raw)
            rec = {
                "method": r.get("method"),
                "best_value": bv,
                "best_value_raw": bv_raw,
                "runtime": r.get("runtime"),
                "seed": r.get("seed"),
                "run_index": r.get("run_index"),
                "parameters": r.get("parameters"),
                "instance_n": r.get("instance_n"),
                "instance_dist": r.get("instance_dist"),
                "instance_cap_ratio_str": r.get("instance_cap_ratio_str"),
                "instance_seed": r.get("instance_seed"),
            }
            runs_by_instance[inst].append(rec)
    return runs_by_instance

def pick_best_for_each_instance(runs_by_instance):
    results = []
    for inst, runs in runs_by_instance.items():
        max_val = None
        for r in runs:
            bv = r["best_value"]
            if bv is None:
                continue
            if max_val is None or bv > max_val:
                max_val = bv

        winners = []
        if max_val is not None:
            for r in runs:
                if r["best_value"] is not None and r["best_value"] == max_val:
                    winners.append({
                        "method": r["method"],
                        "best_value": r["best_value"],
                        "runtime": r["runtime"],
                        "seed": r["seed"],
                        "run_index": r["run_index"],
                        "parameters": r["parameters"]
                    })

        by_method_type = defaultdict(list)
        for r in runs:
            m = r.get("method", "")
            if "Quantum" in (m or "") or "quantum" in (m or "") or m in ("QAOA", "VQE", "QuantumAnnealing"):
                key = "quantum"
            else:
                key = "classical"
            by_method_type[key].append(r)

        def best_in_group(group):
            mv = None
            for rr in group:
                if rr["best_value"] is None:
                    continue
                if mv is None or rr["best_value"] > mv:
                    mv = rr["best_value"]
            return mv

        best_quantum = best_in_group(by_method_type.get("quantum", []))
        best_classical = best_in_group(by_method_type.get("classical", []))

        results.append({
            "instance_file": inst,
            "instance_n": runs[0].get("instance_n"),
            "instance_dist": runs[0].get("instance_dist"),
            "instance_cap_ratio_str": runs[0].get("instance_cap_ratio_str"),
            "best_value_overall": max_val,
            "winners": winners,
            "best_quantum": best_quantum,
            "best_classical": best_classical,
            "num_runs": len(runs)
        })

    results.sort(key=lambda x: x["instance_file"])
    return results

def write_csv(results, path):
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        header = [
            "instance_file", "instance_n", "instance_dist", "instance_cap_ratio_str",
            "best_value_overall", "num_winners", "winner_methods",
            "best_quantum", "best_classical", "num_runs"
        ]
        writer.writerow(header)
        for r in results:
            winner_methods = ";".join(sorted({w["method"] for w in r["winners"]})) if r["winners"] else ""
            writer.writerow([
                r["instance_file"],
                r.get("instance_n"),
                r.get("instance_dist"),
                r.get("instance_cap_ratio_str"),
                r.get("best_value_overall"),
                len(r["winners"]),
                winner_methods,
                r.get("best_quantum"),
                r.get("best_classical"),
                r.get("num_runs")
            ])

def write_json(results, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

In [4]:
q = load_json(QUANTUM_PATH)
c = load_json(CLASSICAL_PATH)
print(f"Loaded {len(q)} quantum runs and {len(c)} classical runs.")

runs_by_instance = collect_runs(q, c)
print(f"Found {len(runs_by_instance)} unique instances.")

results = pick_best_for_each_instance(runs_by_instance)
write_csv(results, OUT_CSV)
write_json(results, OUT_JSON)

print(f"\nResults saved to:\n - {OUT_CSV.resolve()}\n - {OUT_JSON.resolve()}")

print("\nPreview:")
for r in results:
    print(f"- {r['instance_file']}: best={r['best_value_overall']} by {[w['method'] for w in r['winners']]}")

Loaded 81 quantum runs and 324 classical runs.
Found 27 unique instances.

Results saved to:
 - C:\Users\abhay\Desktop\Projects\COMA_IIITR\experiment_results\merged_best_by_instance.csv
 - C:\Users\abhay\Desktop\Projects\COMA_IIITR\experiment_results\merged_best_by_instance.json

Preview:
- knapsack_n100_seed20251020.json: best=25554.0 by ['ParticleSwarmOptimization', 'ParticleSwarmOptimization', 'ParticleSwarmOptimization']
- knapsack_n100_seed20251023.json: best=39931.0 by ['Greedy_Ratio', 'ParticleSwarmOptimization']
- knapsack_n100_seed20251026.json: best=45613.0 by ['Greedy_Ratio', 'ParticleSwarmOptimization', 'ParticleSwarmOptimization', 'ParticleSwarmOptimization']
- knapsack_n100_seed20251120.json: best=18691.0 by ['ParticleSwarmOptimization']
- knapsack_n100_seed20251123.json: best=33418.0 by ['GeneticAlgorithm', 'ParticleSwarmOptimization', 'ParticleSwarmOptimization', 'ParticleSwarmOptimization']
- knapsack_n100_seed20251126.json: best=43669.0 by ['ParticleSwarmOptimization'

In [5]:
import json, os, re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
OUT_DIR = RESULTS_DIR / "plots"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# If you want to force maximize/minimize set this to True/False. If None, code auto-detects.
FORCE_MAXIMIZE = None  # set True to force "higher is better", False to force "lower is better", None to auto-decide

# Use percent gap if True, otherwise absolute gap.
USE_PERCENT_GAP = True

FIGSIZE = (14,6)
DPI = 150
FONT_TITLE = 14
FONT_AXIS = 12
FONT_ANN = 10
BAR_HEIGHT = 0.6
USE_PERCENT_GAP = True

In [7]:
# ----- helper loaders -----
def safe_load_json(p: Path):
    try:
        with open(p,"r") as f:
            return json.load(f)
    except Exception:
        return None

def flatten_runs(obj_list, tag):
    rows=[]
    if not obj_list: return rows
    for rec in obj_list:
        inst = rec.get("instance_file") or rec.get("instance") or rec.get("problem") or rec.get("inst")
        method = rec.get("method") or rec.get("solver") or rec.get("algorithm") or rec.get("solver_name") or rec.get("algo")
        # common value and capacity names
        best_value = rec.get("best_value") if "best_value" in rec else rec.get("value") if "value" in rec else rec.get("objective")
        # capacity candidates in the run record (sometimes stored)
        capacity_fields = ["capacity","used_capacity","used_weight","total_weight","weight","capacity_used","knapsack_capacity","capacity_value","used_capacity_value"]
        cap = None
        for k in capacity_fields:
            if k in rec:
                cap = rec.get(k); break
        rows.append({
            "instance_file": inst,
            "method": method or "unknown",
            "best_value": best_value,
            "capacity_in_run": cap,
            "runtime": rec.get("runtime") or rec.get("time") or rec.get("duration"),
            "raw": rec,
            "source": tag
        })
    return rows

In [8]:
# ========== Load data ==========
merged = safe_load_json(OUT_JSON) or []
classical = safe_load_json(CLASSICAL_PATH) or []
quantum = safe_load_json(QUANTUM_PATH) or []

# fallback: auto-detect JSON files in DATA_DIR if any of these are missing
if not (merged or classical or quantum):
    for p in RESULTS_DIR.glob("*.json"):
        j = safe_load_json(p)
        if isinstance(j, list) and j:
            keys = set(j[0].keys())
            if keys & {"method","solver","algorithm","best_value"}:
                if not classical: classical = j
                elif not quantum: quantum = j
            elif keys & {"best_value_overall","instance_file","best_overall"}:
                if not merged: merged = j

rows = []
rows += flatten_runs(classical, "classical")
rows += flatten_runs(quantum, "quantum")
df = pd.DataFrame(rows)

if df.empty:
    raise RuntimeError("No run records found. Check your JSON files or paths under /mnt/data.")

# normalize instance id (file stem)
df['instance_id'] = df['instance_file'].apply(lambda s: Path(s).stem if isinstance(s,str) and s else str(s))

# ----- try to extract capacity constraint per instance -----
def find_capacity_from_merged(inst_stem, merged_list):
    """Check merged records for capacity-like keys."""
    if not merged_list: return None
    for rec in merged_list:
        # match by filename stem or by explicit instance id
        inst_candidate = rec.get("instance_file") or rec.get("instance") or rec.get("problem")
        if not inst_candidate: continue
        if Path(inst_candidate).stem != inst_stem: continue
        # look for capacity-like keys
        for key in ["capacity_constraint","knapsack_capacity","capacity_total","capacity","cap","capacity_limit","W","total_capacity","best_capacity_overall"]:
            if key in rec:
                return rec.get(key)
        # sometimes merged only has 'instance' with nested info
        if isinstance(rec, dict):
            # search nested dict shallowly
            for v in rec.values():
                if isinstance(v, (int,float)):
                    # not ideal; skip
                    pass
    return None

In [9]:
def find_capacity_in_instance_file(instance_file_path):
    """Open instance file if present and try to find capacity-like keys. Supports JSON, CSV, plain text."""
    p = Path(instance_file_path)
    if not p.exists():
        return None
    # JSON
    try:
        j = json.loads(p.read_text())
        if isinstance(j, dict):
            for key in ["capacity","knapsack_capacity","capacity_constraint","cap","W","total_capacity","capacity_limit"]:
                if key in j:
                    return j[key]
            # also check nested keys shallowly
            for v in j.values():
                if isinstance(v, (int,float)):
                    # heuristics: ignore lists of items
                    # skip this broad match to avoid false positives
                    pass
    except Exception:
        pass
    # CSV: look for header with capacity-like name
    try:
        import csv
        with open(p,"r") as f:
            header = f.readline().strip().lower()
            for token in ["capacity","knapsack_capacity","cap","capacity_limit","total_capacity"]:
                if token in header:
                    # read with pandas and pick first value of that column
                    try:
                        dfc = pd.read_csv(p)
                        for key in dfc.columns:
                            if token in str(key).lower():
                                val = dfc[key].dropna().unique()
                                if len(val)>0 and np.isscalar(val[0]):
                                    return val[0]
                    except Exception:
                        pass
    except Exception:
        pass
    # Plain text fallback: scan for integer after "capacity" word
    try:
        text = p.read_text()
        m = re.search(r'capacity[^0-9\-]*([0-9]+(?:\.[0-9]+)?)', text, re.IGNORECASE)
        if m:
            return float(m.group(1))
    except Exception:
        pass
    return None

# Build capacity_map
capacity_map = {}
for inst in df['instance_id'].unique():
    # 1) check merged file
    cap = find_capacity_from_merged(inst, merged)
    if cap is not None:
        capacity_map[inst] = cap
        continue
    # 2) check capacity present in any run record (capacity_in_run field)
    cap_candidates = df.loc[df['instance_id']==inst, 'capacity_in_run'].dropna().unique().tolist()
    if cap_candidates:
        # choose numeric candidate if possible
        for c in cap_candidates:
            try:
                cnum = float(c)
                capacity_map[inst] = cnum
                break
            except Exception:
                # if it's a string path or nested, ignore
                pass
        if inst in capacity_map:
            continue
    # 3) try to open referenced instance file(s) for that instance (if full path available in rows)
    inst_rows = df[df['instance_id']==inst]
    found = None
    for ref in inst_rows['instance_file'].dropna().unique():
        # try relative to DATA_DIR and the raw path
        candidates = [Path(ref), RESULTS_DIR / Path(ref), RESULTS_DIR / Path(ref).name]
        for cand in candidates:
            cand = cand.resolve() if cand.exists() else cand
            if cand.exists():
                found = find_capacity_in_instance_file(cand)
                if found is not None:
                    capacity_map[inst] = found
                    break
        if inst in capacity_map:
            break
    # if not found, leave missing (we'll fallback later)
    if inst not in capacity_map:
        capacity_map[inst] = None

# 4) final fallback: use maximum achieved capacity among runs for that instance (as a lower-quality substitute)
for inst in df['instance_id'].unique():
    if capacity_map.get(inst) is None:
        vals = pd.to_numeric(df.loc[df['instance_id']==inst, 'capacity_in_run'], errors='coerce').dropna()
        if not vals.empty:
            capacity_map[inst] = vals.max()
        else:
            # try use best_overall from merged if available under other key names
            # many merged entries may have 'best_value_overall' which is objective not capacity -> avoid using it by default
            capacity_map[inst] = None

# attach capacity constraint into dataframe
df['capacity_constraint'] = df['instance_id'].map(capacity_map)
# capacity used by this run (if available) else try to map best_value to capacity (some runs store capacity under 'best_value')
df['capacity_used'] = pd.to_numeric(df['capacity_in_run'], errors='coerce')
# if capacity_used NA, try best_value (but only if capacity_constraint exists to avoid mixing objective)
mask_missing = df['capacity_used'].isna() & df['best_value'].notna()
df.loc[mask_missing, 'capacity_used'] = pd.to_numeric(df.loc[mask_missing, 'best_value'], errors='coerce')

# if capacity_used still NA, leave as NaN
# compute best achieved per instance (use capacity_used if present)
df['capacity_used'] = pd.to_numeric(df['capacity_used'], errors='coerce')
best_by_instance = df.groupby('instance_id')['capacity_used'].max().rename('best_capacity_achieved')
df = df.join(best_by_instance, on='instance_id')

# compute percent of capacity used vs constraint (if constraint present)
def percent_of_constraint(row):
    cap = row['capacity_constraint']
    if pd.isna(cap) or cap == 0:
        return np.nan
    return float(row['best_capacity_achieved']) / float(cap) * 100.0

summary = df.groupby(['instance_id']).agg(
    capacity_constraint=('capacity_constraint','first'),
    best_capacity_achieved=('best_capacity_achieved','first')
).reset_index()
summary['pct_of_capacity'] = summary.apply(percent_of_constraint, axis=1)

In [10]:
# Save summary CSV
OUT_SUM = RESULTS_DIR / "per_instance_capacity_summary.csv"
OUT_SUM.parent.mkdir(parents=True, exist_ok=True)
summary.to_csv(OUT_SUM, index=False)

In [11]:
# ----- plotting: horizontal bar with capacity constraint marker -----
saved = []
for inst, sub in df.groupby('instance_id'):
    sub2 = sub.copy()
    # prefer rows with numeric capacity_used
    sub2 = sub2[~sub2['capacity_used'].isna()].copy()
    if sub2.empty:
        # if no capacity_used values, skip instance but keep in summary
        print(f"[WARN] instance {inst}: no capacity_used numeric values found; skipping plot.")
        continue
    sub_sorted = sub2.sort_values('capacity_used', ascending=False).reset_index(drop=True)
    labels = [f"{str(m)} [{s}]" for m,s in zip(sub_sorted['method'], sub_sorted['source'])]
    values = sub_sorted['capacity_used'].astype(float).values
    cap_cons = sub_sorted['capacity_constraint'].dropna().unique()
    cap_val = float(cap_cons[0]) if len(cap_cons)>0 else None

    fig, ax = plt.subplots(figsize=FIGSIZE, dpi=DPI)
    y_pos = np.arange(len(values))
    ax.barh(y_pos, values, height=BAR_HEIGHT)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(labels, fontsize=FONT_AXIS)
    ax.invert_yaxis()
    ax.set_xlabel("Capacity used (higher = better)", fontsize=FONT_AXIS)
    ax.set_title(f"Instance {inst} — capacity used vs capacity constraint", fontsize=FONT_TITLE)

    # annotate each bar with gap to best (percent) and runtime if present
    best_val = values.max()
    for i,(val,rt,gap_src) in enumerate(zip(values, sub_sorted['runtime'].values, sub_sorted.get('gap_pct', np.zeros(len(values))))):
        # compute gap relative to best_val
        if best_val and best_val!=0:
            gap_pct = (best_val - val) / best_val * 100.0
        else:
            gap_pct = 0.0
        ann_gap = f"{gap_pct:.2f}% gap"
        ann_rt = f"{rt:.3g}s" if not pd.isna(rt) else ""
        ann = ann_gap + (f" · {ann_rt}" if ann_rt else "")
        ax.text(val + max(1e-9, 0.01*max(values)), i, ann, va='center', fontsize=FONT_ANN)

    # capacity constraint marker and annotation
    if cap_val is not None and not pd.isna(cap_val):
        ax.axvline(cap_val, color='k', linestyle='--', linewidth=1)
        # annotate best achieved vs capacity as percent
        pct = (best_val / cap_val * 100.0) if cap_val!=0 else np.nan
        ax.text(cap_val, -0.5, f"capacity constraint = {cap_val}\nbest uses {pct:.2f}% of capacity" if not np.isnan(pct) else f"capacity constraint = {cap_val}", 
                va='bottom', ha='center', fontsize=FONT_ANN, bbox=dict(facecolor='white', alpha=0.6))
    else:
        ax.text(0.98, 0.02, "capacity constraint: NOT FOUND", transform=ax.transAxes, fontsize=FONT_ANN,
                ha='right', va='bottom', bbox=dict(facecolor='white', alpha=0.7))

    plt.tight_layout()
    fname = OUT_DIR / f"instance_{re.sub(r'[^0-9A-Za-z_-]','_',inst)}_capacity.png"
    fig.savefig(fname, bbox_inches='tight')
    plt.close(fig)
    saved.append(str(fname))

print(f"Saved {len(saved)} capacity plots to {OUT_DIR}")
print("Saved capacity summary CSV to:", OUT_SUM)

Saved 27 capacity plots to experiment_results\plots
Saved capacity summary CSV to: experiment_results\per_instance_capacity_summary.csv
