
# Bitonic Sort — Caliper/Thicket Analysis (Google Colab)

This notebook reads `.cali` files organized in your Google Drive at:

```
/content/drive/MyDrive/Caliper/
  ├── 1_perc_perturbed/
  ├── random/
  ├── reverse_sorted/
  └── sorted/
```

Each folder contains `.cali` files named like `N-P.cali` (e.g., `4194304-64.cali`),
where `N` is number of elements and `P` is number of MPI ranks (processes).

**Outputs**
- Builds a runs table with metadata (N, P, input_type) and metrics for `main`, `comp`, and `comm`.
- Plots Experiments 1, 2 (incl. speedup), 3, and 5 with all input types on the same axes.
- Uses Thicket/Hatchet readers; if some stats (e.g., variance) are missing, plots continue with what’s available.


In [10]:

# If running on Colab, install dependencies.
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    !pip -q install "hatchet" "thicket" "caliper-reader" pandas matplotlib
else:
    print("Not in Colab; ensure hatchet, thicket, caliper-reader, pandas, matplotlib are installed.")


In [15]:

import os, re
from pathlib import Path
from typing import Dict, List, Tuple, Optional

import pandas as pd
import matplotlib.pyplot as plt

from hatchet import GraphFrame
try:
    from thicket import thicket
    HAVE_THICKET = True
except Exception as e:
    HAVE_THICKET = False
    print("Thicket not available:", e)

# Mount Google Drive if on Colab
try:
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive', force_remount=False)
except Exception as e:
    print("If you're not on Colab, ignore the mount step. Error:", e)

ROOT = Path('/content/drive/MyDrive/Caliper')
FOLDERS = {
    'sorted': 'Sorted',
    'random': 'Random',
    'reverse_sorted': 'ReverseSorted',
    '1_perc_perturbed': '1_perc_perturbed',
}

def pow2_label(n: int) -> str:
    if n and (n & (n - 1)) == 0:
        k = n.bit_length() - 1
        return f"2^{k}"
    return str(n)


Thicket not available: cannot import name 'thicket' from 'thicket' (/usr/local/lib/python3.12/dist-packages/thicket/__init__.py)
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:

def pick_col(cols, *need):
    for c in cols:
        parts = c if isinstance(c, tuple) else (c,)
        s = " ".join([str(x) for x in parts]).lower()
        if all(n in s for n in need):
            return c
    return None

def parse_np_from_filename(path: Path) -> Tuple[Optional[int], Optional[int]]:
    m = re.match(r"(\d+)-(\d+)\.cali$", path.name)
    if not m:
        return None, None
    return int(m.group(1)), int(m.group(2))

def extract_metrics_from_cali(path: Path) -> Dict:
    rec = {
        "file": str(path),
        "N": None, "P": None, "input_type": None,
        "main_avg": None, "main_min": None, "main_max": None, "main_var": None,
        "comp_avg": None, "comp_min": None, "comp_max": None, "comp_var": None,
        "comm_avg": None, "comm_min": None, "comm_max": None, "comm_var": None,
    }
    Nf, Pf = parse_np_from_filename(path)
    if Nf: rec["N"] = Nf
    if Pf: rec["P"] = Pf

    t = None
    if HAVE_THICKET:
        try:
            t = Thicket.from_caliper(str(path))
        except Exception:
            t = None

    if t is not None:
        df = t.dataframe
        meta = getattr(t, "metadata", None)
        if meta is not None:
            for k in ["input_size", "num_procs", "input_type"]:
                if k in meta.columns and len(meta[k].dropna()) > 0:
                    val = meta[k].dropna().iloc[0]
                    if k == "input_size" and not rec["N"]:
                        try: rec["N"] = int(val)
                        except: pass
                    elif k == "num_procs" and not rec["P"]:
                        try: rec["P"] = int(val)
                        except: pass
                    elif k == "input_type" and not rec["input_type"]:
                        rec["input_type"] = str(val)

        name_col = None
        if "name" in df.columns:
            name_col = "name"
        elif ("name","") in df.columns:
            name_col = ("name","")
        else:
            if "name" in df.index.names:
                df = df.reset_index()
            if "name" in df.columns:
                name_col = "name"

        cand_avg = pick_col(df.columns, "avg", "time")
        cand_min = pick_col(df.columns, "min", "time")
        cand_max = pick_col(df.columns, "max", "time")
        cand_var = pick_col(df.columns, "var", "time")
        if cand_avg is None:
            cand_avg = pick_col(df.columns, "time", "(inc)") or pick_col(df.columns, "time")

        def value_for(node_name: str, col):
            if col is None or name_col is None:
                return None
            try:
                sub = df[df[name_col].astype(str).str.fullmatch(node_name, na=False)]
                if len(sub) == 0:
                    sub = df[df[name_col].astype(str).str.startswith(node_name)]
                if len(sub) == 0:
                    return None
                v = sub.iloc[0][col]
                return float(v)
            except Exception:
                return None

        for node, prefix in (("main","main"), ("comp_large","comp"), ("comp","comp"), ("comm","comm")):
            if prefix == "comp" and rec["comp_avg"] is not None:
                continue
            rec[f"{prefix}_avg"] = rec[f"{prefix}_avg"] or value_for(node, cand_avg)
            rec[f"{prefix}_min"] = rec[f"{prefix}_min"] or value_for(node, cand_min)
            rec[f"{prefix}_max"] = rec[f"{prefix}_max"] or value_for(node, cand_max)
            rec[f"{prefix}_var"] = rec[f"{prefix}_var"] or value_for(node, cand_var)
        return rec

    # Fallback: Hatchet
    try:
        gf = GraphFrame.from_caliper(str(path))
        gdf = gf.dataframe.reset_index()
        avg_col = pick_col(gdf.columns, "time", "(inc)") or pick_col(gdf.columns, "time")
        name_col = "name" if "name" in gdf.columns else None

        def hv(node):
            if name_col is None or avg_col is None:
                return None
            sub = gdf[gdf[name_col].astype(str) == node]
            if len(sub) == 0:
                sub = gdf[gdf[name_col].astype(str).str.startswith(node)]
            if len(sub) == 0:
                return None
            try:
                return float(sub.iloc[0][avg_col])
            except Exception:
                return None

        rec["main_avg"] = hv("main")
        rec["comp_avg"] = hv("comp_large") or hv("comp")
        rec["comm_avg"] = hv("comm")
        return rec
    except Exception as e:
        print(f"[WARN] Could not read {path.name}: {e}")
        return rec


In [13]:

records = []
for folder, itype in FOLDERS.items():
    d = ROOT / folder
    if not d.exists():
        print(f"[NOTE] Missing folder: {d} (skipping)")
        continue
    for path in sorted(d.glob("*.cali")):
        rec = extract_metrics_from_cali(path)
        if not rec.get("input_type"):
            rec["input_type"] = itype
        records.append(rec)

runs = pd.DataFrame.from_records(records)
for c in ["N","P",
          "main_avg","main_min","main_max","main_var",
          "comp_avg","comp_min","comp_max","comp_var",
          "comm_avg","comm_min","comm_max","comm_var"]:
    if c in runs.columns:
        runs[c] = pd.to_numeric(runs[c], errors="coerce")
runs["N_label"] = runs["N"].apply(lambda x: pow2_label(int(x)) if pd.notna(x) else None)

print("Loaded runs:", len(runs))
runs.head(10)


[WARN] Could not read 1048576-1024.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-128.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-16.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-2.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-256.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-32.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-4.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-512.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-64.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 1048576-8.cali: from_caliper() needs cali-query to query .cali file
[WARN] Could not read 16777216-1024.cali: from_caliper() needs cali-query to query .cali

Unnamed: 0,file,N,P,input_type,main_avg,main_min,main_max,main_var,comp_avg,comp_min,comp_max,comp_var,comm_avg,comm_min,comm_max,comm_var,N_label
0,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,1024,Sorted,,,,,,,,,,,,,2^20
1,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,128,Sorted,,,,,,,,,,,,,2^20
2,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,16,Sorted,,,,,,,,,,,,,2^20
3,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,2,Sorted,,,,,,,,,,,,,2^20
4,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,256,Sorted,,,,,,,,,,,,,2^20
5,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,32,Sorted,,,,,,,,,,,,,2^20
6,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,4,Sorted,,,,,,,,,,,,,2^20
7,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,512,Sorted,,,,,,,,,,,,,2^20
8,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,64,Sorted,,,,,,,,,,,,,2^20
9,/content/drive/MyDrive/Caliper/sorted/1048576-...,1048576,8,Sorted,,,,,,,,,,,,,2^20


In [14]:

def ensure_metric_exists(df: pd.DataFrame, col: str) -> bool:
    if col not in df.columns or df[col].dropna().empty:
        print(f"[WARN] No data for metric column '{col}'. Skipping plot.")
        return False
    return True

def plot_exp1(runs: pd.DataFrame, metric_col="main_avg"):
    df = runs[runs["P"] == 64].copy()
    if not ensure_metric_exists(df, metric_col): return
    df = df.dropna(subset=["N", metric_col, "input_type"])
    plt.figure()
    for itype, sub in sorted(df.groupby("input_type")):
        sub = sub.sort_values("N")
        plt.plot(sub["N"], sub[metric_col], marker='o', label=itype)
    plt.xscale("log", base=2)
    plt.xlabel("Elements N")
    plt.ylabel(metric_col)
    plt.title(f"Experiment 1 — {metric_col} vs N (P=64)")
    plt.legend(); plt.grid(True); plt.show()

def plot_exp2_strong(runs: pd.DataFrame, N_fixed=2**22, metric_col="main_avg"):
    df = runs[runs["N"] == N_fixed].copy()
    if not ensure_metric_exists(df, metric_col): return
    df = df.dropna(subset=["P", metric_col, "input_type"])
    plt.figure()
    for itype, sub in sorted(df.groupby("input_type")):
        sub = sub.sort_values("P")
        plt.plot(sub["P"], sub[metric_col], marker='o', label=itype)
    plt.xscale("log", base=2)
    plt.xlabel("Processes P")
    plt.ylabel(metric_col)
    plt.title(f"Experiment 2 (Strong) — {metric_col} vs P (N={pow2_label(N_fixed)})")
    plt.legend(); plt.grid(True); plt.show()

    if metric_col == "main_avg":
        plt.figure()
        for itype, sub in sorted(df.groupby("input_type")):
            sub = sub.sort_values("P")
            if len(sub) == 0 or pd.isna(sub.iloc[0]["main_avg"]) or sub.iloc[0]["main_avg"] == 0:
                continue
            base = sub.iloc[0]["main_avg"]
            speedup = base / sub["main_avg"]
            plt.plot(sub["P"], speedup, marker='o', label=itype)
        plt.xscale("log", base=2)
        plt.xlabel("Processes P")
        plt.ylabel("Speedup (T_base / T_P)")
        plt.title(f"Experiment 2 — Speedup (main_avg) vs P (N={pow2_label(N_fixed)})")
        plt.legend(); plt.grid(True); plt.show()

def plot_exp3_weak(runs: pd.DataFrame, metric_col="main_avg"):
    pairs = [(2**16,16),(2**18,32),(2**20,64),(2**22,128),(2**24,256),(2**26,512),(2**28,1024)]
    sel = None
    for (N,P) in pairs:
        chunk = runs[(runs["N"] == N) & (runs["P"] == P)]
        sel = chunk if sel is None else pd.concat([sel, chunk], ignore_index=True)
    if sel is None or sel.empty:
        print("[WARN] No weak-scaling pairs found."); return
    if not ensure_metric_exists(sel, metric_col): return
    plt.figure()
    for itype, sub in sorted(sel.groupby("input_type")):
        sub = sub.sort_values("N")
        plt.plot(sub["N"], sub[metric_col], marker='o', label=itype)
    plt.xscale("log", base=2)
    plt.xlabel("Elements N (paired with P)")
    plt.ylabel(metric_col)
    plt.title(f"Experiment 3 (Weak) — {metric_col} vs N (paired P)")
    plt.legend(); plt.grid(True); plt.show()

def plot_exp5_vary_init(runs: pd.DataFrame, N_fixed=2**22, P_fixed=64, metric_col="main_avg"):
    df = runs[(runs["N"] == N_fixed) & (runs["P"] == P_fixed)].copy()
    if df.empty:
        print(f"[WARN] No runs for N={N_fixed}, P={P_fixed}."); return
    if not ensure_metric_exists(df, metric_col): return
    df = df.dropna(subset=[metric_col, "input_type"])
    order = ["Sorted", "1_perc_perturbed", "Random", "ReverseSorted"]
    df["input_type"] = pd.Categorical(df["input_type"], categories=order, ordered=True)
    df = df.sort_values("input_type")
    plt.figure()
    plt.bar(df["input_type"].astype(str), df[metric_col])
    plt.xlabel("Input type"); plt.ylabel(metric_col)
    plt.title(f"Experiment 5 — {metric_col} (N={pow2_label(N_fixed)}, P={P_fixed})")
    plt.grid(True, axis='y'); plt.show()

# Generate all figures for main/comp/comm
METRICS = ["main_avg", "comp_avg", "comm_avg"]

for m in METRICS:
    plot_exp1(runs, metric_col=m)

for m in METRICS:
    plot_exp2_strong(runs, N_fixed=2**22, metric_col=m)

for m in METRICS:
    plot_exp3_weak(runs, metric_col=m)

for m in METRICS:
    plot_exp5_vary_init(runs, N_fixed=2**22, P_fixed=64, metric_col=m)

# Diagnostics
present_counts = {c: runs[c].notna().sum() for c in ["main_min","main_max","main_var",
                                                     "comp_min","comp_max","comp_var",
                                                     "comm_min","comm_max","comm_var"] if c in runs}
print("Counts of runs with aggregate stats present:")
for k,v in present_counts.items():
    print(f"  {k}: {v}")


[WARN] No data for metric column 'main_avg'. Skipping plot.
[WARN] No data for metric column 'comp_avg'. Skipping plot.
[WARN] No data for metric column 'comm_avg'. Skipping plot.
[WARN] No data for metric column 'main_avg'. Skipping plot.
[WARN] No data for metric column 'comp_avg'. Skipping plot.
[WARN] No data for metric column 'comm_avg'. Skipping plot.
[WARN] No data for metric column 'main_avg'. Skipping plot.
[WARN] No data for metric column 'comp_avg'. Skipping plot.
[WARN] No data for metric column 'comm_avg'. Skipping plot.
[WARN] No data for metric column 'main_avg'. Skipping plot.
[WARN] No data for metric column 'comp_avg'. Skipping plot.
[WARN] No data for metric column 'comm_avg'. Skipping plot.
Counts of runs with aggregate stats present:
  main_min: 0
  main_max: 0
  main_var: 0
  comp_min: 0
  comp_max: 0
  comp_var: 0
  comm_min: 0
  comm_max: 0
  comm_var: 0
