In [None]:
import polars as pl
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np

# ── Directories ───────────────────────────────
QUERY_DIR = Path("Data/Queries")
PLOT_DIR = Path("Plots")
PLOT_DIR.mkdir(parents=True, exist_ok=True)
quartiles = [0.25, 0.5, 0.75, 1.0]

# ── Loop over taxonomic levels ────────────────
for level in ["family", "genus", "species"]:
    dist_file = QUERY_DIR / f"{level}_distribution.csv"
    plot_file = PLOT_DIR / f"{level}_cumulative.png"

    df = pl.read_csv(dist_file).sort("observed")
    df = df.with_columns(pl.cum_sum("observed").alias("cumulative_observed"))
    df = df.with_row_index("rank", offset=1)

    # Quartile 
    total_obs = df["cumulative_observed"][-1]
    quartile_thresholds = [total_obs * q for q in quartiles]
    quartile_counts = []
    prev_idx = 0
    for q_val in quartile_thresholds:
        idx = int(np.searchsorted(df["cumulative_observed"].to_numpy(), q_val))
        quartile_counts.append(idx - prev_idx)
        prev_idx = idx
        
    print(f" 0–25% : {quartile_counts[0]} {level}")
    print(f" 25–50% : {quartile_counts[1]} {level}")
    print(f" 50–75% : {quartile_counts[2]} {level}")
    print(f" 75–100%: {quartile_counts[3]} {level}")

    # Plot 
    plt.figure(figsize=(10, 6))
    plt.plot(df["rank"], df["cumulative_observed"], linewidth=2)
    plt.title(f"Cumulative {level.capitalize()} Observations by Index", fontsize=14)
    plt.xlabel(f"{level.capitalize()} (rank order)", fontsize=12)
    plt.ylabel("Cumulative Observations", fontsize=12)

    # mark quartile 
    for q, val in zip(quartiles, quartile_thresholds):
        plt.axhline(y=val, color="gray", linestyle="--", alpha=0.4)
        plt.text(df["rank"][-1] * 0.98, val, f"{int(q*100)}%", va="center",
                 ha="right", fontsize=9, color="gray")

    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(plot_file, dpi=300)
    plt.close()

    print(f"✅ Plot saved → {plot_file}")


In [None]:
import polars as pl
from pathlib import Path
import numpy as np

# ── Directories ───────────────────────────────
QUERY_DIR = Path("Data/Queries")

# ── Process all distribution CSVs ─────────────
for level in ["family", "genus", "species"]:
    dist_file = QUERY_DIR / f"{level}_distribution.csv"
    out_file = QUERY_DIR / f"{level}_distribution_quartiled.csv"

    # Load data (sorted by observed)
    df = pl.read_csv(dist_file).sort("observed")

    # Compute cumulative and proportion of total
    df = df.with_columns([
        pl.cum_sum("observed").alias("cumulative_observed")
    ])
    total_obs = df["cumulative_observed"][-1]
    df = df.with_columns([
        (pl.col("cumulative_observed") / total_obs).alias("cum_fraction")
    ])

    # Assign quartiles based on cumulative fraction
    def assign_quartile(x):
        if x < 0.25:
            return 1
        elif x < 0.5:
            return 2
        elif x < 0.75:
            return 3
        else:
            return 4

    quartiles = [assign_quartile(x) for x in df["cum_fraction"].to_list()]

    # Add quartile column
    df = df.with_columns(pl.Series("quartile", quartiles, dtype=pl.Int8))

    # Save to new CSV
    df.write_csv(out_file)

    print(f"✅ {level.capitalize()} quartiles assigned → {out_file}")


In [None]:
import polars as pl
from pathlib import Path

# ─── Directories ───────────────────────────────────────
MASTER_CSV = Path("Data/CSV/master.csv")
QUERY_DIR = Path("Data/Queries")
QUERY_DIR.mkdir(parents=True, exist_ok=True)

# ─── Load master dataset ────────────────────────────────
df = pl.read_csv(MASTER_CSV)

# ─── Function to generate observation + distribution files ─
for level in ["family", "genus", "species"]:
    counts = (
        df.group_by(level)
          .len()
          .rename({"len": "observed"})
    )

    # Sorted by ID (observation ordering)
    counts.sort(level).write_csv(QUERY_DIR / f"{level}_observation.csv")

    # Sorted by frequency (distribution ordering)
    counts.sort("observed").write_csv(QUERY_DIR / f"{level}_distribution.csv")

    print(f"✅ Saved → {QUERY_DIR / f'{level}_observation.csv'}")
    print(f"✅ Saved → {QUERY_DIR / f'{level}_distribution.csv'}")


pi/hist


In [None]:
# pie_and_bins.py
from pathlib import Path
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch

# ---- config ----
DATA_PATH = Path("master.parquet")  # parquet or csv; change as needed
FAMILY_COL = "family"               # integer 1..60

# ---- load ----
if DATA_PATH.suffix.lower() == ".parquet":
    df = pl.read_parquet(DATA_PATH, use_statistics=False)
else:
    df = pl.read_csv(DATA_PATH)

# keep only 1..60
df = df.filter((pl.col(FAMILY_COL) >= 1) & (pl.col(FAMILY_COL) <= 60))

# ---- binning ----
# segment: 0=>1-20, 1=>21-40, 2=>41-60
seg = ((pl.col(FAMILY_COL) - 1) // 20).alias("seg")
# sub-bin (4 per segment, width=5), 0..3
sub = (((pl.col(FAMILY_COL) - 1) % 20) // 5).alias("sub")

binned = (
    df.select(seg, sub)
      .group_by(["seg", "sub"])
      .agg(pl.len().alias("n"))
)

# total per seg
seg_tot = binned.group_by("seg").agg(pl.col("n").sum().alias("N")).sort("seg")
N_total = int(seg_tot["N"].sum())

# ratios for pie (three wedges)
overall_ratios = (seg_tot["N"] / N_total).to_list()  # [r0, r1, r2]
pie_labels = ["Families 1–20", "Families 21–40", "Families 41–60"]
explode = [0.08, 0.08, 0.08]

# sub-bin ratios per seg (stacked bar parts): shape (3 segs, 4 sub-bins)
# fill zeros for missing combos
full = (
    pl.DataFrame(
        {"seg": np.repeat([0,1,2], 4), "sub": np.tile([0,1,2,3], 3)}
    ).join(binned, on=["seg", "sub"], how="left")
     .fill_null(0)
     .join(seg_tot, on="seg", how="left")
     .with_columns((pl.col("n") / N_total).alias("r_part"))
     .sort(["seg", "sub"])
)

# reshape to list-of-lists for stacking
parts = [full.filter(pl.col("seg")==s)["r_part"].to_list() for s in (0,1,2)]

# ---- plotting ----
fig, (ax_pie, ax_bar) = plt.subplots(1, 2, figsize=(10.5, 5.5))
fig.subplots_adjust(wspace=0.05)

# pie
angle = -180 * overall_ratios[0]  # split first wedge across x-axis (style from template)
wedges, *_ = ax_pie.pie(
    overall_ratios,
    autopct='%1.1f%%',
    startangle=angle,
    labels=pie_labels,
    explode=explode
)
ax_pie.set_aspect('equal')

# stacked bars (one bar per segment; each bar sums to its pie ratio)
width = 0.25
xs = [-width*1.5, 0.0, width*1.5]  # positions for seg 0,1,2
sub_labels = ["1–5","6–10","11–15","16–20"]  # reused per segment (relative)
colors = [f"C{i}" for i in range(4)]

bar_tops = []
for i, (x, stack) in enumerate(zip(xs, parts)):
    bottom = 0.0
    for j, h in enumerate(stack):
        bc = ax_bar.bar(x, h, width, bottom=bottom, label=sub_labels[j] if i==0 else None, alpha=0.35+0.15*j)
        # label inside each block
        if h > 0:
            ax_bar.bar_label(bc, labels=[f"{(h/N_total)*100*N_total:.1f}%"], label_type='center', padding=0)
        bottom += h
    bar_tops.append(bottom)

ax_bar.set_xlim(min(xs)-width*2.0, max(xs)+width*2.0)
ax_bar.set_ylim(0, max(bar_tops)*1.05 if bar_tops else 1)
ax_bar.set_title("Within-segment distributions (5-wide bins)")
ax_bar.set_xticks(xs, ["1–20", "21–40", "41–60"])
ax_bar.legend(title="Sub-bins", loc="upper right", frameon=False)
ax_bar.axis('off')  # mirror the template’s clean look

# ---- connect wedges to bars ----
# helper to attach two lines per wedge (top and bottom of its bar)
def connect_wedge_to_bar(wedge, x, bar_height, axA=ax_bar, axB=ax_pie, lw=3.0):
    theta1, theta2 = wedge.theta1, wedge.theta2
    center, r = wedge.center, wedge.r

    # top line
    xt = x - width/2
    yt = bar_height
    xw_top = r * np.cos(np.deg2rad(theta2)) + center[0]
    yw_top = r * np.sin(np.deg2rad(theta2)) + center[1]
    con = ConnectionPatch(xyA=(xt, yt), coordsA=axA.transData,
                          xyB=(xw_top, yw_top), coordsB=axB.transData)
    con.set_color([0, 0, 0]); con.set_linewidth(lw)
    axA.add_artist(con)

    # bottom line
    xb = x - width/2
    yb = 0.0
    xw_bot = r * np.cos(np.deg2rad(theta1)) + center[0]
    yw_bot = r * np.sin(np.deg2rad(theta1)) + center[1]
    con2 = ConnectionPatch(xyA=(xb, yb), coordsA=axA.transData,
                           xyB=(xw_bot, yw_bot), coordsB=axB.transData)
    con2.set_color([0, 0, 0]); con2.set_linewidth(lw)
    axA.add_artist(con2)

# do connections for each (wedge i -> bar i)
for i in range(3):
    connect_wedge_to_bar(wedges[i], xs[i], bar_tops[i])

plt.tight_layout()
plt.show()
