In [None]:
import numpy as np
import pandas as pd

# --- knobs you can edit ---
IN_PATH   = "merged_output.csv"     # input file
OUT_PATH  = "merged_output_cut.csv" # output file
COL       = "LogGFP"                # column to filter on
LOW, HIGH = 1.70, 2.15              # range to trim (inclusive)
KEEP_FRAC = 0.60                    # keep 70% (drop 30%)
SEED      = 123                     # reproducible randomness
# ---------------------------

rng = np.random.default_rng(SEED)

# Load
df = pd.read_csv(IN_PATH)

# Make a clean numeric series for the target column
x = pd.to_numeric(df[COL], errors="coerce").replace([np.inf, -np.inf], np.nan)

# Mask for rows inside the target range (inclusive)
mask_range = x.between(LOW, HIGH, inclusive="both")

# Rows to keep inside the range: sample without replacement
idx_in = df.index[mask_range]
n_in = len(idx_in)
n_keep = int(round(n_in * KEEP_FRAC))

if n_in == 0:
    print(f"No rows in [{LOW}, {HIGH}] — nothing to cut.")
    trimmed = df.copy()
else:
    keep_idx_in = set(rng.choice(idx_in, size=n_keep, replace=False))
    # Build final keep mask: keep all outside the range + sampled inside
    keep_mask = (~mask_range) | df.index.to_series().isin(keep_idx_in)
    trimmed = df.loc[keep_mask].copy()

# Report
print(f"Total rows: {len(df)}")
print(f"Rows in [{LOW}, {HIGH}]: {n_in}  -> keeping {n_keep} ({KEEP_FRAC*100:.0f}%)")
print(f"New total rows: {len(trimmed)}")

# Save
trimmed.to_csv(OUT_PATH, index=False)
print(f"Wrote: {OUT_PATH}")


Total rows: 177582
Rows in [1.7, 2.15]: 146364  -> keeping 87818 (60%)
New total rows: 119036
Wrote: merged_output_cut.csv


Filter in bins

trying it more simpel

now with kutting the middle and the expremes duplicating

with more to the extremes

comapre the differnt contribution

In [7]:
import numpy as np
import pandas as pd

# ===================== knobs =====================
IN_PATH   = "merged_output.csv"
OUT_PATH  = "merged_output_balanced_2.csv"
COL       = "LogGFP"
LOWER     = 1.2
NBINS     = 40

SEED      = 123
rng = np.random.default_rng(SEED)

# Per-bin scale factors (0-based bin index):
#   factor > 1.0  -> oversample with replacement to target size
#   factor = 1.0  -> keep as-is
#   factor < 1.0  -> downsample WITHOUT replacement to target size
SCALE_MAP = {
    0: 10.0,
    1: 9.0,
    2: 4.0,
    3: 2.0,
    4: 1.3, 
    5: 1.3,# downsample bin 5 to 70%
    6: 1.0, 
    7: 1.0,
    8: 1.0,
    9: 1.0,
    10:0.67,
    11:0.52,
    12:0.42,
    13: 0.30,
    14: 0.20,
    15: 0.30,
    16: 0.32,
    17: 0.35,
    18: 0.36,
    19: 0.37,
    
    
    19: 0.50,
    20: 1.0,
    21:1.7,
    22: 1.7,
    23: 4.5,
    24:2.0,
    25:1.3,
        26:1.1,
            27:2.2,
                28:2.0,
                    29:3.2,
                    30:3.9,
                        31: 4,
                            32:15,
                                33:6.0,
                                    34:7,
                                        35:10,
                                            36:17,
                                                37:19,
                                                    38:39,
                                                        
                                                        39:27,
                        
}

# Optional: range trimming BEFORE bin scaling (list of tuples)
# Each tuple: (low, high, keep_frac). Example keeps 70% in [1.75, 2.10].
RANGE_TRIMS = [
    # (1.75, 2.10, 0.70),
]

# =================================================

# --- load & filter to LOWER like in your pipeline ---
df = pd.read_csv(IN_PATH)
x = pd.to_numeric(df[COL], errors="coerce").replace([np.inf, -np.inf], np.nan)
df = df.loc[x.notna() & (x >= LOWER)].copy()

# --- optional range-based trims (no leakage; removes rows only) ---
for (low, high, keep_frac) in RANGE_TRIMS:
    mask = df[COL].between(low, high, inclusive="both")
    idx_in = df.index[mask]
    n_in = len(idx_in)
    k = int(round(n_in * keep_frac))
    if n_in and 0 <= k < n_in:
        keep_idx = set(rng.choice(idx_in, size=k, replace=False))
        keep_mask = (~mask) | df.index.to_series().isin(keep_idx)
        df = df.loc[keep_mask].copy()
        print(f"Trimmed [{low}, {high}] to {keep_frac:.0%}: kept {k}/{n_in}")

# --- fixed edges (LOWER..max), with nextafter padding like before ---
xmax  = float(df[COL].max())
edges = np.linspace(LOWER, xmax, NBINS + 1)
edges[0]  = np.nextafter(edges[0],  -np.inf)
edges[-1] = np.nextafter(edges[-1],  np.inf)

# --- assign bins with right-inclusive semantics (pd.cut) ---
b = pd.cut(df[COL], bins=edges, right=True, include_lowest=True, labels=False)
if b.isna().any():
    df = df.loc[b.notna()].copy()
    b  = b.loc[b.notna()]
df["_bin"] = b.astype(int)

# --- diagnostics: original counts ---
orig_counts = df["_bin"].value_counts().sort_index()
print("Original per-bin counts:", orig_counts.to_dict())

# --- per-bin scaling (now supports downsampling when factor < 1) ---
scaled_parts = []
targets_report = []

for bval, block in df.groupby("_bin"):
    factor = float(SCALE_MAP.get(bval, 1.0))
    cur = len(block)
    target = int(round(cur * factor))
    targets_report.append((bval, cur, factor, target))

    if target <= 0:
        # drop bin entirely if requested
        continue
    if target == cur:
        scaled_parts.append(block)
    elif target > cur:
        # oversample WITH replacement
        idx = rng.choice(block.index, size=target, replace=True)
        scaled_parts.append(block.loc[idx])
    else:
        # downsample WITHOUT replacement
        idx = rng.choice(block.index, size=target, replace=False)
        scaled_parts.append(block.loc[idx])

balanced = pd.concat(scaled_parts, ignore_index=True)

# --- verify with same edges & show counts ---
final_bins = pd.cut(balanced[COL], bins=edges, right=True, include_lowest=True, labels=False)
balanced["_bin"] = final_bins.astype(int)

bal_counts = balanced["_bin"].value_counts().sort_index()
print("Balanced per-bin counts:", bal_counts.to_dict())

print("\nTargets per bin (after scaling):")
for bval, cur, factor, target in sorted(targets_report):
    print(f"  bin {bval:02d}: cur={cur}, factor={factor:.3g}, target={target}")

# --- save outputs ---
balanced.drop(columns=["_bin"]).to_csv(OUT_PATH, index=False)
np.save("bin_edges.npy", edges)
print(f"\nWrote {OUT_PATH} and bin_edges.npy")


Original per-bin counts: {0: 41, 1: 68, 2: 109, 3: 206, 4: 473, 5: 849, 6: 1259, 7: 1860, 8: 2521, 9: 3628, 10: 5150, 11: 7174, 12: 10046, 13: 15408, 14: 26406, 15: 19690, 16: 17572, 17: 17412, 18: 17405, 19: 12357, 20: 5500, 21: 2442, 22: 1183, 23: 915, 24: 1115, 25: 1342, 26: 1581, 27: 776, 28: 877, 29: 853, 30: 746, 31: 292, 32: 37, 33: 43, 34: 33, 35: 25, 36: 9, 37: 8, 38: 4, 39: 6}
Balanced per-bin counts: {0: 410, 1: 612, 2: 436, 3: 412, 4: 615, 5: 1104, 6: 1259, 7: 1860, 8: 2521, 9: 3628, 10: 3450, 11: 3730, 12: 4219, 13: 4622, 14: 5281, 15: 5907, 16: 5623, 17: 6094, 18: 6266, 19: 6178, 20: 5500, 21: 4151, 22: 2011, 23: 4118, 24: 2230, 25: 1745, 26: 1739, 27: 1707, 28: 1754, 29: 2730, 30: 2909, 31: 1168, 32: 555, 33: 258, 34: 231, 35: 250, 36: 153, 37: 152, 38: 156, 39: 162}

Targets per bin (after scaling):
  bin 00: cur=41, factor=10, target=410
  bin 01: cur=68, factor=9, target=612
  bin 02: cur=109, factor=4, target=436
  bin 03: cur=206, factor=2, target=412
  bin 04: cur=

In [30]:
# --- One-file distribution plot (uses same binning rules as your balancer) ---
import os, time
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

# ==== CONFIG: point to the file you want to visualize ====
CSV_PATH  = "merged_output_balanced.csv"   # <- your current file
COL       = "LogGFP"
LOWER     = 1.2
NBINS     = 20
EDGES_NPY = "bin_edges.npy"                # will use if present
SAVE_PNG  = False                          # set True to also save a PNG
# =========================================================

def make_edges_from_file(values, lower, nbins):
    vmax = float(values.max())
    edges = np.linspace(lower, vmax, nbins + 1)
    edges[0]  = np.nextafter(edges[0],  -np.inf)  # swallow the exact lower bound
    edges[-1] = np.nextafter(edges[-1],  np.inf)  # swallow the exact upper bound
    return edges

# 1) Load & filter to LOWER exactly like the balancer
df = pd.read_csv(CSV_PATH)
x  = pd.to_numeric(df[COL], errors="coerce").replace([np.inf, -np.inf], np.nan)
df = df.loc[x.notna() & (x >= LOWER)].copy()
xv = df[COL].astype(float).values
assert len(xv) > 0, f"No rows >= {LOWER} in {CSV_PATH}"

# 2) Load edges if available; otherwise build them the same way
if os.path.exists(EDGES_NPY):
    edges = np.load(EDGES_NPY)
    # sanity: if edges don’t match current file domain, rebuild them
    if (edges[0] > LOWER*0.9999) or (edges[-1] < xv.max()*0.9999) or (len(edges) != NBINS+1):
        edges = make_edges_from_file(xv, LOWER, NBINS)
else:
    edges = make_edges_from_file(xv, LOWER, NBINS)

nb = len(edges) - 1

# 3) Assign bins with the SAME rule: right-inclusive, include_lowest
b = pd.cut(df[COL], bins=edges, right=True, include_lowest=True, labels=False)
b = b.astype(int)  # no NaNs expected thanks to nextafter

# 4) Counts and labels
counts = b.value_counts().sort_index()
# ensure every bin is present
counts = counts.reindex(range(nb), fill_value=0)

labels  = [f"[{edges[i]:.4g}, {edges[i+1]:.4g}]" for i in range(nb)]
widths  = edges[1:] - edges[:-1]
centers = (edges[1:] + edges[:-1]) / 2.0

# 5) Plot bars at the true bin positions/widths
plt.figure(figsize=(8,4.5))
plt.bar(centers, counts.values, width=widths*0.9, alpha=0.8, edgecolor="black", linewidth=0.4)
plt.xlabel(COL); plt.ylabel("Count")
plt.title(f"Distribution of {os.path.basename(CSV_PATH)}\n({nb} fixed bins from {edges[0]:.3g}, right-inclusive)")
plt.tight_layout()
plt.show()

# 6) Also print a neat count table
counts_df = pd.DataFrame({"Bin": labels, "Count": counts.values})
print(counts_df.to_string(index=False))

# 7) Optional: save PNG
if SAVE_PNG:
    out_name = f"dist_{nb}bins_from{float(edges[0]):.2f}_{int(time.time())}.png"

    _ = plt.figure()  # not used; kept for safety
    # replot quick for saving (or reuse the previous fig if you prefer)
    plt.close(_)  # close the noop
    # you can also save the previous figure by grabbing it via plt.gcf()
    plt.gcf().savefig(out_name, dpi=200, bbox_inches="tight")
    print("Saved:", os.path.abspath(out_name))


           Bin  Count
  [1.2, 1.295]    654
[1.295, 1.389]   1260
[1.389, 1.484]   3966
[1.484, 1.579]   6238
[1.579, 1.674]   7994
[1.674, 1.768]   7394
[1.768, 1.863]  15272
[1.863, 1.958]  36877
[1.958, 2.052]  34984
[2.052, 2.147]  59524
[2.147, 2.242]  15884
[2.242, 2.337]   6294
[2.337, 2.431]  12285
[2.431, 2.526]  16499
[2.526, 2.621]  13840
[2.621, 2.715]   1038
 [2.715, 2.81]     80
 [2.81, 2.905]     58
    [2.905, 3]     17
    [3, 3.094]     10


  plt.show()
