In [27]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import re
from pathlib import Path
from scipy import stats

import sys, os
# --- bring in the styler
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../')))
from md_styler import MDStyler
sty = MDStyler().apply()
NMOLS = 200


## Discover data

In [2]:
data_path = Path('../data')
all_files = glob.glob(str(data_path / '*.csv'))
experiment_names = set()
parameter_names = set()

for file in all_files:
    # Extract experiment name and parameter name from filename
    filename = os.path.basename(file)
    # Find the position of the last underscore
    last_underscore_pos = filename.rfind('_')
    if last_underscore_pos > 0:
        # Extract experiment name (everything before last underscore)
        exp_name = filename[:last_underscore_pos]
        #print(filename, exp_name, last_underscore_pos)
        # Extract parameter name (between last underscore and .csv)
        param_name = filename[last_underscore_pos+1:].replace('.csv', '')
        
        experiment_names.add(exp_name)
        parameter_names.add(param_name)

print(f"Found {len(experiment_names)} experiments: {sorted(experiment_names)}")
print(f"Found {len(parameter_names)} parameters: {sorted(parameter_names)}")


Found 20 experiments: ['motor2m_frac-0', 'motor2m_frac-10', 'motor2m_frac2-0', 'motor2m_frac2-10', 'motor2m_frac2-90', 'motor2m_frac3-0', 'motor2m_frac3-10', 'motor2m_frac3-90', 'motor2m_frac4-90', 'motor2m_squeeze10_frac-0', 'motor2m_squeeze10_frac-10', 'motor2m_squeeze10_frac-50', 'motor2m_squeeze10_frac-90', 'motor2m_squeeze10_frac2-50', 'motor2m_squeeze12_frac-0_2', 'motor2m_squeeze12_frac-0_3', 'motor2m_squeeze14_frac-0_2', 'rotate1', 'test_nr2', 'test_nr3']
Found 11 parameters: ['RG', 'box-z', 'g-r', 'radius', 'sasa', 'torsion', 'torsion-alpha', 'torsion-phi', 'torsion-theta', 'voronota-area', 'voronota-volume']


Check for outliers

In [None]:

# Plot radius over time for each experiment
plt.figure(figsize=(12, 8))

for exp_name in sorted(experiment_names):
    try:
        # Load radius data
        radius_file = data_path / f"{exp_name}_radius.csv"
        if not radius_file.exists():
            print(f"No radius data for {exp_name}, skipping...")
            continue
            
        df = pd.read_csv(radius_file)
        
        # Assuming first column is time and second is radius
        time_col = df.columns[0]
        radius_col = df.columns[1]
        print(exp_name, df[radius_col][0])
        
        plt.plot(df[time_col], df[radius_col], label=exp_name)
    except Exception as e:
        print(f"Error processing {exp_name}: {e}")

plt.xlabel('Time (ps)')
plt.ylabel('Radius (Å)')
plt.title('Radius over time for all experiments')
plt.legend(loc='best', bbox_to_anchor=(1.05, 1), borderaxespad=0.)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Define which to exclude

'motor2m_frac-0' seems to be from other experiment and has some weird molecules

In [23]:
experiments_to_exclude = ['motor2m_frac-0', 'motor2m_frac2-0', 'rotate1'] 

In [24]:
valid_experiments = sorted(list(experiment_names - set(experiments_to_exclude)))
print(f"Analyzing {len(valid_experiments)} experiments: {valid_experiments}")

# Define utility function to get experiment ratio (molecule A vs B)
def get_experiment_squeeze(exp_name, data_path=Path('../data')):
    """
    Calculate ratio of molecule B (resid 1,2) vs molecule A (resid 3,4) for an experiment
    Returns the ratio as a float between 0 and 1 (fraction of molecule B)
    """
    # Try to find any local parameter file for this experiment
    z_file = data_path / f"{exp_name}_box-z.csv"
    try:
        df = pd.read_csv(z_file)
        start= df.iloc[0,1]
        if start < 100:
            return 15
        if start < 115:
            return 12
        return 9
    except:
        return np.nan

def get_experiment_ratio(exp_name, data_path=Path('../data')):
    """
    Calculate ratio of molecule B (resid 1,2) vs molecule A (resid 3,4) for an experiment
    Returns the ratio as a float between 0 and 1 (fraction of molecule B)
    """
    # Try to find any local parameter file for this experiment
    local_files = glob.glob(str(data_path / f"{exp_name}_RG.csv"))
    
    for file in local_files:
        df = pd.read_csv(file)
        if 'resid' in df.columns:  # Check if it's a local parameter file
            resids = df['resid']
            # Count molecules of type A (resid 1,2) and type B (resid 3,4)
            count_A = sum(1 for r in resids if r in [1, 2])
            count_B = sum(1 for r in resids if r in [3, 4])
            total = count_A + count_B
            
            if total > 0:
                return int(count_B*100/total)
    
    print(f"Warning: Could not determine ratio for {exp_name}")
    return None

# Get ratio for each experiment
experiment_ratios = {exp: get_experiment_ratio(exp) for exp in valid_experiments}
experiment_squeeze = {exp: get_experiment_squeeze(exp) for exp in valid_experiments}
print("Experiment ratios (fraction of molecule B):")
for exp, ratio in experiment_ratios.items():
    if ratio is not None:
        print(f"  {exp}: {ratio}")
for exp, s in experiment_squeeze.items():
    if ratio is not None:
        print(f"  {exp}: {s}")

Analyzing 17 experiments: ['motor2m_frac-10', 'motor2m_frac2-10', 'motor2m_frac2-90', 'motor2m_frac3-0', 'motor2m_frac3-10', 'motor2m_frac3-90', 'motor2m_frac4-90', 'motor2m_squeeze10_frac-0', 'motor2m_squeeze10_frac-10', 'motor2m_squeeze10_frac-50', 'motor2m_squeeze10_frac-90', 'motor2m_squeeze10_frac2-50', 'motor2m_squeeze12_frac-0_2', 'motor2m_squeeze12_frac-0_3', 'motor2m_squeeze14_frac-0_2', 'test_nr2', 'test_nr3']
Experiment ratios (fraction of molecule B):
  motor2m_frac-10: 10
  motor2m_frac2-10: 10
  motor2m_frac2-90: 90
  motor2m_frac3-0: 0
  motor2m_frac3-10: 10
  motor2m_frac3-90: 90
  motor2m_frac4-90: 90
  motor2m_squeeze10_frac-0: 0
  motor2m_squeeze10_frac-10: 10
  motor2m_squeeze10_frac-50: 90
  motor2m_squeeze10_frac-90: 90
  motor2m_squeeze12_frac-0_2: 0
  motor2m_squeeze12_frac-0_3: 0
  motor2m_squeeze14_frac-0_2: 0
  test_nr2: 0
  test_nr3: 0
  motor2m_frac-10: 9
  motor2m_frac2-10: 9
  motor2m_frac2-90: 9
  motor2m_frac3-0: 9
  motor2m_frac3-10: 9
  motor2m_frac3-

## Data Management

In [88]:
def concat_global(data_path=Path('../data')):
    """
    Concatenate a global parameter across all valid experiments.
    
    Args:
        param_name: Name of the global parameter
        data_path: Path to data directory
        
    Returns:
        DataFrame with columns: time, parameter, experiment, ratio
    """
    all_data = []
    
    for exp_name in valid_experiments:
        # Handle special parameters

        # Load box_z data and convert to packing
        file_path = data_path / f"{exp_name}_box-z.csv"
        if not file_path.exists():
            continue
            
        df = pd.read_csv(file_path)
        df.columns = ['time', 'box-z']
        df['packing'] = NMOLS / (df['box-z'] / 10) # Box in Å to packing in mols/nm
        df = df[['time', 'packing']]  # Only keep time and packing
        file_path = data_path / f"{exp_name}_sasa.csv"
        if not file_path.exists():
            continue
        df2 = pd.read_csv(file_path)
        df2.columns = ['time', 'sasa']
        df['sasa'] = df2['sasa'] / NMOLS # sasa per motor
        file_path = data_path / f"{exp_name}_radius.csv"
        if not file_path.exists():
            continue
        df3 = pd.read_csv(file_path)
        df3.columns = ['time', 'radius']
        df['radius'] = df3['radius'] / 10 # Make to nm     
            


        # Add experiment name
        df['experiment'] = exp_name
        
        # Get ratio information
        df['ratioB'] = get_experiment_ratio(exp_name)
        df['squeeze'] = get_experiment_squeeze(exp_name)
            
        all_data.append(df)
    
    if not all_data:
        return pd.DataFrame()  # Return empty DataFrame if no data found
        
    return pd.concat(all_data, ignore_index=True)

def concat_all(data_path=Path('../data'), return_missing: bool = False):
    """
    Concatenate all parameters across all valid experiments.
    
    Creates a summary DataFrame with averages for the last 50% of time frames,
    separated by molecule type (A and B).
    
    Also detects experiments with partial data (some but not all parameters present)
    and prints a warning. If return_missing=True, also returns a DataFrame report.

    Args:
        data_path: Path to data directory
        return_missing: If True, return (result_df, missing_report_df)

    Returns:
        DataFrame with columns: experiment, ratioB, molecule, squeeze, <params...>, std-<local params...>
        If return_missing=True: (result_df, missing_report_df)
    """
    all_results = []
    missing_rows = []

    # Build expected parameter set (note: 'box-z' becomes 'packing' after transform)
    expected_params = set()
    for p in parameter_names:
        expected_params.add('packing' if p == 'box-z' else p)
    for exp_name in valid_experiments:
        ratio = get_experiment_ratio(exp_name)
        squeeze = get_experiment_squeeze(exp_name)

        result_A = {'experiment': exp_name, 'ratioB': ratio, 'molecule': 'A', 'squeeze': squeeze}
        result_B = {'experiment': exp_name, 'ratioB': ratio, 'molecule': 'B', 'squeeze': squeeze}

        found_params = set()  # track successfully computed param names for this experiment

        for param_name in parameter_names:
            is_global = False
            is_torsion = False

            file_path = data_path / f"{exp_name}_{param_name}.csv"
            if not file_path.exists():
                # Missing raw file for this parameter in this experiment
                continue

            df = pd.read_csv(file_path)
            is_global = 'resid' not in df.columns
            is_torsion = param_name.startswith('torsion')

            # ----- special transforms / normalization -----
            if param_name == 'box-z':
                # Convert to packing (motors / nm) from box-z in Å
                df.columns = ['time', 'packing']
                df['packing'] = NMOLS / (df['packing'] / 10.0)
                param_name = 'packing'
                is_global = True

            elif param_name == 'sasa':
                df.columns = ['time', 'sasa']
                df['sasa'] = df['sasa'] / NMOLS
                is_global = True

            elif param_name == 'radius':
                df.columns = ['time', 'radius']
                df['radius'] = df['radius'] / 10
                df['diameter'] = 2*df['radius']
                # If it were Å, we would convert by /10. Keep comment to avoid future confusion.
                is_global = True

            elif is_torsion and 'resid' in df.columns:
                # Mirror torsions for resid 2 and 4: ONLY angle/time columns, not 'resid'
                angle_cols = [c for c in df.columns if c != 'resid']
                mask = df['resid'].isin([2, 4])
                if mask.sum() < 2:
                    print(f"Warning: torsion mirror check failed or sparse for {exp_name}_{param_name}")
                df.loc[mask, angle_cols] = df.loc[mask, angle_cols] * -1

            # ----- last 50% of frames & aggregation -----
            if is_global:
                total_frames = len(df)
                if total_frames < 2:
                    # too short to split — skip but record partial
                    continue
                last_half_start = total_frames // 2
                df = df.iloc[last_half_start:]

                # standardize 2-col format: ['time', param_name]
                if len(df.columns) >= 2:
                    df.columns = ['time', param_name] + list(df.columns[2:])

                mean_value = df[param_name].mean()

                result_A[param_name] = mean_value
                result_B[param_name] = mean_value
                found_params.add(param_name)

            else:
                if 'resid' not in df.columns:
                    print(f"Error; resid not in {exp_name}_{param_name}")
                    continue

                # Rename columns to standardized format - first column is resid, rest are timesteps
                time_columns = list(range(len(df.columns) - 1))
                df.columns = ['resid'] + time_columns

                # Get last 50% of time columns
                time_cols = [col for col in df.columns if col != 'resid']
                if len(time_cols) < 2:
                    continue
                last_half_cols = time_cols[len(time_cols)//2:]

                # Map resids to molecule types (A: 1,2; B: 3,4)
                df['type'] = df['resid'].map({1: 'A', 2: 'A', 3: 'B', 4: 'B'})

                # A
                df_A = df[df['type'] == 'A']
                if not df_A.empty:
                    A_time_avg = df_A[last_half_cols].mean(axis=1)
                    result_A[param_name] = A_time_avg.mean()
                    result_A[f'std-{param_name}'] = A_time_avg.std()
                    found_params.add(param_name)

                # B
                df_B = df[df['type'] == 'B']
                if not df_B.empty:
                    B_time_avg = df_B[last_half_cols].mean(axis=1)
                    result_B[param_name] = B_time_avg.mean()
                    result_B[f'std-{param_name}'] = B_time_avg.std()
                    found_params.add(param_name)

        # Record results
        if len(result_A) > 3:
            all_results.append(result_A)
        if len(result_B) > 3:
            all_results.append(result_B)

        # ----- missing data detection for this experiment -----
        missing = sorted(expected_params - found_params)
        present = sorted(found_params)
        completeness = (len(found_params) / max(1, len(expected_params)))

        if missing and present:
            print(f"[WARNING] Partial data for {exp_name} (squeeze={squeeze}, ratioB={ratio}): "
                  f"missing {len(missing)}/{len(expected_params)} → {missing}")

        missing_rows.append({
            'experiment': exp_name,
            'squeeze': squeeze,
            'ratioB': ratio,
            'present_n': len(present),
            'expected_n': len(expected_params),
            'completeness': completeness,
            'present_params': present,
            'missing_params': missing,
        })

    # Final tables
    result_df = pd.DataFrame(all_results)
    missing_report_df = pd.DataFrame(missing_rows).sort_values(['squeeze', 'ratioB', 'experiment']).reset_index(drop=True)

    if return_missing:
        return result_df, missing_report_df
    return result_df

print("Testing concat_global with 'radius':")
global_df = concat_global()
print(global_df.head())
print("\nTesting concat_all:")
all_df, missing = concat_all(return_missing=True)
print(all_df.head())


Testing concat_global with 'radius':
     time    packing      sasa    radius       experiment  ratioB  squeeze
0     0.0  16.260718  2.367655  2.640704  motor2m_frac-10      10        9
1  1000.0  16.695486  2.230225  2.611278  motor2m_frac-10      10        9
2  2000.0  16.919118  2.190415  2.611726  motor2m_frac-10      10        9
3  3000.0  16.977865  2.128425  2.610530  motor2m_frac-10      10        9
4  4000.0  17.047296  2.123235  2.619862  motor2m_frac-10      10        9

Testing concat_all:
         experiment  ratioB molecule  squeeze  torsion-phi  std-torsion-phi  \
0   motor2m_frac-10      10        A        9   -19.180691         5.542503   
1   motor2m_frac-10      10        B        9   121.972910        91.875310   
2  motor2m_frac2-10      10        A        9   -19.145314         6.573715   
3  motor2m_frac2-10      10        B        9   151.525840         3.488721   
4  motor2m_frac2-90      90        A        9   -20.457817         5.489535   

   voronota-volum

##  Analysis

In [99]:
import numpy as np
import pandas as pd

# constants
NMOLS = 200
Z_CI = 1.96

def summarize_parameters(all_df):
    """
    Compute mean and 95% CI per (squeeze, ratioB) for global parameters and
    per (squeeze, ratioB, molecule) for local parameters (A/B),
    using replicate-level summaries in all_df.
    """
    global_params = ['radius', 'sasa', 'packing', 'diameter']
    local_params = [c for c in all_df.columns
                    if not c.startswith('std-')
                    and c not in global_params
                    and c not in ['experiment','ratioB','molecule','squeeze']]

        # --- compute derived density per experiment ---
    all_df = all_df.copy()
    all_df['density'] = all_df['packing']/ (np.pi * all_df['radius']**2)
    global_params.append('density')

    results = []

    # ---------- GLOBAL PARAMETERS ----------
    for p in global_params:
        if p not in all_df.columns:
            continue
        df = all_df[['squeeze', 'ratioB', p]].dropna()
        if df.empty:
            continue
        grouped = df.groupby(['squeeze', 'ratioB'])[p]
        res = grouped.agg(['mean', 'std', 'count']).reset_index()
        res['count'] = res['count'] / 2 # We have both molA and molB which are accounted for here
        res['sem'] = res['std'] / np.sqrt(res['count'])
        res['CI'] = Z_CI * res['sem']
        res['parameter'] = p
        results.append(res[['parameter', 'squeeze', 'ratioB', 'mean', 'count','CI']])

    # ---------- LOCAL PARAMETERS (per squeeze, ratioB, molecule) ----------
    for p in local_params:
        std_col = f'std-{p}'
        if std_col not in all_df.columns:
            continue

        df = all_df[['squeeze','ratioB','molecule',p,std_col]].dropna()
        if df.empty:
            continue

        grouped = df.groupby(['squeeze','ratioB','molecule'])
        tmp = []

        for (sq, rb, mol), g in grouped:
            # determine group molecule count from ratioB (assumed percent)
            nB = int(round((rb/100.0) * NMOLS))
            nA = NMOLS - nB
            n_mols = nB if mol == 'B' else nA

            # if a group has zero molecules (e.g., ratioB=0 and mol=='B'), skip safely
            if n_mols <= 1:
                print(f"[warn] n_mols <= 1 for (squeeze={sq}, ratioB={rb}, molecule={mol}); skipping {p}")
                continue

            k = len(g)  # number of replicate experiments in this cell

            mean = g[p].mean()

            # pooled within-fibre variance across replicates (use correct n per replicate)
            # since n_mols is constant within a (sq, rb, mol) cell, this simplifies,
            # but we keep the general pooled formula for safety:
            vd_pooled_num = ((n_mols - 1) * (g[std_col] ** 2)).sum()
            vd_pooled_den = k * (n_mols - 1)
            vd_pooled = vd_pooled_num / vd_pooled_den if vd_pooled_den > 0 else np.nan

            # variance between replicate means
            vd_between = g[p].var(ddof=1) if k > 1 else 0.0

            # component from within-molecule variance for the mean of a single replicate
            within_component = vd_pooled / n_mols if np.isfinite(vd_pooled) else 0.0

            # conservative SEM for the averaged mean over k replicates:
            # take the dominating component, then divide by k
            sem = np.sqrt(max(within_component, vd_between) / k)
            ci  = Z_CI * sem

            tmp.append(dict(parameter=p, squeeze=sq, ratioB=rb, molecule=mol,
                            mean=mean, CI=ci, count=k, n_mols=n_mols))

        if tmp:
            results.append(pd.DataFrame(tmp))

    summary = pd.concat(results, ignore_index=True) if results else pd.DataFrame()
    return summary


# usage
summary_df = summarize_parameters(all_df)
print(summary_df[summary_df['parameter'] == 'radius'])

  parameter  squeeze  ratioB      mean  count        CI molecule  n_mols
0    radius        9     0.0  2.634849    4.0  0.022224      NaN     NaN
1    radius        9    10.0  2.611403    3.0  0.010393      NaN     NaN
2    radius        9    90.0  2.640702    3.0  0.036610      NaN     NaN
3    radius       12     0.0  2.758679    1.0  0.000000      NaN     NaN
4    radius       12    10.0  2.789449    1.0  0.000000      NaN     NaN
5    radius       12    50.0  2.836411    1.0  0.000000      NaN     NaN
6    radius       12    90.0  2.907397    2.0  0.046250      NaN     NaN
7    radius       15     0.0  2.952352    2.0  0.006302      NaN     NaN


In [100]:
params_order = [
    ("RG", "R$_g$ (nm)"),
    ("voronota-volume", "Volume (nm$^3$/motor)"),
    ("voronota-area", "Area (nm$^2$/motor)"),
    ("radius", "Radius (nm)"),
    ("density", "Density (motor/nm$^3$)"),
    ("sasa", "SASA (nm$^2$/motor)"),
    ("packing", "Axial density (motors/nm)"),
]

for squeeze, sdf in summary_df.groupby("squeeze"):
    print(f"\n### Squeeze = {squeeze}\n")
    for p, label in params_order:
        sub = sdf[sdf["parameter"] == p]
        if sub.empty:
            continue

        # detect if parameter is local (has molecule column)
        is_local = "molecule" in sub.columns and sub["molecule"].notna().any()

        if is_local:
            # molecule A when ratioB = 0, molecule B when ratioB = 90
            row0 = sub[(sub["ratioB"] == 0) & (sub["molecule"] == "A")]
            row90 = sub[(sub["ratioB"] == 90) & (sub["molecule"] == "B")]
        else:
            # global parameters have no molecule column
            row0 = sub[sub["ratioB"] == 0]
            row90 = sub[sub["ratioB"] == 90]

        if row0.empty or row90.empty:
            print(f"{label}: missing data for ratio 0 or 90")
            continue

        mean0, ci0 = row0.iloc[0][["mean", "CI"]]
        mean90, ci90 = row90.iloc[0][["mean", "CI"]]

        print(
            f"{label:<35s} & "
            f"{mean0:5.2f} $\\pm$ {ci0:4.3f} & "
            f"{mean90:5.2f} $\\pm$ {ci90:4.3f} &"
        )



### Squeeze = 9

R$_g$ (nm)                          &  8.68 $\pm$ 0.046 &  8.87 $\pm$ 0.059 &
Volume (nm$^3$/motor)               & 576.99 $\pm$ 2.900 & 573.28 $\pm$ 4.291 &
Area (nm$^2$/motor)                 & 264.95 $\pm$ 6.106 & 268.41 $\pm$ 7.286 &
Radius (nm)                         &  2.63 $\pm$ 0.022 &  2.64 $\pm$ 0.037 &
Density (motor/nm$^3$)              &  0.79 $\pm$ 0.004 &  0.79 $\pm$ 0.015 &
SASA (nm$^2$/motor)                 &  2.06 $\pm$ 0.070 &  2.03 $\pm$ 0.014 &
Axial density (motors/nm)           & 17.13 $\pm$ 0.365 & 17.20 $\pm$ 0.300 &

### Squeeze = 12

R$_g$ (nm)                          &  8.82 $\pm$ 0.087 &  9.00 $\pm$ 0.055 &
Volume (nm$^3$/motor)               & 574.29 $\pm$ 5.234 & 565.36 $\pm$ 17.261 &
Area (nm$^2$/motor)                 & 262.49 $\pm$ 13.105 & 295.68 $\pm$ 36.361 &
Radius (nm)                         &  2.76 $\pm$ 0.000 &  2.91 $\pm$ 0.046 &
Density (motor/nm$^3$)              &  0.80 $\pm$ 0.000 &  0.77 $\pm$ 0.027 &
SASA (nm$^2$/mot

In [102]:

# ---- configure rows: (parameter, label, decimals, transform(mean, CI)->(mean, CI))
ROWS = [
    ("radius",           "Diameter (nm)",                         2, lambda m, c: (2*m, 2*c)),
    ("packing",          "Axial Density (motors/nm)",             2, lambda m, c: (m, c)),
    ("density",          "Volumetric Density (motors/nm$^3$)",    2, lambda m, c: (m, c)),
    ("sasa",             "SASA (nm$^2$/motor)",                   2, lambda m, c: (m, c)),
    ("RG",               "Molecular radius of gyration (\\AA)",   2, lambda m, c: (m, c)),
    ("voronota-volume",  "Molecular core volume (\\AA$^3$)",      0, lambda m, c: (m, c)),
    ("voronota-area",    "Molecular core area (\\AA$^2$)",        0, lambda m, c: (m, c)),
]

SQUEEZES = [9, 12, 15]
SUBCOLS  = ["MMa", "MMb"]                 # MMa=ratioB 0, MMb=ratioB 90
RATIO_MAP = {"MMa": 0, "MMb": 90}

def _fmt(mean, ci, decimals):
    if np.isnan(mean) or np.isnan(ci):
        return "–"
    q = f"{{:.{decimals}f}}"
    return f"{q.format(mean)} $\\pm$ {q.format(ci)}"

def _cell_value(sub_df, param, squeeze, sublabel):
    """Return (mean, CI) for a param/squeeze/subcol, respecting local/global molecule filtering."""
    ratio = RATIO_MAP[sublabel]
    sdf = sub_df[sub_df["parameter"] == param]
    if sdf.empty:
        return np.nan, np.nan

    # detect local vs global per your rule: local has a 'molecule' column with any non-null values
    is_local = ("molecule" in sdf.columns) and sdf["molecule"].notna().any()

    if is_local:
        mol = "A" if ratio == 0 else "B"
        view = sdf[(sdf["squeeze"] == squeeze) & (sdf["ratioB"] == ratio) & (sdf["molecule"] == mol)]
    else:
        view = sdf[(sdf["squeeze"] == squeeze) & (sdf["ratioB"] == ratio)]

    if view.empty:
        return np.nan, np.nan

    # if multiple rows (replicate groups), average defensively
    return view["mean"].mean(), view["CI"].mean()

# ---- compose LaTeX
header_main = " & " + " & ".join([f"\\multicolumn{{2}}{{c}}{{\\textbf{{squeeze {sq}}}}}" for sq in SQUEEZES]) + " \\\\"
header_sub  = " & " + " & ".join([f"\\textbf{{MMa}} & \\textbf{{MMb}}" for _ in SQUEEZES]) + " \\\\"

lines = []
lines.append("\\begin{table*}[tbp]")
lines.append("    \\centering")
lines.append("    \\small")
lines.append("    \\caption{Summary across squeezes and excitation ratios (MMa = ratioB 0\\%, MMb = ratioB 90\\%). Errors are 95\\% confidence intervals across replicate fibres.}")
lines.append("    \\begin{tabular}{|c|" + "c|" * (len(SQUEEZES)*2) + "}")
lines.append("    \\hline")
lines.append("    \\textbf{Measurement} " + header_main)
lines.append("    \\hline")
lines.append("     " + header_sub)
lines.append("    \\hline")

for param, label, decs, transform in ROWS:
    row_cells = []
    for sq in SQUEEZES:
        for sublabel in SUBCOLS:
            # special case: no MMb for squeeze=15
            if sq == 15 and sublabel == "MMb":
                row_cells.append("–")
                continue
            m, c = _cell_value(summary_df, param, sq, sublabel)
            if not (np.isnan(m) or np.isnan(c)):
                m, c = transform(m, c)
            row_cells.append(_fmt(m, c, decs))
    line = f"    {label} & " + " & ".join(row_cells) + " \\\\ \\hline"
    lines.append(line)

lines.append("    \\end{tabular}")
lines.append("    \\label{table:fibre_characteristics_by_squeeze_ratio}")
lines.append("\\end{table*}")

print("\n".join(lines))

\begin{table*}[tbp]
    \centering
    \small
    \caption{Summary across squeezes and excitation ratios (MMa = ratioB 0\%, MMb = ratioB 90\%). Errors are 95\% confidence intervals across replicate fibres.}
    \begin{tabular}{|c|c|c|c|c|c|c|}
    \hline
    \textbf{Measurement}  & \multicolumn{2}{c}{\textbf{squeeze 9}} & \multicolumn{2}{c}{\textbf{squeeze 12}} & \multicolumn{2}{c}{\textbf{squeeze 15}} \\
    \hline
      & \textbf{MMa} & \textbf{MMb} & \textbf{MMa} & \textbf{MMb} & \textbf{MMa} & \textbf{MMb} \\
    \hline
    Diameter (nm) & 5.27 $\pm$ 0.04 & 5.28 $\pm$ 0.07 & 5.52 $\pm$ 0.00 & 5.81 $\pm$ 0.09 & 5.90 $\pm$ 0.01 & – \\ \hline
    Axial Density (motors/nm) & 17.13 $\pm$ 0.37 & 17.20 $\pm$ 0.30 & 19.07 $\pm$ 0.00 & 20.40 $\pm$ 0.06 & 21.41 $\pm$ 0.46 & – \\ \hline
    Volumetric Density (motors/nm$^3$) & 0.79 $\pm$ 0.00 & 0.79 $\pm$ 0.01 & 0.80 $\pm$ 0.00 & 0.77 $\pm$ 0.03 & 0.78 $\pm$ 0.02 & – \\ \hline
    SASA (nm$^2$/motor) & 2.06 $\pm$ 0.07 & 2.03 $\pm$ 0.01 & 1.92

## Plotting functions
### var(squeeze)

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from md_styler import MDStyler

# --- style & fixed colors
sty = MDStyler().apply()
squeeze_colors = {9: sty.palette6[0], 12: sty.palette6[2], 15: sty.palette6[1]}

# --- config
GLOBAL_PARAMS = ["radius", "sasa", "packing", "density"]
PARAM_LABEL = {
    "radius": "radius (nm)",
    "sasa": "SASA per molecule (nm²)",
    "packing": "axial density (molecules / nm)",
    "density": "density (molecules / nm³)",
}
RATIOB_TARGET = 0
NBOOT = 1000
rng = np.random.default_rng(42)
all_df['density'] = all_df['packing'] /(np.pi * all_df['radius']**2)
# --- filter all_df to ratioB == 0 and globals
df0 = all_df.query("ratioB == @RATIOB_TARGET").copy()
present = [p for p in GLOBAL_PARAMS if p in df0.columns]

def bootstrap_rho_balanced(df_param, nboot=NBOOT, rng=rng):
    """Bootstrap Pearson ρ between squeeze and parameter value."""
    work = df_param[["squeeze", "value"]].dropna().copy()
    by_sq = {sq: sub.reset_index(drop=True) for sq, sub in work.groupby("squeeze")}
    squeezes_sorted = sorted(by_sq.keys())
    rhos = np.empty(nboot, dtype=float)

    for i in range(nboot):
        xs, ys = [], []
        for sq in squeezes_sorted:
            sub = by_sq[sq]
            idx = rng.integers(0, len(sub), size=len(sub))
            vals = sub.loc[idx, "value"].to_numpy()
            xs.append(np.full(len(sub), float(sq)))
            ys.append(vals)
        x = np.concatenate(xs)
        y = np.concatenate(ys)
        rhos[i], _ = pearsonr(x, y)

    return float(np.mean(rhos)), tuple(np.percentile(rhos, [2.5, 97.5]))

rho_rows = []
for i, p in enumerate(present):
    # tidy parameter subset
    sub = df0[["squeeze", p]].rename(columns={p: "value"}).dropna()
    if sub.empty:
        print(f"[warn] No data for '{p}' at ratioB={RATIOB_TARGET}")
        continue

    rho_mean, (rho_lo, rho_hi) = bootstrap_rho_balanced(sub)

    # --- scatter plot of all raw data points
    fig, ax = sty.fig_square()
    for sq in sorted(sub["squeeze"].unique()):
        g = sub[sub["squeeze"] == sq]
        ax.scatter(
            np.full(len(g), sq),
            g["value"],
            s=16,
            alpha=0.8,
            color=squeeze_colors.get(int(sq), "#777777"),
            label=f"Squeeze {sq}" if i == 0 else None,
        )

    ax.set_xlabel("Iterations of Scaling")
    ax.set_ylabel(PARAM_LABEL.get(p, p))
    if i == 0:
        ax.legend(frameon=False, loc="best")
    plt.show()

    rho_rows.append(dict(parameter=p, rho=rho_mean, CI_low=rho_lo, CI_high=rho_hi))

rho_df = pd.DataFrame(rho_rows).sort_values("rho", ascending=False)
rho_df


### var(ratio)

In [None]:
squeeze_colors = {9: sty.palette6[0], 12: sty.palette6[2]}  # only 9 & 12

# --- config
GLOBAL_PARAMS = ["radius", "sasa", "packing", "density"]
PARAM_LABEL = {
    "radius": "radius (nm)",
    "sasa": "SASA per molecule (nm²)",
    "packing": "axial density (molecules / nm)",
    "density": "density (molecules / nm³)",
}
NBOOT = 1000
rng = np.random.default_rng(42)

# --- subset to squeezes 9 & 12, keep only rows with needed columns
df = all_df.copy()
df = df[df["squeeze"].isin([9, 12])]
present = [p for p in GLOBAL_PARAMS if p in df.columns]
if not present:
    print("[warn] No global parameters present among", GLOBAL_PARAMS)

def bootstrap_rho_by_ratioB(df_param, nboot=NBOOT, rng=rng):
    """
    Bootstrap Pearson ρ between ratioB (x) and parameter value (y),
    balanced across squeezes 9 & 12.
    """
    work = df_param[["squeeze", "ratioB", "value"]].dropna().copy()
    by_sq = {sq: sub.reset_index(drop=True) for sq, sub in work.groupby("squeeze")}
    squeezes_sorted = sorted(by_sq.keys())
    if len(squeezes_sorted) == 0:
        return np.nan, (np.nan, np.nan)

    rhos = np.empty(nboot, dtype=float)
    for i in range(nboot):
        xs, ys = [], []
        sub = by_sq[9]
        idx = rng.integers(0, len(sub), size=len(sub))
        xs.append(sub.loc[idx, "ratioB"].to_numpy(dtype=float))
        ys.append(sub.loc[idx, "value"].to_numpy(dtype=float))
        x = np.concatenate(xs)
        y = np.concatenate(ys)
        rhos[i], _ = pearsonr(x, y)

    return float(np.mean(rhos)), tuple(np.percentile(rhos, [2.5, 97.5]))

rho_rows = []
for i, p in enumerate(present):
    sub = df[["squeeze", "ratioB", p]].rename(columns={p: "value"}).dropna()
    if sub.empty:
        print(f"[warn] No data for '{p}' with squeezes 9 & 12")
        continue

    # --- bootstrap Pearson correlation (ratioB vs value)
    rho_mean, (rho_lo, rho_hi) = bootstrap_rho_by_ratioB(sub)

    # --- scatter of raw data points (ratioB on x), colored by squeeze
    fig, ax = sty.fig_square()
    for sq in (9, 12):
        g = sub[sub["squeeze"] == sq]
        if g.empty:
            continue
        ax.scatter(
            g["ratioB"].astype(float),
            g["value"].astype(float),
            s=16,
            alpha=0.85,
            color=squeeze_colors[sq],
            label=f"Scaling: {sq}" if i == 0 else None,
        )

    ax.set_xlabel("Percent actuated (%)")
    ax.set_ylabel(PARAM_LABEL.get(p, p))
    if i == 0:
        ax.legend(frameon=False, loc="best")
    plt.show()

    rho_rows.append(dict(parameter=p, rho=rho_mean, CI_low=rho_hi if np.isnan(rho_lo) else rho_lo, CI_high=rho_hi))

rho_df = pd.DataFrame(rho_rows).sort_values("rho", ascending=False)
rho_df


## Torsion angles

In [None]:
def plot_torsion_angles(data_path=Path('../data')):
    """
    Create a single scatter plot of torsion-theta vs torsion-alpha for the last frame of each experiment.
    Points are colored by molecule type (A vs B). KDE is drawn per molecule type on a 1/10 subsample.

    Args:
        data_path: Path to data directory
    """
    # ----- apply style (no change to the data pipeline)
    sty = MDStyler().apply()
    sns.reset_orig() 

    torsion_data = []

    for exp_name in valid_experiments:
        # Get ratio for this experiment
        ratio = get_experiment_ratio(exp_name, data_path)
        if ratio is None:
            print(f"[warn] Missing ratio for {exp_name}, skipping…")
            continue

        # Load torsion-theta data
        theta_file = data_path / f"{exp_name}_torsion-theta.csv"
        if not theta_file.exists():
            print(f"[warn] Missing file: {theta_file}")
            continue

        # Load torsion-alpha (phi) data
        phi_file = data_path / f"{exp_name}_torsion-alpha.csv"
        if not phi_file.exists():
            print(f"[warn] Missing file: {phi_file}")
            continue

        df_theta = pd.read_csv(theta_file)
        df_phi   = pd.read_csv(phi_file)

        # Basic structural checks (same shape & resid column)
        if df_theta.shape != df_phi.shape:
            print(f"[warn] Structure mismatch for {exp_name} (theta {df_theta.shape} vs alpha {df_phi.shape}), skipping…")
            continue
        if "resid" not in df_theta.columns or "resid" not in df_phi.columns:
            print(f"[warn] 'resid' column missing for {exp_name}, skipping…")
            continue

        # Last columns (last frame)
        last_theta_col = df_theta.columns[-1]
        last_phi_col   = df_phi.columns[-1]

        # Extract per-molecule values
        for idx, row in df_theta.iterrows():
            resid = row["resid"]

            # Ensure resid exists in phi table at same row or anywhere
            if resid not in df_phi["resid"].values:
                print(f"[warn] resid {resid} not found in alpha for {exp_name}, skipping this resid…")
                continue

            theta_val = row[last_theta_col]
            phi_val   = df_phi.iloc[idx, -1]

            # mirror for B-R that relaxes to A-S (domain-specific rule)
            if resid in [2, 3]:
                theta_val *= -1
                phi_val   *= -1

            # simple wrapping/cleaning to keep angles in expected ranges
            if pd.notna(theta_val) and theta_val > 100:
                theta_val -= 360
            if pd.notna(phi_val) and phi_val < -150:
                phi_val += 360

            # Map resid → molecule type
            molecule = 'A' if resid in [1, 2] else 'B'

            torsion_data.append({
                "experiment": exp_name,
                "resid": resid,
                "molecule": molecule,
                "ratioB": ratio,
                "theta": theta_val,
                "alpha": phi_val
            })

    torsion_df = pd.DataFrame(torsion_data)

    if torsion_df.empty:
        print("[warn] No torsion data collected")
        return None

    # sanity checks
    if torsion_df[["theta", "alpha"]].isna().any().any():
        n_nan = torsion_df[["theta", "alpha"]].isna().sum().sum()
        print(f"[warn] Found {n_nan} NaN values in theta/alpha. They will be ignored by plotting.")

    # ---- plotting (style-only changes)
    # keep original call signature (single axes), but our rcParams are already active
    fig, ax = plt.subplots(figsize=(12, 10))

    # choose styler colors (A = cyan-ish, B = deep blue from PBC box)
    molecule_colors = {
        "A": sty.get_color("cyan"),
        "B": sty.get_color("red"),
    }

    # scatter (keep small marker size, thin black edge like original)
    for molecule_type, color in molecule_colors.items():
        subset = torsion_df[torsion_df["molecule"] == molecule_type]
        ax.scatter(
            subset["theta"], subset["alpha"],
            color=color, marker='o', s=12, alpha=0.3,
            edgecolor='black', linewidth=0.2,
            label=f"Initiated as {molecule_type}"
        )

    # KDE per molecule type on a 1/10 subsample (if seaborn is available)
    for molecule_type, color in molecule_colors.items():
        subset = torsion_df[torsion_df["molecule"] == molecule_type]
        # 1/10 subsample for KDE (reproducible)
        subs = subset.sample(frac=1, random_state=0)
        sns.kdeplot(
            x=subs['theta'], y=subs['alpha'],
            levels=5, linewidths=1, ax=ax,
            color=color, alpha=0.5
        )

    # legend, labels, formatting (match original intent)
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), title="Molecule Type", loc='best', frameon=False)

    ax.set_title('Torsion Angles by Molecule Type', fontsize=24)
    ax.set_xlabel(r'Torsion $\theta$ (°)')
    ax.set_ylabel(r'Torsion $\alpha$ (°)')
    ax.set_aspect('equal')

    # original script enabled grid with light alpha
    ax.grid(True, alpha=0.3)

    # dynamic limits consistent with original max-range logic
    max_range = max(
        abs(torsion_df['theta'].max()), abs(torsion_df['theta'].min()),
        abs(torsion_df['alpha'].max()), abs(torsion_df['alpha'].min())
    )
    # (keep auto limits unless you want to set explicit ones as in your commented code)

    plt.tight_layout()
    return fig, ax, torsion_df

fig, ax, df = plot_torsion_angles()
plt.show()

## r(g)

In [139]:
# analyze_gr.py
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Iterable, List, Tuple, Optional

from scipy.signal import savgol_filter  # for Savitzky–Golay smoothing

@dataclass
class GRAnalysisResult:
    r: np.ndarray
    weighted_gr_df: pd.DataFrame
    smoothed_df: Optional[pd.DataFrame]
    peaks: Dict[str, Dict[str, float]]
    weighted_gr: Dict[str, np.ndarray]
    weighted_std: Dict[str, np.ndarray]
    used_experiments: List[str]
    messages: List[str]


def analyze_weighted_gr_styled(
    experiment_ratios: Dict[str, float],
    data_dir: str = "../data",
    NMOLS: int = 200,
    r_range: Optional[Tuple[float, float]] = None,
    use_ordered_pairs: bool = True,   # True keeps your N_i*(N_i-1) choice
    savgol_window: Optional[int] = None,
    savgol_poly: int = 3,
    interpolate_if_mismatch: bool = True,
) -> GRAnalysisResult:
    """
    Weighted-average g(r) across experiments with optional Savitzky–Golay smoothing.
    Returns data + diagnostics (no plotting). To plot, call plot_weighted_gr_with_styler(...).
    """

    msgs: List[str] = []

    # ---- read all CSVs
    experiment_data: Dict[str, pd.DataFrame] = {}
    molecule_counts: Dict[str, Dict[int, int]] = {}

    for expt, ratioB_pct in experiment_ratios.items():
        if ratioB_pct == None:
            continue
        # compute counts: A vs B split, and R vs S split 50:50
        n_A = int(round((100.0 - ratioB_pct) / 100.0 * NMOLS))
        n_B = NMOLS - n_A
        # split each evenly over R/S
        N_1 = int(round(n_A / 2))  # A-R
        N_2 = n_A - N_1            # A-S
        N_3 = int(round(n_B / 2))  # B-R
        N_4 = n_B - N_3            # B-S

        # sanity
        if (N_1 + N_2 + N_3 + N_4) != NMOLS:
            msgs.append(f"[warn] {expt}: counts round to {N_1+N_2+N_3+N_4} ≠ {NMOLS}. Adjusted by last bin.")
        molecule_counts[expt] = {1: N_1, 2: N_2, 3: N_3, 4: N_4}

        fp = os.path.join(data_dir, f"{expt}_g-r.csv")
        if not os.path.exists(fp):
            msgs.append(f"[warn] file not found: {fp}")
            continue

        df = pd.read_csv(fp)
        if "r (Å)" not in df.columns:
            msgs.append(f"[warn] {expt}: missing 'r (Å)' column")
            continue

        experiment_data[expt] = df

    used = list(experiment_data.keys())
    if not used:
        raise ValueError("No valid data files found.")

    # ---- choose base r-grid and check consistency
    base_r = experiment_data[used[0]]["r (Å)"].to_numpy()
    grids_ok = True
    for expt, df in experiment_data.items():
        r_here = df["r (Å)"].to_numpy()
        if r_here.shape != base_r.shape or not np.allclose(r_here, base_r, rtol=0, atol=1e-8):
            grids_ok = False
            msgs.append(f"[warn] {expt}: r-grid differs from first dataset")
    if not grids_ok:
        if interpolate_if_mismatch:
            msgs.append("[info] interpolating all g(r) columns onto the first dataset’s r-grid")
            for expt, df in experiment_data.items():
                if df["r (Å)"].to_numpy().shape != base_r.shape or not np.allclose(df["r (Å)"], base_r):
                    # interpolate every g(r) column
                    r_src = df["r (Å)"].to_numpy()
                    for col in df.columns:
                        if col.startswith("g(r)"):
                            df[col] = np.interp(base_r, r_src, df[col].to_numpy())
                    df["r (Å)"] = base_r
        else:
            msgs.append("[error] r-grids mismatch and interpolate_if_mismatch=False")

    r = base_r.copy()

    # ---- r-range mask
    if r_range is not None:
        rmin, rmax = r_range
        mask = (r >= rmin) & (r <= rmax)
        r = r[mask]
    else:
        mask = slice(None)

    # ---- define interaction mapping (skip AR–BR and AR–BS later when plotting)
    interaction_mapping = {
        "AR-AR":   ["g(r) 1-1", "g(r) 2-2"],
        "AR-AS": ["g(r) 1-2"],
        "BR-BR":   ["g(r) 3-3", "g(r) 4-4"],
        "BR-BS": ["g(r) 3-4"],
    }

    weighted_gr: Dict[str, np.ndarray] = {}
    weighted_std: Dict[str, np.ndarray] = {}
    weights_sum: Dict[str, float] = {}
    peak_positions: Dict[str, List[Tuple[float, float]]] = {k: [] for k in interaction_mapping}

    for inter, columns in interaction_mapping.items():
        weighted_gr[inter] = np.zeros_like(r)
        weights_sum[inter] = 0.0
        all_values: List[np.ndarray] = []
        all_weights: List[float] = []

        for expt, df in experiment_data.items():
            vals_here: List[np.ndarray] = []
            weights_here: List[float] = []

            for col in columns:
                if col not in df.columns:
                    msgs.append(f"[warn] {expt}: column missing: {col}")
                    continue

                # parse pair ids
                i, j = (int(x) for x in col.split()[-1].split("-"))
                Ni = molecule_counts[expt][i]
                Nj = molecule_counts[expt][j]

                if i == j:
                    weight = Ni * (Ni - 1) if use_ordered_pairs else Ni * (Ni - 1) / 2.0
                else:
                    weight = Ni * Nj

                vals_here.append(df[col].to_numpy()[mask])
                weights_here.append(float(weight))

            if vals_here:
                total_w = float(np.sum(weights_here))
                # within-experiment weighted mean across symmetric columns
                exp_avg = np.zeros_like(r, dtype=float)
                for v, w in zip(vals_here, weights_here):
                    exp_avg += v * (w / total_w)

                all_values.append(exp_avg)
                all_weights.append(total_w)

                # peak for this experiment
                pk_idx = int(np.nanargmax(exp_avg))
                peak_positions[inter].append((float(r[pk_idx]), total_w))

                weighted_gr[inter] += exp_avg * total_w
                weights_sum[inter] += total_w

        if weights_sum[inter] > 0:
            weighted_gr[inter] /= weights_sum[inter]

            # weighted variance (sample); guard small-n
            all_values = np.asarray(all_values)
            all_weights = np.asarray(all_weights, dtype=float)
            if all_values.shape[0] > 1:
                diff2 = np.zeros_like(r, dtype=float)
                for v, w in zip(all_values, all_weights):
                    diff2 += w * (v - weighted_gr[inter]) ** 2
                correction = 1.0 - np.sum(all_weights**2) / (np.sum(all_weights) ** 2)
                if correction <= 0:
                    weighted_std[inter] = np.full_like(r, np.nan, dtype=float)
                else:
                    var = diff2 / (np.sum(all_weights) * correction)
                    weighted_std[inter] = np.sqrt(var)
            else:
                weighted_std[inter] = np.full_like(r, np.nan, dtype=float)
        else:
            weighted_std[inter] = np.full_like(r, np.nan, dtype=float)

    # ---- construct DataFrames
    result_df = pd.DataFrame({"r (Å)": r})
    n_expt = len(used)
    for inter in interaction_mapping:
        result_df[f"g(r) {inter}"] = weighted_gr[inter]
        ci = 1.96 * weighted_std[inter] / np.sqrt(max(n_expt, 1))
        result_df[f"CI95 {inter}"] = ci

    # ---- peaks summary
    peaks: Dict[str, Dict[str, float]] = {}
    for inter in interaction_mapping:
        grv = weighted_gr[inter]
        pk_idx = int(np.nanargmax(grv))
        peak_r = float(r[pk_idx])
        peak_gr = float(grv[pk_idx])
        peak_gr_ci = float(1.96 * (weighted_std[inter][pk_idx] if np.isfinite(weighted_std[inter][pk_idx]) else np.nan) / np.sqrt(max(n_expt, 1)))

        if peak_positions[inter]:
            pos, wts = zip(*peak_positions[inter])
            pos = np.asarray(pos, dtype=float)
            wts = np.asarray(wts, dtype=float)
            if len(pos) > 1:
                w_mean = float(np.sum(pos * wts) / np.sum(wts))
                diff2 = float(np.sum(wts * (pos - w_mean) ** 2))
                corr = 1.0 - float(np.sum(wts**2) / (np.sum(wts) ** 2))
                if corr > 0:
                    std_r = np.sqrt(diff2 / (np.sum(wts) * corr))
                    peak_r_ci = float(1.96 * std_r / np.sqrt(len(pos)))
                else:
                    peak_r_ci = np.nan
            else:
                w_mean = pos[0]
                peak_r_ci = np.nan
        else:
            w_mean = peak_r
            peak_r_ci = np.nan

        peaks[inter] = {"r": w_mean, "r_CI95": peak_r_ci, "g(r)": peak_gr, "g(r)_CI95": peak_gr_ci}

    # ---- optional Savitzky–Golay smoothing (for export convenience)
    smoothed_df = None
    if savgol_window and savgol_window > 2 and savgol_window % 2 == 1:
        smoothed_df = pd.DataFrame({"r (Å)": r})
        for inter in interaction_mapping:
            y = weighted_gr[inter]
            ci = result_df[f"CI95 {inter}"].to_numpy()
            y_sm = savgol_filter(y, savgol_window, savgol_poly, mode="interp")
            ci_sm = savgol_filter(ci, savgol_window, savgol_poly, mode="interp")
            smoothed_df[f"g(r) {inter}"] = y_sm
            smoothed_df[f"CI95 {inter}"] = ci_sm
    elif savgol_window is not None:
        msgs.append("[warn] savgol_window must be odd and >= 3; smoothing skipped")

    return GRAnalysisResult(
        r=r,
        weighted_gr_df=result_df,
        smoothed_df=smoothed_df,
        peaks=peaks,
        weighted_gr=weighted_gr,
        weighted_std=weighted_std,
        used_experiments=used,
        messages=msgs,
    )


def plot_weighted_gr_with_styler(
    res: GRAnalysisResult,
    sty: Optional[MDStyler] = None,
    use_savgol: bool = True,
    exclude_pairs: Iterable[str] = ("AR-BR", "AR-BS"),  # per your instruction
    square: bool = False,
    title: Optional[str] = None,
) -> Tuple[plt.Figure, plt.Axes]:
    """
    Plot weighted g(r) + 95% CI using your MDStyler.
    AA solid vs CG dashed is not semantically meaningful here, so we use solid lines
    with shaded CI; line dashes can be used to differentiate families if you wish.
    """
    sty = sty or MDStyler().apply()
    fig, ax = (sty.fig_square() if square else sty.fig_horizontal())

    # choose which table to plot
    df = res.smoothed_df if (use_savgol and res.smoothed_df is not None) else res.weighted_gr_df
    r = res.r

    # which interactions to draw (respect exclusion)
    cols = [c for c in df.columns if c.startswith("g(r) ")]
    inter_names = [c.replace("g(r) ", "") for c in cols]
    draw = [(n, c) for n, c in zip(inter_names, cols) if n not in set(exclude_pairs)]

    pal = sty.get_palette(len(draw))
    for color, (name, col) in zip(pal, draw):
        y = df[col].to_numpy()
        # CI column key
        ci_key = col.replace("g(r)", "CI95")
        ci = df[ci_key].to_numpy() if ci_key in df.columns else np.full_like(y, np.nan)

        ax.plot(r, y, color=color, label=name)
        # matte translucent band
        ax.fill_between(r, y - ci, y + ci, color=color, alpha=0.18)

    ax.set_xlabel("r (Å)")
    ax.set_ylabel("g(r)")
    if title:
        ax.set_title(title)

    ax.legend(frameon=False, ncol=2)
    return fig, ax


In [148]:
res = analyze_weighted_gr_styled(
    experiment_ratios,
    data_dir="../data",
    NMOLS=200,
    r_range=(0.0, 20.0),
    savgol_window=15,   # odd, >=3
    savgol_poly=3,
)
for m in res.messages:
    print(m)

[warn] motor2m_frac3-0: column missing: g(r) 3-3
[warn] motor2m_frac3-0: column missing: g(r) 4-4
[warn] motor2m_squeeze10_frac-0: column missing: g(r) 3-3
[warn] motor2m_squeeze10_frac-0: column missing: g(r) 4-4
[warn] motor2m_squeeze12_frac-0_2: column missing: g(r) 3-3
[warn] motor2m_squeeze12_frac-0_2: column missing: g(r) 4-4
[warn] motor2m_squeeze12_frac-0_3: column missing: g(r) 3-3
[warn] motor2m_squeeze12_frac-0_3: column missing: g(r) 4-4
[warn] motor2m_squeeze14_frac-0_2: column missing: g(r) 3-3
[warn] motor2m_squeeze14_frac-0_2: column missing: g(r) 4-4
[warn] test_nr2: column missing: g(r) 3-3
[warn] test_nr2: column missing: g(r) 4-4
[warn] test_nr3: column missing: g(r) 3-3
[warn] test_nr3: column missing: g(r) 4-4
[warn] motor2m_frac3-0: column missing: g(r) 3-4
[warn] motor2m_squeeze10_frac-0: column missing: g(r) 3-4
[warn] motor2m_squeeze12_frac-0_2: column missing: g(r) 3-4
[warn] motor2m_squeeze12_frac-0_3: column missing: g(r) 3-4
[warn] motor2m_squeeze14_frac-0

In [None]:

fig, ax = plot_weighted_gr_with_styler(
    res,
    sty=sty,
    use_savgol=True,
    exclude_pairs=("AR-BR", "AR-BS"),   # your requirement
    square=False,
    title=None,
)
plt.title("AA", fontsize=11, fontweight="bold", y=1.02)
plt.show()

## Sandbox

In [112]:
# Define categorical colormap for ratios
RATIO_COLORS = {0: 'red', 10: 'purple', 90: 'blue'}

def plot_parameter_vs_packing(all_df, parameter):
    """
    Create scatter plot of a parameter vs packing with error bars, colored by ratio.
    
    Args:
        all_df: DataFrame from concat_all
        parameter: Parameter name to plot against packing
    """
    if parameter not in all_df.columns or 'packing' not in all_df.columns:
        print(f"Missing required columns: {parameter} or packing")
        return
    
    # Check for CI columns
    param_ci_col = f'CI-{parameter}'
    packing_ci_col = 'CI-packing'
    
    if param_ci_col not in all_df.columns or packing_ci_col not in all_df.columns:
        print(f"Missing CI columns: {param_ci_col} or {packing_ci_col}")
        return
    
    # Create plot
    fig, ax = plt.subplots(figsize=(10, 8))
    # Create scatter plot with different markers for molecule types
    for molecule_type, marker in zip(['A', 'B'], ['o', 's']):
        subset = all_df[all_df['molecule'] == molecule_type]

        
        # Create scatter with error bars
        for _, row in subset.iterrows():
            closest_ratio = min(RATIO_COLORS.keys(), key=lambda k: abs(k-row['ratioB']))
            color = RATIO_COLORS[closest_ratio]
            ax.errorbar(
                row['packing'], row[parameter],
                xerr=row[packing_ci_col], yerr=row[param_ci_col],
                fmt=marker, markersize=8, 
                color=color,
                alpha=0.7, elinewidth=0.5, capsize=0,
                label=f"Ratio {row['ratioB']:.0f}, Molecule {molecule_type}"
            )
            
    # Calculate and print statistics at the end of the function
    print(f"\nStatistics for {parameter}:")
    for molecule in ['A', 'B']:
        subset = all_df[all_df['molecule'] == molecule]
        
        # Calculate mean value
        mean_val = subset[parameter].mean()
        
        # Properly account for both measurement uncertainty and data spread
        n = len(subset)
        if n > 1:
            # Standard error of the mean from the spread of data points
            std_error = subset[parameter].std(ddof=1) / np.sqrt(n)
            
            # Root mean square of individual confidence intervals
            individual_ci_rms = np.sqrt(np.mean(subset[param_ci_col]**2))
            
            # Combined uncertainty using error propagation
            combined_uncertainty = np.sqrt(std_error**2 + individual_ci_rms**2)
            
            # 95% confidence interval
            ci_val = 1.96 * combined_uncertainty
        else:
            # If only one data point, use its confidence interval
            ci_val = subset[param_ci_col].iloc[0]
        
        lower = mean_val - ci_val
        upper = mean_val + ci_val
        
        # Calculate Pearson correlation with robust handling
        # First, create a clean subset without any missing values
        clean_subset = subset.dropna(subset=['packing', parameter])
        
        # Check data validity
        x = clean_subset['packing'].values
        y = clean_subset[parameter].values
        
        # Calculate correlation only if we have valid data
        if len(x) >= 2:
            # Check for variance in both variables
            if np.var(x) > 0 and np.var(y) > 0:
                pearson_r, p_value = scipy.stats.pearsonr(x, y)
                correlation_str = f"Pearson r = {pearson_r:.4f} (p = {p_value:.4f})"
            else:
                correlation_str = "No variance in data"
        else:
            correlation_str = "Insufficient data points"
        
        print(f"  Molecule {molecule}: {mean_val:.4f} [{lower:.4f}, {upper:.4f}], {correlation_str}")
        
    # Add legend and labels
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))  # Keep only unique labels
    ax.legend(by_label.values(), by_label.keys(), loc='best')
    
    ax.set_xlabel('Packing (motors/nm)')
    ax.set_ylabel(parameter)
    ax.set_title(f'{parameter} vs Packing')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig, ax

def plot_sasa_vs_inverse_packing(global_df):
    """
    Create lineplot showing trajectory of sasa vs packing^-1 for all experiments.
    
    Args:
        global_df: DataFrame from concat_global for sasa_new
    """
    if 'sasa' not in global_df.columns or 'packing' not in global_df.columns:
        print("Missing required columns: sasa_new or packing")
        return
    
    # Create plot
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Get unique experiments
    experiments = global_df['experiment'].unique()

    # Plot each experiment
    for exp in experiments:
        subset = global_df[global_df['experiment'] == exp]
        ratio = subset['ratioB'].iloc[0]  # Get ratio for this experiment
        
        # Sort by packing for a smooth line
        #subset = subset.sort_values('packing')
        closest_ratio = min(RATIO_COLORS.keys(), key=lambda k: abs(k-ratio))
        color = RATIO_COLORS[closest_ratio]
        
        # Calculate inverse packing
        inverse = 1 / subset['packing']
        
        ax.plot(inverse, subset['sasa'], 
                label=f"RatioB {ratio:.0f}",
                color=color, linewidth=2, alpha=0.8)
                
        ax.plot(inverse.iloc[0], subset['sasa'].iloc[0], 
                marker='o', markersize=8, color=color)
    
    
    # Add legend and labels
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), loc='best')
    ax.set_xlabel('Packing⁻¹ (nm/motor)')
    ax.set_ylabel('SASA per molecule (nm$^2$)')
    ax.set_title('SASA vs Inverse Packing')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig, ax

def plot_with_rolling_ci(global_df, param_name, window=10):
    """
    Create lineplot of parameter vs time with rolling 95% CI.
    
    Args:
        global_df: DataFrame from concat_global
        param_name: Parameter name to plot (density or sasa_new)
        window: Rolling window size (default: 10)
    """
    if param_name not in global_df.columns or 'time' not in global_df.columns:
        print(f"Missing required columns: {param_name} or time")
        return
    
    # Create plot
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Get unique experiments
    experiments = global_df['experiment'].unique()

    # Calculate confidence interval multiplier for 95% CI
    ci_multiplier = 1.96
    
    # Plot each experiment
    for exp in experiments:
        subset = global_df[global_df['experiment'] == exp].copy()
        ratio = subset['ratioB'].iloc[0]  # Get ratio for this experiment
        
        # Sort by time
        subset = subset.sort_values('time')
        closest_ratio = min(RATIO_COLORS.keys(), key=lambda k: abs(k-ratio))
        color = RATIO_COLORS[closest_ratio]
        
        # Calculate rolling mean and standard deviation
        subset[f'{param_name}_rolling'] = subset[param_name].rolling(window=window, center=True).mean()
        subset[f'{param_name}_std'] = subset[param_name].rolling(window=window, center=True).std()
        subset[f'{param_name}_n'] = subset[param_name].rolling(window=window, center=True).count()
        
        # Calculate confidence intervals
        subset[f'{param_name}_ci'] = ci_multiplier * subset[f'{param_name}_std'] / np.sqrt(subset[f'{param_name}_n'])
        
        # Drop NA rows from rolling calculations
        subset = subset.dropna(subset=[f'{param_name}_rolling', f'{param_name}_ci'])
        
        # Plot the line and confidence band
        ax.plot(subset['time']/1000, subset[f'{param_name}_rolling'], 
                label=f"Ratio {ratio:.0f}",
                color=color, linewidth=2)
        
        ax.fill_between(subset['time']/1000, 
                       subset[f'{param_name}_rolling'] - subset[f'{param_name}_ci'],
                       subset[f'{param_name}_rolling'] + subset[f'{param_name}_ci'],
                       color=color, alpha=0.2)
    
    # Add legend and labels
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), loc='best')
    ax.set_xlabel('Time (ns)')
    ax.set_ylabel(param_name)
    ax.set_title(f'{param_name} vs Time with 95% CI')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig, ax

def calculate_density():
    """
    Calculate density as packing / (pi*radius^2)
    
    Args:
        global_df: DataFrame with packing and radius columns
    """
    radius_global = concat_global('radius')
    packing_global = concat_global('packing')
    # Merge dataframes
    density_df = pd.merge(
        packing_global, radius_global[['time', 'experiment', 'radius']], 
        on=['time', 'experiment']
    )
    # Calculate density
    density_df['density'] = density_df['packing'] / (np.pi * density_df['radius']**2)
    
    return density_df

def plot_torsion_angles(data_path=Path('../data')):
    """
    Create two scatter plots of torsion-theta vs torsion-phi for the last frame of each experiment.
    Left plot: colored by ratio, Right plot: colored by molecule type (A: red, B: blue)
    
    Args:
        data_path: Path to data directory
    """
    # Collect data for all experiments
    torsion_data = []
    
    for exp_name in valid_experiments:
        # Get ratio for this experiment
        ratio = get_experiment_ratio(exp_name, data_path)
        if ratio is None:
            continue
            
        # Load torsion-theta data
        theta_file = data_path / f"{exp_name}_torsion-theta.csv"
        if not theta_file.exists():
            continue
            
        # Load torsion-phi data
        phi_file = data_path / f"{exp_name}_torsion-alpha.csv"
        if not phi_file.exists():
            continue
            
        # Read the files
        df_theta = pd.read_csv(theta_file)
        df_phi = pd.read_csv(phi_file)
        
        # Check if they have the same structure
        if df_theta.shape != df_phi.shape or 'resid' not in df_theta.columns or 'resid' not in df_phi.columns:
            print(f"Structure mismatch for {exp_name}, skipping...")
            continue
            
        # Get the last column (last frame) for each file
        last_theta_col = df_theta.columns[-1]
        last_phi_col = df_phi.columns[-1]
        
        # Extract data for each molecule
        for idx, row in df_theta.iterrows():
            resid = row['resid']
            
            # Skip if this resid isn't in the phi data
            if resid not in df_phi['resid'].values:
                continue
                
            # Get theta and phi values
            theta_val = row[last_theta_col]
            phi_val = df_phi.iloc[idx,-1]
            
            # Mirror values for resid 2 and 4
            if resid in [2, 3]: # The B-R isomer relaxes to A-S!!!
                theta_val *= -1
                phi_val *= -1
            if theta_val > 100:
                theta_val = theta_val - 360
            if phi_val < -150:
                phi_val += 360
            
            # Map resid to molecule type
            molecule = 'A' if resid in [1, 2] else 'B'
            
            # Add to data collection
            torsion_data.append({
                'experiment': exp_name,
                'resid': resid,
                'molecule': molecule,
                'ratioB': ratio,
                'theta': theta_val,
                'alpha': phi_val
            })
    
    # Convert to DataFrame
    torsion_df = pd.DataFrame(torsion_data)
    
    if torsion_df.empty:
        print("No torsion data collected")
        return None
        
        # Create a single figure
    fig, ax = plt.subplots(figsize=(12, 10))

    # Plot colored by molecule type (previously on ax2)
    molecule_colors = {'A': 'red', 'B': 'blue'}
    for molecule_type, color in molecule_colors.items():
        molecule_subset = torsion_df[torsion_df['molecule'] == molecule_type]
        sc = ax.scatter(molecule_subset['theta'], molecule_subset['alpha'],
                        color=color, marker='o', s=15, alpha=0.25,
                        edgecolor='black', linewidth=0.2,
                        label=f"Molecule {molecule_type}")

    # Add KDE for each molecule type
    for molecule_type, color in molecule_colors.items():
        subset = torsion_df[torsion_df['molecule'] == molecule_type]
        if len(subset) >= 10:  # Need sufficient points for KDE
            try:
                sns.kdeplot(x=subset['theta'], y=subset['alpha'],
                            levels=5, linewidths=1, ax=ax,
                            color=color, alpha=0.5)
            except Exception as e:
                print(f"Could not create KDE for molecule {molecule_type}: {e}")

    # Add legend
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), title="Molecule Type", loc='best')
    ax.set_title('Torsion Angles by Molecule Type')

    # Set labels and formatting
    ax.set_xlabel(r'Torsion $\theta$ (°)')
    ax.set_ylabel(r'Torsion $\alpha$ (°)')
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.3)

    # Set limits
    max_range = max(
        abs(torsion_df['theta'].max()), abs(torsion_df['theta'].min()),
        abs(torsion_df['alpha'].max()), abs(torsion_df['alpha'].min())
    )

    # Uncomment if you want specific limits
    # ax.set_xlim(-280*1.1, 100*1.1)
    # ax.set_ylim(-max_range*1.1, max_range*1.1)

    plt.tight_layout()
    return fig, ax, torsion_df

In [8]:
all_df = concat_all()
sasa_global = concat_global('sasa')
packing_global = concat_global('packing')
radius_global = concat_global('radius')
density_df = calculate_density()
print(all_df.shape, packing_global.shape, sasa_global.shape, density_df.shape)
print(packing_global.head())
print(all_df.head())

(24, 25) (2412, 4) (2412, 4) (2412, 6)
     time    packing       experiment  ratioB
0     0.0  16.260718  motor2m_frac-10      10
1  1000.0  16.695486  motor2m_frac-10      10
2  2000.0  16.919118  motor2m_frac-10      10
3  3000.0  16.977865  motor2m_frac-10      10
4  4000.0  17.047296  motor2m_frac-10      10
         experiment  ratioB molecule        RG     CI-RG    packing  \
0   motor2m_frac-10      10        A  8.603780  0.010997  17.297524   
1   motor2m_frac-10      10        B  8.629517  0.033504  17.297524   
2  motor2m_frac2-10      10        A  8.647189  0.009813  16.954748   
3  motor2m_frac2-10      10        B  8.652479  0.040942  16.954748   
4  motor2m_frac2-90      90        A  8.640262  0.027931  17.148262   

   CI-packing      sasa   CI-sasa    torsion  ...  torsion-theta  \
0    0.011721  1.972909  0.003552   0.256950  ...    -105.395219   
1    0.011721  1.972909  0.003552  14.949945  ...     -22.235020   
2    0.009087  1.995068  0.003538        NaN  ...     

In [9]:
def print_CI(global_df, parameter):
    """
    Compute and print 95% confidence intervals for parameter means grouped by ratioB.
    
    Args:
        global_df: DataFrame with columns: time, parameter, experiment, ratioB
        parameter: Name of the parameter column to analyze
    """    
    # Create a DataFrame to store results
    results = []
    # Process each experiment
    for exp_name, exp_data in global_df.groupby('experiment'):
        # Sort by time to ensure we get the last 10%
        exp_data = exp_data.sort_values('time')
        
        # Calculate the index for the last 10% of the data
        last_10_percent_idx = int(0.9 * len(exp_data))
        
        # Get the last 10% of the data
        last_10_percent = exp_data.iloc[last_10_percent_idx:]
        
        # Compute average and std of the parameter in the last 10%
        avg = last_10_percent[parameter].mean()
        std = last_10_percent[parameter].std()
        
        # Get the ratioB value for this experiment
        ratio_b = exp_data['ratioB'].iloc[0]
        if ratio_b == 10:
            ratio_b = 0
        
        # Store results
        results.append({
            'experiment': exp_name,
            'ratioB': ratio_b,
            'mean': avg,
            'std': std
        })
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    # Group by ratioB
    grouped = results_df.groupby('ratioB')
    
    print(f"95% Confidence Intervals for {parameter} by ratioB:")
    print("-" * 50)
    
    # Calculate confidence intervals for each group
    for ratio, group in grouped:
        # Number of experiments in this group
        n_experiments = len(group)
        
        # Calculate the weighted mean
        weighted_mean = group['mean'].mean()
        
        # Propagate the standard error
        # For independent measurements, standard error = sqrt(sum(std_i^2)) / N
        propagated_std = np.sqrt(np.sum(group['std']**2) / n_experiments)
        
        # Calculate 95% confidence interval (using t-distribution for small samples)
        t_value = stats.t.ppf(0.975, n_experiments - 1)  # 95% CI is +/- 1.96 for large samples
        ci_lower = weighted_mean - t_value * propagated_std
        ci_upper = weighted_mean + t_value * propagated_std
        
        print(f"ratioB = {ratio:.2f} (n={n_experiments}):")
        print(f"  Mean {parameter}: {weighted_mean:.4f}")
        print(f"  stdE {parameter}: {propagated_std:.4f}")
        print(f"  95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
        print(f"  CI width: {ci_upper - ci_lower:.4f}")
        print()

print_CI(density_df, 'density')
print_CI(sasa_global, 'sasa')
print_CI(radius_global, 'radius')


95% Confidence Intervals for density by ratioB:
--------------------------------------------------
ratioB = 0.00 (n=7):
  Mean density: 0.7475
  stdE density: 0.0024
  95% CI: [0.7416, 0.7533]
  CI width: 0.0117

ratioB = 90.00 (n=5):
  Mean density: 0.7396
  stdE density: 0.0025
  95% CI: [0.7326, 0.7465]
  CI width: 0.0139

95% Confidence Intervals for sasa by ratioB:
--------------------------------------------------
ratioB = 0.00 (n=7):
  Mean sasa: 1.9525
  stdE sasa: 0.0281
  95% CI: [1.8836, 2.0213]
  CI width: 0.1377

ratioB = 90.00 (n=5):
  Mean sasa: 1.9641
  stdE sasa: 0.0177
  95% CI: [1.9149, 2.0134]
  CI width: 0.0985

95% Confidence Intervals for radius by ratioB:
--------------------------------------------------
ratioB = 0.00 (n=7):
  Mean radius: 2.7898
  stdE radius: 0.0054
  95% CI: [2.7766, 2.8031]
  CI width: 0.0265

ratioB = 90.00 (n=5):
  Mean radius: 2.8201
  stdE radius: 0.0057
  95% CI: [2.8043, 2.8360]
  CI width: 0.0317



In [10]:
def print_molecule_CI(all_df, parameter):
    """
    Compute and print 95% confidence intervals for parameter means grouped by molecule type.
    
    Args:
        all_df: DataFrame generated by concat_all() with columns: experiment, ratioB, molecule, parameter, etc.
        parameter: Name of the parameter column to analyze
    """
   
    # Check if parameter exists in the dataframe
    if parameter not in all_df.columns:
        print(f"Error: '{parameter}' not found in the dataframe.")
        return
    
    # Check if CI column exists
    ci_column = f'CI-{parameter}'
    has_ci_column = ci_column in all_df.columns
    
    # Group by molecule
    grouped = all_df.groupby('molecule')
    
    print(f"95% Confidence Intervals for {parameter} by molecule type:")
    print("-" * 50)
    
    # Calculate confidence intervals for each molecule type
    for molecule_type, group in grouped:
        # Remove NA values
        valid_data = group[~group[parameter].isna()]
        
        if valid_data.empty:
            print(f"No valid data for molecule type {molecule_type}")
            continue
            
        # Count number of molecules of this type across all experiments
        # Each row in the dataframe represents one molecule in one experiment
        n_molecules = len(valid_data)
        
        # Calculate the mean across all molecules
        weighted_mean = valid_data[parameter].mean()
        
        # If CI column exists, use it for propagation
        if has_ci_column:
            # Standard error from existing CIs
            # CI = 1.96 * std / sqrt(n), so std = CI * sqrt(n) / 1.96
            stds = valid_data[ci_column] * np.sqrt(1) / 1.96  # Each CI was for n=1
            propagated_std = np.sqrt(np.sum(stds**2) / (n_molecules))
        else:
            # Use standard deviation directly
            propagated_std = valid_data[parameter].std() / np.sqrt(n_molecules)
        
        # Calculate 95% confidence interval (using t-distribution for small samples)
        t_value = stats.t.ppf(0.975, n_molecules - 1)
        ci_lower = weighted_mean - t_value * propagated_std
        ci_upper = weighted_mean + t_value * propagated_std
        
        print(f"Molecule type: {molecule_type} (n={n_molecules}):")
        print(f"  Mean {parameter}: {weighted_mean:.4f}")
        print(f"  95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
        print(f"  stdE {parameter}: {propagated_std:.4f}")
        print(f"  CI width: {ci_upper - ci_lower:.4f}")
        print()
print_molecule_CI(all_df, 'RG')
print_molecule_CI(all_df, 'voronota-volume')
print_molecule_CI(all_df, 'voronota-area')

95% Confidence Intervals for RG by molecule type:
--------------------------------------------------
Molecule type: A (n=12):
  Mean RG: 8.7261
  95% CI: [8.7017, 8.7505]
  stdE RG: 0.0111
  CI width: 0.0488

Molecule type: B (n=9):
  Mean RG: 8.7821
  95% CI: [8.7542, 8.8100]
  stdE RG: 0.0121
  CI width: 0.0558

95% Confidence Intervals for voronota-volume by molecule type:
--------------------------------------------------
Molecule type: A (n=12):
  Mean voronota-volume: 700.9644
  95% CI: [694.1988, 707.7301]
  stdE voronota-volume: 3.0739
  CI width: 13.5314

Molecule type: B (n=9):
  Mean voronota-volume: 699.3775
  95% CI: [691.7011, 707.0539]
  stdE voronota-volume: 3.3289
  CI width: 15.3529

95% Confidence Intervals for voronota-area by molecule type:
--------------------------------------------------
Molecule type: A (n=12):
  Mean voronota-area: 389.8923
  95% CI: [385.3674, 394.4172]
  stdE voronota-area: 2.0559
  CI width: 9.0499

Molecule type: B (n=9):
  Mean voronota-a

In [None]:

# Create density vs time plot
print("Creating density vs time plot...")
fig1, _ = plot_with_rolling_ci(density_df, 'density')
plt.ylabel('Density (motors/nm$^3$)')
fig1.savefig('../figs/density_vs_time.png', dpi=300, bbox_inches='tight')
plt.show()

print("Creating packing vs time plot...")
fig, ax = plt.subplots(figsize=(12, 8))
color='blue'
window=5
experiments = density_df['experiment'].unique()
param_name='packing'

for exp in experiments:
    subset = density_df[density_df['experiment'] == exp].copy()
    subset = subset.sort_values('time')
    subset[f'{param_name}_rolling'] = subset[param_name].rolling(window=window, center=True).mean()
    subset = subset.dropna(subset=[f'{param_name}_rolling'])
    ax.plot(subset['time']/1000, subset[f'{param_name}_rolling'],
            color=color, linewidth=2)
ax.set_xlabel('Time (ns)')
ax.set_ylabel('Packing (motors/nm)')
ax.grid(True, alpha=0.3)
plt.tight_layout()
fig.savefig('../figs/packing_vs_time.png', dpi=300, bbox_inches='tight')
plt.show()


# Create SASA vs time plot
print("Creating SASA vs time plot...")
fig2, _ = plot_with_rolling_ci(sasa_global, 'sasa')
plt.ylabel('SASA (nm$^2$/motor)')
fig2.savefig('../figs/sasa_vs_time.png', dpi=300, bbox_inches='tight')
plt.show()

# Create SASA vs inverse packing plot
print("Creating SASA vs inverse packing plot...")
fig3, _ = plot_sasa_vs_inverse_packing(pd.merge(
    sasa_global, packing_global[['time', 'experiment', 'packing']], 
    on=['time', 'experiment']
))
fig3.savefig('../figs/sasa_vs_inverse_packing.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
all_df['CI-ratioB'] = all_df['ratioB'] * 0
parameters = [col for col in all_df.columns if col not in 
                     ['experiment', 'ratio', 'molecule', 'g-r', 'packing', 'torsion', 'torsion-theta', 'torsion-phi'] 
                     and not col.startswith('CI-')]
        
for param in parameters:
    print(f"Creating {param} vs packing plot...")
    try:
        fig, _ = plot_parameter_vs_packing(all_df, param)
        fig.savefig(f'../figs/{param}_vs_packing.png', dpi=300, bbox_inches='tight')
    except Exception as e:
        print(f"Error creating plot for {param}: {e}")
    

In [12]:
print('AU')
print(df[(df.molecule == 'B') & (df.theta < -100)].head())
print('Center upper')
print(df[(df.molecule == 'B') & (df.theta < -10) & (df.theta > -100) & (df.alpha > 100)])
print('Center lower')
print(df[(df.molecule == 'B') & (df.theta < -10) & (df.theta > -100) & (df.alpha < 100)])

AU
            experiment  resid molecule  ratioB       theta      alpha
466   motor2m_frac2-90    4.0        B      90 -163.621834  60.270402
566   motor2m_frac2-90    4.0        B      90 -182.321937  52.377337
1166  motor2m_frac3-90    4.0        B      90 -165.929586  60.808019
1266  motor2m_frac4-90    4.0        B      90 -181.523579  51.490146
Center upper
                     experiment  resid molecule  ratioB      theta       alpha
1233           motor2m_frac4-90    3.0        B      90 -66.126522  107.870764
1333           motor2m_frac4-90    3.0        B      90 -52.192386  138.810043
1352           motor2m_frac4-90    3.0        B      90 -64.455858  116.034166
1378           motor2m_frac4-90    4.0        B      90 -64.013042  127.952670
2183  motor2m_squeeze10_frac-90    4.0        B      90 -63.170582  112.542388
Center lower
            experiment  resid molecule  ratioB      theta      alpha
93     motor2m_frac-10    3.0        B      10 -45.694376  81.699121
193    mo

## G(r)

In [13]:
def analyze_weighted_gr(experiment_ratios, r_range=None, rolling_window=None):
    """
    Analyze g(r) data across multiple experiments with weighted averaging.
    
    Parameters:
    -----------
    experiment_ratios : dict
        Dictionary with experiment names as keys and molecule ratios as values
    r_range : tuple, optional
        (min_r, max_r) to filter data range for plotting and analysis
    rolling_window : int, optional
        Window size for rolling average smoothing in plots. If None, no smoothing is applied.
        
    Returns:
    --------
    dict
        Dictionary containing DataFrames with processed data and peak information
    """
    # Dictionary to store dataframes from each experiment
    experiment_data = {}
    
    # Dictionary to store molecule counts for each experiment
    molecule_counts = {}
    
    # Read data for each experiment
    for experiment_name, ratio in experiment_ratios.items():
        # Calculate molecule counts
        N_1 = N_2 = int((100-ratio)/100 * NMOLS/2)
        N_3 = N_4 = int(ratio/100 * NMOLS/2)
        
        molecule_counts[experiment_name] = {
            1: N_1,
            2: N_2,
            3: N_3,
            4: N_4
        }
        
        # Read CSV data
        file_path = f'../data/{experiment_name}_g-r.csv'
        if not os.path.exists(file_path):
            print(f"Warning: File not found: {file_path}")
            continue
            
        df = pd.read_csv(file_path)
        experiment_data[experiment_name] = df
    
    if not experiment_data:
        raise ValueError("No valid data files found")
    
    # Get r values from the first dataframe
    r_values = experiment_data[list(experiment_data.keys())[0]]['r (Å)'].values
    
    # Apply r_range filter if provided
    if r_range:
        min_r, max_r = r_range
        r_mask = (r_values >= min_r) & (r_values <= max_r)
        r_values = r_values[r_mask]
    else:
        r_mask = slice(None)  # No masking
    
    # Initialize dictionaries for results
    weighted_gr = {}
    weighted_std = {}
    weights_sum = {}
    

    # Dictionary for mapping interaction pairs
    interaction_mapping = {
        'A-A': ['g(r) 1-1', 'g(r) 2-2'],
        'AR-AS': ['g(r) 1-2'],
        'A-B': ['g(r) 1-3', 'g(r) 2-4'],
        'AR-BS': ['g(r) 1-4', 'g(r) 2-3'],
        'B-B': ['g(r) 3-3', 'g(r) 4-4'],
        'BR-BS': ['g(r) 3-4']
    }    # Dictionary to track peak positions across datasets
    peak_positions = {interaction_name: [] for interaction_name in interaction_mapping.keys()}
    
    
    # Process each type of interaction
    for interaction_name, column_keys in interaction_mapping.items():
        weighted_gr[interaction_name] = np.zeros_like(r_values)
        weights_sum[interaction_name] = 0
        
        # Store values from all experiments for standard deviation calculation
        all_values = []
        all_weights = []
        
        # Process each experiment
        for experiment_name, df in experiment_data.items():
            weights = []
            values_list = []
            
            # Process each column for this interaction type
            for col_key in column_keys:
                # Extract resids from the column name
                parts = col_key.split()[-1].split('-')
                resid_i, resid_j = int(parts[0]), int(parts[1])
                
                # Get molecule counts
                N_i = molecule_counts[experiment_name][resid_i]
                N_j = molecule_counts[experiment_name][resid_j]
                
                # Calculate weight based on number of possible interactions
                if resid_i == resid_j:
                    # For same type interactions (e.g., 1-1)
                    weight = N_i * (N_i - 1)
                else:
                    # For different type interactions
                    weight = N_i * N_j
                
                # Skip if column doesn't exist
                if col_key not in df.columns:
                    continue
                
                # Apply r_range filter if provided
                values = df[col_key].values[r_mask]
                
                # Store for this specific column
                values_list.append(values)
                weights.append(weight)
            
            # If we have values for this experiment and interaction
            if values_list:
                # Average across symmetric columns first (within this experiment)
                exp_avg_values = np.zeros_like(r_values)
                total_weight = sum(weights)
                for val, w in zip(values_list, weights):
                    exp_avg_values += val * (w / total_weight)
                
                # Store for cross-experiment calculations
                all_values.append(exp_avg_values)
                all_weights.append(total_weight)
                
                # Find peak position for this experiment and interaction
                peak_idx = np.argmax(exp_avg_values)
                peak_r = r_values[peak_idx]
                peak_positions[interaction_name].append((peak_r, total_weight))
                
                # Update global weighted sum
                weighted_gr[interaction_name] += exp_avg_values * total_weight
                weights_sum[interaction_name] += total_weight
        
        # Calculate weighted average across experiments
        if weights_sum[interaction_name] > 0:
            weighted_gr[interaction_name] /= weights_sum[interaction_name]
            
            # Calculate weighted standard deviation
            all_values = np.array(all_values)
            all_weights = np.array(all_weights)
            
            # Weighted sample variance calculation
            diff_squared = np.zeros_like(r_values)
            for vals, weight in zip(all_values, all_weights):
                diff_squared += weight * (vals - weighted_gr[interaction_name])**2
            
            # Bessel's correction for weighted std (for sample rather than population)
            correction = 1 - np.sum(all_weights**2) / (np.sum(all_weights)**2)
            weighted_var = diff_squared / (np.sum(all_weights) * correction)
            weighted_std[interaction_name] = np.sqrt(weighted_var)
    
    # Create result dataframe with r values and weighted g(r) for each interaction
    result_df = pd.DataFrame({'r (Å)': r_values})
    
    # Add weighted g(r) and 95% CI to dataframe
    n_experiments = len(experiment_data)
    for interaction_name in interaction_mapping.keys():
        result_df[f'g(r) {interaction_name}'] = weighted_gr[interaction_name]
        ci_95 = 1.96 * weighted_std[interaction_name] / np.sqrt(n_experiments)
        result_df[f'CI95 {interaction_name}'] = ci_95
    
    # Find peaks in g(r) for each interaction
    peaks_info = {}
    for interaction_name in interaction_mapping.keys():
        gr_values = weighted_gr[interaction_name]
        # Simple peak detection - find maximum in weighted average
        peak_idx = np.argmax(gr_values)
        peak_r = r_values[peak_idx]
        peak_gr = gr_values[peak_idx]
        peak_gr_ci = 1.96 * weighted_std[interaction_name][peak_idx] / np.sqrt(n_experiments)
        
        # Calculate statistics for peak positions across datasets
        if peak_positions[interaction_name]:
            # Extract r values and weights
            positions, weights = zip(*peak_positions[interaction_name])
            positions = np.array(positions)
            weights = np.array(weights)
            
            # Calculate weighted mean of peak positions
            weighted_peak_r = np.sum(positions * weights) / np.sum(weights)
            
            # Calculate weighted standard deviation of peak positions
            diff_squared = np.sum(weights * (positions - weighted_peak_r)**2)
            correction = 1 - np.sum(weights**2) / (np.sum(weights)**2)
            weighted_var = diff_squared / (np.sum(weights) * correction)
            weighted_std_r = np.sqrt(weighted_var)
            
            # Calculate 95% CI for peak position
            peak_r_ci = 1.96 * weighted_std_r / np.sqrt(len(positions))
        else:
            weighted_peak_r = peak_r
            peak_r_ci = 0
        
        peaks_info[interaction_name] = {
            'r': weighted_peak_r,
            'r_CI95': peak_r_ci,
            'g(r)': peak_gr,
            'g(r)_CI95': peak_gr_ci
        }
    
    # Plot the results
    plt.figure(figsize=(12, 8))
    
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    for i, (interaction_name, color) in enumerate(zip(interaction_mapping.keys(), colors)):
        gr_values = weighted_gr[interaction_name]
        ci_values = 1.96 * weighted_std[interaction_name] / np.sqrt(n_experiments)
        
        # Apply rolling average if specified
        if rolling_window and rolling_window > 1:
            # Calculate rolling average for smoother plotting
            pd_series = pd.Series(gr_values)
            smoothed_gr = pd_series.rolling(window=rolling_window, center=True).mean()
            
            # Handle NaN values at the edges due to rolling window
            # Use original values at the edges
            edge_points = rolling_window // 2
            smoothed_gr[:edge_points] = gr_values[:edge_points]
            smoothed_gr[-edge_points:] = gr_values[-edge_points:]
            
            # Also smooth the confidence intervals
            pd_ci = pd.Series(ci_values)
            smoothed_ci = pd_ci.rolling(window=rolling_window, center=True).mean()
            smoothed_ci[:edge_points] = ci_values[:edge_points]
            smoothed_ci[-edge_points:] = ci_values[-edge_points:]
            
            # Plot both original (transparent) and smoothed data
            plt.plot(r_values, gr_values, color=color, alpha=0.2)
            plt.plot(r_values, smoothed_gr, label=f'{interaction_name} (smoothed)', color=color)
            plt.fill_between(r_values, 
                            smoothed_gr - smoothed_ci, 
                            smoothed_gr + smoothed_ci, 
                            color=color, alpha=0.2)
        else:
            # Plot original data without smoothing
            plt.plot(r_values, gr_values, label=f'{interaction_name}', color=color)
            plt.fill_between(r_values, 
                            gr_values - ci_values, 
                            gr_values + ci_values, 
                            color=color, alpha=0.2)
    
    plt.xlabel('r (Å)')
    plt.ylabel('g(r)')
    title = 'Weighted Average Radial Distribution Functions with 95% CI'
    if rolling_window and rolling_window > 1:
        title += f' (Smoothed with {rolling_window}-point rolling average)'
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    # Print peak information
    print("First peaks in g(r) for each interaction:")
    for interaction_name, peak_data in peaks_info.items():
        g_lower = peak_data['g(r)'] - peak_data['g(r)_CI95']
        g_upper = peak_data['g(r)'] + peak_data['g(r)_CI95']
        r_lower = peak_data['r'] - peak_data['r_CI95'] 
        r_upper = peak_data['r'] + peak_data['r_CI95']
        print(f"{interaction_name}: g(r) = {peak_data['g(r)']:.4f} [{g_lower:.4f}, {g_upper:.4f}] at r = {peak_data['r']:.4f} [{r_lower:.4f}, {r_upper:.4f}] Å")
    
    
    # Create separate smoothed dataframe if rolling average was applied
    smoothed_df = None
    if rolling_window and rolling_window > 1:
        smoothed_df = pd.DataFrame({'r (Å)': r_values})
        
        for interaction_name in interaction_mapping.keys():
            # Apply rolling mean to g(r) values
            pd_series = pd.Series(weighted_gr[interaction_name])
            smoothed_gr = pd_series.rolling(window=rolling_window, center=True).mean()
            
            # Handle NaN values at the edges
            edge_points = rolling_window // 2
            smoothed_gr[:edge_points] = weighted_gr[interaction_name][:edge_points]
            smoothed_gr[-edge_points:] = weighted_gr[interaction_name][-edge_points:]
            
            # Also smooth the confidence intervals
            ci_values = 1.96 * weighted_std[interaction_name] / np.sqrt(n_experiments)
            pd_ci = pd.Series(ci_values)
            smoothed_ci = pd_ci.rolling(window=rolling_window, center=True).mean()
            smoothed_ci[:edge_points] = ci_values[:edge_points]
            smoothed_ci[-edge_points:] = ci_values[-edge_points:]
            
            # Add to dataframe
            smoothed_df[f'g(r) {interaction_name} (smoothed)'] = smoothed_gr
            smoothed_df[f'CI95 {interaction_name} (smoothed)'] = smoothed_ci
    
    return {
        'weighted_gr_df': result_df,
        'smoothed_df': smoothed_df,
        'peaks': peaks_info,
        'r_values': r_values,
        'weighted_gr': weighted_gr,
        'weighted_std': weighted_std
    }

In [None]:
output = analyze_weighted_gr(experiment_ratios,rolling_window=5)

In [2]:
import scipy.constants as constant
import scipy.special as special


def cylindrical_PB_potential(r, debye_length, radius, surface_charge_density, epsilon_r=78.5):
    """
    Calculate the electrostatic potential at distance r from the cylinder axis
    
    Parameters:
    -----------
    r : float
        Distance from the cylinder axis (in meters)
    debye_length : float
        Debye screening length (in meters)
    radius : float
        Radius of the cylinder (in meters)
    surface_charge_density : float
        Surface charge density (in C/m²)
    epsilon_r : float
        Relative permittivity of the medium (default: water at 25°C)
        
    Returns:
    --------
    float
        Potential in Volts
    """
    # Constants
    epsilon_0 = constant.epsilon_0  # Vacuum permittivity (F/m)
    
    # Check if r is outside the cylinder
    if r < radius:
        raise ValueError("r must be greater than or equal to the cylinder radius")
    
    # Calculate the potential
    prefactor = surface_charge_density * radius / (epsilon_r * epsilon_0)
    bessel_ratio = special.k0(r / debye_length) / special.k1(radius / debye_length)
    potential = prefactor * bessel_ratio
    
    return potential

In [None]:
x = 20*1e-9
radius = 2.5*1e-9         # Cylinder radius (1 nm)
r = x/2             # Distance from cylinder axis (5 nm)
dz = 0.05e-9
debye_length = 3.044/(1/constant.N_A/dz)**0.5*x*1e-10     # Debye length (1 nm)
print(f'Debey: {debye_length*1e9:.2f} nm')
surface_charge = -2*constant.e/(radius*dz*2*constant.pi)   # Surface charge density (0.05 C/m²)
print(f'Sigma: {surface_charge:.2f} C/m2')

# Calculate potential at distance r
potential = cylindrical_PB_potential(r, debye_length, radius, surface_charge)
print(f"Potential at {(r-radius)*1e9:.2f} nm from surface: {potential:.4f} V")

# Calculate potential at different distances
distances = np.linspace(radius, radius + 10e-9, 100)
potentials = [cylindrical_PB_potential(d, debye_length, radius, surface_charge) for d in distances]
plt.plot(distances, potentials)