In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns

In [None]:
# dir_data = Path("/Volumes/shared/pyne_group/Shared/Papers/cas9_minicircles/20250908-cas9-paper-our-response-to-reviewers/20250805-quentin-reviewer-response/figure_ext_3_images/csv_data")
dir_data_standard = Path("/Users/sylvi/topo_data/cas9-paper-graphs-updated-nomenclature/fig-1/csv-data-whole-plot")
assert dir_data_standard.exists(), f"Data directory {dir_data_standard} does not exist."
dir_data_curvature = Path(
    "/Users/sylvi/topo_data/cas9-paper-our-response-to-reviewers/20250805-quentin-reviewer-response/figure_ext_3_images/csv_data_curvature"
)
assert dir_data_curvature.exists(), f"Data directory {dir_data_curvature} does not exist."

output_xlsx_path = Path(
    "/Users/sylvi/topo_data/cas9-paper-graphs-updated-nomenclature/full-plot-data-xlsx/all_plots_xlsx_file.xlsx"
)
assert output_xlsx_path.parent.exists(), f"Output directory {output_xlsx_path.parent} does not exist."


# Dictionary to store all the dataframes for saving to excel later
all_plot_data = {}
figure_only_plot_data = {}

figure_only_data_names = [
    "kde_contour_length_rel_on_ot1_ot2.csv",
    "kde_feret_ratio_rel_on_ot1_ot2.csv",
    "kde_min_feret_rel_on_ot1_ot2.csv",
    "kde_contour_length_sc_on_ot1_ot2.csv",
    "kde_feret_ratio_sc_on_ot1_ot2.csv",
    "kde_min_feret_sc_on_ot1_ot2.csv",
    "kde_contour_length_sc_cas9_on_ot1_ot2.csv",
    "kde_feret_ratio_sc_cas9_on_ot1_ot2.csv",
    "kde_min_feret_sc_cas9_on_ot1_ot2.csv",
    "kde_protein_area_sc_cas9_on_ot1_ot2.csv",
    "kde_protein_volume_sc_cas9_on_ot1_ot2.csv",
    "ot1_ot2_central_mean_curvature_plot_data.csv",
]

In [None]:
# Check the standard plots
plot = False
kde_plot_types = [
    "feret_ratio",
    "min_feret",
    "max_feret",
    "contour_length",
    "protein_area",
    "protein_volume",
    "protein_max_height",
]

for kde_plot_type in kde_plot_types:
    print(f"Plotting kde plot type: {kde_plot_type}")

    files_to_plot = dir_data_standard.glob(f"*{kde_plot_type}*.csv")

    for file in files_to_plot:
        df = pd.read_csv(dir_data_standard / file)

        # store the dataframe in the dictionary
        all_plot_data[file.stem] = df
        if file.name in figure_only_data_names:
            figure_only_plot_data[file.stem] = df

        if plot:
            plt.figure(figsize=(8, 6))
            # plot kde by sample_type
            for sample_type in df["sample_type"].unique():
                subset = df[df["sample_type"] == sample_type]
                # there will be one other column which is not sample_type, plot this one
                value_col = [col for col in df.columns if col != "sample_type"][0]
                print(f"other columns: {value_col}")
                sns.kdeplot(subset[value_col], label=sample_type, fill=True, alpha=0.5)
            plt.title(f"KDE Plot for {file.stem}")
            plt.xlabel("Value")
            plt.ylabel("Density")
            plt.legend()
            plt.show()

    print("=" * 120)
    print("=" * 120)
    print("\n\n\n")

In [None]:
# check the curvature plots
# fetch the csv file
curvature_data_filename = "ot1_ot2_central_mean_curvature_plot_data.csv"
df_middle_curvature_stats_ot1_ot2 = pd.read_csv(dir_data_curvature / curvature_data_filename)
for sample_type in df_middle_curvature_stats_ot1_ot2["sample_type"].unique():
    subset = df_middle_curvature_stats_ot1_ot2[df_middle_curvature_stats_ot1_ot2["sample_type"] == sample_type]
    print(f"Sample type: {sample_type}, Mean curvature mean: {subset['mean_curvature'].mean()}, N: {len(subset)}")
    # plot violin plot
plt.figure(figsize=(8, 6))
sns.violinplot(
    data=df_middle_curvature_stats_ot1_ot2,
    x="sample_type",
    y="mean_curvature",
    inner="point",
    scale="width",
)
plt.title("Central Mean Curvature Distribution for OT1 and OT2 Samples")
plt.xlabel("Sample Type")
plt.ylabel("Mean Curvature (1/nm)")
plt.show()

all_plot_data["central_mean_curvature_ot1_ot2"] = df_middle_curvature_stats_ot1_ot2
if curvature_data_filename in figure_only_data_names:
    figure_only_plot_data["central_mean_curvature_ot1_ot2"] = df_middle_curvature_stats_ot1_ot2

In [None]:
def shorten_sheet_name(sheet_name: str) -> str:
    """Shorten the sheet name to be Excel compatible."""
    original_sheet_name = sheet_name
    # strip "kde" from the start of the sheet name if present
    if sheet_name.startswith("kde_"):
        sheet_name = sheet_name[4:]
    # replace "feret_ratio" with "f-ratio" for brevity
    sheet_name = sheet_name.replace("feret_ratio", "f-ratio")
    # replace "contour_length" with "c-len" for brevity
    sheet_name = sheet_name.replace("contour_length", "c-len")
    # replace "protein_area" with "p-area" for brevity
    sheet_name = sheet_name.replace("protein_area", "p-area")
    # replace "protein_volume" with "p-vol" for brevity
    sheet_name = sheet_name.replace("protein_volume", "p-vol")
    # replace "protein_max_height" with "p-max-h" for brevity
    sheet_name = sheet_name.replace("protein_max_height", "p-max-h")
    if len(sheet_name) > 31:
        raise ValueError(f"Sheet name {sheet_name} is too long for Excel ({len(sheet_name)} characters > 31).")
    print(f"Renamed sheet {original_sheet_name:<45} ->  {sheet_name}")
    return sheet_name


# save all the plots to excel
with pd.ExcelWriter(output_xlsx_path) as writer:
    for sheet_name, df in all_plot_data.items():
        short_sheet_name = shorten_sheet_name(sheet_name)
        df.to_excel(writer, sheet_name=short_sheet_name, index=False)
        # index=False stops pandas from writing an extra index column
print(f"Saved all plot data to {output_xlsx_path}")

# save only the figure data to another excel file
figure_only_output_xlsx_path = output_xlsx_path.parent / "ext_data_figure_3_only_plots_data.xlsx"
with pd.ExcelWriter(figure_only_output_xlsx_path) as writer:
    for sheet_name, df in figure_only_plot_data.items():
        short_sheet_name = shorten_sheet_name(sheet_name)
        df.to_excel(writer, sheet_name=short_sheet_name, index=False)
        # index=False stops pandas from writing an extra index column
print(f"Saved figure only plot data to {figure_only_output_xlsx_path}")