In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger

# Set up a logger with colours
logger.remove()
# Logger format white time, green origin, line number, yellow level name and blue message
logger.add(
    sys.stderr,
    format="<white>{time:HH:mm:ss}</white> <blue>{name}:{function}:{line}</blue> <yellow>{level.name}</yellow> <green>{message}</green>",
    colorize=True,
)

logger.info("Logger set up")

In [None]:
# DIR = Path("/Users/sylvi/topo_data/hariborings/testing_all_unbound_data/")
DIR = Path(
    "/Volumes/shared/pyne_group/Shared/AFM_Data/Cas9_Minicircles/feret_diameter_&_binding_angle_comparison_&_cas9_cropped_data/dna_only_output_csvs/"
)
assert DIR.exists()

# # Load the all_statistics.csv file
# df = pd.read_csv(DIR / "all_statistics.csv")

In [None]:
# Load all the sample dataframes
samples = [
    "ON_REL",
    "ON_SC",
    "OT1_REL",
    "OT1_SC",
    "OT2_REL",
    "OT2_SC",
]


# NUMPY_DIR = Path(
#     "/Volumes/shared/pyne_group/Shared/AFM_Data/Cas9_Minicircles/feret_diameter_&_binding_angle_comparison_&_cas9_cropped_data/dna_only_feret_&_area_stats/"
# )
# if not NUMPY_DIR.exists():
#     NUMPY_DIR.mkdir()
# for sample in samples:
#     sample_dir = NUMPY_DIR / f"{sample}"
#     if not sample_dir.exists():
#         sample_dir.mkdir()

sample_dataframes = {}

for sample in samples:
    sample_data_file = DIR / f"{sample}" / "all_statistics.csv"
    sample_data = pd.read_csv(sample_data_file, index_col=0)

    # Add a new column for the ratio of min_feret to area
    sample_data["min_feret_area_ratio"] = sample_data["min_feret"] / sample_data["area"]

    sample_dataframes[sample] = sample_data

    # # Save the stats as numpy files for easy loading later
    # # min_feret
    # np.save(NUMPY_DIR / f"{sample}" / "min_feret.npy", sample_data["min_feret"].values)
    # # max_feret
    # np.save(NUMPY_DIR / f"{sample}" / "max_feret.npy", sample_data["max_feret"].values)
    # # area
    # np.save(NUMPY_DIR / f"{sample}" / "area.npy", sample_data["area"].values)
    # # min_feret_area_ratio
    # np.save(
    #     NUMPY_DIR / f"{sample}" / "min_feret_area_ratio.npy",
    #     sample_data["min_feret_area_ratio"].values,
    # )

In [None]:
# # try loading the numpy files again
# for sample in samples:
#     # min_feret
#     min_feret = np.load(NUMPY_DIR / f"{sample}" / "min_feret.npy")
#     # max_feret
#     max_feret = np.load(NUMPY_DIR / f"{sample}" / "max_feret.npy")
#     # area
#     area = np.load(NUMPY_DIR / f"{sample}" / "area.npy")
#     # min_feret_area_ratio
#     min_feret_area_ratio = np.load(NUMPY_DIR / f"{sample}" / "min_feret_area_ratio.npy")

#     # Check that they are the same
#     assert np.allclose(min_feret, sample_dataframes[sample]["min_feret"].values)
#     assert np.allclose(max_feret, sample_dataframes[sample]["max_feret"].values)
#     assert np.allclose(area, sample_dataframes[sample]["area"].values)
#     assert np.allclose(
#         min_feret_area_ratio,
#         sample_dataframes[sample]["min_feret_area_ratio"].values,
#     )

In [None]:
# Statistics, lower_xlim, upper_xlim
stats = [
    ("min_feret", 0.0, 2.5e-8),
    ("max_feret", 0.0, 0.4e-7),
    ("area", 0.0, 0.25e-15),
    ("min_feret_area_ratio", 0.0, 2.5e8),
]

apply_xlim = True

for stat, lower_xlim, upper_xlim in stats:
    logger.info(f"Plotting {stat}")

    sns.set_theme(style="whitegrid")
    fig, ax = plt.subplots(figsize=(8, 6))

    for sample in samples:
        sample_data = sample_dataframes[sample]

        # Remove the rows that have NaN as a value for this statistic
        sample_data = sample_data[sample_data[stat].notna()]

        # Plot KDE plot
        sns.kdeplot(data=sample_data, x=stat, label=f"{sample} n: {len(sample_data)}", ax=ax)

    ax.set_xlabel(stat)
    ax.set_ylabel("Density")
    ax.set_title(f"Comparison of {stat} distributions")
    if apply_xlim:
        ax.set_xlim(lower_xlim, upper_xlim)
    ax.legend()
    plt.show()