In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger

sns.set_theme(style="white")

# Set up a logger with colours
logger.remove()
# Logger format white time, green origin, line number, yellow level name and blue message
logger.add(
    sys.stderr,
    format="<white>{time:HH:mm:ss}</white> <blue>{name}:{function}:{line}</blue> <yellow>{level.name}</yellow> <green>{message}</green>",
    colorize=True,
)

logger.info("Logger set up")

In [None]:
DIR = Path("/Users/sylvi/topo_data/hariborings/testing_all_unbound_data/")

# # Load the all_statistics.csv file
# df = pd.read_csv(DIR / "all_statistics.csv")

In [None]:
# Cas9 feret data loading
CAS9_RESULTS_DIR = Path(
    "/Volumes/shared/pyne_group/Shared/AFM_Data/Cas9_Minicircles/feret_diameter_&_binding_angle_comparison_&_cas9_cropped_data/cas9_crops_p2nm_with_feret_diameters"
)
assert CAS9_RESULTS_DIR.exists()

cas9_samples = ["ON_SC", "OT1_SC", "OT2_SC"]
# Turn the sample numpy files into a dataframe
cas9_df = pd.DataFrame()
cas9_high_res_df = pd.DataFrame()

cas9_plot_stats = {
    "min_ferets",
    None,
    None,
    "min_ferets_high_res_only",
    None,
    None,
    "max_ferets",
    None,
    None,
    "max_ferets_high_res_only",
    None,
    None,
    "angle_differences",
    None,
    None,
    "angle_differences_high_res_only",
    None,
    None,
}

cas9_plot_data = {}
for stat, lower_bound, upper_bound in cas9_plot_stats:
    # Plot each sample for the stat
    for sample in cas9_samples:
        # Load the data
        data = np.load(CAS9_RESULTS_DIR / f"{sample}_p2nm" / "output_plots" / f"{stat}.npy")

        # Create an entry in the plot data dict if it doesn't exist
        if stat not in cas9_plot_data:
            cas9_plot_data[stat] = {}
        cas9_plot_data[stat][sample] = data

# Plot
fig, ax = plt.subplots()
for stat, stat_dict in cas9_plot_data.items():
    for sample, data in stat_dict.items():
        ax.hist(data, bins=100, alpha=0.5, label=f"{sample} {stat}")
        ax.legend()
        ax.set_title(stat)
        ax.set_xlabel("Feret Diameter (nm)")
        ax.set_ylabel("Frequency")

In [None]:
# DNA only stats
samples = [
    "ON_REL",
    "ON_SC",
    "OT1_REL",
    "OT1_SC",
    "OT2_REL",
    "OT2_SC",
]


sample_dataframes = {}

for sample in samples:
    sample_data_file = DIR / f"output_{sample}" / "all_statistics.csv"
    sample_data = pd.read_csv(sample_data_file, index_col=0)

    # Add a new column for the ratio of min_feret to area
    sample_data["min_feret_area_ratio"] = sample_data["min_feret"] / sample_data["area"]

    sample_dataframes[sample] = sample_data

In [None]:
# Statistics, lower_xlim, upper_xlim
stats = [
    ("min_feret", 0.0, 2.5e-8),
    ("max_feret", 0.0, 0.4e-7),
    ("area", 0.0, 0.25e-15),
    ("min_feret_area_ratio", 0.0, 2.5e8),
]

apply_xlim = True

for stat, lower_xlim, upper_xlim in stats:
    logger.info(f"Plotting {stat}")

    fig, ax = plt.subplots(figsize=(8, 6))

    for sample in samples:
        sample_data = sample_dataframes[sample]

        # Remove the rows that have NaN as a value for this statistic
        sample_data = sample_data[sample_data[stat].notna()]

        np.save(DIR / f"{sample}_{stat}.npy", sample_data[stat].notna().to_numpy())

        # Plot KDE plot
        sns.kdeplot(data=sample_data, x=stat, label=f"{sample} n: {len(sample_data)}", ax=ax)

    ax.set_xlabel(stat)
    ax.set_ylabel("Density")
    ax.set_title(f"Comparison of {stat} distributions")
    if apply_xlim:
        ax.set_xlim(lower_xlim, upper_xlim)
    ax.legend()
    plt.show()

In [None]:
# Compare feret diameters between DNA only and Cas9