In [None]:
from pathlib import Path
import sys
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger

# Set up a logger with colours
logger.remove()
# Logger format white time, green origin, line number, yellow level name and blue message
logger.add(
    sys.stderr,
    format="<white>{time:HH:mm:ss}</white> <blue>{name}:{function}:{line}</blue> <yellow>{level.name}</yellow> <green>{message}</green>",
    colorize=True,
)

logger.info("Logger set up")

In [None]:
# DIR = Path("/Users/sylvi/topo_data/hariborings/testing_all_unbound_data/")
CAS9_DIR = Path("/Users/sylvi/topo_data/hariborings/processed_grains/")
DAY = "2024-03-22"
assert CAS9_DIR.exists()

# Cas9

In [None]:
# Load cas9 samples dictionaries from pickle

with open(CAS9_DIR / "CAS9_ON_SC" / DAY / "ON_SC_dict.pkl", "rb") as f:
    ON_SC_dict = pickle.load(f)
with open(CAS9_DIR / "CAS9_OT1_SC" / DAY / "OT1_SC_dict.pkl", "rb") as f:
    OT1_SC_dict = pickle.load(f)
with open(CAS9_DIR / "CAS9_OT2_SC" / DAY / "OT2_SC_dict.pkl", "rb") as f:
    OT2_SC_dict = pickle.load(f)

# Print keys of the first sample
logger.info(f"Num ON_SC samples: {len(ON_SC_dict)}")
logger.info(f"Num OT1_SC samples: {len(OT1_SC_dict)}")
logger.info(f"Num OT2_SC samples: {len(OT2_SC_dict)}")
logger.info(f"Keys in ON_SC_dict: {ON_SC_dict[list(ON_SC_dict.keys())[0]].keys()}")

# Intersection midpoint distance ratio

In [None]:
# Plot intersection distance ratios
on_sc_intersection_distance_ratios = []
for sample in ON_SC_dict:
    if "intersection_midpoint_distance_ratio" in ON_SC_dict[sample]:
        on_sc_intersection_distance_ratios.append(ON_SC_dict[sample]["intersection_midpoint_distance_ratio"])
    else:
        logger.warning(f"Sample {sample} does not have intersection_midpoint_distance_ratio")
ot1_sc_intersection_distance_ratios = []
for sample in OT1_SC_dict:
    if "intersection_midpoint_distance_ratio" in OT1_SC_dict[sample]:
        ot1_sc_intersection_distance_ratios.append(OT1_SC_dict[sample]["intersection_midpoint_distance_ratio"])
    else:
        logger.warning(f"Sample {sample} does not have intersection_midpoint_distance_ratio")
ot2_sc_intersection_distance_ratios = []
for sample in OT2_SC_dict:
    if "intersection_midpoint_distance_ratio" in OT2_SC_dict[sample]:
        ot2_sc_intersection_distance_ratios.append(OT2_SC_dict[sample]["intersection_midpoint_distance_ratio"])
    else:
        logger.warning(f"Sample {sample} does not have intersection_midpoint_distance_ratio")

plt.figure()
sns.kdeplot(on_sc_intersection_distance_ratios, label=f"ON_SC n={len(on_sc_intersection_distance_ratios)}")
sns.kdeplot(ot1_sc_intersection_distance_ratios, label=f"OT1_SC n={len(ot1_sc_intersection_distance_ratios)}")
sns.kdeplot(ot2_sc_intersection_distance_ratios, label=f"OT2_SC n={len(ot2_sc_intersection_distance_ratios)}")
plt.legend()
plt.xlabel("Intersection midpoint distance ratio")
plt.ylabel("Number of samples")
plt.title("Intersection midpoint distance ratio of samples")
plt.show()

# Binding angle

In [None]:
# Plot binding angles
on_sc_binding_angles = []
for sample in ON_SC_dict:
    if "binding_angle" in ON_SC_dict[sample]:
        on_sc_binding_angles.append(np.degrees(ON_SC_dict[sample]["binding_angle"]))
    else:
        logger.warning(f"Sample {sample} does not have binding_angle")
ot1_sc_binding_angles = []
for sample in OT1_SC_dict:
    if "binding_angle" in OT1_SC_dict[sample]:
        ot1_sc_binding_angles.append(np.degrees(OT1_SC_dict[sample]["binding_angle"]))
    else:
        logger.warning(f"Sample {sample} does not have binding_angle")
ot2_sc_binding_angles = []
for sample in OT2_SC_dict:
    if "binding_angle" in OT2_SC_dict[sample]:
        ot2_sc_binding_angles.append(np.degrees(OT2_SC_dict[sample]["binding_angle"]))
    else:
        logger.warning(f"Sample {sample} does not have binding_angle")

plt.figure()
sns.kdeplot(on_sc_binding_angles, label=f"ON_SC n={len(on_sc_binding_angles)}")
sns.kdeplot(ot1_sc_binding_angles, label=f"OT1_SC n={len(ot1_sc_binding_angles)}")
sns.kdeplot(ot2_sc_binding_angles, label=f"OT2_SC n={len(ot2_sc_binding_angles)}")
plt.legend()
plt.xlabel("Binding angle")
plt.ylabel("Number of samples")
plt.title("Binding angle of samples")
plt.show()

# Path elongation distance

In [None]:
# Plot path elongation distances
on_sc_path_elongation_distances = []
for sample in ON_SC_dict:
    if "path_elongation_distance_nm" in ON_SC_dict[sample]:
        on_sc_path_elongation_distances.append(ON_SC_dict[sample]["path_elongation_distance_nm"])
    else:
        logger.warning(f"Sample {sample} does not have path_elongation_distance")
ot1_sc_path_elongation_distances = []
for sample in OT1_SC_dict:
    if "path_elongation_distance_nm" in OT1_SC_dict[sample]:
        ot1_sc_path_elongation_distances.append(OT1_SC_dict[sample]["path_elongation_distance_nm"])
    else:
        logger.warning(f"Sample {sample} does not have path_elongation_distance")
ot2_sc_path_elongation_distances = []
for sample in OT2_SC_dict:
    if "path_elongation_distance_nm" in OT2_SC_dict[sample]:
        ot2_sc_path_elongation_distances.append(OT2_SC_dict[sample]["path_elongation_distance_nm"])
    else:
        logger.warning(f"Sample {sample} does not have path_elongation_distance")

plt.figure()
sns.kdeplot(on_sc_path_elongation_distances, label=f"ON_SC n={len(on_sc_path_elongation_distances)}")
sns.kdeplot(ot1_sc_path_elongation_distances, label=f"OT1_SC n={len(ot1_sc_path_elongation_distances)}")
sns.kdeplot(ot2_sc_path_elongation_distances, label=f"OT2_SC n={len(ot2_sc_path_elongation_distances)}")
plt.legend()
plt.xlabel("Path elongation distance")
plt.ylabel("Number of samples")
plt.title("Path elongation distance of samples")
plt.show()

In [None]:
# Plot min feret
on_sc_min_ferets = []
for sample in ON_SC_dict:
    if "min_feret" in ON_SC_dict[sample]:
        on_sc_min_ferets.append(ON_SC_dict[sample]["min_feret"])
    else:
        logger.warning(f"Sample {sample} does not have min_feret")
ot1_sc_min_ferets = []
for sample in OT1_SC_dict:
    if "min_feret" in OT1_SC_dict[sample]:
        ot1_sc_min_ferets.append(OT1_SC_dict[sample]["min_feret"])
    else:
        logger.warning(f"Sample {sample} does not have min_feret")
ot2_sc_min_ferets = []
for sample in OT2_SC_dict:
    if "min_feret" in OT2_SC_dict[sample]:
        ot2_sc_min_ferets.append(OT2_SC_dict[sample]["min_feret"])
    else:
        logger.warning(f"Sample {sample} does not have min_feret")

plt.figure()
sns.kdeplot(on_sc_min_ferets, label=f"ON_SC n={len(on_sc_min_ferets)}")
sns.kdeplot(ot1_sc_min_ferets, label=f"OT1_SC n={len(ot1_sc_min_ferets)}")
sns.kdeplot(ot2_sc_min_ferets, label=f"OT2_SC n={len(ot2_sc_min_ferets)}")
plt.legend()
plt.xlabel("Min feret")
plt.ylabel("Number of samples")
plt.title("Min feret of samples")
plt.show()

In [None]:
# Plot average and standard deviation of angles per nm

on_sc_mean_angles_per_nm = []
on_sc_std_angles_per_nm = []
on_sc_all_angles_per_nm = []
for sample in ON_SC_dict:
    if "angles_per_nm" in ON_SC_dict[sample]:
        mean_angle_per_nm = np.mean(ON_SC_dict[sample]["angles_per_nm"])
        std_angle_per_nm = np.std(ON_SC_dict[sample]["angles_per_nm"])
        on_sc_mean_angles_per_nm.append(mean_angle_per_nm)
        on_sc_std_angles_per_nm.append(std_angle_per_nm)
        on_sc_all_angles_per_nm.extend(ON_SC_dict[sample]["angles_per_nm"])
    else:
        logger.warning(f"Sample {sample} does not have angles_per_nm")

ot1_sc_mean_angles_per_nm = []
ot1_sc_std_angles_per_nm = []
ot1_sc_all_angles_per_nm = []
for sample in OT1_SC_dict:
    if "angles_per_nm" in OT1_SC_dict[sample]:
        mean_angle_per_nm = np.mean(OT1_SC_dict[sample]["angles_per_nm"])
        std_angle_per_nm = np.std(OT1_SC_dict[sample]["angles_per_nm"])
        ot1_sc_mean_angles_per_nm.append(mean_angle_per_nm)
        ot1_sc_std_angles_per_nm.append(std_angle_per_nm)
        ot1_sc_all_angles_per_nm.extend(OT1_SC_dict[sample]["angles_per_nm"])
    else:
        logger.warning(f"Sample {sample} does not have angles_per_nm")

ot2_sc_mean_angles_per_nm = []
ot2_sc_std_angles_per_nm = []
ot2_sc_all_angles_per_nm = []
for sample in OT2_SC_dict:
    if "angles_per_nm" in OT2_SC_dict[sample]:
        mean_angle_per_nm = np.mean(OT2_SC_dict[sample]["angles_per_nm"])
        std_angle_per_nm = np.std(OT2_SC_dict[sample]["angles_per_nm"])
        ot2_sc_mean_angles_per_nm.append(mean_angle_per_nm)
        ot2_sc_std_angles_per_nm.append(std_angle_per_nm)
        ot2_sc_all_angles_per_nm.extend(OT2_SC_dict[sample]["angles_per_nm"])
    else:
        logger.warning(f"Sample {sample} does not have angles_per_nm")

plt.figure()
sns.kdeplot(on_sc_mean_angles_per_nm, label=f"ON_SC n={len(on_sc_mean_angles_per_nm)}")
sns.kdeplot(ot1_sc_mean_angles_per_nm, label=f"OT1_SC n={len(ot1_sc_mean_angles_per_nm)}")
sns.kdeplot(ot2_sc_mean_angles_per_nm, label=f"OT2_SC n={len(ot2_sc_mean_angles_per_nm)}")
plt.legend()
plt.xlabel("Mean angle per nm")
plt.ylabel("Number of samples")
plt.title("Mean angle per nm of samples")
plt.show()

plt.figure()
sns.kdeplot(on_sc_std_angles_per_nm, label=f"ON_SC n={len(on_sc_std_angles_per_nm)}")
sns.kdeplot(ot1_sc_std_angles_per_nm, label=f"OT1_SC n={len(ot1_sc_std_angles_per_nm)}")
sns.kdeplot(ot2_sc_std_angles_per_nm, label=f"OT2_SC n={len(ot2_sc_std_angles_per_nm)}")
plt.legend()
plt.xlabel("Standard deviation of angle per nm")
plt.ylabel("Number of samples")
plt.title("Standard deviation of angle per nm of samples")
plt.show()

plt.figure()
sns.kdeplot(on_sc_all_angles_per_nm, label=f"ON_SC n={len(ON_SC_dict.keys())}")
sns.kdeplot(ot1_sc_all_angles_per_nm, label=f"OT1_SC n={len(OT1_SC_dict.keys())}")
sns.kdeplot(ot2_sc_all_angles_per_nm, label=f"OT2_SC n={len(OT2_SC_dict.keys())}")
plt.legend()
plt.xlabel("Angle per nm")
plt.ylabel("Number of samples")
plt.title("Angle per nm of samples")
plt.show()

In [None]:
# Plot path aspect ratio
stat = "path_aspect_ratio_length_over_width"

plt.figure()
for sample_type in ["ON_SC", "OT1_SC", "OT2_SC"]:
    path_aspect_ratios = []
    for sample in globals()[f"{sample_type}_dict"]:
        if stat in globals()[f"{sample_type}_dict"][sample]:
            path_aspect_ratios.append(globals()[f"{sample_type}_dict"][sample][stat])
        else:
            logger.warning(f"Sample {sample} does not have {stat}")

    sns.kdeplot(path_aspect_ratios, label=f"{sample_type} n={len(path_aspect_ratios)}")
    plt.legend()
    plt.xlabel(stat)
    plt.ylabel("Number of samples")
    plt.title(f"{stat} of samples")
plt.show()

In [None]:
# Plot lengths
stat = "path_bounding_box_length"
plt.figure()
for sample_type in ["ON_SC", "OT1_SC", "OT2_SC"]:
    lengths = []
    for sample in globals()[f"{sample_type}_dict"]:
        if stat in globals()[f"{sample_type}_dict"][sample]:
            lengths.append(globals()[f"{sample_type}_dict"][sample][stat])
        else:
            logger.warning(f"Sample {sample} does not have {stat}")

    sns.kdeplot(lengths, label=f"{sample_type} n={len(lengths)}")
    plt.legend()
    plt.xlabel(stat)
    plt.ylabel("Number of samples")
    plt.title(f"{stat} of samples")
plt.show()

In [None]:
stat = "path_bounding_box_width"
plt.figure()
for sample_type in ["ON_SC", "OT1_SC", "OT2_SC"]:
    widths = []
    for sample in globals()[f"{sample_type}_dict"]:
        if stat in globals()[f"{sample_type}_dict"][sample]:
            widths.append(globals()[f"{sample_type}_dict"][sample][stat])
        else:
            logger.warning(f"Sample {sample} does not have {stat}")

    sns.kdeplot(widths, label=f"{sample_type} n={len(widths)}")
    plt.legend()
    plt.xlabel(stat)
    plt.ylabel("Number of samples")
    plt.title(f"{stat} of samples")
plt.show()

# Unbound

In [None]:
# Load all the sample dataframes
samples = [
    "ON_REL",
    "ON_SC",
    "OT1_REL",
    "OT1_SC",
    "OT2_REL",
    "OT2_SC",
]


# NUMPY_DIR = Path(
#     "/Volumes/shared/pyne_group/Shared/AFM_Data/Cas9_Minicircles/feret_diameter_&_binding_angle_comparison_&_cas9_cropped_data/dna_only_feret_&_area_stats/"
# )
# if not NUMPY_DIR.exists():
#     NUMPY_DIR.mkdir()
# for sample in samples:
#     sample_dir = NUMPY_DIR / f"{sample}"
#     if not sample_dir.exists():
#         sample_dir.mkdir()

sample_dataframes = {}

for sample in samples:
    sample_data_file = DIR / f"{sample}" / "all_statistics.csv"
    sample_data = pd.read_csv(sample_data_file, index_col=0)

    # Add a new column for the ratio of min_feret to area
    sample_data["min_feret_area_ratio"] = sample_data["min_feret"] / sample_data["area"]

    sample_dataframes[sample] = sample_data

    # # Save the stats as numpy files for easy loading later
    # # min_feret
    # np.save(NUMPY_DIR / f"{sample}" / "min_feret.npy", sample_data["min_feret"].values)
    # # max_feret
    # np.save(NUMPY_DIR / f"{sample}" / "max_feret.npy", sample_data["max_feret"].values)
    # # area
    # np.save(NUMPY_DIR / f"{sample}" / "area.npy", sample_data["area"].values)
    # # min_feret_area_ratio
    # np.save(
    #     NUMPY_DIR / f"{sample}" / "min_feret_area_ratio.npy",
    #     sample_data["min_feret_area_ratio"].values,
    # )

In [None]:
# # try loading the numpy files again
# for sample in samples:
#     # min_feret
#     min_feret = np.load(NUMPY_DIR / f"{sample}" / "min_feret.npy")
#     # max_feret
#     max_feret = np.load(NUMPY_DIR / f"{sample}" / "max_feret.npy")
#     # area
#     area = np.load(NUMPY_DIR / f"{sample}" / "area.npy")
#     # min_feret_area_ratio
#     min_feret_area_ratio = np.load(NUMPY_DIR / f"{sample}" / "min_feret_area_ratio.npy")

#     # Check that they are the same
#     assert np.allclose(min_feret, sample_dataframes[sample]["min_feret"].values)
#     assert np.allclose(max_feret, sample_dataframes[sample]["max_feret"].values)
#     assert np.allclose(area, sample_dataframes[sample]["area"].values)
#     assert np.allclose(
#         min_feret_area_ratio,
#         sample_dataframes[sample]["min_feret_area_ratio"].values,
#     )

In [None]:
# Statistics, lower_xlim, upper_xlim
stats = [
    ("min_feret", 0.0, 2.5e-8),
    ("max_feret", 0.0, 0.4e-7),
    ("area", 0.0, 0.25e-15),
    ("min_feret_area_ratio", 0.0, 2.5e8),
]

apply_xlim = True

for stat, lower_xlim, upper_xlim in stats:
    logger.info(f"Plotting {stat}")

    sns.set_theme(style="whitegrid")
    fig, ax = plt.subplots(figsize=(8, 6))

    for sample in samples:
        sample_data = sample_dataframes[sample]

        # Remove the rows that have NaN as a value for this statistic
        sample_data = sample_data[sample_data[stat].notna()]

        # Plot KDE plot
        sns.kdeplot(data=sample_data, x=stat, label=f"{sample} n: {len(sample_data)}", ax=ax)

    ax.set_xlabel(stat)
    ax.set_ylabel("Density")
    ax.set_title(f"Comparison of {stat} distributions")
    if apply_xlim:
        ax.set_xlim(lower_xlim, upper_xlim)
    ax.legend()
    plt.show()