In [None]:
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_dir = Path("/Users/sylvi/topo_data/shelterin")
assert base_dir.exists()
all_stats_file = base_dir / "output-data-redo" / "all_statistics.csv"
assert all_stats_file.exists()
all_stats_df = pd.read_csv(all_stats_file)
all_disordered_segments_file = base_dir / "output-data-redo" / "all_disordered_segment_statistics.csv"
assert all_disordered_segments_file.exists()
all_disordered_segments_df = pd.read_csv(all_disordered_segments_file)

images_to_delete = ["20250502_5nMTRF2_1ng_tel12picozEE_nicl.0_00010"]
# remove images to delete
for image_name in images_to_delete:
    len_before = len(all_stats_df)
    all_stats_df = all_stats_df[~all_stats_df["image"].str.contains(image_name)]
    len_after = len(all_stats_df)
    assert len_before - len_after > 0
    len_before = len(all_disordered_segments_df)
    all_disordered_segments_df = all_disordered_segments_df[
        ~all_disordered_segments_df["image"].str.contains(image_name)
    ]
    len_after = len(all_disordered_segments_df)
    assert len_before - len_after > 0
    print(f"removed {image_name} from all stats and all disordered segments")

# create a display name column
all_stats_df["display_name"] = all_stats_df["basename"].str.split("/").str[-1]
print(f"all stats display names: {all_stats_df['display_name'].unique()}")

# convert units to be in nm
all_stats_df["total_branch_lengths"] /= 1e-9
all_stats_df["smallest_bounding_area"] /= 1e-18
all_stats_df["area"] /= 1e-18
all_stats_df["height_median"] /= 1e-9

boxplotcolour = "lightgrey"

print(f"all stats: {len(all_stats_df)}, cols: {len(all_stats_df.columns)}")
for col in all_stats_df.columns:
    print(f"  {col}")

print(f"all disordered segments: {len(all_disordered_segments_df)}, cols: {len(all_disordered_segments_df.columns)}")
for col in all_disordered_segments_df.columns:
    print(f"  {col}")

#### pretty plotting

In [None]:
def boxplot(x, y, data):
    sns.boxplot(x=x, y=y, data=data, color=boxplotcolour)
    # print the median and IQRs for each x's y values
    for i, group in data.groupby(x):
        # median
        median = group[y].median()
        iqr = group[y].quantile(0.75) - group[y].quantile(0.25)
        print(f"{i}: median={median}, IQR={iqr}")


def simpleticks():
    num_ticks = len(plt.gca().get_xticklabels())
    new_ticks = []
    for i, tick in enumerate(plt.gca().get_xticklabels()):
        # get rid of "TEL12" or "TEL80" from the tick label
        # get rid of TEL12 if applicable
        if "TEL12" in tick.get_text():
            if "control" in tick.get_text():
                tick.set_text("TEL12")
            else:
                tick.set_text(tick.get_text().replace("TEL12", ""))
        # get rid of TEL80 if applicable
        if "TEL80" in tick.get_text():
            if "control" in tick.get_text():
                tick.set_text("TEL80")
            else:
                tick.set_text(tick.get_text().replace("TEL80", ""))
        new_ticks.append(tick.get_text())
    # set the ticks
    tick_indexes = np.linspace(0, num_ticks - 1, num_ticks)
    plt.xticks(tick_indexes, new_ticks)


# set default font size for axes labels
plt.rcParams.update({"axes.labelsize": 16})

In [None]:
sns.stripplot(x="display_name", y="area", data=all_stats_df)
plt.xticks(rotation=90)
plt.ylabel("area (nm^2)")
plt.xlabel("sample type")
plt.title("molecule area")
plt.show()

sns.stripplot(x="display_name", y="smallest_bounding_area", data=all_stats_df)
plt.xticks(rotation=90)
plt.ylabel("smallest bounding box area (nm^2)")
plt.xlabel("sample type")
plt.title("molecule bounding box area")
plt.hlines(1000, xmin=-1, xmax=10, colors="red", linestyles="dashed")
plt.show()

In [None]:
# create a DF of just grain data (no subgrains, so no double-counting of total branch length stats)
grains_list = []
unique_images = all_stats_df["image"].unique()
print(f"len all stats: {len(all_stats_df)}")
print(f"unique images: {len(unique_images)}")
for image in unique_images:
    all_grains_data = all_stats_df[all_stats_df["image"] == image]
    grain_numbers = all_grains_data["grain_number"].unique()
    print(f"  image: {image}, grain numbers: {len(grain_numbers)}")
    for grain_number in grain_numbers:
        # grab just this grain's data
        grain_data = all_grains_data[all_grains_data["grain_number"] == grain_number]
        classes = grain_data["class_number"].unique()
        num_dna_segments = len(grain_data[grain_data["class_number"] == 1])
        num_protein_segments = len(grain_data[grain_data["class_number"] == 2])
        protein_present = 1 if 1 in classes and 2 in classes else 0
        print(f"    grain number: {grain_number}, classes: {classes}")

        # get the segment data for this grain from the disordered segments df
        disordered_segment_data = all_disordered_segments_df[all_disordered_segments_df["image"] == image]
        disordered_segment_data = disordered_segment_data[disordered_segment_data["grain_number"] == grain_number]

        # get important stats
        branch_distances = disordered_segment_data["branch_distance"]
        mean_branch_distance = branch_distances.mean()
        # tbl is the same for all subgrains since it's the total, and just repeated for each row of the grain
        tbl = grain_data["total_branch_lengths"].values[0]
        # total protein volume is just the total of all the class 2 volumes for the grain
        total_protein_volume = grain_data[grain_data["class_number"] == 2]["volume"].sum()
        displayname = grain_data["display_name"].values[0]
        basename = grain_data["basename"].values[0]

        # add to the list
        grains_list.append(
            {
                "image": image,
                "grain_number": grain_number,
                "protein_present": protein_present,
                "num_dna_segments": num_dna_segments,
                "num_protein_segments": num_protein_segments,
                "mean_branch_distance": mean_branch_distance,
                "total_branch_lengths": tbl,
                "total_protein_volume": total_protein_volume,
                "display_name": displayname,
                "basename": basename,
            }
        )

grains_df = pd.DataFrame(grains_list)

print(f"num TRF1: {len(grains_df[grains_df['display_name'].str.contains('TRF1')])}")
print(f"num TRF2: {len(grains_df[grains_df['display_name'].str.contains('TRF2')])}")

In [None]:
# Plot TBL vs display_name for only grains with protein
plt.figure()
# stripplot with violins
sns.violinplot(
    x="display_name", y="total_branch_lengths", data=grains_df[grains_df["protein_present"] == 1], color=boxplotcolour
)
sns.stripplot(
    x="display_name",
    y="total_branch_lengths",
    data=grains_df[grains_df["protein_present"] == 1],
    color="black",
    alpha=0.5,
)
plt.xticks(rotation=90)
plt.ylabel("total branch lengths (nm)")
plt.xlabel("sample type")
plt.title("total branch lengths for grains with protein")
simpleticks()
plt.show()

# plot protein volume against TBL for grains with protein, with volume on y and TBL on x, using jointplot
plt.figure()
# scatterplot
# plot TRF1 in orange and TRF2 in blue
sns.jointplot(
    x="total_branch_lengths",
    y="total_protein_volume",
    data=grains_df[grains_df["protein_present"] == 1],
    hue="display_name",
    palette={"TEL12TRF1": "orange", "TEL12TRF2": "blue"},
    alpha=0.5,
)
plt.xlabel("total branch lengths (nm)")
plt.ylabel("total protein volume (nm^3)")
# plt.title("total protein volume vs total branch lengths for grains with protein")
plt.tight_layout()
plt.show()

# number of DNA segments vs display name
plt.figure()
sns.violinplot(
    x="display_name", y="num_dna_segments", data=grains_df[grains_df["protein_present"] == 1], color=boxplotcolour
)
sns.stripplot(
    x="display_name",
    y="num_dna_segments",
    data=grains_df[grains_df["protein_present"] == 1],
    color="black",
    alpha=0.5,
)
plt.xticks(rotation=90)
plt.ylabel("number of DNA segments")
plt.xlabel("sample type")
plt.title("number of DNA segments for grains with protein")
simpleticks()
plt.show()

# number of protein segments vs display name
plt.figure()
sns.violinplot(
    x="display_name", y="num_protein_segments", data=grains_df[grains_df["protein_present"] == 1], color=boxplotcolour
)
sns.stripplot(
    x="display_name",
    y="num_protein_segments",
    data=grains_df[grains_df["protein_present"] == 1],
    color="black",
    alpha=0.5,
)
plt.xticks(rotation=90)
plt.ylabel("number of protein segments")
plt.xlabel("sample type")
plt.title("number of protein segments for grains with protein")
simpleticks()
plt.show()
