In [None]:
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_dir = Path("/Users/sylvi/topo_data/shelterin")
assert base_dir.exists()
all_stats_file = base_dir / "output-data-redo" / "all_statistics.csv"
assert all_stats_file.exists()
all_stats = pd.read_csv(all_stats_file)

all_disordered_segments_file = base_dir / "output-data-redo" / "all_disordered_segment_statistics.csv"
assert all_disordered_segments_file.exists()
all_disordered_segments = pd.read_csv(all_disordered_segments_file)

# create a date column using the yymmdd date at the start of the "image" column
all_stats["date"] = all_stats["image"].str.split("_").str[0]
all_disordered_segments["date"] = all_disordered_segments["image"].str.split("_").str[0]

# # vet by date
# data_exclude_range_start = "20250101"
# data_exclude_range_end = "20250301"
# # exclude any rows of all_stats that contain dates in this range
# all_stats["date"] = pd.to_datetime(all_stats["date"], format="%Y%m%d")
# all_disordered_segments["date"] = pd.to_datetime(all_disordered_segments["date"], format="%Y%m%d")
# all_stats = all_stats[
#     ~((all_stats["date"] >= data_exclude_range_start) & (all_stats["date"] <= data_exclude_range_end))
# ]
# all_disordered_segments = all_disordered_segments[
#     ~(
#         (all_disordered_segments["date"] >= data_exclude_range_start)
#         & (all_disordered_segments["date"] <= data_exclude_range_end)
#     )
# ]


# create a display name column that takes the basename column and strips everything before the first forward slash,
# eg: "data-redo/TEL80shelterin" would be displayed as "TEL80shelterin"
all_stats["display_name"] = all_stats["basename"].str.split("/").str[1]
# print unique display names
unique_display_names = all_stats["display_name"].unique()
print("Unique display names:")
for name in unique_display_names:
    print(name)

# convert TBL to be in nm
all_stats["total_branch_lengths"] /= 1e-9
all_stats["smallest_bounding_area"] /= 1e-18
all_stats["area"] /= 1e-18
all_stats["height_median"] /= 1e-9

# plotting options
boxplotcolour = "lightgrey"

print(all_stats.columns)
print(len(all_stats))
print("----")
print(all_disordered_segments.columns)

In [None]:
def boxplot(x, y, data):
    sns.boxplot(x=x, y=y, data=data, color=boxplotcolour)
    # print the median and IQRs for each x's y values
    for i, group in data.groupby(x):
        # median
        median = group[y].median()
        iqr = group[y].quantile(0.75) - group[y].quantile(0.25)
        print(f"{i}: median={median}, IQR={iqr}")


def simpleticks():
    num_ticks = len(plt.gca().get_xticklabels())
    new_ticks = []
    for i, tick in enumerate(plt.gca().get_xticklabels()):
        # get rid of "TEL12" or "TEL80" from the tick label
        # get rid of TEL12 if applicable
        if "TEL12" in tick.get_text():
            if "control" in tick.get_text():
                tick.set_text("TEL12")
            else:
                tick.set_text(tick.get_text().replace("TEL12", ""))
        # get rid of TEL80 if applicable
        if "TEL80" in tick.get_text():
            if "control" in tick.get_text():
                tick.set_text("TEL80")
            else:
                tick.set_text(tick.get_text().replace("TEL80", ""))
        new_ticks.append(tick.get_text())
    # set the ticks
    tick_indexes = np.linspace(0, num_ticks - 1, num_ticks)
    plt.xticks(tick_indexes, new_ticks)


# set default font size for axes labels
plt.rcParams.update({"axes.labelsize": 16})

# areas

In [None]:
sns.stripplot(x="display_name", y="area", data=all_stats)
plt.xticks(rotation=90)
plt.ylabel("area (nm^2)")
plt.xlabel("sample type")
plt.title("molecule area")
plt.show()

sns.stripplot(x="display_name", y="smallest_bounding_area", data=all_stats)
plt.xticks(rotation=90)
plt.ylabel("smallest bounding box area (nm^2)")
plt.xlabel("sample type")
plt.title("molecule bounding box area")
plt.hlines(1000, xmin=-1, xmax=10, colors="red", linestyles="dashed")
plt.show()

## remove anomalies

In [None]:
# # tel12 controls sometimes overlaid with each other artificially increasing TBL so remove controls that have absurd TBLs
# anomaly_threshold_tbl_too_low = 700
# data_tbl_tel12_control_anomalies = all_stats[
#     all_stats["basename"].str.contains("TEL12control")
#     & (all_stats["total_branch_lengths"] >= anomaly_threshold_tbl_too_low)
# ]
# all_stats = all_stats[
#     ~(
#         all_stats["basename"].str.contains("TEL12control")
#         & (all_stats["total_branch_lengths"] >= anomaly_threshold_tbl_too_low)
#     )
# ]
# print(f"removed tel12 control anomalies: {len(data_tbl_tel12_control_anomalies)}")

# anomaly_bbox_area_threshold = 1000
# # get rid of any row where the bounding box area is less than 1000 nm^2
# # LESS THAN
# data_bbox_area_anomalies = all_stats[all_stats["smallest_bounding_area"] < anomaly_bbox_area_threshold]
# all_stats = all_stats[all_stats["smallest_bounding_area"] >= anomaly_bbox_area_threshold]
# print(f"removed bounding box area anomalies: {len(data_bbox_area_anomalies)}")

# TEL80S protein volumes

In [None]:
# protein volume for TEL80 S
# grab TEL80shelterin by selecting rows with "TEL80shelterin" in the "basename" column
data_tel80s = all_stats[all_stats["basename"].str.contains("TEL80shelterin")]
print(f"tel80s: {len(data_tel80s)}")
data_tel80s_protein = data_tel80s[data_tel80s["class_number"] == 2]
print(f"tel80s protein: {len(data_tel80s_protein)}")

mean_tel80s_protein_volume = data_tel80s_protein["volume"].mean() / (1e-9) ** 3
std_tel80s_protein_volume = data_tel80s_protein["volume"].std() / (1e-9) ** 3
minimum_tel80s_protein_volume = data_tel80s_protein["volume"].min() / (1e-9) ** 3
maximum_tel80s_protein_volume = data_tel80s_protein["volume"].max() / (1e-9) ** 3
print(
    f"mean tel80s protein volume: {mean_tel80s_protein_volume:.2f} nm^3 | std: {std_tel80s_protein_volume:.2f} nm^3 | min: {minimum_tel80s_protein_volume:.2f} nm^3 | max: {maximum_tel80s_protein_volume:.2f} nm^3"
)

plt.figure()
sns.stripplot(data_tel80s_protein["volume"])
plt.title("Volume of TEL80shelterin proteins")
plt.show()

# TBL for TEL80C & TEL80S

In [None]:
data_tel12c = all_stats[all_stats["basename"].str.contains("TEL80control")]
print(f"tel80c: {len(data_tel12c)}")
data_tel80s = all_stats[all_stats["basename"].str.contains("TEL80shelterin")]
print(f"tel80s: {len(data_tel80s)}")

# grab only class 1 subgrain 0 since data is duplicated for each subgrain and class
data_tel80c_s0_c1 = data_tel12c[(data_tel12c["subgrain_number"] == 0) & (data_tel12c["class_number"] == 1)]
data_tel80s_s0_c1 = data_tel80s[(data_tel80s["subgrain_number"] == 0) & (data_tel80s["class_number"] == 1)]
print(f"tel80c_s1_c1: {len(data_tel80c_s0_c1)}")
print(f"tel80s_s1_c1: {len(data_tel80s_s0_c1)}")

data_tel80c_tel80s = pd.concat([data_tel80c_s0_c1, data_tel80s_s0_c1])

tel80c_median_tbl = data_tel80c_s0_c1["total_branch_lengths"].median()
tel80c_iqr_tbl = data_tel80c_s0_c1["total_branch_lengths"].quantile(0.75) - data_tel80c_s0_c1[
    "total_branch_lengths"
].quantile(0.25)
tel80s_median_tbl = data_tel80s_s0_c1["total_branch_lengths"].median()
tel80s_iqr_tbl = data_tel80s_s0_c1["total_branch_lengths"].quantile(0.75) - data_tel80s_s0_c1[
    "total_branch_lengths"
].quantile(0.25)
print(f"TEL80control median TBL: {tel80c_median_tbl:.2f} nm | IQR: {tel80c_iqr_tbl:.2f} nm")
print(f"TEL80shelterin median TBL: {tel80s_median_tbl:.2f} nm | IQR: {tel80s_iqr_tbl:.2f} nm")


# plot TBL for both
plt.figure()
sns.stripplot(x="display_name", y="total_branch_lengths", data=data_tel80c_tel80s)
boxplot("display_name", "total_branch_lengths", data_tel80c_tel80s)
plt.ylabel("total branch length (nm)")
plt.xlabel("")
# redraw x ticks to be more readable
simpleticks()
plt.show()

In [None]:
data_tel80s = all_stats[all_stats["basename"].str.contains("TEL80shelterin")]
images = data_tel80s["image"].unique()
tbl_total_volume_pairs = []
tbl_maximum_volume_pairs = []
for image in images:
    subdata_images = data_tel80s[data_tel80s["image"] == image]
    grain_numbers = subdata_images["grain_number"].unique()
    for grain_number in grain_numbers:
        subdata_grain = subdata_images[subdata_images["grain_number"] == grain_number]
        class_numbers = subdata_grain["class_number"].unique()
        if 1 in class_numbers and 2 in class_numbers:
            tbl = subdata_grain.iloc[0]["total_branch_lengths"]
            total_volume = subdata_grain[subdata_grain["class_number"] == 2]["volume"].sum() / (1e-9) ** 3
            maximum_volume = subdata_grain[subdata_grain["class_number"] == 2]["volume"].max() / (1e-9) ** 3
            tbl_total_volume_pairs.append((tbl, total_volume))
            tbl_maximum_volume_pairs.append((tbl, maximum_volume))

tbls = [pair[0] for pair in tbl_total_volume_pairs]
total_volumes = [pair[1] for pair in tbl_total_volume_pairs]
maximum_volumes = [pair[1] for pair in tbl_maximum_volume_pairs]

plt.figure()
plt.scatter(tbls, total_volumes)
# plt.title("Total branch lengths vs total volume for TEL80 + shelterin")
plt.xlabel("Total branch length ($nm$)")
plt.ylabel("Protein volume ($nm^3$)")
plt.show()

# plt.figure()
# plt.scatter(tbls, maximum_volumes)
# # plt.title("Total branch lengths vs maximum volume")
# plt.xlabel("Total branch length ($nm$)")
# plt.ylabel("Maximum volume ($nm^3$)")
# plt.show()

# TBL & height for TEL12C

In [None]:
data_tel12c = all_stats[all_stats["basename"].str.contains("TEL12control")]
print(f"tel12c: {len(data_tel12c)}")
images = data_tel12c["image"].unique()
tbl_height_pairs = []
for image in images:
    subdata_images = data_tel12c[data_tel12c["image"] == image]
    grain_numbers = subdata_images["grain_number"].unique()
    for grain_number in grain_numbers:
        subdata_grain = subdata_images[subdata_images["grain_number"] == grain_number]
        subdata_subgrains = subdata_grain["subgrain_number"].unique()
        for subgrain_number in subdata_subgrains:
            subdata_subgrain = subdata_grain[subdata_grain["subgrain_number"] == subgrain_number]
            assert len(subdata_subgrain) == 1, f"len(subdata_subgrain) = {len(subdata_subgrain)}"
            class_number = subdata_subgrain["class_number"].values[0]
            if class_number == 1:
                tbl = subdata_subgrain["total_branch_lengths"].values[0]
                if np.isnan(tbl):
                    continue
                median_height = subdata_subgrain["height_median"].values[0]
                if np.isnan(median_height):
                    continue
                tbl_height_pairs.append((tbl, median_height))

tbls = [pair[0] for pair in tbl_height_pairs]
heights = [pair[1] for pair in tbl_height_pairs]

median_median_height = np.median(heights)
# iqr_median_height = np.percentile(heights, 75) - np.percentile(heights, 25)
median_total_branch_lengths = np.median(tbls)
# iqr_total_branch_lengths = np.percentile(tbls, 75) - np.percentile(tbls, 25)

# print(f"TEL12control | median median height: {median_median_height:.2f} nm | iqr: {iqr_median_height:.2f} nm")
print(
    f"TEL12control | median total branch lengths: {median_total_branch_lengths:.2f} nm | iqr: {iqr_total_branch_lengths:.2f} nm"
)

# get rows where tbl is less than 200
data_tbl_anomaly = all_stats[all_stats["total_branch_lengths"] < 200]
print(f"tbl anomalies: {len(data_tbl_anomaly)}")
# print image names
print(data_tbl_anomaly["image"].unique())

print("median heights vs total branch lengths for tel12 control")
plt.figure()
plt.scatter(tbls, heights)
plt.xlabel("Total branch lengths (nm)")
plt.ylabel("Median height (nm)")
plt.ylim(0, 3.0)
plt.show()

# TBL & median height for TEL80

In [None]:
data_tel80c = all_stats[all_stats["basename"].str.contains("TEL80control")]
images = data_tel80c["image"].unique()
tbl_height_pairs = []
for image in images:
    subdata_images = data_tel80c[data_tel80c["image"] == image]
    grain_numbers = subdata_images["grain_number"].unique()
    for grain_number in grain_numbers:
        subdata_grain = subdata_images[subdata_images["grain_number"] == grain_number]
        subdata_subgrains = subdata_grain["subgrain_number"].unique()
        for subgrain_number in subdata_subgrains:
            subdata_subgrain = subdata_grain[subdata_grain["subgrain_number"] == subgrain_number]
            assert len(subdata_subgrain) == 1, f"len(subdata_subgrain) = {len(subdata_subgrain)}"
            class_number = subdata_subgrain["class_number"].values[0]
            if class_number == 1:
                tbl = subdata_subgrain["total_branch_lengths"].values[0]
                if np.isnan(tbl):
                    continue
                median_height = subdata_subgrain["height_median"].values[0]
                if np.isnan(median_height):
                    continue
                tbl_height_pairs.append((tbl, median_height))

tbls = [pair[0] for pair in tbl_height_pairs]
heights = [pair[1] for pair in tbl_height_pairs]

median_median_height = np.mean(heights)
iqr_median_height = np.std(heights)
median_total_branch_lengths = np.mean(tbls)
iqr_total_branch_lengths = np.std(tbls)

print(f"TEL80control | mean median height: {median_median_height:.2f} nm | std: {iqr_median_height:.2f} nm")
print(
    f"TEL80control | mean total branch lengths: {median_total_branch_lengths:.2f} nm | std: {iqr_total_branch_lengths:.2f} nm"
)
print(f"N: {len(tbls)}")

print("mean heights vs total branch lengths for tel80 control")

plt.figure()
plt.scatter(tbls, heights)
plt.xlabel("Total branch lengths")
plt.ylabel("Median height")
plt.ylim(0, 3.0)
plt.show()

# TBL for TEL12

In [None]:
all_stats_tbl_in_nm = all_stats.copy()
all_stats_data_filtered = all_stats_tbl_in_nm[all_stats_tbl_in_nm["image"].str.contains("")]
print(f"2025: {len(all_stats_data_filtered)}")
data_tel12c = all_stats_data_filtered[all_stats_data_filtered["basename"].str.contains("TEL12control")]
print(f"tel12c: {len(data_tel12c)}")
data_tel12s = all_stats_data_filtered[all_stats_data_filtered["basename"].str.contains("TEL12shelterin")]
print(f"tel12s: {len(data_tel12s)}")
data_tel12trf1 = all_stats_data_filtered[all_stats_data_filtered["basename"].str.contains("TEL12TRF1")]
print(f"tel12trf1: {len(data_tel12trf1)}")
data_tel12trf2 = all_stats_data_filtered[all_stats_data_filtered["basename"].str.contains("TEL12TRF2")]
print(f"tel12trf2: {len(data_tel12trf2)}")

data_tel12c_s0_c1 = data_tel12c[(data_tel12c["subgrain_number"] == 0) & (data_tel12c["class_number"] == 1)]
print(f"tel12c_s0_c1: {len(data_tel12c_s0_c1)}")
data_tel12s_s0_c1 = data_tel12s[(data_tel12s["subgrain_number"] == 0) & (data_tel12s["class_number"] == 1)]
print(f"tel12s_s0_c1: {len(data_tel12s_s0_c1)}")
data_tel12trf1_s0_c1 = data_tel12trf1[(data_tel12trf1["subgrain_number"] == 0) & (data_tel12trf1["class_number"] == 1)]
print(f"tel12trf1_s0_c1: {len(data_tel12trf1_s0_c1)}")
data_tel12trf2_s0_c1 = data_tel12trf2[(data_tel12trf2["subgrain_number"] == 0) & (data_tel12trf2["class_number"] == 1)]
print(f"tel12trf2_s0_c1: {len(data_tel12trf2_s0_c1)}")

data_tbl_tel12 = pd.concat([data_tel12c_s0_c1, data_tel12s_s0_c1, data_tel12trf1_s0_c1, data_tel12trf2_s0_c1])
plt.figure()
# sns.stripplot(x="basename", y="total_branch_lengths", data=data_tbl_tel12)
sns.stripplot(x="display_name", y="total_contour_length", data=data_tbl_tel12)
sns.boxplot(x="display_name", y="total_contour_length", data=data_tbl_tel12, color=boxplotcolour)
# make labels vertical
plt.xticks(rotation=90)
plt.title("Total branch lengths of TEL12 proteins class 1 only")
plt.show()


# replot without TEL12TRF2cb, TEL12TRF2cTRFH, TEL12TRF2cmyb, TEL12TRF2ch
data_tel12_s0_c1_without_mutants = data_tbl_tel12[~data_tbl_tel12["basename"].str.contains("TEL12TRF2cb")]
data_tel12_s0_c1_without_mutants = data_tel12_s0_c1_without_mutants[
    ~data_tel12_s0_c1_without_mutants["basename"].str.contains("TEL12TRF2cTRFH")
]
data_tel12_s0_c1_without_mutants = data_tel12_s0_c1_without_mutants[
    ~data_tel12_s0_c1_without_mutants["basename"].str.contains("TEL12TRF2cmyb")
]
data_tel12_s0_c1_without_mutants = data_tel12_s0_c1_without_mutants[
    ~data_tel12_s0_c1_without_mutants["basename"].str.contains("TEL12TRF2ch")
]
plt.figure()
sns.stripplot(x="display_name", y="total_branch_lengths", data=data_tel12_s0_c1_without_mutants)
sns.boxplot(
    x="display_name",
    y="total_branch_lengths",
    data=data_tel12_s0_c1_without_mutants,
    color=boxplotcolour,
)
plt.title("Total branch lengths of TEL12 proteins")
plt.xticks(rotation=90)
plt.xlabel("sample type")
plt.ylabel("total branch length (nm)")
plt.show()



In [None]:
print(len(all_stats))
data_tel12 = all_stats[all_stats["basename"].str.contains("TEL12")]
print(len(data_tel12))

images = all_stats["image"].unique()
for image in images:
    subdata_images = all_stats[all_stats["image"] == image]
    grain_numbers = subdata_images["grain_number"].unique()
    print(f"grain numbers: {grain_numbers}")
    for grain_number in grain_numbers:
        print(f"grain number: {grain_number}")
        subdata_grain = subdata_images[subdata_images["grain_number"] == grain_number]
        class_numbers = subdata_grain["class_number"].unique()
        print(f"class numbers: {class_numbers}")
        if 1 in class_numbers and 2 in class_numbers:
            print(f"image: {image} | grain number: {grain_number} | class numbers: {class_numbers}")

# median branch length for TEL12TRF1 vs TEL12TRF2 vs TEL12shelterin

In [None]:
data_tel12c = all_stats[all_stats["basename"].str.contains("TEL12control")]
print(f"tel12c: {len(data_tel12c)}")
data_tel12s = all_stats[all_stats["basename"].str.contains("TEL12shelterin")]
print(f"tel12s: {len(data_tel12s)}")
data_tel12trf1 = all_stats[all_stats["basename"].str.contains("TEL12TRF1")]
print(f"tel12trf1: {len(data_tel12trf1)}")
data_tel12trf2 = all_stats[all_stats["basename"].str.contains("TEL12TRF2")]
print(f"tel12trf2: {len(data_tel12trf2)}")

bigdf_list = []

for data, name in zip(
    [data_tel12c, data_tel12s, data_tel12trf1, data_tel12trf2],
    ["TEL12control", "TEL12shelterin", "TEL12TRF1", "TEL12TRF2"],
):
    images = data["image"].unique()
    for image in images:
        subdata_images = data[data["image"] == image]
        grain_numbers = subdata_images["grain_number"].unique()
        for grain_number in grain_numbers:
            # for this grain if there is not both class 1 and class 2, skip
            classes = subdata_images[subdata_images["grain_number"] == grain_number]["class_number"].unique()

            subdata_grain = subdata_images[subdata_images["grain_number"] == grain_number]
            disordered_segment_data = all_disordered_segments[all_disordered_segments["image"] == image]
            disordered_segment_data = disordered_segment_data[disordered_segment_data["grain_number"] == grain_number]
            branch_distances = disordered_segment_data["branch_distance"]
            mean_branch_distance = np.mean(branch_distances)
            tbl = subdata_grain["total_branch_lengths"].values[0]
            mean_branch_distance_divided_by_total_branch_length = mean_branch_distance / tbl
            displayname = subdata_grain["basename"].values[0]
            display_name = subdata_grain["display_name"].values[0]
            # add to df list
            bigdf_list.append(
                {
                    "image": image,
                    "grain_number": grain_number,
                    "mean_branch_distance": mean_branch_distance,
                    "basename": displayname,
                    "display_name": display_name,
                    "mean_branch_distance_divided_by_total_branch_length": mean_branch_distance_divided_by_total_branch_length,
                    "total_branch_length": tbl,
                    "total_protein_volume": subdata_grain[subdata_grain["class_number"] == 2]["volume"].sum()
                    / (1e-9) ** 3,
                    "class_1_and_2": 1 if 1 in classes and 2 in classes else 0,
                }
            )

alldf = pd.DataFrame(bigdf_list)
print(f"num grains in bigdf: {len(alldf)}")
alldf_only_c1c2 = alldf[alldf["class_1_and_2"] == 1]
bigdf_tel12c = alldf[alldf["display_name"] == "TEL12control"]

################ Mean branch length for only TRF1 and TRF2

print("Mean branch length for only TRF1 and TRF2")
plt.figure()
mean_branch_length_trf1_trf2 = alldf[alldf["display_name"].isin(["TEL12TRF1", "TEL12TRF2"])]
sns.stripplot(x="display_name", y="mean_branch_distance", data=mean_branch_length_trf1_trf2)
sns.boxplot(
    x="display_name",
    y="mean_branch_distance",
    data=mean_branch_length_trf1_trf2,
    color=boxplotcolour,
)
plt.ylabel("Mean branch distance ($nm$)")
plt.xlabel("")
simpleticks()
plt.show()

# same but for only c1c2
print("Mean branch length C1C2 for only TRF1 and TRF2")
df_mean_bl_plot = alldf_only_c1c2[alldf_only_c1c2["display_name"].isin(["TEL12TRF1", "TEL12TRF2", "TEL12shelterin"])]
# add all control data to this df
df_mean_bl_plot = pd.concat([df_mean_bl_plot, bigdf_tel12c])
print(df_mean_bl_plot["display_name"].unique())
plt.figure()
sns.stripplot(x="display_name", y="mean_branch_distance", data=df_mean_bl_plot)
sns.boxplot(x="display_name", y="mean_branch_distance", data=df_mean_bl_plot, color=boxplotcolour)
plt.ylabel("Mean branch distance ($nm$)")
simpleticks()
plt.xlabel("")
plt.show()

############## total branch length for all except shelterin

print("Total branch length for all mutants except shelterin, control")
plot_tbl_all_but_s_c_order = [
    "TEL12TRF1",
    "TEL12TRF2",
    "TEL12TRF2cb",
    "TEL12TRF2cTRFH",
    "TEL12TRF2ch",
    "TEL12TRF2cmyb",
]
mean_branch_length_df = alldf[alldf["display_name"].isin(plot_tbl_all_but_s_c_order)]

# total branch length
plt.figure()
# sns.violinplot(x="basename", y="total_branch_length", data=mean_branch_length_df)
sns.stripplot(x="display_name", y="total_branch_length", data=mean_branch_length_df, order=plot_tbl_all_but_s_c_order)
boxplot("display_name", "total_branch_length", mean_branch_length_df)
plt.xlabel("")
plt.ylabel("Total branch length ($nm$)")
simpleticks()
plt.xticks(rotation=45)
plt.show()

######################## total branch length C1&2 for TRF1, TRF2, shelterin

print("total branch length C1&2 for TRF1, TRF2, shelterin")
mean_branch_length_trf1_trf2_shelterin = alldf_only_c1c2[
    alldf_only_c1c2["display_name"].isin(["TEL12TRF1", "TEL12TRF2", "TEL12shelterin"])
]

plt.figure()
sns.stripplot(x="display_name", y="total_branch_length", data=mean_branch_length_trf1_trf2_shelterin)
sns.boxplot(
    x="display_name", y="total_branch_length", data=mean_branch_length_trf1_trf2_shelterin, color=boxplotcolour
)
# plt.title(f"Total branch length C1&2")
plt.ylabel("Total branch length ($nm$)")
plt.xlabel("")
simpleticks()
plt.show()


######################## SAME BUT FOR ONLY MAIN SAMPLES ONLY

print("total branch length for main samples only")
mean_branch_length_control_trf1_trf2_shelterin = alldf[
    alldf["display_name"].isin(["TEL12control", "TEL12TRF1", "TEL12TRF2", "TEL12shelterin"])
]
plt.figure()
sns.stripplot(x="display_name", y="total_branch_length", data=mean_branch_length_control_trf1_trf2_shelterin)
sns.boxplot(
    x="display_name", y="total_branch_length", data=mean_branch_length_control_trf1_trf2_shelterin, color=boxplotcolour
)
plt.title(f"Total branch length")
plt.ylabel("Total branch length ($nm$)")
plt.show()

# print ns for each display name
for display_name in mean_branch_length_control_trf1_trf2_shelterin["display_name"].unique():
    print(
        f"{display_name}: {len(mean_branch_length_control_trf1_trf2_shelterin[mean_branch_length_control_trf1_trf2_shelterin['display_name'] == display_name])}"
    )

########################## bridged vs non bridged

print("bridged vs non bridged")
total_branch_bridge_threshold = 600
alldf["bridged"] = alldf["total_branch_length"] > total_branch_bridge_threshold
bridged = alldf[alldf["bridged"] == True]
not_bridged = alldf[alldf["bridged"] == False]
plt.figure()
sns.stripplot(x="display_name", y="total_branch_length", data=bridged)
sns.boxplot(x="display_name", y="total_branch_length", data=bridged, color=boxplotcolour)
sns.stripplot(x="display_name", y="total_branch_length", data=not_bridged)
sns.boxplot(x="display_name", y="total_branch_length", data=not_bridged, color=boxplotcolour)
plt.title("Total branch length bridged vs not bridged")
plt.xticks(rotation=90)
plt.ylabel("total branch length ($nm$)", fontsize=14)
plt.xlabel("")
plt.show()


# for each basename, print number of bridged vs not bridged
for displayname in alldf_only_c1c2["display_name"].unique():
    basename_data = alldf[alldf["display_name"] == displayname]
    bridged = basename_data[basename_data["bridged"] == True]
    not_bridged = basename_data[basename_data["bridged"] == False]
    percentage_bridged = len(bridged) / (len(bridged) + len(not_bridged)) * 100
    # print with padding so it's all lined up
    print(
        f"{displayname:<20} bridged: {len(bridged):<8} not bridged: {len(not_bridged):<8} percentage: {percentage_bridged:>10,.2f}%   ratio: {len(bridged) / len(not_bridged):<10,.2f}"
    )


########################## TBL bridged vs non bridged class 1&2 only
alldf_only_c1c2 = alldf_only_c1c2.copy()
alldf_only_c1c2["bridged"] = alldf_only_c1c2["total_branch_length"] > total_branch_bridge_threshold
print("TBL bridged vs non bridged class 1&2 only")
plt.figure()
sns.stripplot(x="display_name", y="total_branch_length", data=alldf_only_c1c2[alldf_only_c1c2["bridged"] == True])
sns.boxplot(
    x="display_name",
    y="total_branch_length",
    data=alldf_only_c1c2[alldf_only_c1c2["bridged"] == True],
    color=boxplotcolour,
)
sns.stripplot(x="display_name", y="total_branch_length", data=alldf_only_c1c2[alldf_only_c1c2["bridged"] == False])
sns.boxplot(
    x="display_name",
    y="total_branch_length",
    data=alldf_only_c1c2[alldf_only_c1c2["bridged"] == False],
    color=boxplotcolour,
)
plt.title("Total branch length bridged vs not bridged")
plt.xticks(rotation=90)
plt.ylabel("total branch length ($nm$)", fontsize=14)
plt.xlabel("")
plt.show()
# for each basename, print number of bridged vs not bridged
for displayname in alldf_only_c1c2["display_name"].unique():
    basename_data = alldf_only_c1c2[alldf_only_c1c2["display_name"] == displayname]
    bridged = basename_data[basename_data["bridged"] == True]
    not_bridged = basename_data[basename_data["bridged"] == False]
    percentage_bridged = len(bridged) / (len(bridged) + len(not_bridged)) * 100
    # print with padding so it's all lined up
    print(
        f"{displayname:<20} bridged: {len(bridged):<8} not bridged: {len(not_bridged):<8} percentage: {percentage_bridged:>10,.2f}%   ratio: {len(bridged) / len(not_bridged):<10,.2f}"
    )

########################## volume vs total branch length

print("tbl vs volume scatter for all")

plt.figure()
sns.scatterplot(x="total_protein_volume", y="total_branch_length", data=alldf_only_c1c2)
plt.show()

############# TBL vs protein volume scatter for only TRF1, TRF2, shelterin

trf1_only = alldf_only_c1c2[alldf_only_c1c2["display_name"] == "TEL12TRF1"]
trf2_only = alldf_only_c1c2[alldf_only_c1c2["display_name"] == "TEL12TRF2"]
shelterin_only = alldf_only_c1c2[alldf_only_c1c2["display_name"] == "TEL12shelterin"]

print("tbl vs protein volume scatter for TRF1, TRF2, shelterin")
plt.figure()
sns.scatterplot(x="total_protein_volume", y="total_branch_length", data=trf1_only, label=f"TRF1 n={len(trf1_only)}")
sns.scatterplot(x="total_protein_volume", y="total_branch_length", data=trf2_only, label=f"TRF2 n={len(trf2_only)}")
sns.scatterplot(
    x="total_protein_volume", y="total_branch_length", data=shelterin_only, label=f"shelterin n={len(shelterin_only)}"
)
plt.xlabel("Total protein volume ($nm^3$)")
plt.ylabel("Total branch length ($nm$)")
plt.legend(loc="lower right")
plt.show()

print("tbl vs protein volume scatter for TRF1, TRF2, shelterin with regression")
plt.figure()
sns.scatterplot(x="total_protein_volume", y="total_branch_length", data=trf1_only, label=f"TRF1 n={len(trf1_only)}")
sns.scatterplot(x="total_protein_volume", y="total_branch_length", data=trf2_only, label=f"TRF2 n={len(trf2_only)}")
sns.scatterplot(
    x="total_protein_volume", y="total_branch_length", data=shelterin_only, label=f"shelterin n={len(shelterin_only)}"
)
# regression plus confidence intervals for each scatter plot
sns.regplot(x="total_protein_volume", y="total_branch_length", data=trf1_only, scatter=False)
sns.regplot(x="total_protein_volume", y="total_branch_length", data=trf2_only, scatter=False)
sns.regplot(x="total_protein_volume", y="total_branch_length", data=shelterin_only, scatter=False)
plt.xlabel("Total protein volume ($nm^3$)")
plt.ylabel("Total branch length ($nm$)")
plt.legend(loc="lower right")
plt.show()

############### total protein volume

plt.figure()
sns.stripplot(x="display_name", y="total_protein_volume", data=alldf_only_c1c2)
# sns.boxplot(x="display_name", y="total_protein_volume", data=bigdf_only_c1c2, color=boxplotcolour)
boxplot(x="display_name", y="total_protein_volume", data=alldf_only_c1c2)
plt.title("Total protein volume")
plt.xlabel("sample type", fontsize=14)
plt.ylabel("Total protein volume ($nm^3$)", fontsize=14)
plt.xticks(rotation=90)
plt.show()

In [None]:
print(len(all_stats))

all_df_list = []
images = all_stats["image"].unique()
for image in images:
    subdata_images = all_stats[all_stats["image"] == image]
    grain_numbers = subdata_images["grain_number"].unique()
    for grain_number in grain_numbers:
        subdata_grain = subdata_images[subdata_images["grain_number"] == grain_number]
        classes = subdata_grain["class_number"].unique()
        disordered_segment_data = all_disordered_segments[all_disordered_segments["image"] == image]
        disordered_segment_data = disordered_segment_data[disordered_segment_data["grain_number"] == grain_number]
        branch_distances = disordered_segment_data["branch_distance"]
        mean_branch_distance = np.mean(branch_distances)
        tbl = subdata_grain["total_branch_lengths"].values[0]
        mean_branch_distance_divided_by_total_branch_length = mean_branch_distance / tbl
        displayname = subdata_grain["basename"].values[0]
        display_name = subdata_grain["display_name"].values[0]
        # add to df list
        all_df_list.append(
            {
                "image": image,
                "grain_number": grain_number,
                "mean_branch_distance": mean_branch_distance,
                "basename": displayname,
                "display_name": display_name,
                "mean_branch_distance_divided_by_total_branch_length": mean_branch_distance_divided_by_total_branch_length,
                "total_branch_length": tbl,
                "total_protein_volume": subdata_grain[subdata_grain["class_number"] == 2]["volume"].sum()
                / (1e-9) ** 3,
                "class_1_and_2": 1 if 1 in classes and 2 in classes else 0,
            }
        )

alldf = pd.DataFrame(all_df_list)
print(f"num grains in alldf: {len(alldf)}")

# print sample types with number of samples
print("-------- all samples ----------")
for display_name in alldf["display_name"].unique():
    print(f"{display_name}: {len(alldf[alldf['display_name'] == display_name])}")
print(f"total: {len(alldf)}")

# only if they have class 1 and 2
print("\n")
print("-------- only grains with proteins ----------")
alldf_only_c1c2 = alldf[alldf["class_1_and_2"] == 1]
# print sample types with number of samples
for display_name in alldf["display_name"].unique():
    print(f"{display_name}: {len(alldf_only_c1c2[alldf_only_c1c2['display_name'] == display_name])}")
print(f"total: {len(alldf_only_c1c2)}")

print("\n")
print("-------- only grains without proteins ----------")
alldf_only_c1 = alldf[alldf["class_1_and_2"] == 0]
# print sample types with number of samples
for display_name in alldf["display_name"].unique():
    print(f"{display_name}: {len(alldf_only_c1[alldf_only_c1['display_name'] == display_name])}")
print(f"total: {len(alldf_only_c1)}")
print("\n")