In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_dir = Path("/Users/sylvi/topo_data/shelterin/")
assert base_dir.exists()

all_stats_file = base_dir / "all_statistics.csv"
assert all_stats_file.exists()
all_stats = pd.read_csv(all_stats_file)

print(all_stats.columns)

## protein volume TEL80 & TEL80+S

In [None]:
# Grab TEL80 data (Control_TEL80 OR Shelterin_TEL80)
data_volume_tel80_s = all_stats[all_stats["category"] == "Shelterin_TEL80"]
data_volume_tel80_c = all_stats[all_stats["category"] == "Control_TEL80"]

# grab the proteins as the rows with class_number == 2 and grab volumes
data_volume_tel80_s = data_volume_tel80_s[all_stats["class_number"] == 2]
data_volume_tel80_c = data_volume_tel80_c[all_stats["class_number"] == 2]

data_volume_tel80_s = data_volume_tel80_s["volume"]
data_volume_tel80_c = data_volume_tel80_c["volume"]

# plot stripplot plot of volumes
plt.figure()
sns.stripplot(data_volume_tel80_s, jitter=True, label="TEL80+S")
sns.stripplot(data_volume_tel80_c, jitter=True, label="TEL80")
plt.title("Protein Volumes")
plt.legend()
plt.show()

In [None]:
# total protein volume per grain
data_volume_tel80_s = all_stats[all_stats["category"] == "Shelterin_TEL80"]

data_volume_tel80_s = data_volume_tel80_s[data_volume_tel80_s["class_number"] == 2]

# for each image, for each grain, sum the volumes
# drop all columns except image_number, grain_number, volume
data_volume_tel80_s = data_volume_tel80_s[["image", "grain_number", "volume"]]
data_volume_tel80_s = data_volume_tel80_s.groupby(["image", "grain_number"]).sum()

print(data_volume_tel80_s)

# plot stripplot plot of volumes
plt.figure()
sns.stripplot(data_volume_tel80_s["volume"], jitter=True)
plt.title("Total Protein Volumes\n(sum of each grain's protein masks' volumes)")
plt.show()

## TBL for TEL80 & TEL80+S

In [None]:
data_tbl_tel80_s = all_stats[all_stats["category"] == "Shelterin_TEL80"]
data_tbl_tel80_c = all_stats[all_stats["category"] == "Control_TEL80"]
data_tbl_tel80_s = data_tbl_tel80_s[all_stats["class_number"] == 1]
data_tbl_tel80_c = data_tbl_tel80_c[all_stats["class_number"] == 1]
data_tbl_tel80_s = data_tbl_tel80_s[all_stats["subgrain_number"] == 0]
data_tbl_tel80_c = data_tbl_tel80_c[all_stats["subgrain_number"] == 0]
data_tbl_tel80_s = data_tbl_tel80_s["total_branch_lengths"]
data_tbl_tel80_c = data_tbl_tel80_c["total_branch_lengths"]

plt.figure()
sns.stripplot(data_tbl_tel80_s, jitter=True, label="TEL80+S")
sns.stripplot(data_tbl_tel80_c, jitter=True, label="TEL80 C")
plt.title("Total Branch Lengths")
plt.legend()

plt.show()

In [None]:
# 2d plot of TBL and volume

# grab rows of category Shelterin_TEL80
data_tels80s = all_stats[all_stats["category"] == "Shelterin_TEL80"]
print(len(data_tels80s))

images = data_tels80s["image"].unique()

tbl_total_volume_pairs = []
tbl_maximum_volume_pairs = []

for image in images:
    subdata_image = data_tels80s[data_tels80s["image"] == image]
    # print(image)
    grain_numbers = data_tels80s[data_tels80s["image"] == image]["grain_number"].unique()
    # print(f" {grain_numbers}")
    for grain_number in grain_numbers:
        subdata_grain = data_tels80s[(data_tels80s["image"] == image) & (data_tels80s["grain_number"] == grain_number)]
        # print(f"  grain {grain_number}")

        # grab all the class_numbers for this grain
        class_numbers = subdata_grain["class_number"].unique()
        # print(f"   class numbers: {class_numbers}")

        if 1 in class_numbers and 2 in class_numbers:
            # success, grab the TBL from any row and the total volume of all class 2 rows
            # tbl
            tbl = subdata_grain.iloc[0]["total_branch_lengths"]
            # volume
            total_volume = subdata_grain[subdata_grain["class_number"] == 2]["volume"].sum()
            maximum_protein_volume = subdata_grain[subdata_grain["class_number"] == 2]["volume"].max()
            # print(f"    tbl: {tbl}, volume: {total_volume}")
            tbl_total_volume_pairs.append((tbl, total_volume))
            tbl_maximum_volume_pairs.append((tbl, maximum_protein_volume))


print(len(tbl_total_volume_pairs))

# plot the pairs as a scatter plot

tbls = [pair[0] for pair in tbl_total_volume_pairs]
total_volumes = [pair[1] for pair in tbl_total_volume_pairs]
maximum_volumes = [pair[1] for pair in tbl_maximum_volume_pairs]

plt.figure()
plt.scatter(tbls, total_volumes)
plt.xlabel("Total Branch Lengths")
plt.ylabel("Total Protein Volumes")
plt.title("TEL80+S Total Branch Lengths vs Total Protein Volumes")
plt.show()

plt.figure()
plt.scatter(tbls, maximum_volumes)
plt.xlabel("Total Branch Lengths")
plt.ylabel("Maximum Protein Volume")
plt.title("TEL80+S Total Branch Lengths vs Maximum Protein Volume")
plt.show()

## TBL & height for CTEL80

In [None]:
# grab control TEL80s
data_tel80c = all_stats[all_stats["category"] == "Control_TEL80"]
# eliminate rows where class == 2
data_tel80c = data_tel80c[data_tel80c["class_number"] == 1]

tbls = data_tel80c["total_branch_lengths"]
median_heights = data_tel80c["height_median"]

# plot median heights vs TBL for TEL80s
plt.figure()
plt.scatter(tbls, median_heights)
plt.xlabel("Total Branch Lengths")
plt.ylabel("Median Heights")
plt.title("Control TEL80 Median Heights vs Total Branch Lengths")
plt.show()

mean_median_height = np.mean(median_heights)
std_median_height = np.std(median_heights)
print(f"mean median height: {mean_median_height:.2e}, std median height: {std_median_height:.2e}")
mean_tbl = np.mean(tbls)
std_tbl = np.std(tbls)
print(f"mean tbl: {mean_tbl:.2e}, std tbl: {std_tbl:.2e}")

In [None]:
# TBL for control_TEL12

# grab only new data - ie data where filename contains a date in 2025
# cutoff_date = "2025"
# recent_data = all_stats[all_stats["image"].str.contains(cutoff_date)]


data_tbl_tel12c = all_stats[all_stats["category"] == "Control_TEL12"]
print(f"all tel12c {len(data_tbl_tel12c)}")
data_tbl_tel12s = all_stats[all_stats["category"] == "Shelterin_TEL12"]
print(f"all tel12s {len(data_tbl_tel12s)}")
data_tbl_trf1 = all_stats[all_stats["category"] == "TRF1"]
print(f"all trf1 {len(data_tbl_trf1)}")
data_tbl_trf2 = all_stats[all_stats["category"] == "TRF2"]
print(f"all trf2 {len(data_tbl_trf2)}")
data_tbl_tel12c = data_tbl_tel12c[data_tbl_tel12c["class_number"] == 1]
data_tbl_tel12s = data_tbl_tel12s[data_tbl_tel12s["class_number"] == 1]
data_tbl_trf1 = data_tbl_trf1[data_tbl_trf1["class_number"] == 1]
data_tbl_trf2 = data_tbl_trf2[data_tbl_trf2["class_number"] == 1]
data_tbl_tel12c = data_tbl_tel12c[data_tbl_tel12c["subgrain_number"] == 0]
data_tbl_tel12s = data_tbl_tel12s[data_tbl_tel12s["subgrain_number"] == 0]
data_tbl_trf1 = data_tbl_trf1[data_tbl_trf1["subgrain_number"] == 0]
data_tbl_trf2 = data_tbl_trf2[data_tbl_trf2["subgrain_number"] == 0]

print(f"trf1 after only class 1 and only subgrain 0 {len(data_tbl_trf1)}")
print(f"trf2 after only class 1 and only subgrain 0 {len(data_tbl_trf2)}")

# merge the two dataframes
data_tbl_tel12 = pd.concat([data_tbl_tel12c, data_tbl_tel12s, data_tbl_trf1, data_tbl_trf2])

plt.figure()
sns.stripplot(x="category", y="total_branch_lengths", data=data_tbl_tel12)
plt.title(f"Total Branch Lengths for TEL12, TRF1, TRF2")
plt.show()

In [None]:
# area for trf1
data_area_trf1 = all_stats[all_stats["category"] == "TRF1"]
data_area_trf1 = data_area_trf1[data_area_trf1["class_number"] == 1]

plt.figure()
sns.stripplot(data_area_trf1["area"], jitter=True)
plt.title("TRF1 Area")
plt.show()