In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from topostats.io import LoadScans
from topostats.plotting import Colormap

colormap = Colormap()
cmap = colormap.get_cmap()

vmin = -3
vmax = 4

In [None]:
base_dir = Path("/Users/sylvi/topo_data/topostats_2/datasets/picoz-nicked-sc")
assert base_dir.exists()
output_dir = base_dir / "output_abs_07"
assert output_dir.exists()
figure_dir = Path("/Users/sylvi/topo_data/topostats_2/figures/fig-dataset-separation/")
assert figure_dir.exists()

df_allstats = pd.read_csv(output_dir / "all_statistics.csv")

# m to nm
df_allstats["total_contour_length"] /= 1e-9


def calculate_num_char_in_string(input_string: str, character: str) -> int:
    """Calculate the number of occurrences of a specific character in a string."""
    # check if nan
    if pd.isna(input_string):
        return 0
    return input_string.count(character)


def remove_datapoints_outside_n_std(df: pd.DataFrame, column: str, n_std: float) -> pd.DataFrame:
    """Remove datapoints outside n standard deviations from the mean."""
    mean = df[column].mean()
    std = df[column].std()
    lower_bound = mean - n_std * std
    upper_bound = mean + n_std * std
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered


df_allstats["num_plusses"] = df_allstats["writhe_string"].apply(calculate_num_char_in_string, character="+")
df_allstats["num_minuses"] = df_allstats["writhe_string"].apply(calculate_num_char_in_string, character="-")
df_allstats["num_plusses_or_minuses"] = df_allstats["num_plusses"] + df_allstats["num_minuses"]

print(df_allstats.columns)

In [None]:
# sns.stripplot(data=remove_datapoints_outside_n_std(df_allstats, "num_plusses_or_minuses", 3), x="basename", y="num_plusses_or_minuses", jitter=True)
# with increased smoothing
fig, ax = plt.subplots(figsize=(6, 6))
# violin without the boxplot inside
sns.violinplot(
    data=remove_datapoints_outside_n_std(df_allstats, "num_plusses_or_minuses", 3),
    x="basename",
    y="num_plusses_or_minuses",
    ax=ax,
    bw_adjust=0.7,
    inner=None,
    linewidth=2,
)
ax.set_xticks(ticks=[0, 1])
ax.set_xticklabels(labels=["PicoZ Supercoiled", "PicoZ Nicked"])
ax.set_ylabel("Number of strand crossings", fontsize=16)
ax.set_xlabel("")
# line thickness for axes thicker
axes_linewidth = 2
ax.spines["top"].set_linewidth(axes_linewidth)
ax.spines["right"].set_linewidth(axes_linewidth)
ax.spines["left"].set_linewidth(axes_linewidth)
ax.spines["bottom"].set_linewidth(axes_linewidth)
# make y ticks be integers only
ax.set_yticks(ticks=np.arange(0, 4))
# text size
ax.tick_params(axis="both", which="major", labelsize=16)
plt.savefig(figure_dir / f"fig-dataset-separation-violin", bbox_inches="tight")


# stacked bar chart of num_plusses_or_minuses
fig, ax = plt.subplots(figsize=(6, 6))
df_counts = df_allstats.groupby(["basename", "num_plusses_or_minuses"]).size().unstack(fill_value=0)
df_counts = df_counts.loc[:, sorted(df_counts.columns)]
df_counts.plot(kind="bar", stacked=True, ax=ax)
ax.set_xticks(ticks=[0, 1])
ax.set_xticklabels(labels=["PicoZ Supercoiled", "PicoZ Nicked"], rotation=0)
ax.set_ylabel("Count", fontsize=16)
ax.set_xlabel("")
# line thickness for axes thicker
axes_linewidth = 2
ax.spines["top"].set_linewidth(axes_linewidth)
ax.spines["right"].set_linewidth(axes_linewidth)
ax.spines["left"].set_linewidth(axes_linewidth)
ax.spines["bottom"].set_linewidth(axes_linewidth)
# text size
ax.tick_params(axis="both", which="major", labelsize=16)
plt.savefig(figure_dir / f"fig-dataset-separation-stacked-bar", bbox_inches="tight")
plt.show()

# stacked bar chart of % num_plusses_or_minuses rather than counts
fig, ax = plt.subplots(figsize=(6, 6))
# convert the dataframe to just be number of counts using groupby. groupby(x).size() returns a series with
# multiple indices, where the first index is the groupby column and the second index is the value column
# We then need to use unstack to convert the second index to columns, and fill_value=0 to fill in any missing
# values with 0 since the series doesn't require each group to have all possible values
# might want to consider using multiindex series elsewhere, since we often have data that doesn't have all values for
# each group?
df_counts_num_plus_minus_series = df_allstats.groupby(["basename", "num_plusses_or_minuses"]).size()
print(" --- Grouped counts --- ")
print(df_counts_num_plus_minus_series)
print("--- Unstacked with fill_value=0 ---")
df_counts = df_counts_num_plus_minus_series.unstack(fill_value=0)
print(df_counts)
# divide by the row sums to get percentages
df_percents = df_counts.div(df_counts.sum(axis=1), axis=0)

order = ["data/supercoiled", "data/nicked"]
df_counts = df_counts.reindex(order)
df_percents = df_percents.reindex(order)

df_percents.plot(kind="bar", stacked=True, ax=ax, width=0.7, colormap="Blues_r")
ax.margins(x=0.1)
ax.set_xticks(ticks=[0, 1])
ax.set_xticklabels(labels=["Supercoiled", "Nicked"], rotation=0)
ax.set_ylabel("Percentage", fontsize=16)
ax.set_xlabel("")
# line thickness for axes thicker
axes_linewidth = 2
ax.spines["top"].set_linewidth(axes_linewidth)
ax.spines["right"].set_linewidth(axes_linewidth)
ax.spines["left"].set_linewidth(axes_linewidth)
ax.spines["bottom"].set_linewidth(axes_linewidth)
# make y ticks be integers only
ax.set_yticks(ticks=np.arange(0, 1.1, 0.1))
# text size
ax.tick_params(axis="both", which="major", labelsize=16)
# legend
ax.set_xlim(-0.5, 2.5)
ax.legend(title="No. crossings", title_fontsize=14, fontsize=12, loc="upper right", frameon=False)
plt.savefig(figure_dir / f"fig-dataset-separation-stacked-bar-percent", bbox_inches="tight")
plt.show()

In [None]:
nicked_molecule_example_file = Path(
    "/Users/sylvi/topo_data/topostats_2/datasets/picoz-nicked-sc/output_abs_07/nicked/processed/20250926_nicked_picoz.0_00008.topostats"
)
assert nicked_molecule_example_file.exists()
supercoiled_molecule_example_file = Path(
    "/Users/sylvi/topo_data/topostats_2/datasets/picoz-nicked-sc/output_abs_07/supercoiled/processed/20250926_supercoiled_picoz.0_00003.topostats"
)
assert supercoiled_molecule_example_file.exists()

loadscans = LoadScans([nicked_molecule_example_file, supercoiled_molecule_example_file], channel="dummy")
loadscans.get_data()
loadscans_image_dictionary = loadscans.img_dict

nicked_example_image_data = loadscans_image_dictionary[nicked_molecule_example_file.stem]
supercoiled_example_image_data = loadscans_image_dictionary[supercoiled_molecule_example_file.stem]

nicked_crop_x = 200
nicked_crop_y = 650
nicked_crop_size = (230, 230)
nicked_example_image = nicked_example_image_data["image"]
nicked_example_image_crop = nicked_example_image[
    nicked_crop_y : nicked_crop_y + nicked_crop_size[0], nicked_crop_x : nicked_crop_x + nicked_crop_size[1]
]

plt.imshow(nicked_example_image_crop, cmap=cmap, vmin=vmin, vmax=vmax)
plt.show()

plt.imshow(supercoiled_example_image_data["image"], cmap=cmap, vmin=vmin, vmax=vmax)
plt.show()

# actually can just crop from output images directly.

In [None]:
sns.violinplot(data=df_allstats, x="basename", y="aspect_ratio")
plt.show()

sns.violinplot(data=df_allstats, x="basename", y="smallest_bounding_area")
plt.show()

fig, ax = plt.subplots(figsize=(6, 6))
sns.violinplot(
    data=remove_datapoints_outside_n_std(df=df_allstats, column="total_contour_length", n_std=3),
    ax=ax,
    x="basename",
    y="total_contour_length",
)
ax.set_xlabel("")
ax.set_ylabel("Plasmid length (nm)", fontsize=16)
# line thickness for axes thicker
axes_linewidth = 2
ax.spines["top"].set_linewidth(axes_linewidth)
ax.spines["right"].set_linewidth(axes_linewidth)
ax.spines["left"].set_linewidth(axes_linewidth)
ax.spines["bottom"].set_linewidth(axes_linewidth)
ax.set_xticks(ticks=[0, 1])
ax.set_xticklabels(labels=["PicoZ Supercoiled", "PicoZ Nicked"])
ax.tick_params(axis="both", which="major", labelsize=16)
plt.savefig(figure_dir / f"fig-dataset-separation-violin-contour-length", bbox_inches="tight")
plt.show()

# box plot with dots
fig, ax = plt.subplots(figsize=(6, 6))
sns.boxplot(
    data=remove_datapoints_outside_n_std(df=df_allstats, column="total_contour_length", n_std=3),
    ax=ax,
    x="basename",
    y="total_contour_length",
    showfliers=False,
)
sns.stripplot(
    data=remove_datapoints_outside_n_std(df=df_allstats, column="total_contour_length", n_std=3),
    ax=ax,
    x="basename",
    y="total_contour_length",
    color="black",
    jitter=True,
    size=5,
    alpha=0.7,
)
ax.set_xlabel("")
ax.set_ylabel("Plasmid length (nm)", fontsize=16)
# line thickness for axes thicker
axes_linewidth = 2
ax.spines["top"].set_linewidth(axes_linewidth)
ax.spines["right"].set_linewidth(axes_linewidth)
ax.spines["left"].set_linewidth(axes_linewidth)
ax.spines["bottom"].set_linewidth(axes_linewidth)
ax.set_xticks(ticks=[0, 1])
ax.set_xticklabels(labels=["Supercoiled", "Nicked"])
ax.tick_params(axis="both", which="major", labelsize=16)
plt.savefig(figure_dir / f"fig-dataset-separation-boxplot-contour-length", bbox_inches="tight")
plt.show()

print(f"medians: {df_allstats.groupby('basename')['total_contour_length'].median()}")