In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
base_dir = Path("/Users/sylvi/topo_data/topostats_2/datasets/picoz-nicked-sc")
assert base_dir.exists()
output_dir = base_dir / "output_std_10"
assert output_dir.exists()

df_allstats = pd.read_csv(output_dir / "all_statistics.csv")


def calculate_num_char_in_string(input_string: str, character: str) -> int:
    """Calculate the number of occurrences of a specific character in a string."""
    # check if nan
    if pd.isna(input_string):
        return 0
    return input_string.count(character)

def remove_datapoints_outside_n_std(df: pd.DataFrame, column: str, n_std: float) -> pd.DataFrame:
    """Remove datapoints outside n standard deviations from the mean."""
    mean = df[column].mean()
    std = df[column].std()
    lower_bound = mean - n_std * std
    upper_bound = mean + n_std * std
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_filtered


df_allstats["num_plusses"] = df_allstats["writhe_string"].apply(calculate_num_char_in_string, character="+")
df_allstats["num_minuses"] = df_allstats["writhe_string"].apply(calculate_num_char_in_string, character="-")
df_allstats["num_plusses_or_minuses"] = df_allstats["num_plusses"] + df_allstats["num_minuses"]

print(df_allstats.columns)

In [None]:
# sns.stripplot(data=remove_datapoints_outside_n_std(df_allstats, "num_plusses_or_minuses", 3), x="basename", y="num_plusses_or_minuses", jitter=True)
# with increased smoothing
sns.violinplot(data=remove_datapoints_outside_n_std(df_allstats, "num_plusses_or_minuses", 3), x="basename", y="num_plusses_or_minuses")
plt.xticks(ticks=[0, 1], labels=["PicoZ Supercoiled", "PicoZ Nicked"])
plt.ylabel("Number of crossings")
plt.show()
sns.histplot(data=remove_datapoints_outside_n_std(df_allstats, "num_plusses_or_minuses", 3), x="basename", hue="num_plusses_or_minuses", multiple="dodge", shrink=0.8)
# rename the xlabels to be more descriptive
plt.xticks(ticks=[0, 1], labels=["PicoZ Supercoiled", "PicoZ Nicked"])
plt.ylabel("Number of crossings")
plt.show()

sns.violinplot(data=remove_datapoints_outside_n_std(df_allstats, "smallest_bounding_area", 3), x="basename", y="smallest_bounding_area")
plt.show()