In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

In [None]:
# Load the data
DATA_DIR = Path("/Volumes/shared/pyne_group/Shared/Papers/cas9_minicircles/figure_1/")
assert DATA_DIR.exists()
filename = "feret_data_2024-05-21_all_without_anomalies.csv"

df = pd.read_csv(DATA_DIR / filename)
print(df.head())

# print all unique sample types
print(df["sample_type"].unique())

In [None]:
# Try a demo t test
dist_a = [
    1,
    1,
    2,
    3,
    3,
    3,
    3,
    4,
    5,
    5,
    6,
    8,
]
dist_b = [1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 7, 9]
t_stat, p_val = stats.ttest_ind(dist_a, dist_b)
print(f"t_stat: {t_stat}, p_val: {p_val}")
# kde
sns.kdeplot(dist_a, color="blue", label=f"dist_a n={len(dist_a)}")
sns.kdeplot(dist_b, color="red", label=f"dist_b n={len(dist_b)}")
plt.legend()

In [None]:
pairs = [("unbound_ON_SC", "cas9_ON_SC"), ("unbound_OT1_SC", "cas9_OT1_SC"), ("unbound_OT2_SC", "cas9_OT2_SC")]

for unbound_sample_type, cas9_sample_type in pairs:
    print(f"Comparing {unbound_sample_type} and {cas9_sample_type}")

    unbound_df = df[df["sample_type"] == unbound_sample_type]
    print(unbound_df.head())
    unbound_series = unbound_df["min_feret"]
    print(unbound_series.head())

    cas9_df = df[df["sample_type"] == cas9_sample_type]
    print(cas9_df.head())
    cas9_series = cas9_df["min_feret"]
    print(cas9_series.head())

    print(f"Ns: {unbound_sample_type}: {len(unbound_series)}, {cas9_sample_type}: {len(cas9_series)}")

    # Perform the t-test
    t_stat, p_val = stats.ttest_ind(unbound_series, cas9_series)

    print(f"t-statistic: {t_stat}")
    print(f"p-value: {p_val}")
    if p_val < 0.05:
        print(
            f"YES - The difference between {unbound_sample_type} and {cas9_sample_type} samples IS statistically significant."
        )
    else:
        print(
            f"NO - The difference between {unbound_sample_type} and {cas9_sample_type} samples IS NOY statistically significant."
        )

        # Plot kdes
    sns.kdeplot(unbound_series, label=f"{unbound_sample_type} (N={len(unbound_series)})")
    sns.kdeplot(cas9_series, label=f"{cas9_sample_type} (N={len(cas9_series)})")
    plt.legend()
    plt.title(f"{unbound_sample_type} vs {cas9_sample_type} min feret, p={p_val:.2e}")
    plt.show()