In [None]:
# Import dependencies
%matplotlib inline
import os
import scanpy as sc
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import anndata as ad
import numpy as np

matplotlib.rcParams['font.family'] = 'sans-serif'

# Initialize random seed
import random
random.seed(111)

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks"
os.chdir( wdir )

# folder structures
RESULTS_FOLDERNAME = "adult/ImageAnalysis/results/"
FIGURES_FOLDERNAME = "adult/ImageAnalysis/figures/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.savefig(os.path.join(folder, fname), format='svg')

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
results = pd.read_csv(os.path.join(RESULTS_FOLDERNAME, 'Tendon project.csv'), index_col=0)
results

In [None]:
results['fullname'] = results.index.str.split(' \(').str[0]
results['coordinates'] = results.index.str.split(' \(').str[1]

In [None]:
results.groupby('fullname').size()

In [None]:
def extract_info(fullname):
    if fullname.startswith('OMB'):
        sampleID = fullname.split('_')[0]
        age_weeks = 'adult'
    else:
        parts = fullname.split('_')
        sampleID = f'DEV{parts[0]}'
        age_weeks = parts[1].replace('w', 'pcw')

    tissue = ''
    if 'Ach' in fullname:
        tissue = 'Achilles tendon'
    elif 'Quad' in fullname:
        tissue = 'Quadriceps tendon'
    elif 'Pat' in fullname:
        tissue = 'Patellar tendon'
    elif 'SSP' in fullname:
        tissue = 'Supraspinatus tendon'
    else:
        tissue = 'Unknown'

    return sampleID, age_weeks, tissue

In [None]:
for index, row in results.iterrows():
    sampleID, age_weeks, tissue = extract_info(row['fullname'])
    results.at[index, 'sample'] = sampleID
    results.at[index, 'age'] = age_weeks
    results.at[index, 'tissue'] = tissue

In [None]:
results

In [None]:
results.groupby(['sample', 'tissue']).size()

In [None]:
results.columns

In [None]:
results['age'].astype('categorical')

In [None]:
results

In [None]:
adult_ages = {
    'OMB1250': '45yr',
    'OMB1267': '52yr',
    'OMB1270': '84yr', 
    'OMB1272': '65yr',
    'OMB1276': '66yr',
    'OMB1284': '50yr',    
    'OMB1291': '69yr'
}

results['age'] = results.apply(
    lambda row: adult_ages[row['sample']] if row['age'] == 'adult' and row['sample'] in adult_ages else row['age'], 
    axis=1
)
results['age'].value_counts()

In [None]:
from pandas.api.types import CategoricalDtype

age_order = [
    '11pcw', '12pcw', '13pcw', '14pcw', '15pcw', '16pcw', '17pcw', '19pcw', '20pcw',
    '45yr', '50yr', '52yr', '65yr', '66yr', '69yr', '84yr'
]

# Create a CategoricalDtype with the specific order
age_dtype = CategoricalDtype(categories=age_order, ordered=True)

# Convert the 'age' column to the categorical type
results['age'] = results['age'].astype(age_dtype)

In [None]:
results['health'] = 'Healthy'
results.loc[results['sample'].isin(['OMB1267', 'OMB1272']), 'health'] = 'Not Healthy'
results['health'].value_counts()

In [None]:
results

In [None]:
results.to_csv(os.path.join(RESULTS_FOLDERNAME, 'stardist_analysis_fulldata.csv'))

In [None]:
sns.set(style="whitegrid")

# Create a scatter plot comparing max distance to centroid to nucleus area, colored by age
plt.figure(figsize=(12, 6))
sns.scatterplot(data=results, x=' Nucleus_area', y=' max_distance_to_centroid', hue='age', palette="tab20b", alpha=0.9, s=100, edgecolor='w', linewidth=0.5)
plt.title('Max Distance to Centroid vs Nucleus Area')
plt.xlabel('Nucleus Area')
plt.ylabel('Max Distance to Centroid')
plt.legend(title='Sample Age', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'Area_vs_Max_dist_to_centroid_scatter_age.svg'))
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of the visualization
sns.set(style="whitegrid")

# Plot boxplot with jitter for nuclei numbers
plt.figure(figsize=(12, 6))
sns.boxplot(data=results, x='age', y=' Nucleus_number', palette="tab20b")
sns.stripplot(data=results, x='age', y=' Nucleus_number', color='black', size=5, jitter=True, alpha=0.5)
plt.title('Nucleus Numbers per Sample Age')
plt.xlabel('Sample Age')
plt.ylabel('Nucleus Numbers')
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'Nucleus_number_boxplot_age.svg'))
plt.show()

# Plot boxplot with jitter for nucleus areas
plt.figure(figsize=(12, 6))
sns.boxplot(data=results, x='age', y=' Nucleus_area', palette="tab20b")
sns.stripplot(data=results, x='age', y=' Nucleus_area', color='black', size=5, jitter=True, alpha=0.5)
plt.title('Nucleus Areas per Sample Age')
plt.xlabel('Sample Age')
plt.ylabel('Nucleus Areas')
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'Nucleus_area_boxplot_age.svg'))
plt.show()

# Plot boxplot with jitter for max distance to centroid
plt.figure(figsize=(12, 6))
sns.boxplot(data=results, x='age', y=' max_distance_to_centroid', palette="tab20b")
sns.stripplot(data=results, x='age', y=' max_distance_to_centroid', color='black', size=5, jitter=True, alpha=0.5)
plt.title('Max Distance to Centroid per Sample Age')
plt.xlabel('Sample Age')
plt.ylabel('Max Distance to Centroid')
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'Max_dist_to_centroid_boxplot_age.svg'))
plt.show()


In [None]:
results[['age', 'health']].value_counts()

In [None]:
results = pd.read_csv(os.path.join(RESULTS_FOLDERNAME, 'stardist_analysis_fulldata.csv'), index_col=0)
results

In [None]:
# Clean column names
results.columns = results.columns.str.strip()

In [None]:
results.columns

In [None]:
for column in ['Nucleus_number', 'Nucleus_area', 'max_distance_to_centroid']:
    stats = results.groupby('age')[column].describe()
    stats.to_csv(os.path.join(RESULTS_FOLDERNAME, f'stats_{column}.csv'))

In [None]:
stats

In [None]:
from scipy.stats import shapiro
import statsmodels.api as sm

# Shapiro-Wilk test for normality
for column in ['Nucleus_number', 'Nucleus_area', 'max_distance_to_centroid']:
    stat, p = shapiro(results[column])
    print(f'Shapiro-Wilk Test for {column}: Statistics={stat}, p={p}')
    
    # Plot histograms
    plt.figure(figsize=(10, 4))
    sns.histplot(results[column], kde=True)
    plt.title(f'Histogram of {column}')
    plt.show()

    # Plot Q-Q plots
    plt.figure(figsize=(10, 4))
    sm.qqplot(results[column], line='s')
    plt.title(f'Q-Q Plot of {column}')
    plt.show()

In [None]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Perform ANOVA
for column in ['Nucleus_number', 'Nucleus_area', 'max_distance_to_centroid']:
    model = ols(f'{column} ~ C(age)', data=results).fit()
    anova_results = anova_lm(model)
    print(f'ANOVA Results for {column}:\n', anova_results)

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Perform Tukey's HSD post-hoc test
for column in ['Nucleus_number', 'Nucleus_area', 'max_distance_to_centroid']:
    tukey = pairwise_tukeyhsd(endog=results[column], groups=results['age'], alpha=0.05)
    print(f'Tukey HSD Results for {column}:\n', tukey)
    tukey_summary = tukey.summary()
    results_df = pd.DataFrame(data=tukey_summary.data[1:], columns=tukey_summary.data[0])
    results_df.to_csv(os.path.join(RESULTS_FOLDERNAME, f'tukey_{column}.csv'), index=False)
    

In [None]:
from scipy.stats import kruskal

# Perform Kruskal-Wallis H-test
for column in ['Nucleus_number']:
    kruskal_stat, kruskal_p = kruskal(*[group[column].values for name, group in results.groupby('age')])
    print(f'Kruskal-Wallis H-test for {column}: Statistics={kruskal_stat}, p={kruskal_p}')

In [None]:
import scikit_posthocs as sp

# Perform Dunn's test for pairwise comparisons
dunn = sp.posthoc_dunn(results, val_col='Nucleus_number', group_col='age', p_adjust='bonferroni')
dunn_df = pd.DataFrame(dunn)
dunn_df.to_csv(os.path.join(RESULTS_FOLDERNAME, f'dunn_nucleus_number.csv'))
print(dunn)