In [None]:
import os
import sys
import numpy as np

sys.path.append('/nas/groups/iber/Users/Federico_Carrara/Statistics_Collection/EpiStats/src/statistics_collection/scripts/')

import StatsAnalytics as sa
import StatsPlots as sp

In [None]:
# os.chdir('../../')
os.getcwd()

## 1. Load the tissue dataframes

In [None]:
# Load dataframes
out_root_path = '../outputs/outputs_v5'
df_files = [
    'output_bladder_control_curated_segmentation_s_10_e_6_d_8/cell_stats/stats_dataset_bladder.csv',
    'output_esophagus_Z2_curated_crop_s_10_e_6_d_8/cell_stats/stats_dataset_esophagus.csv',
    'output_intestine_sample2_b_curated_segmentation_relabel_seq_s_10_e_6_d_8/cell_stats/stats_dataset_intestine_villus.csv',
    'output_lung_new_sample_b_curated_segmentation_central_crop_relabel_seq_s_10_e_6_d_8/cell_stats/stats_dataset_lung_bronchiole.csv',
    'output_lung_pseudostratified_from_harold_s_10_e_6_d_8/cell_stats/stats_dataset_lung.csv'
]
df_paths = [os.path.join(out_root_path, df_file) for df_file in df_files]

cell_stats_df = sa.prepare_df(paths_to_dfs=df_paths)

In [None]:
# import ast
# import pandas as pd

# # If using v4 of collected statistics run this cell 
# cell_stats_df['principal_axis_and_elongation'] = cell_stats_df['principal_axis_and_elongation'].apply(
#     lambda x: x.split(',')[0].replace('(', '')
# )
# cell_stats_df['principal_axis_and_elongation'] = cell_stats_df['principal_axis_and_elongation'].apply(
#     lambda x: ast.literal_eval(x)
# )
# cell_stats_df['elongation'] = cell_stats_df['principal_axis_and_elongation']
# cell_stats_df.drop(columns=['principal_axis_and_elongation'], inplace=True)

# cell_stats_df["num_neighbors_2D_principal"] = cell_stats_df['neighbors_2D_principal'].apply(lambda x: [len(l) for l in x])

# df_files = ['output_bladder_control_curated_segmentation_s_10_e_6_d_8/cell_stats/stats_dataset_bladder.csv']
# df_paths = [os.path.join(out_root_path, df_file) for df_file in df_files]
# bladder_stats_df = sa.prepare_df(paths_to_dfs=df_paths)

# cell_stats_df = pd.concat(objs=[cell_stats_df, bladder_stats_df], axis=0, ignore_index=True, join="inner")

## 2. Process dataframe (outlier detection, numerical features extraction, standardization, ...)

In [None]:
# Rename columns
cell_stats_df = sa.rename_features(
    df=cell_stats_df,
    old_names=['area'],
    new_names=['surface_area']
)

In [None]:
# Outlier detection
cell_stats_df = sa.detect_outliers(df=cell_stats_df, quantile_level=0.025)

In [None]:
# cell_stats_df.to_csv("/nas/groups/iber/Users/Federico_Carrara/Statistics_Collection/outputs/outputs_v4/stats_table.csv")

In [None]:
# Specify numeric features you want to extract from the data
num_features = [
    'surface_area', 'volume', 'isoperimetric_ratio', 
    'num_neighbors', 'elongation',
    'contact_area_fraction', 'mean_contact_area'
]

In [None]:
# Extract dataframe with only ids and numerical features to make it more handy for plotting
# The function also remove NA's
numeric_cell_stats_df = sa.extract_numerical(
    df=cell_stats_df,
    numeric_features=num_features,
    remove_na=True
)

In [None]:
# numeric_cell_stats_df.to_csv("/nas/groups/iber/Users/Federico_Carrara/Statistics_Collection/outputs/outputs_v4/numerical_stats_table.csv")

In [None]:
# Get standardized dataset (only numerical features)
std_cell_stats_df = sa.standardize(
    df=numeric_cell_stats_df,
    numeric_features=num_features
)
std_cell_stats_df

##### TEMPORARY

Append PCA data to the numerical dataframe, store loadings and explained variance in a separate JSON file.

In [None]:
### TEMPORARY ###
# Add PCs to the table
pcs, loadings, ex_var = sa.apply_PCA(
    df=numeric_cell_stats_df,
    numeric_features=num_features
)

In [None]:
pca_cell_stats_df = numeric_cell_stats_df.copy()
pca_cell_stats_df["PC1"] = pcs[:, 0]
pca_cell_stats_df["PC2"] = pcs[:, 1]
pca_cell_stats_df.to_csv("/nas/groups/iber/Users/Federico_Carrara/Statistics_Collection/outputs/outputs_v4/numerical_stats_table.csv")

In [None]:
import json

out_dict = {
    "features": num_features,
    "PC1_coeffs": list(loadings[0]),
    "PC2_coeffs": list(loadings[1]),
    "explained_variance": list(ex_var)
}

with open("/nas/groups/iber/Users/Federico_Carrara/Statistics_Collection/outputs/outputs_v4/pca_others.json", "w") as file:
    json.dump(out_dict, file, indent=4)

Collect Aboav_Law and Lewis_Law data

In [None]:
### TEMPORARY ###
# Get a dictionary for the Lewis Law data
lewis_dict = sa._get_lewis_law_2D_stats(
    df=cell_stats_df,
    principal_axis=True
)

# Save
with open("/nas/groups/iber/Users/Federico_Carrara/Statistics_Collection/outputs/outputs_v4/lewis_data.json", "w") as file:
    json.dump(lewis_dict, file, indent=2)

In [None]:
### TEMPORARY ###
# Get a dictionary for the Aboav Law data
aboav_dict = sa._get_aboav_law_2D_stats(
    df=cell_stats_df,
    principal_axis=True
)

# Save
with open("/nas/groups/iber/Users/Federico_Carrara/Statistics_Collection/outputs/outputs_v4/aboav_data.json", "w") as file:
    json.dump(aboav_dict, file, indent=2)

## 3. Make plots

For all the plotting function we would need a list of the numerical features of the dataframe to plot. <br>
We also need specify a path to a directory in which to save all the plots.

In [None]:
numerical_features = [
    'surface_area', 'volume', 'isoperimetric_ratio', 
    'num_neighbors', 'elongation',
    'contact_area_fraction', 'mean_contact_area'
] # here you can also set the order of variables for the plots
save_plots_dir = '../images/'

In [None]:
# Create a discrete colormap for plots (not used for correlation matrix)
# Note: in the plotting functions you can either choose a used defined cmap like this one,
# or a normal matplotlib colormap defined by a string (e.g., 'viridis')
colors = np.array([
    [102,194,165],
    [252,141,98],
    [141,160,203],
    [231,138,195],
    [166,216,84]
]) / 255
cell_cmap = sp.create_cmap(color_list = colors)
cell_cmap

### 3.1. Correlation Matrix

In [None]:
sp.corr_matrix_plot(
    df=std_cell_stats_df,
    numerical_features=numerical_features,
    standardize_data=False,
    remove_outliers=True,
    color_map = 'coolwarm',
    save_dir=save_plots_dir,
    show=True
)

### 3.2. PCA plots

In [None]:
sp.pca_plots(
    df=std_cell_stats_df,
    numerical_features=numerical_features,
    standardize_data=False,
    remove_outliers=True,
    color_map = cell_cmap,
    save_dir=save_plots_dir,
    show=True
)

### 3.3. Grid of kdplots for different numerical features

In [None]:
uoms = [
    '\u00B5m' + '\u00B2', 
    '\u00B5m' + '\u00B3', 
    None, 
    None,
    None,
    None,
    '\u00B5m' + '\u00B2'
]

y_lims = [0.002, 0.002, 0.012, 0.25, 3.0, 14, 0.040]

sp.features_grid_kdplots(
    df=numeric_cell_stats_df,
    features=numerical_features,
    y_lims=y_lims,
    remove_outliers=True,
    units_of_measure=uoms,
    color_map=cell_cmap,
    save_dir=save_plots_dir,
    show=True
)

### 3.4. Lewis' Law plots

Lewis' law states that the average apical area of cells having $n$ neighbors $\bar{A}_n$ is linearly related to the number of neighbors $n$ itself. $\newline$
In particular we have the following relation:
$$ \frac{\bar{A}_n}{\bar{A}} = \frac{n - 2}{4} $$
In 3D we could have a similar relation for volumes, namely:
$$ \frac{\bar{V}_n}{\bar{V}} \sim n $$


In [None]:
sp.lewis_law_plots(
    df=numeric_cell_stats_df,
    feature='volume',
    fit_degrees=[1,2],
    remove_outliers=True,
    color_map=cell_cmap,
    save_dir=save_plots_dir,
    show=True
)

### 3.5. Violin Plots for individual samples

In [None]:
sp.violin_plots(
    df=numeric_cell_stats_df, 
    tissue='bladder',
    features=numerical_features,
    units_of_measure='m',
    remove_outliers=True,
    color_map=cell_cmap,
    save_dir=None, 
    show=True
)

### 3.7. Plots of 2D statistics along standard cartesian axes

#### 3.7.0. Number of 2D neighbors barplots

In [None]:
sp.num_neighbors_barplots(
    df=cell_stats_df,
    version='2D',
    remove_outliers=True,
    color_map=cell_cmap,
    save_dir=save_plots_dir,
    show=True
)

#### 3.7.1. Lewis Law plots

In [None]:
sp.lewis_law_2D_plots(
    df=cell_stats_df,
    fit_degrees=None,
    version='standard',
    remove_outliers=True,
    color_map=cell_cmap,
    save_dir=save_plots_dir, 
    show=True
)

#### 3.7.2. Aboav-Weaire Law plots

The Aboav-Weaire law states that the average number of neighbours of all n cells that border a cell with n neighbours follows:

$$ m(n) = 5 + \frac{8}{n}$$

In [None]:
sp.aboav_wearie_2D_plots(
    df=cell_stats_df,
    version='standard',
    remove_outliers=True,
    color_map=cell_cmap,
    save_dir=save_plots_dir,
    show=True
)

### 3.8. Plots of 2D statistics along apical-basal axis of cells

#### 3.8.1. Lewis Law plots

In [None]:
sp.lewis_law_2D_plots(
    df=cell_stats_df,
    fit_degrees=None,
    version='principal',
    remove_outliers=True,
    color_map=cell_cmap,
    save_dir=save_plots_dir, 
    show=True
)

#### 3.8.2. Aboav-Weaire Law plots

The Aboav-Weaire law states that the average number of neighbours of all n cells that border a cell with n neighbours follows:

$$ m(n) = 5 + \frac{8}{n}$$

In [None]:
sp.aboav_wearie_2D_plots(
    df=cell_stats_df,
    version='principal',
    fitted=True,
    remove_outliers=True,
    color_map=cell_cmap,
    save_dir=save_plots_dir,
    show=True
)