In [None]:
from anndata import AnnData
import scanpy as sc
import squidpy as sq
import numba
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pytometry as pm
from anndata import read_h5ad
from ipywidgets import FloatProgress
from utag import utag
import holoviews as hv
from utag.utils import celltype_connectivity, domain_connectivity
from utag.vizualize import draw_network

In [None]:
plt.ion()
sc.settings.set_figure_params(dpi=200, fontsize=10)

# Loading data

load selected area csv for corresponding week that you are running (selection is done using Perseus).

In [None]:
dat1=pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_selectedarea.csv',sep=',')

In [None]:
dat1 = dat1.drop(["DAPI1"], axis=1)


In [None]:
dat = dat1.drop(['size', 'cell_id', 'x', 'y'], axis=1)

In [None]:
dat

# Defining/ subtracting cut off values

insert the list of values to substract the background (values for each week provided in the supplementary files)

In [None]:
#cut off values based on images(test, higher cd44, CD90, DCN and wt1)
bg_cut = [0.8668,8.1543,0.5302,0.0528,1.4586,0.2816,4.908,4.2471,2.2077,20.3258,7.287,2.3969,3.5992,0.3883,0.3179,
0.8679,10,0.2211,7.5438,1.2001,0.1298,0.1221,3.0316,3.6212,0.143,0.4125,0.5709,6.1809,0.9834,3.5]

In [None]:
#subtracting cutoff values to get rid of background 
dat = dat.subtract(bg_cut)

In [None]:
exp=dat
meta=dat1[["cell_id", "size", "x","y"]]


In [None]:
exp

In [None]:
meta

# Anndata

here we create an anndata object with exp and meta we created

In [None]:
import numpy as np
coordinates=np.array(dat1.loc[:,['x', 'y']])

In [None]:
obsm={"spatial": coordinates}

In [None]:

adata = sc.AnnData(exp,obs=meta,obsm={"spatial": coordinates})

print(adata)

adata.var # shows the gene names
adata.X # LOOKS for expression



# Arcsinh normalization

In [None]:
pm.tl.normalize_arcsinh(adata, cofactor=150)

# Clustering

In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata,n_neighbors= 10)
sc.tl.umap(adata,min_dist= 0.1, spread= 0.3, negative_sample_rate= 4)
sc.tl.leiden(adata,resolution=1)
sc.pl.umap(adata, color='leiden')

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=5, sharey=False,save='ranking.pdf')

In [None]:
sc.pl.spatial(adata, color='leiden', spot_size=30, ncols = 1, cmap = 'turbo')

# Dot plot

In [None]:
adata.obs.leiden= adata.obs.leiden.astype(str)
markers = adata.var.index
sc.pl.dotplot(adata, markers,groupby= 'leiden', dendrogram=True, size_title=None)

For running other weeks, the annotations from the supplementary figures can be used. Below we provide annotations for week 11 as it is the example dataset

initial annotation for week 11

In [None]:
old_to_new = {
    '0':'Endothelial',
    '1':'S0X9+ epithelial',
    '2':'Endothelial',
    '3':'Mesenchymal',
    '4':'Endothelial',
    '5':'Mesenchymal',
    '6':'Proliferating mesenchymal',
    '7':'Mesenchymal',
    '8':'Mesenchymal',
    '9':'Artifact',
    '10':'Mesenchymal',
    '11':'Lymphathic endothelial',
    '12':'SOX2+ epithelial',
    '13':'SOX2+ epithelial',
    '14':'Smooth muscle',
    '15':'Endothelial',
    '16':'Immune',
    '17':'Neuronal',
    '18':'Immune',
    '19':'Chondroblast',
    '20':'Mesenchymal',
    '21':'Artifact',
    '22':'SOX9+ epithelial',
    '23':'Smooth muscle',
    '24':'Mesenchymal',
    '25':'Vascular smooth muscle',
    '26':'Mesenchymal',
    '27':'Immune',
    '28':'Mesenchymal',
    '29':'Artefact',
    '30':'Artefact',
    '31':'Artefact',
    '32':'Artefact'

}
adata.obs['annotation'] = adata.obs['leiden'].map(old_to_new)

# REMOVING ARTEFACT CLUSTERS

removing artefact clusters

In [None]:
adata_new = adata[~adata.obs['leiden'].isin(['9','21','29','30','31','32']),:].copy()

reclustering

In [None]:
sc.tl.pca(adata_new)
sc.pp.neighbors(adata_new,n_neighbors= 10)
sc.tl.umap(adata_new,min_dist= 0.1, spread= 0.3, negative_sample_rate= 4)
sc.tl.leiden(adata_new,resolution=1,key_added='leiden_sub')
sc.pl.umap(adata_new, color='leiden_sub')

For running other weeks, the annotations from the supplementary figures can be used. Below we provide annotations for week 11 as it is the example dataset

annotations after excluding artifacts for week 11

In [None]:
old_to_new = {
    '0':'Vim+ mes.',
    '1':'Vim+ mes.',
    '2':'Endothelial',
    '3':'SOX2 high epit.',
    '4':'Endothelial',
    '5':'Endothelial',
    '6':'SOX9 high epit.',
    '7':'Neuronal',
    '8':'Lymp.endo.',
    '9':'Vim+ mes.',
    '10':'Ki67+ mes.',
    '11':'Vim+ mes.',
    '12':'SOX9 high epit.',
    '13':'Immune',
    '14':'Immune',
    '15':'Vim+ mes.',
    '16':'Air. fibro.',
    '17':'Chondroblast',
    '18':'Adv. fibro.',
    '19':'Ki67+ mes.',
    '20':'ASM',
    '21':'Vim+ mes.',
    '22':'Vim+ mes.',
    '23':'VSM',
    '24':'SOX9 high epit.',
    '25':'Air. fibro.',
    '26':'Immune',
    '27':'Vim+ mes.',
    '28':'Neuronal',
    '29':'SOX2 high epit.',
    '30':'Endothelial',
    '31':'Endothelial',
    '32':'Endothelial',

}
adata_new.obs['annotation'] = adata_new.obs['leiden_sub'].map(old_to_new)

In [None]:
adata_new.uns['annotation_colors'] = ['#e6194b', '#3cb44b', '#9A6324', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080','#a9a9a9', '#e6beff']

In [None]:
sc.pl.spatial(adata_new, color='annotation', spot_size=30, ncols = 1, cmap = 'turbo')

In [None]:
sc.pl.umap(adata_new, color='annotation')

dot plot with the annotations

In [None]:
sc.tl.dendrogram(adata_new,groupby='annotation')
adata_new.obs.leiden= adata_new.obs.leiden.astype(str)
markers = adata_new.var.index
sc.pl.dotplot(adata_new, markers,groupby= 'annotation', dendrogram=True, size_title=None,save='annotated_dotplot.pdf')

writing a csv file after artefact cluster removal, this csv will be used in the next section.

In [None]:
t=adata_new.X
pd.DataFrame(data=t, index=adata_new.obs_names, columns=adata_new.var_names).to_csv('11w_artefactremoved.csv')

ASM cluster was reclustered for week 12 and 13. While running these datasets, you can perform this reclustering and integration of neuronal cells to main 'annotations'.You can integrate cluster 9 for week 12 and cluster 17 for week 13 as 'neuronal'.

isolating ASM cluster and reclustering it

In [None]:
#ASM = adata_new[adata_new.obs['annotation'].isin(['ASM'])]
#reclustering ASM cluster
#genes_to_keep = ['ACTA2', 'CD44', 'CD56','CD90','WT1']  # Add the names of the genes you want to keep
#ASM =ASM[:, ASM.var_names.isin(genes_to_keep)]
#sc.tl.pca(ASM)
#sc.pp.neighbors(ASM,n_neighbors= 10)
#sc.tl.umap(ASM,min_dist= 0.1, spread= 0.3, negative_sample_rate= 4)
#sc.tl.leiden(ASM,resolution=0.4,key_added='leiden_asm')
#sc.pl.umap(ASM, color='leiden_asm',legend_loc= 'on data')
#dotplot
#sc.tl.dendrogram(ASM,groupby='leiden_asm')
#ASM.obs.leiden_asm= ASM.obs.leiden_asm.astype(str)
#markers = ASM.var.index
#sc.pl.dotplot(ASM, markers,groupby= 'leiden_asm', dendrogram=True, size_title=None)

integrating neuronal cell cluster (#9) to the annotations

In [None]:
#ASM.obs.to_csv('13w_ASM.csv')
#ASM_csv= pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/14w/13w_ASM.csv',sep=',')

updating the annotations with neuronal cells 
Filter cell IDs from ASM_csv where 'leiden_asm' column equals 17

In [None]:
#selected_cell_ids = ASM_csv.loc[ASM_csv['leiden_asm'] == 17, 'cell_id']

# Create a dictionary mapping selected cell IDs to the desired annotation ("Neuronal")
#cell_id_to_annotation = dict(zip(selected_cell_ids, ['Neuronal'] * len(selected_cell_ids)))

# Update 'annotation' column in adata_new2 using the map function
#adata_new.obs['annotation'] = adata_new.obs['cell_id'].map(cell_id_to_annotation).combine_first(adata_new.obs['annotation'])

# MARKER CORRELATION

In [None]:
import numba
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

update the path to the filepath that you saved above.

In [None]:
data=pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_artefactremoved.csv',sep=',')

In [None]:
data = data.drop(["Unnamed: 0"], axis=1)
data=data.reset_index()
data = data.drop(["index"], axis=1)

In [None]:
markers = []
#Getting the list of marker names
markers = list(data.columns.values)


calculating correlation values between markers

In [None]:
data = data[markers].corr()

In [None]:
plt.figure()
fig1, ax1 = plt.subplots(figsize=(8,8))
sns_heatmap = sns.heatmap(data, annot=True, annot_kws={"fontsize":5}, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1, center = 0, square = False, linewidths=.1, cbar=False, ax=ax1)
plt.savefig('11weekpvalues.pdf')

In [None]:
sns_clustermap = sns.clustermap(data, figsize=(8,8))
plt.savefig('11weekcorrelation.pdf')

# Reclustering immune cells

isolating immune cluster and reclustering it

In [None]:
# Select cells annotated as 'immune'
immune_cells = adata_new[adata_new.obs['annotation'].isin(['Immune'])]



reclustering immune cells

In [None]:
genes_to_keep = ['CD3', 'CD4', 'CD19','CD44','CD45','CD56','CD68','CD163','HLADR','KI67','MRC1']  # Add the names of the genes you want to keep


immune_cells =immune_cells[:, immune_cells.var_names.isin(genes_to_keep)]

In [None]:
sc.tl.pca(immune_cells)


sc.pp.neighbors(immune_cells,n_neighbors= 10)

sc.tl.umap(immune_cells,min_dist= 0.1, spread= 0.3, negative_sample_rate= 4)

sc.tl.leiden(immune_cells,resolution=0.2,key_added='leiden_imm')

sc.pl.umap(immune_cells, color='leiden_imm',legend_loc= 'on data')


In [None]:
plt.ion()
sc.settings.set_figure_params(dpi=150, fontsize=10)
sc.pl.spatial(immune_cells, color='leiden_imm', spot_size=30, ncols = 1, cmap = 'turbo')

In [None]:
sq.pl.spatial_scatter(immune_cells, shape=None, color='leiden_imm', connectivity_key="spatial_connectivities",edges_width=0.4, size=3,crop_coord=[(10000, 10000, 15900, 19000)],library_id=None,edges_color='#5e5c5b')

dot plots for the immune clusters

In [None]:
sc.tl.dendrogram(immune_cells,groupby='leiden_imm')
immune_cells.obs.leiden_imm= immune_cells.obs.leiden_imm.astype(str)
markers = immune_cells.var.index
sc.pl.dotplot(immune_cells, markers,groupby= 'leiden_imm', dendrogram=True, size_title=None)

annotating immune clusters based on dot plots. For the other weeks, the annotations for each cluster is provided in the supplementary figure.

In [None]:
old_to_new2 = {
    '0':'Macrophage',
    '1':'Macrophage',
    '2':'Macrophage',
    '3':'Macrophage',
    '4':'ILC2 & T cell',
    '5':'ILC2 & T cell',
    '6':'Macrophage',
    '7':'Macrophage',
    '8':'Dendritic cell',
    
}
immune_cells.obs['imm_ann'] = immune_cells.obs['leiden_imm'].map(old_to_new2)
immune_cells.obs['imm_ann'] = immune_cells.obs['imm_ann'].astype('category')

dot plots for the annotated immune cells

In [None]:
sc.tl.dendrogram(immune_cells,groupby='imm_ann')
immune_cells.obs.imm_ann= immune_cells.obs.imm_ann.astype(str)
markers = immune_cells.var.index
sc.pl.dotplot(immune_cells, markers,groupby= 'imm_ann', dendrogram=True, size_title=None,save='imm_dotplot.pdf')

In [None]:
immune_cells.obs['imm_ann_refined']=pd.Categorical((immune_cells.obs['imm_ann'].copy()).astype('category'))

integrating CD19+ CD45+ B cells (selected manually from the segmented image). As B cells were very low in number we selected them manually based on the overlaid CD45 and CD19 images.

In [None]:
# List of cell IDs to be assigned "B cell" annotation
cell_ids_to_update = [88818,89080,93154,95974,134998,142137,144827,150800,158249,158283,165963,201194,206374,214565,216697,218731,223325,231923,235643,258893,342895,368725,421475,425331,455036,457540,459825,483990,484131,485858,485877,487019,554400,555658,561378,639589,691107,696833,697404,708754,731412]


# Check if "B cell" is a category in imm_ann_refined, and add it if not
if "B cell" not in immune_cells.obs['imm_ann_refined'].cat.categories:
    immune_cells.obs['imm_ann_refined'] = immune_cells.obs['imm_ann_refined'].cat.add_categories("B cell")

# Iterate through the list and update annotations to "B cell"
for cell_id in cell_ids_to_update:
    # Check if the cell ID exists in the immune_cells.obs DataFrame
    if cell_id in immune_cells.obs['cell_id'].tolist():
        # Update the annotation for the specific cell ID to "B cell"
        immune_cells.obs.loc[immune_cells.obs['cell_id'] == cell_id, 'imm_ann_refined'] = "B cell"
    else:
        print(f"Cell ID '{cell_id}' not found in the immune_cells.obs DataFrame.")


integrating CD45+ CD56+ NK cells (selected manually from the segmented image). As NK cells were low in number, we selected them manually based on the overlaid CD45 and CD56 images. NK cell_ids are provided for week 11, 12 and 13 in the data repository.

In [None]:
# List of cell IDs to be assigned "NK cell" annotation
cell_ids_to_update = pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_NKcell_selection.csv',sep=',')
cell_ids_to_update = cell_ids_to_update['cell_id'].to_list()

# Check if "B cell" is a category in imm_ann_refined, and add it if not
if "NK cell" not in immune_cells.obs['imm_ann_refined'].cat.categories:
    immune_cells.obs['imm_ann_refined'] = immune_cells.obs['imm_ann_refined'].cat.add_categories("NK cell")

# Iterate through the list and update annotations to "nk cell"
for cell_id in cell_ids_to_update:
    # Check if the cell ID exists in the immune_cells.obs DataFrame
    if cell_id in immune_cells.obs['cell_id'].tolist():
        # Update the annotation for the specific cell ID to "NK cell"
        immune_cells.obs.loc[immune_cells.obs['cell_id'] == cell_id, 'imm_ann_refined'] = "NK cell"
    else:
        print(f"Cell ID '{cell_id}' not found in the immune_cells.obs DataFrame.")


In [None]:
sc.tl.dendrogram(immune_cells,groupby='imm_ann_refined')
immune_cells.obs.imm_ann= immune_cells.obs.imm_ann.astype('str')
markers = immune_cells.var.index
sc.pl.dotplot(immune_cells, markers,groupby= 'imm_ann_refined', dendrogram=True, size_title=None,save='dotplot_imm.pdf')

In [None]:
immune_cells.uns['imm_ann_refined_colors'] = ['#0000ff','#ffd700','#bc8f8f','#00bfff','#ff1493']

In [None]:
sc.pl.spatial(immune_cells, color='imm_ann_refined', spot_size=90, ncols = 1, cmap = 'turbo')

creating the csv file for immune_cells

In [None]:
immune_cells.obs.to_csv('11w_immunecells.csv')

reading it back to loop through the cell_ids and and annotations

In [None]:
immune_cells_csv= pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_immunecells.csv',sep=',')

In [None]:
immune_cells_csv

# Proliferating immune cells

In [None]:
immune_cells.obs['imm_ann_pro']=pd.Categorical((immune_cells.obs['imm_ann_refined'].copy()).astype('category'))

reading the proliferating cells which are selected via Perseus

In [None]:
dat_pro=pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_proliferation.csv',sep=',')

In [None]:
dat_pro

tagging immune cells as 'proliferating' if they are in the proliferating cells list

In [None]:
proliferating_idx_imm = immune_cells.obs.cell_id.isin(dat_pro.cell_id)

# Ensure imm_ann_pro is of type str
immune_cells.obs['imm_ann_pro'] = immune_cells.obs['imm_ann_pro'].astype(str)

prolif_res_imm = immune_cells.obs.loc[proliferating_idx_imm].imm_ann_pro.apply(lambda x: 'Proliferating ' + str(x))
immune_cells.obs.loc[proliferating_idx_imm, 'imm_ann_pro'] = prolif_res_imm


In [None]:
immune_cells.obs['imm_ann_pro'] = immune_cells.obs['imm_ann_pro'].astype('category')

In [None]:
immune_cells.uns['imm_ann_pro_colors'] = ['#00bfff','#0000ff','#ffd700','#bc8f8f','#ff1493','#6495ed','#00ff00','#f8b4ff','#2e8b57','#ba55d3']

calculating cell-cell adjacency

In [None]:
sq.gr.spatial_neighbors(immune_cells, n_neighs=10, coord_type="generic")
sq.pl.spatial_scatter(immune_cells, shape=None, color='imm_ann_pro', connectivity_key="spatial_connectivities", size=2,figsize=(8, 6),library_id=None)

plotting cell-cell adjacency spatial plots by cropping the tissue

In [None]:
sq.pl.spatial_scatter(immune_cells, shape=None, color='imm_ann_pro',figsize=(8,6), connectivity_key="spatial_connectivities",edges_width=0.8, size=20,crop_coord=[(5000, 5000, 10000, 10000)],library_id=None,edges_color='#5e5c5b')

In [None]:
sq.gr.nhood_enrichment(immune_cells, cluster_key="imm_ann_pro")

adding a dataframe to immune_cells.uns['imm_ann_pro_neighbours'] by combining the neighborhood enrichment counts for imm_ann_pro

In [None]:
immune_cells.uns['imm_ann_pro_neighbours']=pd.DataFrame(immune_cells.uns['imm_ann_pro_nhood_enrichment']['count'],index=np.unique(immune_cells.obs['imm_ann_pro']),columns=np.unique(immune_cells.obs['imm_ann_pro']))

drawing a network using the immune_cells.uns['imm_ann_pro_neighbours'] dataframe

In [None]:
fig = draw_network(
    adata=immune_cells,
    node_key = 'imm_ann_pro',
    adjacency_matrix_key= 'imm_ann_pro_neighbours',
    font_size=30,
    edge_weight = 5,
    edge_weight_baseline=1,
    dpi=150,
    node_size_max=3000,
    node_size_min=1000,
)
plt.savefig("11w_imm_celltocell_network.pdf")

integrating immune cell annotations to adata_new to be able to show adjacency of immune cells with other cell types

In [None]:
adata_new.obs['immune_ann']=pd.Categorical((adata_new.obs['annotation'].copy()).astype('category'))

re-creating the csv file for immune_cells this time it will contain 'imm_ann_pro' column that we will loop through in the next part

In [None]:
immune_cells.obs.to_csv('11w_immunecells.csv')

reading it back to loop through the cell_ids and and annotations

In [None]:
immune_cells_csv= pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_immunecells.csv',sep=',')

In [None]:
cell_id_to_subtype = dict(zip(immune_cells_csv['cell_id'], immune_cells_csv['imm_ann_pro']))

# Update 'arterialimm_subtypes' using the map function
adata_new.obs['immune_ann'] = adata_new.obs['cell_id'].map(cell_id_to_subtype).combine_first(adata_new.obs['immune_ann'])


In [None]:
adata_new.obs['immune_ann']=pd.Categorical(adata_new.obs['immune_ann']).astype('category')

calculating the adjacency and plotting it as a heatmap

In [None]:
sq.gr.spatial_neighbors(adata_new, key_added='spatial')
sq.gr.spatial_neighbors(adata_new, radius=50, coord_type="generic")
sq.gr.nhood_enrichment(adata_new,cluster_key='immune_ann')
sq.pl.nhood_enrichment(adata_new, cluster_key='immune_ann', method="ward",mode='zscore',vmax=100,vmin=-100,cmap='coolwarm',show=False)


# NEIGHBOURHOOD ANALYSIS

# BANKSY

reading the anndata with calculated banksy domains. This is calculated by the Jupyter notebook named as 'Domains_Banksy'. Here in this section we visualize the domains and annotate them according to the cell types that they contain

In [None]:
adata_domain = read_h5ad('100524_adata_11w_domains.h5ad')

In [None]:
sc.pl.spatial(adata_domain,spot_size=20,color='banksy_domain')

heatmap showing marker expression in each domain

In [None]:
plt.ion()
sc.settings.set_figure_params(dpi=150, fontsize=10)
sc.pl.matrixplot(
    adata_domain,
    var_names = adata_domain.var.index,
    groupby="banksy_domain",
    dendrogram=True,
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    show=True
)

annotating the domains

In [None]:
banksy_map = {
    "0": "Endothelial-mesenchymal rich",
    "1": "Mesenchymal-immune rich",
    "2": "Mesenchymal rich",
    "3": "SOX9high epithelial rich",
    "4": "Mesenchymal rich",
    "5": "Chondroblast rich",
    "6": "SOX2high epithelial rich",
    "7": "SOX9high epithelial rich",
    "8": "ASM rich",
    "9": "ASM rich",
    "10": "Endothelial-mesenchymal rich",
   
}

adata_domain.obs['banksy_domain_annotation'] = adata_domain.obs['banksy_domain'].map(banksy_map)
adata_domain.obs['banksy_domain_annotation'] = adata_domain.obs['banksy_domain_annotation'].astype('category')

In [None]:
sc.pl.spatial(adata_domain,spot_size=20,color='banksy_domain_annotation')

In [None]:
adata_domain.uns["banksy_domain_annotation_colors"] = ['#C87FAA', '#6396dd', '#41030D', '#ffbc89', '#0000FF', '#657213', '#15e45b']

In [None]:
sc.pl.spatial(adata_domain,spot_size=20,color='banksy_domain_annotation')

In [None]:
adata_domain.obs["slide"] = "1"

bar plots showing the cell types in each domain

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Grouping and preparing data
data = adata_domain.obs.groupby(["annotation", "banksy_domain_annotation"]).count().reset_index()

# Creating FacetGrid
g = sns.FacetGrid(
    data=data,
    hue="annotation",
    col="banksy_domain_annotation",
    col_wrap=2,
    aspect=1.2,
    palette="colorblind",
    sharex=False,
    height=2,
)

# Mapping barplot to FacetGrid
g.map(sns.barplot, "annotation", "slide")

# Iterate over axes of FacetGrid
for i, ax in enumerate(g.axes.flat):
    # Making the x-axis labels smaller
    labels = ax.get_xticklabels()  # get x labels
    ax.set_xticklabels(labels, rotation=90, fontsize=6)  # set new labels with smaller fontsize
    titles = ax.get_title()
    titles = titles.replace("Banksy domain = ", "")
    ax.set_title(titles, fontsize=8)  # Adjust title fontsize as needed
    ax.set_ylabel("", fontsize=8)  # Adjust y-axis label fontsize as needed
    ax.set_xlabel("", fontsize=8)  # Adjust x-axis label fontsize as needed

plt.tight_layout()
plt.savefig("zone_barplot.pdf")

pie charts for showing the cell types in each domain 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Grouping and preparing data
data = adata_domain.obs.groupby(["banksy_domain_annotation", "annotation"]).size().reset_index(name='count')

# Define the annotation colors in alphabetical order
annotation_colors = ['#e6194b', '#3cb44b', '#9A6324', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#a9a9a9', '#e6beff']

# Get unique annotations and sort them alphabetically
sorted_annotations = sorted(adata_domain.obs['annotation'].unique())

# Create a mapping of annotations to colors
color_mapping = {annotation: color for annotation, color in zip(sorted_annotations, annotation_colors)}

# Function to create pie charts
def pie_chart(ax, data):
    banksy_domain_annotation = data['banksy_domain_annotation'].iloc[0]
    sizes = data['count']
    labels = data['annotation']
    total = sum(sizes)
    percentages = [size / total * 100 for size in sizes]

    # Sort the labels and percentages by descending order of percentages
    sorted_labels_percentages = sorted(zip(labels, percentages), key=lambda x: x[1], reverse=True)
    sorted_labels, sorted_percentages = zip(*sorted_labels_percentages)

    colors = [color_mapping[label] for label in sorted_labels]

    wedges, texts = ax.pie(sorted_percentages, startangle=140, colors=colors, wedgeprops=dict(width=0.3), radius=0.7)

    ax.set_title(banksy_domain_annotation, fontsize=10)

    # Add a custom legend
    legend_elements = [
        plt.Line2D([0], [0], marker='o', color='w', label=f'{label} ({pct:.1f}%)',
                   markerfacecolor=color_mapping[label], markersize=10) for label, pct in sorted_labels_percentages
    ]
    ax.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(1.3, 0.5), fontsize=8, frameon=False)

# Creating the layout
unique_annotations = data['banksy_domain_annotation'].unique()
num_annotations = len(unique_annotations)
cols = 2
rows = int(np.ceil(num_annotations / cols))

fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))

for i, (ax, annotation) in enumerate(zip(axes.flatten(), unique_annotations)):
    data_subset = data[data['banksy_domain_annotation'] == annotation]
    pie_chart(ax, data_subset)

# Remove any empty subplots
for j in range(i + 1, rows * cols):
    fig.delaxes(axes.flatten()[j])

plt.tight_layout()
plt.savefig("zone_piecharts.pdf")
plt.show()


annotating the domains using domain numbers, for the other weeks than 11, you can find this information in the main figure.

In [None]:
banksy_map = {
    "Endothelial-mesenchymal rich": "Domain 1",
    "Mesenchymal-immune rich": "Domain 3",
    "Mesenchymal rich": "Domain 2",
    "Chondroblast rich": "Domain 7",
    "SOX2high epithelial rich": "Domain 5",
    "SOX9high epithelial rich": "Domain 6",
    "ASM rich": "Domain 4",
   
}

adata_domain.obs['banksy_domain_numbers'] = adata_domain.obs['banksy_domain_annotation'].map(banksy_map)
adata_domain.obs['banksy_domain_numbers'] = adata_domain.obs['banksy_domain_numbers'].astype('category')

In [None]:
adata_domain.obs['banksy_domain_annotation'].value_counts()

In [None]:
adata_domain.obs['banksy_domain_numbers'].value_counts()

to check the dispersion of the domains you can run ripley's L.

In [None]:
mode = "L"
sq.gr.ripley(adata_domain, cluster_key="banksy_domain_numbers", mode=mode, max_dist=500)
sq.pl.ripley(adata_domain, cluster_key="banksy_domain_numbers", mode=mode, save='11wripleysL.pdf')

# cell to cell adjacency

In [None]:
sq.gr.spatial_neighbors(adata_new, radius=50, coord_type="generic")
sq.pl.spatial_scatter(adata_new, shape=None, color='annotation', connectivity_key="spatial_connectivities", size=1,library_id=None,figsize=(8, 3))

zoomed in regions for cell cell adjacency plots

In [None]:
sq.pl.spatial_scatter(adata_new, shape=None, color='annotation', connectivity_key="spatial_connectivities", edges_width=0.6, size=0.4,library_id=None,edges_color='#5e5c5b',figsize=(8, 3))

In [None]:
sq.gr.nhood_enrichment(adata_new, cluster_key="annotation")

In [None]:
sq.pl.nhood_enrichment(adata_new, cluster_key='annotation', method="ward", mode='zscore', vmax=100, vmin=-100, cmap='coolwarm', show=False)

# Proliferation

In [None]:
dat_pro=pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_proliferation.csv',sep=',')

In [None]:
dat_pro

for the simplicity we changed the names of the subcell types (eg. Vim+mes.) to main cell types(eg. Mesenchymal)

In [None]:
proliferation_annotations = {
    'Vim+ mes.':'Mesenchymal',        
    'Endothelial':'Endothelial',
    'SOX2 high epit.':'SOX2 high epit.',         
    'Ki67+ mes.':'Mesenchymal',        
    'SOX9 high epit.':'SOX9 high epit.',               
    'Neuronal':'Mesenchymal',                  
    'Air. fibro.':'Mesenchymal',       
    'Lymp.endo.':'Lymp.endo.',      
    'Immune':'Immune',                            
    'Chondroblast':'Mesenchymal',              
    'Adv. fibro.':'Mesenchymal',     
    'ASM':'ASM',           
    'VSM':'VSM',    
      
}
adata_new.obs['proliferation'] = adata_new.obs['annotation'].map(proliferation_annotations)

adding proliferation tag to the annotations based on the proliferating cell_ids

In [None]:
proliferating_idx = adata_new.obs.cell_id.isin(dat_pro.cell_id)

prolif_res = adata_new.obs.loc[proliferating_idx].proliferation.apply(lambda x: 'Proliferating ' + x)
adata_new.obs.loc[proliferating_idx, 'proliferation'] = prolif_res

In [None]:
sc.pl.spatial(adata_new, color='proliferation', spot_size=30, ncols = 1, color_map='viridis')

calculating neighbors of proliferating cells

In [None]:
sq.gr.spatial_neighbors(adata_new, radius=50, coord_type="generic")
sq.gr.nhood_enrichment(adata_new, cluster_key="proliferation")
sq.pl.nhood_enrichment(adata_new, cluster_key='proliferation', method="ward", mode='zscore', vmax=100, vmin=-100, cmap='coolwarm', show=False)

In [None]:
adata_new.uns['proliferating_neighbors']=pd.DataFrame(adata_new.uns['proliferation_nhood_enrichment']['count'],index=np.unique(adata_new.obs['proliferation']),columns=np.unique(adata_new.obs['proliferation']))

In [None]:
plt.figure()
fig1, ax1 = plt.subplots(figsize=(7,5))
sns_heatmap = sns.heatmap(adata_new.uns['proliferating_neighbors'], annot=True, annot_kws={"fontsize":5}, fmt='.2f', cmap='Spectral', vmin=-1, vmax=1, center = 0, square = False, linewidths=.1, cbar=False, ax=ax1)


sns_clustermap = sns.clustermap(adata_new.uns['proliferating_neighbors'],cmap='coolwarm', figsize=(8,8),vmin=-915, vmax=915)
plt.savefig('11week_cellcell_prolif.pdf')

# arterial immune cells

This part is performed for week 11,12 and 13. 

Arterial immune cell selection is done manually and cell_ids are provided in the data repository.

In [None]:
arterial_imm=pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_arterial_immune.csv',sep=',')

for the simplicity we changed the names of the subcell types (eg. Vim+mes.) to main cell types(eg. Mesenchymal)

In [None]:
arterialimm_annotations = {
    'Vim+ mes.':'Mesenchymal',        
    'Endothelial':'Endothelial',
    'SOX2 high epit.':'SOX2 high epit.',         
    'Ki67+ mes.':'Mesenchymal',        
    'SOX9 high epit.':'SOX9 high epit.',                  
    'Neuronal':'Neuronal',                  
    'Air. fibro.':'Mesenchymal',       
    'Lymp.endo.':'Lymp. endo.',      
    'Immune':'Immune',                             
    'Chondroblast':'Chondroblast',              
    'Adv. fibro.':'Mesenchymal',     
    'ASM':'ASM',           
    'VSM':'VSM',    
}
adata_new.obs['arterialimm'] = adata_new.obs['annotation'].map(arterialimm_annotations)

adding 'arterial' tag the selected arterial immune cells

In [None]:
arterialimm_idx = adata_new.obs.cell_id.isin(arterial_imm.cell_id)

arterialimm_res = adata_new.obs.loc[arterialimm_idx].arterialimm.apply(lambda x: 'Arterial ' + str(x))
adata_new.obs.loc[arterialimm_idx, 'arterialimm'] = arterialimm_res

In [None]:
sc.pl.spatial(adata_new, color='arterialimm', spot_size=30, ncols = 1, color_map='viridis')

some selected arterial immune cells are from endothelial and mesenchymal clusters. Therefore we exclude these cells from the analysis for arterial immune cells. In order to not exclude cells from adata_new, we copy it and rename it as adata_new_imm

In [None]:
adata_new_imm = adata_new[~adata_new.obs['arterialimm'].isin(['Arterial Endothelial','Arterial Mesenchymal']),:].copy()

In [None]:
sc.pl.spatial(adata_new_imm, color='arterialimm', spot_size=30, ncols = 1,color_map='viridis')

In [None]:
adata_new_imm.uns['arterialimm_colors'] = ['#2c6fcd','#009905','#2ff7f7','#f96d00','#d50104','#15e45b','#ffbc89','#0044f9','#00b3dc','#fe8a8b','#8e0002','#193f75']

volcano plot for Immune vs arterial immune

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import hvplot.pandas
def rank_genes_groups_df(adata, group, pval_cutoff : float =None, logfc_cutoff=None): 
    d = pd.DataFrame() 
    for k in ['scores', 'names', 'logfoldchanges', 'pvals', 'pvals_adj']: 
        d[k] = adata.uns["rank_genes_groups"][k][group] 
    if pval_cutoff is not None: 
        d = d[d["pvals_adj"] < pval_cutoff] 
    if logfc_cutoff is not None: 
        d = d[d["logfoldchanges"].abs() > logfc_cutoff] 
    return d

sc.tl.rank_genes_groups(adata_new, 'arterialimm', groups=['Arterial Immune'], reference='Immune', method='wilcoxon', n_genes=adata_new.var_names.size,corr_method='benjamini-hochberg')
de_df = rank_genes_groups_df(adata_new, "Arterial Immune")

# Calculate -log10 of the adjusted p-values
de_df['minus_log10_pvals_adj'] = -np.log10(de_df['pvals_adj'])

# Define thresholds for significant points
significant_thresh = 0.05  # Example threshold for adjusted p-values
logfc_thresh = 1.0         # Example threshold for log fold change

# Create the volcano plot
plt.figure(figsize=(10, 8))  # You can adjust the figure size to fit your needs
plt.scatter(
    de_df['logfoldchanges'],
    de_df['minus_log10_pvals_adj'],
    color=np.where((de_df['pvals_adj'] < significant_thresh) & (de_df['logfoldchanges'].abs() > logfc_thresh), 'red', 'gray'),
    alpha=1,
    s=200
)

# Annotations for significant genes
significant = de_df[(de_df['pvals_adj'] < significant_thresh) & (de_df['logfoldchanges'].abs() > logfc_thresh)]
for i, row in significant.iterrows():
    plt.text(row['logfoldchanges'], row['minus_log10_pvals_adj'], row['names'], fontsize=9, rotation=45, ha='right')

# Adding labels and title
plt.title('Volcano Plot of Differential Expression')
plt.xlabel('Log Fold Change')
plt.ylabel('-log10(Adjusted P-value)')

# Add grid for better readability and visual appeal
plt.grid(True)

# Display the plot
plt.savefig('arterial_volcano.pdf')
plt.show()

integrating immune cell subtypes to the arterial immune cell analysis (for detailed analysis)

In [None]:
adata_new.obs['arterialimm_subtypes']=pd.Categorical((adata_new.obs['arterialimm'].copy()).astype('category'))

In [None]:
immune_cells_csv= pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_immunecells.csv',sep=',')

updating the arterialimm_subtypes with immune subtypes.

In [None]:
cell_id_to_subtype = dict(zip(immune_cells_csv['cell_id'], immune_cells_csv['imm_ann_pro']))

# Update 'arterialimm_subtypes' using the map function
adata_new.obs['arterialimm_subtypes'] = adata_new.obs['cell_id'].map(cell_id_to_subtype).combine_first(adata_new.obs['arterialimm_subtypes'])


adding "arterial" tag to the immune subcell types

In [None]:

arterialimm_idx = adata_new.obs.cell_id.isin(arterial_imm.cell_id)

arterialimm_res = adata_new.obs.loc[arterialimm_idx].arterialimm_subtypes.apply(lambda x: 'Arterial ' + x)
adata_new.obs.loc[arterialimm_idx, 'arterialimm_subtypes'] = arterialimm_res

preparation for selecting random immune cells

In [None]:
adata_new_imm.obs.to_csv('11w_obs_adatanew_imm.csv')

In [None]:
arterial_obs = pd.read_table('/Users/sanem.sanyar/Python_squidpy_analysis/fetallung_analysis_python/11w/11w_obs_adatanew_imm.csv',sep=',')

In [None]:
arterial_obs

# Quantification of neighbours of arterial immune cells and random immune cells

this analysis was performed for week 11, 12 and 13.

change the cell_types list for corresponding week

In [None]:
import math
cell_types = ['Immune', 'Endothelial', 'Mesenchymal',
              'Chondroblast', 'Lymp. endo.',
              'SOX2 high epit.', 'SOX9 high epit.', 'Neuronal',
              'ASM', 'VSM',
              'Arterial Immune']
from cycler import cycler
# Create distance bins
distance_bins = np.arange(0, 51, 5)  # Adjust the bin width as needed
bin_counts = np.zeros((len(cell_types), len(distance_bins) - 1))
# Filter rows where ‘arterialimm’ is ‘Arterial Immune’
arterial_immune_rows = arterial_obs[arterial_obs['arterialimm'] == 'Arterial Immune']
for index, row in arterial_immune_rows.iterrows():
    x1 = row['x']
    y1 = row['y']
    for _, big_data_row in arterial_obs.iterrows():
        x2 = big_data_row['x']
        y2 = big_data_row['y']
        distance = math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) * 0.51
        if distance > 50 or (x1 == x2 and y1 == y2):
            continue
        cell_type = big_data_row['arterialimm']
        if cell_type in cell_types:
            ct_index = cell_types.index(cell_type)
            bin_index = np.digitize(distance, distance_bins) - 1
            bin_counts[ct_index, bin_index] += 1
# Calculate percentages
total_cells = bin_counts.sum(axis=0)
percentage_data = (bin_counts / total_cells) * 100

# Save percentage data to a CSV file
percentage_df = pd.DataFrame(percentage_data, index=cell_types, columns=distance_bins[:-1])
percentage_df.to_csv('percentage_values_arterial.csv')


# Create a custom color cycle for unique colors for all cell types
unique_colors = plt.cm.tab20.colors  # You can choose any colormap or set of colors
custom_color_cycle = cycler(color=unique_colors)

# Apply the custom color cycle to the plot
plt.rc('axes', prop_cycle=custom_color_cycle)

# Plot the histogram
for ct_index, cell_type in enumerate(cell_types):
    plt.plot(distance_bins[:-1], percentage_data[ct_index], label=cell_type)

plt.xlabel('Distance')
plt.ylabel('Percentage of Cells (%)')
plt.title('Cell Type Percentages in Distance Bins')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Save the figure as a PDF
plt.savefig('11pcw_arterial_neighbours.pdf')
plt.show()



this analysis was performed for week 11, 12 and 13.There were in total, 365, 68 and 45 arterial immune cells respectively for these weeks. As the example code provided is for 11 week, here we select 365 cells for these analysis. You can change them according to the weeks you perform analysis on.

# Quantification of the neighbours of randomly picked cells (to compare with arterial cells)

In [None]:
from random import sample

np.random.seed(100)

random_cells = arterial_obs.take(np.random.permutation(len(arterial_obs))[:365])

random_cells


change the cell_types list for corresponding week

In [None]:
import math
cell_types = ['Immune', 'Endothelial', 'Mesenchymal',
              'Chondroblast', 'Lympathic endothelial', 'Epithelial(luminal)',
              'SOX2+ epithelial', 'SOX9+ epithelial', 'Neuronal',
              'Airway smooth muscle', 'Vascular smooth muscle',
              'Arterial Immune']


from cycler import cycler
# Create distance bins
distance_bins = np.arange(0, 51, 5)  # Adjust the bin width as needed
bin_counts = np.zeros((len(cell_types), len(distance_bins) - 1))
for index, row in random_cells.iterrows():
    x1 = row['x']
    y1 = row['y']
    for _, big_data_row in arterial_obs.iterrows():
        x2 = big_data_row['x']
        y2 = big_data_row['y']
        distance = math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) * 0.51
        if distance > 50 or (x1 == x2 and y1 == y2):
            continue
        cell_type = big_data_row['arterialimm']
        if cell_type in cell_types:
            ct_index = cell_types.index(cell_type)
            bin_index = np.digitize(distance, distance_bins) - 1
            bin_counts[ct_index, bin_index] += 1
# Calculate percentages
total_cells = bin_counts.sum(axis=0)
percentage_data = (bin_counts / total_cells) * 100

# Save percentage data to a CSV file
percentage_df = pd.DataFrame(percentage_data, index=cell_types, columns=distance_bins[:-1])
percentage_df.to_csv('percentage_values_random.csv')

# Create a custom color cycle for unique colors for all cell types
unique_colors = plt.cm.tab20.colors  # You can choose any colormap or set of colors
custom_color_cycle = cycler(color=unique_colors)

# Apply the custom color cycle to the plot
plt.rc('axes', prop_cycle=custom_color_cycle)

# Plot the histogram
for ct_index, cell_type in enumerate(cell_types):
    plt.plot(distance_bins[:-1], percentage_data[ct_index], label=cell_type)

plt.xlabel('Distance')
plt.ylabel('Percentage of Cells (%)')
plt.title('Cell Type Percentages in Distance Bins')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Save the figure as a PDF
plt.savefig('11pcw_randomcell_neighbours.pdf')
plt.show()



# Quantification of the neighbors of randomly picked immune cells.

In [None]:
immune_rows = arterial_obs[arterial_obs['arterialimm'] == 'Immune']

np.random.seed(100)

random_imm_cells = immune_rows.take(np.random.permutation(len(immune_rows))[:365])

random_imm_cells


In [None]:
from cycler import cycler
# Create distance bins
distance_bins = np.arange(0, 51, 5)  # Adjust the bin width as needed
bin_counts = np.zeros((len(cell_types), len(distance_bins) - 1))
for index, row in random_imm_cells.iterrows():
    x1 = row['x']
    y1 = row['y']
    for _, big_data_row in arterial_obs.iterrows():
        x2 = big_data_row['x']
        y2 = big_data_row['y']
        distance = math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) * 0.51
        if distance > 50 or (x1 == x2 and y1 == y2):
            continue
        cell_type = big_data_row['arterialimm']
        if cell_type in cell_types:
            ct_index = cell_types.index(cell_type)
            bin_index = np.digitize(distance, distance_bins) - 1
            bin_counts[ct_index, bin_index] += 1
# Calculate percentages
total_cells = bin_counts.sum(axis=0)
percentage_data = (bin_counts / total_cells) * 100

# Save percentage data to a CSV file
percentage_df = pd.DataFrame(percentage_data, index=cell_types, columns=distance_bins[:-1])
percentage_df.to_csv('percentage_values_random_imm.csv')

# Create a custom color cycle for unique colors for all cell types
unique_colors = plt.cm.tab20.colors  # You can choose any colormap or set of colors
custom_color_cycle = cycler(color=unique_colors)

# Apply the custom color cycle to the plot
plt.rc('axes', prop_cycle=custom_color_cycle)

# Plot the histogram
for ct_index, cell_type in enumerate(cell_types):
    plt.plot(distance_bins[:-1], percentage_data[ct_index], label=cell_type)

plt.xlabel('Distance')
plt.ylabel('Percentage of Cells (%)')
plt.title('Cell Type Percentages in Distance Bins')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Save the figure as a PDF
plt.savefig('11pcw_randomimmune_neighbours.pdf')
plt.show()




write the Anndata object to use it in other notebooks such as 'merging&clustering_all_weeks', 'cell-cell adjacency' and 'Domain_banksy'.

In [None]:
adata_new.write_h5ad('adata_11w.h5ad')

# SOX2 SOX9 comparison

first version of the plot

these values are coming from the supplementary file 'Cell counts for analysis of proliferation in aiways.pdf'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Data
data = {
    'Percentage': [82, 17.9, 97.8, 2.2, 96.2, 3.8],
    'Category': ['SOX9+', 'ProSOX9+', 'SOX2+', 'ProSOX2+', 'Large SOX2+', 'Pro Large SOX2+']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Define the groups
groups = {
    'SOX9': ['SOX9+', 'ProSOX9+'],
    'SOX2': ['SOX2+', 'ProSOX2+'],
    'Large SOX2+': ['Large SOX2+', 'Pro Large SOX2+']
}

# Create a new column for the group
df['Group'] = df['Category'].apply(lambda x: next((group for group, items in groups.items() if x in items), None))

# Pivot the data
pivot_table = df.pivot(index='Group', columns='Category', values='Percentage')

# Plot the stacked bar plot
ax = pivot_table.plot(kind='bar', stacked=True, colormap="tab20", figsize=(10, 6))

# Set labels and title
plt.xlabel('Group')
plt.ylabel('Percentage')
plt.title('Stacked Bar Plot')

# Display the plot
plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('SOX2_SOX9_largeSOX2.pdf')
plt.show()


2nd version of the plot

In [None]:
these values are coming from the supplementary file 'Cell counts for analysis of proliferation in aiways.pdf'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Data
data = {
    'Percentage': [17.9, 2.2, 3.8],
    'Category': ['ProSOX9+', 'ProSOX2+', 'Pro Large SOX2+']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Create a bar plot
plt.figure(figsize=(8, 6))
plt.bar(df['Category'], df['Percentage'], color='skyblue')
plt.xlabel('Category')
plt.ylabel('Percentage')
plt.title('Percentage by Category')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Ensure labels and titles fit within the figure

# Show the plot
plt.savefig('SOX2_SOX9_largeSOX2_pro.pdf')
plt.show()

Calculating significance in different groups based on counts which is provided in the supplementary file 'Cell counts for analysis of proliferation in aiways.pdf'

In [None]:
import pandas as pd
from scipy.stats import fisher_exact

table = {'SOX2': [1496, 33],
        'Large SOX2': [6629, 265],
        'SOX9': [2399, 525]}

index = ['Non pro', 'pro']

df = pd.DataFrame(table, index=index)

# Perform Fisher's exact test for each pair of columns
results = {}

columns = df.columns
num_columns = len(columns)

for i in range(num_columns):
    for j in range(i + 1, num_columns):
        col1 = columns[i]
        col2 = columns[j]

        # Create a 2x2 contingency table
        contingency_table = df[[col1, col2]]

        # Perform Fisher's exact test
        odds_ratio, p_value = fisher_exact(contingency_table)

        # Store the results
        result_key = f"{col1} vs {col2}"
        results[result_key] = {
            'Odds Ratio': odds_ratio,
            'P-Value': p_value
        }

# Create a DataFrame from the results
results_df = pd.DataFrame(results).T
print(results_df)
