In [None]:
# Import dependencies
%matplotlib inline
import os
import scanpy as sc
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import anndata as ad
import numpy as np

#from pydeseq2.dds import DeseqDataSet
#from pydeseq2.ds import DeseqStats

matplotlib.rcParams['font.family'] = 'sans-serif'

# Initialize random seed
import random
random.seed(111)

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks"
os.chdir( wdir )

# folder structures
RESULTS_FOLDERNAME = "adult/PseudobulkDGE/results/"
FIGURES_FOLDERNAME = "adult/PseudobulkDGE/figures"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.savefig(os.path.join(folder, fname), format='svg')

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
adata = sc.read_h5ad('adult/annotation/results/adultdev_cellhint.h5ad')
adata

In [None]:
adata.X.max()

In [None]:
adata.X = adata.layers['counts'].copy()
adata.X.max()

In [None]:
keep = [
    'COL6A6 FSTL1 DCLK1 Progenitors',
    'Embryonic Chondrocytes',
    'MKX TNMD ABI3BP GAS2 Progenitors',
    'MSC Precursors',
    'RUNX2 THBS2 COL11A1 Progenitors',
    'SCX FGF14 THBS4 FSTL5 Progenitors',
    'SOX5 CREB5 Chondrocyte Progenitors',
     'ABI3BP GAS2 Fibroblasts',
     'COL3A1 PI16 Fibroblasts',
     'COL6A6 FNDC1 Fibroblasts',
     'Chondrocytes',
     'FGF14 THBS4 Fibroblasts',
     'NEGR1 SCN7A Fibroblasts',
     'COL3A1hi fibroblasts',
     'ABCA10hi fibroblasts',
     'FBLNhi fibroblasts',
     'NR4A1hi fibroblasts'
]

adata = adata[adata.obs.annotations_upd2.isin(keep)]
adata

In [None]:
sc.pp.filter_cells(adata, min_genes=300)
sc.pp.filter_genes(adata, min_cells=5)
adata

In [None]:
adata.var["mt"] = adata.var_names.str.startswith("MT-")
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]"))
        
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True)

In [None]:
adata.obs['sampletype'].value_counts()

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * M.mad()) | (
        np.median(M) + nmads * M.mad() < M
    )
    return outlier

adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)
adata.obs.outlier.value_counts()

In [None]:
adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (
    adata.obs["pct_counts_mt"] > 8
)
adata.obs.mt_outlier.value_counts()

In [None]:
print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()

print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

In [None]:
sc.pp.normalize_total(adata, target_sum=None, inplace=True)
print(adata.X[0:10,0:10])
sc.pp.log1p(adata)
print(adata.X[0:10,0:10])
adata.layers["log1p_norm"] = adata.X.copy()

In [None]:
adata.raw = adata

In [None]:
#adata.X = adata.layers["log1p_norm"].copy()
#sc.pp.highly_variable_genes(adata, flavor='seurat', batch_key='sampletype')
#sc.pp.scale(adata)
#sc.pp.pca(adata)
#sc.pp.neighbors(adata, n_neighbors=15)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['annotations_upd2'])
sc.pl.umap(adata, color=['sampletype'])

In [None]:
matrisome = pd.read_csv(os.path.join('resources/matrisome_hs_masterlist.csv'))
matrisome = matrisome[matrisome['Division'] != "Retired"]
matrisome = matrisome[matrisome['Category'] != "Secreted Factors"]
matrisome

In [None]:
geneset = set(adata.var_names) #all genes list
categories = ["ECM Regulators", "ECM Glycoproteins", "ECM-affiliated Proteins", "Collagens", "Proteoglycans"]
ecmgenes_dict={}
for ecmtype in categories:
    selection = matrisome[matrisome['Category'] == ecmtype]
    selection_genes = set(selection['Gene Symbol'].values)
    genes = list(selection_genes.intersection(geneset))
    ecmgenes_dict[ecmtype] = genes

In [None]:
for category in ecmgenes_dict.keys():
    print(f'{category}: {len(ecmgenes_dict[category])} genes')

In [None]:
adata.X = adata.layers['log1p_norm'].copy()
print(adata.layers['log1p_norm'].max())
print(adata.X.max())

In [None]:
cell_type = 'annotations_upd2'

In [None]:
sc.tl.dendrogram(adata, 'annotations_upd2', use_rep='X_pca')

In [None]:
vmin=0
for category, genes in ecmgenes_dict.items():
    gene_count = len(genes)  # This needs to be inside the loop
    vmin = 0

    if gene_count >= 150:
        # Calculate the split points for three equal parts
        split1 = gene_count // 3
        split2 = 2 * gene_count // 3

        # Plotting three segments of the gene list
        sc.pl.matrixplot(adata, genes[:split1], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_1_mean.tiff')
        sc.pl.matrixplot(adata, genes[split1:split2], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_2_mean.tiff')
        sc.pl.matrixplot(adata, genes[split2:], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_3_mean.tiff')
    elif 110 < gene_count < 150:
        # Plotting two segments of the gene list
        sc.pl.matrixplot(adata, genes[:gene_count // 2], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_1_mean.tiff')
        sc.pl.matrixplot(adata, genes[gene_count // 2:], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_2_mean.tiff')
    else:  # This will handle gene_count <= 110
        # Plotting all genes in one plot
        sc.pl.matrixplot(adata, genes, groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_mean.tiff')


In [None]:
vmin=0
for category, genes in ecmgenes_dict.items():
    gene_count = len(genes)  # This needs to be inside the loop
    vmin = 0

    if gene_count >= 150:
        # Calculate the split points for three equal parts
        split1 = gene_count // 3
        split2 = 2 * gene_count // 3

        # Plotting three segments of the gene list
        sc.pl.dotplot(adata, genes[:split1], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_1_mean.tiff')
        sc.pl.dotplot(adata, genes[split1:split2], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_2_mean.tiff')
        sc.pl.dotplot(adata, genes[split2:], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_3_mean.tiff')
    elif 110 < gene_count < 150:
        # Plotting two segments of the gene list
        sc.pl.dotplot(adata, genes[:gene_count // 2], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_1_mean.tiff')
        sc.pl.dotplot(adata, genes[gene_count // 2:], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_2_mean.tiff')
    else:  # This will handle gene_count <= 110
        # Plotting all genes in one plot
        sc.pl.dotplot(adata, genes, groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_mean.tiff')


In [None]:
sc.tl.rank_genes_groups(adata, groupby=cell_type, method="wilcoxon")
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata,
    n_genes=5,
    values_to_plot="logfoldchanges",
    min_logfoldchange=2,
    vmax=7,
    vmin=-7,
    cmap="bwr",
)

# File preparation for DESeq2 pseudobulk DGE analysis

In [None]:
adata.obs.groupby(['sampletype', 'annotations_upd2']).size()

In [None]:
adata.obs["annotations_upd2"] = [ct.replace(" ", "_") for ct in adata.obs["annotations_upd2"]]
adata.obs["annotations_upd2"] = [ct.replace("+", "") for ct in adata.obs["annotations_upd2"]]
adata.obs["annotations_upd2"].value_counts()

In [None]:
adata.obs["bulksample"] = [
    f"{rep}_{l}" for rep, l in zip(adata.obs["sampletype"], adata.obs["annotations_upd2"])
]
adata.obs["bulksample"].value_counts()

In [None]:
adata.obs.columns

In [None]:
adata.obs["bulksample"] = adata.obs["bulksample"].astype("category")
adata.obs["annotations_upd2"] = adata.obs["annotations_upd2"].astype("category")
adata.obs["sampletype"] = adata.obs["sampletype"].astype("category")
adata.obs["grouptype"] = adata.obs["grouptype"].astype("category")
adata.obs["megagrouptype"] = adata.obs["megagrouptype"].astype("category")
adata.obs["tendon_status"] = adata.obs["tendon_status"].astype("category")
adata.obs["microanat"] = adata.obs["microanat"].astype("category")
adata.obs["type"] = adata.obs["type"].astype("category")
adata.obs["age"] = adata.obs["age"].astype("category")
adata.obs["phase"] = adata.obs["phase"].astype("category")
adata.obs["libbatch"] = adata.obs["libbatch"].astype("category")

# Pseudobulking

subset by cell type
then subset by sample
add age condition
add replicate number column

In [None]:
# subset by cell type
#cell_subset = {}
#for cell_type in adata.obs['C_scANVI'].unique():
#    cell_subset[cell_type] = adata[adata.obs['C_scANVI'] == cell_type]
    
#cell_subset

In [None]:
len(adata.obs['bulksample'].unique())

In [None]:
NUM_OF_CELL_PER_DONOR = 30 # to filter out donors with less than this amount of cells

def aggregate_and_filter(
    adata,
    donor_key="bulksample",
    cell_identity_key="annotations_upd2",
    replicates_per_patient=3,
):
    pbs_cell_type_dict = {}
    for i, cell_type in enumerate(adata.obs[cell_identity_key].cat.categories):
        print(
            f'Processing {cell_type} ({i+1} out of {len(adata.obs[cell_identity_key].cat.categories)})...'
        )
        # subset adata to the given cell identity
        adata_cell_pop = adata[adata.obs[cell_identity_key] == cell_type].copy()
        # check which donors to keep according to the number of cells specified with NUM_OF_CELL_PER_DONOR
        size_by_donor = adata_cell_pop.obs.groupby([donor_key]).size()
        donors_to_drop = [
            donor 
            for donor in size_by_donor.index
            if size_by_donor[donor] <= NUM_OF_CELL_PER_DONOR
        ]
        if len(donors_to_drop) > 0:
            print("Dropping the following samples:")
            print(donors_to_drop)

        pbs = []
        
        for i, sample in enumerate(adata_cell_pop.obs[donor_key].unique()):
            print(f"\tProcessing donor {i+1} out of {len(adata_cell_pop.obs[donor_key].unique())}...", end="\r")
            if sample not in donors_to_drop:
                samp_cell_subset = adata_cell_pop[adata_cell_pop.obs[donor_key] == sample]
                samp_cell_subset.X = samp_cell_subset.layers['counts'] #make sure to use raw data

                # create pseudoreplicates
                indices = list(samp_cell_subset.obs_names)
                random.shuffle(indices)
                indices = np.array_split(np.array(indices), replicates_per_patient)

                for k, pseudo_rep in enumerate(indices):

                    rep_adata = sc.AnnData(X = samp_cell_subset.X.sum(axis = 0),
                                           var = samp_cell_subset.var[[]])
                    
                    rep_adata.obs['replicate'] = k
                    rep_adata.obs_names = [sample + '_' + str(k)]
                    rep_adata.obs[donor_key] = samp_cell_subset.obs[donor_key].iloc[0]
                    rep_adata.obs['libbatch'] = samp_cell_subset.obs['libbatch'].iloc[0]
                    rep_adata.obs[cell_identity_key] = samp_cell_subset.obs[cell_identity_key].iloc[0]
                    rep_adata.obs["sampletype"] = samp_cell_subset.obs["sampletype"].iloc[0]
                    rep_adata.obs["grouptype"] = samp_cell_subset.obs["grouptype"].iloc[0]
                    rep_adata.obs["megagrouptype"] = samp_cell_subset.obs["megagrouptype"].iloc[0]
                    rep_adata.obs["tendon_status"] = samp_cell_subset.obs["tendon_status"].iloc[0]
                    rep_adata.obs["microanat"] = samp_cell_subset.obs["microanat"].iloc[0]
                    rep_adata.obs["type"] = samp_cell_subset.obs["type"].iloc[0]
                    rep_adata.obs["age"] = samp_cell_subset.obs["age"].iloc[0]
                    rep_adata.obs["phase"] = samp_cell_subset.obs["phase"].iloc[0]
                    rep_adata.obs["libbatch"] = samp_cell_subset.obs["libbatch"].iloc[0]
                    
                    pbs.append(rep_adata)
        print("\n")   
        pbs_cell_type_dict[cell_type] = sc.concat(pbs)
    
    return pbs_cell_type_dict

In [None]:
pb = aggregate_and_filter(adata)

In [None]:
pb

In [None]:
pb['SCX_FGF14_THBS4_FSTL5_Progenitors'].X

In [None]:
for celltype in pb.keys():
    counts = pd.DataFrame(pb[celltype].X, columns = pb[celltype].var_names, index=pb[celltype].obs_names)
    counts.T.to_csv(os.path.join(RESULTS_FOLDERNAME, f'counts_matrix_{celltype}.csv'))
    metadata = pd.DataFrame(pb[celltype].obs, index=pb[celltype].obs_names)
    metadata.to_csv(os.path.join(RESULTS_FOLDERNAME, f'metadata_{celltype}.csv'))

In [None]:
adata_list = list(pb.values())
concatenated_adata = ad.concat(adata_list, index_unique=None, join='outer')
concatenated_adata

In [None]:
concatenated_adata.obs

In [None]:
counts = pd.DataFrame(concatenated_adata.X, columns = concatenated_adata.var_names, index=concatenated_adata.obs_names)
counts.T.to_csv(os.path.join(RESULTS_FOLDERNAME, f'counts_matrix_full.csv'))
metadata = pd.DataFrame(concatenated_adata.obs, index=concatenated_adata.obs_names)
metadata.to_csv(os.path.join(RESULTS_FOLDERNAME, f'metadata_full.csv'))

In [None]:
counts

In [None]:
concatenated_adata.layers['counts'] = concatenated_adata.X.copy()
concatenated_adata.obs["lib_size"] = np.sum(concatenated_adata.layers["counts"], axis=1)
concatenated_adata.obs["log_lib_size"] = np.log(concatenated_adata.obs["lib_size"])

sc.pp.normalize_total(concatenated_adata, target_sum=None)
sc.pp.log1p(concatenated_adata)
sc.pp.scale(concatenated_adata)
sc.pp.pca(concatenated_adata)
sc.pl.pca(concatenated_adata, color=concatenated_adata.obs, ncols=1, size=300,
         save='PCA_plots.svg')

In [None]:
sc.pl.pca_loadings(adata)

In [None]:
concatenated_adata.write(os.path.join(RESULTS_FOLDERNAME, 'concatenated_pseudobulk.h5ad'))

# Plotting of DESeq2 normalised counts

In [None]:
norm_counts_deseq = pd.read_csv('../rnotebooks/DevAdultCombined_PseudobulkDGE/normalised_counts_full.txt', sep='\t', index_col=0)
metadata_deseq = pd.read_csv(os.path.join(RESULTS_FOLDERNAME, 'metadata_full.csv'), index_col=0)

In [None]:
counts_df = norm_counts_deseq.T
metadata_df = metadata_deseq
metadata_df = metadata_df.loc[counts_df.index]

adata = sc.AnnData(X=counts_df.values, obs=metadata_df)
adata.var_names = counts_df.columns
adata.obs_names = counts_df.index

In [None]:
adata

In [None]:
sc.pp.scale(adata, max_value=10)
sc.pp.pca(adata)

In [None]:
adata.obs['annotations_upd2'] = adata.obs['annotations_upd2'].astype('category')

In [None]:
sc.tl.dendrogram(adata, 'annotations_upd2')

In [None]:
for category, genes in ecmgenes_dict.items():
    gene_count = len(genes) 
    vmin = 0

    if gene_count >= 150:
        # Calculate the split points for three equal parts
        split1 = gene_count // 3
        split2 = 2 * gene_count // 3

        # Plotting three segments of the gene list
        sc.pl.dotplot(adata, genes[:split1], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_1_mean_deseqnorm.tiff')
        sc.pl.dotplot(adata, genes[split1:split2], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_2_mean_deseqnorm.tiff')
        sc.pl.dotplot(adata, genes[split2:], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                         swap_axes=True, save=f'{category}_3_mean_deseqnorm.tiff')
    elif 110 < gene_count < 150:
        # Plotting two segments of the gene list
        sc.pl.dotplot(adata, genes[:gene_count // 2], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_1_mean_deseqnorm.tiff')
        sc.pl.dotplot(adata, genes[gene_count // 2:], groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_2_mean_deseqnorm.tiff')
    else:  # This will handle gene_count <= 110
        # Plotting all genes in one plot
        sc.pl.dotplot(adata, genes, groupby=cell_type, dendrogram=True, use_raw=True,
                         vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                         save=f'{category}_mean_deseqnorm.tiff')


In [None]:
vmin=-10
vmax=10

for i, (category, genes) in enumerate(ecmgenes_dict.items()):
    gene_count = len(genes)  # Get the length of the gene list
    if gene_count > 150:
        
        split1 = gene_count // 3
        split2 = 2 * gene_count // 3

        sc.pl.matrixplot(adata, genes[:split1], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_1_deseqnorm_scaled.tiff')
        sc.pl.matrixplot(adata, genes[split1:split2], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_2_deseqnorm_scaled.tiff')
        sc.pl.matrixplot(adata, genes[split2:], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_3_deseqnorm_scaled.tiff')
    if gene_count > 110:
        sc.pl.matrixplot(adata, genes[:gene_count // 2], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_1_deseqnorm_scaled.tiff')
        sc.pl.matrixplot(adata, genes[gene_count // 2:], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_2_deseqnorm_scaled.tiff')
    else:
        sc.pl.matrixplot(adata, ecmgenes_dict[category], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_deseqnorm_scaled.tiff')

In [None]:
vmin=-10
vmax=10

for i, (category, genes) in enumerate(ecmgenes_dict.items()):
    gene_count = len(genes)  # Get the length of the gene list
    if gene_count > 150:
        
        split1 = gene_count // 3
        split2 = 2 * gene_count // 3

        sc.pl.dotplot(adata, genes[:split1], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_1_deseqnorm_scaled.tiff')
        sc.pl.dotplot(adata, genes[split1:split2], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_2_deseqnorm_scaled.tiff')
        sc.pl.dotplot(adata, genes[split2:], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_3_deseqnorm_scaled.tiff')
    if gene_count > 110:
        sc.pl.dotplot(adata, genes[:gene_count // 2], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_1_deseqnorm_scaled.tiff')
        sc.pl.dotplot(adata, genes[gene_count // 2:], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_2_deseqnorm_scaled.tiff')
    else:
        sc.pl.dotplot(adata, ecmgenes_dict[category], groupby=cell_type, dendrogram=True, use_raw=False,
                         vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                         title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{category}_deseqnorm_scaled.tiff')

In [None]:
adata = adata[adata.obs['replicate'] == 1].copy()

In [None]:
sc.pl.clustermap(
    adata,
    obs_keys='annotations_upd2',
    cmap='RdBu_r',
    method='average',
    metric='euclidean',
    figsize=(20, 20)
)

# Groups analysis

In [None]:
norm_counts_deseq = {}
for groupname in os.listdir('../rnotebooks/DevAdultCombined_PseudobulkDGE/'):
    if groupname.startswith("group"):
        print(groupname)
        path = f'../rnotebooks/DevAdultCombined_PseudobulkDGE/{groupname}/normalised_counts.txt'
        norm_counts_deseq[groupname] = pd.read_csv(path, sep='\t', index_col=0)
        norm_counts_deseq[groupname]

In [None]:
norm_counts_deseq['group4']

In [None]:
metadata_deseq = {}
for groupname in os.listdir('../rnotebooks/DevAdultCombined_PseudobulkDGE/'):
    if groupname.startswith("group"):
        print(groupname)
        path = f'../rnotebooks/DevAdultCombined_PseudobulkDGE/{groupname}/group_metadata.csv'
        metadata_deseq[groupname] = pd.read_csv(path, index_col=0)
        metadata_deseq[groupname]

In [None]:
metadata_deseq['group4']

In [None]:
adata_objects = {}
for groupname in norm_counts_deseq:
    counts_df = norm_counts_deseq[groupname].T
    metadata_df = metadata_deseq[groupname]
    metadata_df = metadata_df.loc[counts_df.index]
    
    adata = sc.AnnData(X=counts_df.values, obs=metadata_df)
    adata.var_names = counts_df.columns
    adata.obs_names = counts_df.index
    
    adata_objects[groupname] = adata

In [None]:
adata_objects

In [None]:
for data in adata_objects.values():
    data.raw = data

In [None]:
vmin=0
categories = ["ECM Regulators", "ECM Glycoproteins", "ECM-affiliated Proteins", "Collagens", "Proteoglycans"]

for groupchoice, adata in adata_objects.items():
    geneset = set(adata.var_names) #all genes list
    ecmgenes_dict={}
    for ecmtype in categories:
        selection = matrisome[matrisome['Category'] == ecmtype]
        selection_genes = set(selection['Gene Symbol'].values)
        genes = list(selection_genes.intersection(geneset))
        ecmgenes_dict[ecmtype] = genes

    for category, genes in ecmgenes_dict.items():
        gene_count = len(genes)  # This needs to be inside the loop
        vmin = 0

        if gene_count >= 150:
            # Calculate the split points for three equal parts
            split1 = gene_count // 3
            split2 = 2 * gene_count // 3

            # Plotting three segments of the gene list
            sc.pl.dotplot(adata, genes[:split1], groupby=cell_type, dendrogram=True, use_raw=True,
                             vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                             swap_axes=True, save=f'{groupchoice}_{category}_1_mean_deseqnorm.tiff')
            sc.pl.dotplot(adata, genes[split1:split2], groupby=cell_type, dendrogram=True, use_raw=True,
                             vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                             swap_axes=True, save=f'{groupchoice}_{category}_2_mean_deseqnorm.tiff')
            sc.pl.dotplot(adata, genes[split2:], groupby=cell_type, dendrogram=True, use_raw=True,
                             vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', 
                             swap_axes=True, save=f'{groupchoice}_{category}_3_mean_deseqnorm.tiff')
        elif 110 < gene_count < 150:
            # Plotting two segments of the gene list
            sc.pl.dotplot(adata, genes[:gene_count // 2], groupby=cell_type, dendrogram=True, use_raw=True,
                             vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                             save=f'{groupchoice}_{category}_1_mean_deseqnorm.tiff')
            sc.pl.dotplot(adata, genes[gene_count // 2:], groupby=cell_type, dendrogram=True, use_raw=True,
                             vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                             save=f'{groupchoice}_{category}_2_mean_deseqnorm.tiff')
        else:  # This will handle gene_count <= 110
            # Plotting all genes in one plot
            sc.pl.dotplot(adata, genes, groupby=cell_type, dendrogram=True, use_raw=True,
                             vmin=vmin, colorbar_title='mean normalised\nexpression', title=f'{category}', cmap='Reds', swap_axes=True,
                             save=f'{groupchoice}_{category}_mean_deseqnorm.tiff')


In [None]:
for data in adata_objects.values():
    print(data.X.max())

In [None]:
for name, data in adata_objects.items():
    sc.pp.scale(data)
    adata_objects[name] = data

for data in adata_objects.values():
    print(data.X.max())

In [None]:
vmin=-4
vmax=4

for groupchoice, adata in adata_objects.items():
    geneset = set(adata.var_names) #all genes list
    ecmgenes_dict={}
    for ecmtype in categories:
        selection = matrisome[matrisome['Category'] == ecmtype]
        selection_genes = set(selection['Gene Symbol'].values)
        genes = list(selection_genes.intersection(geneset))
        ecmgenes_dict[ecmtype] = genes

    for i, (category, genes) in enumerate(ecmgenes_dict.items()):
        gene_count = len(genes)  # Get the length of the gene list
        if gene_count > 150:

            split1 = gene_count // 3
            split2 = 2 * gene_count // 3

            sc.pl.dotplot(adata, genes[:split1], groupby=cell_type, dendrogram=True, use_raw=False,
                             vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                             title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{groupchoice}_{category}_1_deseqnorm_scaled.tiff')
            sc.pl.dotplot(adata, genes[split1:split2], groupby=cell_type, dendrogram=True, use_raw=False,
                             vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                             title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{groupchoice}_{category}_2_deseqnorm_scaled.tiff')
            sc.pl.dotplot(adata, genes[split2:], groupby=cell_type, dendrogram=True, use_raw=False,
                             vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                             title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{groupchoice}_{category}_3_deseqnorm_scaled.tiff')
        if gene_count > 110:
            sc.pl.dotplot(adata, genes[:gene_count // 2], groupby=cell_type, dendrogram=True, use_raw=False,
                             vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                             title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{groupchoice}_{category}_1_deseqnorm_scaled.tiff')
            sc.pl.dotplot(adata, genes[gene_count // 2:], groupby=cell_type, dendrogram=True, use_raw=False,
                             vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                             title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{groupchoice}_{category}_2_deseqnorm_scaled.tiff')
        else:
            sc.pl.dotplot(adata, ecmgenes_dict[category], groupby=cell_type, dendrogram=True, use_raw=False,
                             vmin=vmin, vmax=vmax, colorbar_title='mean z-score', 
                             title=f'{category}', cmap='RdBu_r', swap_axes=True, save=f'{groupchoice}_{category}_deseqnorm_scaled.tiff')