In [None]:
# Import dependencies
%matplotlib inline
import os
import scanpy as sc
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import anndata as ad
import numpy as np

#from pydeseq2.dds import DeseqDataSet
#from pydeseq2.ds import DeseqStats

matplotlib.rcParams['font.family'] = 'sans-serif'

# Initialize random seed
import random
random.seed(111)

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks"
os.chdir( wdir )

# folder structures
HARMONY_FOLDERNAME = "foetal/results/Harmony/"
RESULTS_FOLDERNAME = "foetal/results/DGE/"
FIGURES_FOLDERNAME = "foetal/figures/DGE/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.savefig(os.path.join(folder, fname), format='svg')

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
adata = sc.read_h5ad(os.path.join(HARMONY_FOLDERNAME, '{}.h5ad'.format('dev_harmony')))
adata

In [None]:
adata.X.max()

In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata

In [None]:
adata.obs["bulksample"] = [
    f"{rep}_{l}" for rep, l in zip(adata.obs["sampletype"], adata.obs["age"])
]

In [None]:
adata.obs["C_scANVI"] = [ct.replace(" ", "_") for ct in adata.obs["C_scANVI"]]
adata.obs["C_scANVI"] = [ct.replace("+", "") for ct in adata.obs["C_scANVI"]]
adata.obs["C_scANVI"]

In [None]:
adata.obs["sample"] = adata.obs["sample"].astype("category")
adata.obs["age"] = adata.obs["age"].astype("category")
adata.obs["bulksample"] = adata.obs["bulksample"].astype("category")
adata.obs["C_scANVI"] = adata.obs["C_scANVI"].astype("category")
adata.obs["libbatch"] = adata.obs["libbatch"].astype("category")
adata.obs["type"] = adata.obs["type"].astype("category")

# Pseudobulking

subset by cell type
then subset by sample
add age condition
add replicate number column

In [None]:
adata.obs.groupby(['sampletype', 'age']).size()

In [None]:
for cell_type in adata.obs['C_scANVI'].unique():
    print(cell_type)

In [None]:
# subset by cell type
#cell_subset = {}
#for cell_type in adata.obs['C_scANVI'].unique():
#    cell_subset[cell_type] = adata[adata.obs['C_scANVI'] == cell_type]
    
#cell_subset

In [None]:
for i, sample in enumerate(adata.obs['bulksample'].unique()):
    print(i, sample)

In [None]:
len(adata.obs['bulksample'].unique())

In [None]:
NUM_OF_CELL_PER_DONOR = 30 # to filter out donors with less than this amount of cells


def aggregate_and_filter(
    adata,
    #cell_identity,
    donor_key="bulksample",
    condition_key="age",
    cell_identity_key="C_scANVI",
    #obs_to_keep=[],  # which additional metadata to keep, e.g. gender, age, etc.
    replicates_per_patient=3,
):
    pbs_cell_type_dict = {}
    for i, cell_type in enumerate(adata.obs[cell_identity_key].cat.categories):
        print(
            f'Processing {cell_type} ({i+1} out of {len(adata.obs[cell_identity_key].cat.categories)})...'
        )
        # subset adata to the given cell identity
        adata_cell_pop = adata[adata.obs[cell_identity_key] == cell_type].copy()
        # check which donors to keep according to the number of cells specified with NUM_OF_CELL_PER_DONOR
        size_by_donor = adata_cell_pop.obs.groupby([donor_key]).size()
        donors_to_drop = [
            donor
            for donor in size_by_donor.index
            if size_by_donor[donor] <= NUM_OF_CELL_PER_DONOR
        ]
        if len(donors_to_drop) > 0:
            print("Dropping the following samples:")
            print(donors_to_drop)

        pbs = []
        
        for i, sample in enumerate(adata_cell_pop.obs[donor_key].unique()):
            print(f"\tProcessing donor {i+1} out of {len(adata_cell_pop.obs[donor_key].unique())}...", end="\r")
            if sample not in donors_to_drop:
                samp_cell_subset = adata_cell_pop[adata_cell_pop.obs[donor_key] == sample]
                samp_cell_subset.X = samp_cell_subset.layers['counts'] #make sure to use raw data

                # create pseudoreplicates
                indices = list(samp_cell_subset.obs_names)
                random.shuffle(indices)
                indices = np.array_split(np.array(indices), replicates_per_patient)

                for k, pseudo_rep in enumerate(indices):

                    rep_adata = sc.AnnData(X = samp_cell_subset.X.sum(axis = 0),
                                           var = samp_cell_subset.var[[]])

                    rep_adata.obs_names = [sample + '_' + str(k)]
                    rep_adata.obs['libbatch'] = samp_cell_subset.obs['libbatch'].iloc[0]
                    rep_adata.obs[condition_key] = samp_cell_subset.obs[condition_key].iloc[0]
                    rep_adata.obs[cell_identity_key] = samp_cell_subset.obs[cell_identity_key].iloc[0]
                    rep_adata.obs['sample'] = samp_cell_subset.obs['sample'].iloc[0]
                    rep_adata.obs['type'] = samp_cell_subset.obs['type'].iloc[0]
                    rep_adata.obs['sampletype'] = samp_cell_subset.obs['sampletype'].iloc[0]
                    rep_adata.obs['replicate'] = k

                    pbs.append(rep_adata)
        print("\n")   
        pbs_cell_type_dict[cell_type] = sc.concat(pbs)
    
    return pbs_cell_type_dict

In [None]:
pb = aggregate_and_filter(adata)

In [None]:
pb

In [None]:
pb['lymEndothelial_Cells'].X

In [None]:
for celltype in pb.keys():
    counts = pd.DataFrame(pb[celltype].X, columns = pb[celltype].var_names, index=pb[celltype].obs_names)
    counts.T.to_csv(os.path.join(RESULTS_FOLDERNAME, f'counts_matrix_{celltype}.csv'))
    metadata = pd.DataFrame(pb[celltype].obs, index=pb[celltype].obs_names)
    metadata.to_csv(os.path.join(RESULTS_FOLDERNAME, f'metadata_{celltype}.csv'))

In [None]:
# Convert the dictionary values to a list
adata_list = list(pb.values())

# Concatenate the list of AnnData objects along the rows (axis 0)
concatenated_adata = ad.concat(adata_list, index_unique=None, join='outer')

# Now, concatenated_adata contains all the data from the dictionary as a single AnnData object
concatenated_adata

In [None]:
#concatenated_adata.obs_names_make_unique()
concatenated_adata.obs

In [None]:
concatenated_adata.obs['sampletyperep'] = concatenated_adata.obs.index
concatenated_adata.obs.index = concatenated_adata.obs['sampletyperep'].astype('str') + '_' + concatenated_adata.obs['C_scANVI'].astype('str')
concatenated_adata.obs

In [None]:
counts = pd.DataFrame(concatenated_adata.X, columns = concatenated_adata.var_names, index=concatenated_adata.obs_names)
counts.T.to_csv(os.path.join(RESULTS_FOLDERNAME, f'counts_matrix_full.csv'))
metadata = pd.DataFrame(concatenated_adata.obs, index=concatenated_adata.obs_names)
metadata.to_csv(os.path.join(RESULTS_FOLDERNAME, f'metadata_full.csv'))

In [None]:
counts

In [None]:
metadata

In [None]:
counts = pd.DataFrame(pb['ABI3BP_GAS2_Fibroblasts_1'].X, columns = pb['ABI3BP_GAS2_Fibroblasts_1'].var_names)
counts

In [None]:
pb['ABI3BP_GAS2_Fibroblasts_1'].obs

In [None]:
#     df = pd.DataFrame(columns=[*adata_cell_pop.var_names, *obs_to_keep])
#     adata_cell_pop.obs[donor_key] = adata_cell_pop.obs[donor_key].astype("category")
#     donors = adata_cell_pop.obs[donor_key].cat.categories
#     for i, donor in enumerate(donors):
#         print(f"\tProcessing donor {i+1} out of {len(donors)}...", end="\r")
#         if donor not in donors_to_drop:
#             adata_donor = adata_cell_pop[adata_cell_pop.obs[donor_key] == donor]
#             # create replicates for each donor
#             indices = list(adata_donor.obs_names)
#             random.shuffle(indices)
#             indices = np.array_split(np.array(indices), replicates_per_patient)
#             for i, rep_idx in enumerate(indices):
#                 adata_replicate = adata_donor[rep_idx]
#                 # specify how to aggregate: sum gene expression for each gene for each donor and also keep the condition information
#                 agg_dict = {gene: "sum" for gene in adata_replicate.var_names}
#                 for obs in obs_to_keep:
#                     agg_dict[obs] = "first"
#                 # create a df with all genes, donor and condition info
#                 df_donor = pd.DataFrame(adata_replicate.X.A)
#                 df_donor.index = adata_replicate.obs_names
#                 df_donor.columns = adata_replicate.var_names
#                 df_donor = df_donor.join(adata_replicate.obs[obs_to_keep])
#                 # aggregate
#                 df_donor = df_donor.groupby(donor_key).agg(agg_dict)
#                 df_donor[donor_key] = donor
#                 df.loc[f"donor_{donor}_{i}"] = df_donor.loc[donor]
#     print("\n")
#     # create AnnData object from the df
#     adata_cell_pop = sc.AnnData(
#         df[adata_cell_pop.var_names], obs=df.drop(columns=adata_cell_pop.var_names)
#     )
#     return adata_cell_pop

# obs_to_keep = ["age", "C_scANVI", "bulksample", "type", "sample", "libbatch"]
# pbs = []
# for i, cell_type in enumerate(adata.obs["C_scANVI"].cat.categories):
#     print(
#         f'Processing {cell_type} ({i+1} out of {len(adata.obs["C_scANVI"].cat.categories)})...'
#     )
#     adata_cell_type = aggregate_and_filter(adata, cell_type, obs_to_keep=obs_to_keep)
#     pbs.append(adata_cell_type)