In [None]:
%%capture

from pathlib import Path
from scanpy import read_h5ad
from umap import UMAP
import pandas as pd
import numpy as np
import anndata

#load filepaths
root_folder = '/allen/programs/celltypes/workgroups/hct/SEA-AD/'
mapping_result_paths = Path(root_folder).glob('MERSCOPE/resegmentation_and_corr_mapping_data/mtg_manuscript/*/cbg_cpum_mapped.csv')
original_taxonomy_path = Path(root_folder).joinpath('RNAseq/scANVI/output/MTG_AD/final.2022-04-14.h5ad')
mapping_taxonomy_path = Path(root_folder).joinpath('MERSCOPE/resegmentation_and_corr_mapping_data/mtg_taxonomy_spatial_gene_subset.h5ad')

#load metadata tracker
h5ad_tracker = pd.read_csv(root_folder+'MERSCOPE/MTG_data_tracker/MTG_h5ad_tracker_SEAAD_072924.csv')
h5ad_tracker = h5ad_tracker[h5ad_tracker['Final QC status']!='f']
passing_h5ads = h5ad_tracker['Barcode'].astype(str).dropna().values.tolist()

#load pseudotime information
pseudotime = np.load(Path(root_folder).joinpath('RNAseq/ingest/input/quant_neuropath/MTG/processed/pseudotime.npy'))
pseudotime = pd.DataFrame(pseudotime.T, columns=["uwa", "donor_pseudotime"])
pseudotime["uwa"] = [str(i).replace(".0", "") for i in pseudotime["uwa"]]
pseudotime["donor_pseudotime"] = 1 - pseudotime["donor_pseudotime"]

## Create h5ad for each section

In [None]:
tax = read_h5ad(original_taxonomy_path)

for mapped_result in mapping_result_paths:

    #collect relevant subfiles and unique IDs
    if 'cellpose' in str(mapped_result):
        barcode = mapped_result.parent.parent.parent.stem
        meta_path = str(mapped_result.parent)+'/cellpose_metadata.csv'
        cirro_path = str(mapped_result.parent)+'/mapped_102023.h5ad'
    else:
        barcode = mapped_result.parent.stem
        meta_path = str([x for x in mapped_result.parent.glob('**/cellpose_metadata.csv')][0])
        cirro_path = str(Path(meta_path).parent)+'/mapped_102023.h5ad'

    #skip failed sections
    if barcode not in passing_h5ads:
        continue

    #skip sections that are already mapped
    if Path(cirro_path).exists():
       continue

    #load mapping results
    mapping = pd.read_csv(mapped_result, index_col=0)
    mapping = mapping.rename(columns={'score.Corr': 'avg.Cor'}) 
    # # list of columns you'd like to merge with the mapping results
    labels_df = tax.obs[['class', 'supertype_scANVI_leiden', 'subclass']]
    combo = mapping.reset_index().merge(labels_df.drop_duplicates(), how="left", on='supertype_scANVI_leiden').set_index('index')
    combo.index = combo.index.map(lambda x: str(x)[1:])
    combo = combo[~combo.index.duplicated()]
    combo.index.name = None
        
    #load cell by gene table 
    cbg_cpum = pd.read_csv(str(mapped_result.parent)+'/cbg_cpum.csv', index_col=0)
    if 'x' in str(cbg_cpum.index.values.tolist()[0]).lower():
        cbg_cpum.index = cbg_cpum.index.map(lambda x: str(x)[1:])
    else:
        cbg_cpum.index = cbg_cpum.index.astype(str)
    cbg_cpum = cbg_cpum.loc[combo.index.values,:]

    #load filtered cell by gene table (used for mapping)
    cbg_filter = pd.read_csv(str(mapped_result.parent)+'/cbg_filtered.csv', index_col=0)
    
    #create h5ad with all components combined 
    combo['filename'] = [barcode]*len(combo)
    cirro_h5ad = anndata.AnnData(X = cbg_cpum, 
                                 obs = combo,
                                 uns = {'fname' : barcode,
                                        'original_taxonomy_path' : original_taxonomy_path,
                                        'mapping_taxonomy_path': mapping_taxonomy_path}
                                )
    cirro_h5ad.obsm['umap'] = UMAP().fit_transform(cbg_cpum)
    cirro_h5ad.layers['raw'] = cbg_filter.set_index(cbg_cpum.index.values)
    
    #add spatial information
    meta = pd.read_csv(meta_path, index_col=0)
    meta = meta.loc[combo.index.values.astype(np.int64),:]
    spatial = meta[['center_x', 'center_y']].values
    cirro_h5ad.obsm['spatial'] = spatial 
    
    #save
    print('saving: '+ str(cirro_path))
    cirro_h5ad.write(cirro_path)


## Combine into single h5ad, adding metadata
- tracker metadata like age, adnc level, etc
- pseudotime
- spatial_cirro for organized cirro plotting

In [None]:
# concatenate individual h5ads into a single file with metadata included
h5ad_paths = Path(root_folder).glob('MERSCOPE/resegmentation_and_corr_mapping_data/mtg_manuscript/*/*.h5ad')
concat_h5ads = [x for x in h5ad_paths if x.parent.stem in passing_h5ads]

#initial combination
h5ad_raws = []
for file in concat_h5ads:
    x = read_h5ad(file)
    if 'filename' not in x.obs.columns.tolist():
        x.obs = x.obs.rename(columns= {'avg.cor' : 'avg.Cor', 
                                       'cluster': 'supertype_scANVI_leiden', 
                                       'subclass_scANVI':'subclass', 
                                       'filename_x': 'filename'})
    h5ad_raws.append(x)
combo_anndata = anndata.concat(h5ad_raws, uns_merge='first')
assert len(combo_anndata.obs.filename.unique().tolist()) == 69
combo_anndata.obs_names_make_unique()

# # ###prepare to add data from h5ad tracker by making tracker table names uniform
selected_rows = h5ad_tracker[['Donor', 'Specimen', 'Barcode', 'UWA ID','Specimen Type', 'Unique Donor ID','technical replicates', 'Dementia', 'Age (Y)', 'Sex', 'Final QC status']]
selected_rows['Barcode'] = selected_rows['Barcode'].apply(lambda x: x if pd.isnull(x) else str(int(x)))
selected_rows = selected_rows.rename(columns={'Barcode': 'filename', 'UWA ID': 'uwa', 'Specimen type': 'ADNC level'})
selected_rows = selected_rows[selected_rows['uwa'].notna()]

# ###join pseudotime data on selected h5ad_tracker information
selected_rows_pt = pd.merge(selected_rows, pseudotime, on='uwa')
selected_rows_pt['uwa'] = selected_rows_pt['uwa'].astype('int')
um = combo_anndata.obs['filename'].unique().tolist()
selected_rows_pt = selected_rows_pt[selected_rows_pt.filename.isin(um)]
h5ad_with_metadata = pd.merge(combo_anndata.obs.reset_index(), selected_rows_pt, on = 'filename', how='left')
combo_anndata.obs = h5ad_with_metadata 

# #remove mangled components from specimen IDs 
combo_anndata.obs = combo_anndata.obs.replace({
    'H21.33.019.Cx30.MTG.02.007.5.1 (H21.33.019.Cx30.MTG.02.007.5.01.02)': 'H21.33.019.Cx30.MTG.02.007.5.1',
    'H21.33.019.Cx30.MTG.02.007.5.0 (H21.33.019.Cx30.MTG.02.007.5.01.01)': 'H21.33.019.Cx30.MTG.02.007.5.0',
    'H21.33.031.CX24.MTG.02.007.1.01.02?':'H21.33.031.CX24.MTG.02.007.1.01.02',
    'H20.33.036.CX24.MTG.02.007.2.01.01?': 'H20.33.036.CX24.MTG.02.007.2.01.01',
    'H20.33.036.CX24.MTG.02.007.2.01.02?': 'H20.33.036.CX24.MTG.02.007.2.01.02',
    'H20.33.036.CX24.MTG.02.007.2.01.04?': 'H20.33.036.CX24.MTG.02.007.2.01.04',
    'H21.33.040.Cx22.MTG.02.007.3.03.01?': 'H21.33.040.Cx22.MTG.02.007.3.03.01',
    'H21.33.040.Cx22.MTG.02.007.3.03.03?': 'H21.33.040.Cx22.MTG.02.007.3.03.03',
    'H21.33.040.Cx22.MTG.02.007.3.03.04?': 'H21.33.040.Cx22.MTG.02.007.3.03.04',
    'H20.33.015.CX24.MTG.02.007.1.03.01?': 'H20.33.015.CX24.MTG.02.007.1.03.01',
    'H20.33.015.CX24.MTG.02.007.1.03.02?': 'H20.33.015.CX24.MTG.02.007.1.03.02',
    'H20.33.015.CX24.MTG.02.007.1.03.03?': 'H20.33.015.CX24.MTG.02.007.1.03.03'})

display(combo_anndata.obs.head())

###create unique pt label within donor id
upt = {}
unique_pt = 0
for i in sorted(combo_anndata.obs.donor_pseudotime.unique()):
    for j in combo_anndata.obs[combo_anndata.obs['donor_pseudotime']==i].filename.unique():
        if j == 'x':
            print(j, unique_pt)
        upt[j] = unique_pt
        unique_pt+=1
combo_anndata.obs['unique_pseudotime'] = [upt[x] for x in combo_anndata.obs['filename']]

# ##organize h5ad.obsm['spatial_cirro'] by h5ad.obs['donor_pseudotime'] and save
counter = 0
ycounter = 0
ydist = 20000
xdist = 30000
grid_width = 10
if 'spatial_cirro' not in combo_anndata.obsm.keys():
    combo_anndata.obsm['spatial_cirro'] = np.empty(combo_anndata.obsm['spatial'].shape)

for ii, gb in combo_anndata.obs.groupby("unique_pseudotime"): #ii = section name, gb = indexed chunk of dataframe
    spatial = combo_anndata.obsm['spatial'][combo_anndata.obs.unique_pseudotime==ii,:]
    spatial = spatial.astype(float)
    cirro_y = -(spatial[:,1] - np.mean(spatial[:,1])) - (ycounter*ydist)
    cirro_x = (spatial[:,0] - min(spatial[:,0])) +(counter*xdist)
    counter += 1
    if counter % grid_width == 0:
        ycounter += 1
        counter = 0
    coordinates_cirro = np.column_stack((cirro_x, cirro_y))
    combo_anndata.obsm['spatial_cirro'][np.where(combo_anndata.obs.unique_pseudotime==ii)] = coordinates_cirro

# add umap and save
combo_anndata.obsm['X_umap'] = UMAP().fit_transform(combo_anndata.X) #takes about 40m
combo_anndata.write_h5ad(root_folder+'MERSCOPE/results/mtg_noselectedcells_072924.h5ad')
