In [1]:
import h5py
import os
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc


  data = yaml.load(f.read()) or {}


### Set up h5ad file

In [2]:
hf = h5py.File('raw/activity_mtx/merge.h5', 'r')
        
key_list = list(hf.keys())
atac_key = []
act_key = []
for i in range(len(key_list)):
    if key_list[i].startswith('peak'):
        atac_key.append(key_list[i])
    if key_list[i].startswith('ACT'):
        act_key.append(key_list[i])


peaks_mtx = hf.get(atac_key[0])
peaks_mtx = np.array(peaks_mtx)
for index in atac_key[1:]:
    Y2 = hf.get(index)
    Y2 = np.array(Y2)
    peaks_mtx = np.concatenate((peaks_mtx, Y2), axis=0)

act_mtx = hf.get(act_key[0])
act_mtx = np.array(act_mtx)
for index in act_key[1:]:
    Y2 = hf.get(index)
    Y2 = np.array(Y2)
    act_mtx = np.concatenate((act_mtx, Y2), axis=0)


In [3]:
peaks_mtx.shape

(25522, 60587)

In [4]:
act_mtx.shape

(25522, 23181)

In [5]:
peaks_name = pd.read_csv('raw/peaks_index.txt',header=None).iloc[:,0].to_numpy()
genes_name = pd.read_csv('raw/genes_index.txt',header=None).iloc[:,0].to_numpy()
cells_name = pd.read_csv('raw/cells_index.txt',header=None).iloc[:,0].to_numpy()

In [6]:
anndat_atac_act = ad.AnnData(
    X = act_mtx,
)
anndat_atac_peak = ad.AnnData(
    X = peaks_mtx,
)
anndat_atac_act.obs_names = cells_name
anndat_atac_act.var_names = genes_name
anndat_atac_peak.obs_names = cells_name
anndat_atac_peak.var_names = peaks_name



In [7]:
np.all(anndat_atac_peak.obs.index == anndat_atac_act.obs.index)

True

In [8]:
anndat_atac_peak.write_h5ad('JEM_ATAC.peaks.h5ad')
anndat_atac_act.write_h5ad('JEM_ATAC.act.h5ad')

### Add meta data

In [2]:
anndat_atac_peak = ad.read_h5ad('JEM_ATAC.peaks.h5ad')
anndat_atac_act = ad.read_h5ad('JEM_ATAC.act.h5ad')
meta = pd.read_csv('atac_cell_metadata.csv')

In [3]:
anndat_atac_peak.obs_names = 'ATAC_' + anndat_atac_peak.obs_names
anndat_atac_act.obs_names = 'ATAC_' + anndat_atac_act.obs_names

In [4]:
cell_ids = np.intersect1d(anndat_atac_peak.obs_names.to_numpy(), meta['cell_id'].to_numpy())

In [5]:
anndat_atac_peak = anndat_atac_peak[cell_ids,]
anndat_atac_act = anndat_atac_act[cell_ids,]


In [6]:
meta.index = meta['cell_id']
meta = meta.loc[cell_ids,:]

In [7]:
meta

Unnamed: 0_level_0,Sample,cell_id,cell_type,current_severity_bin,is_doublet,UMAP1,UMAP2
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ATAC_555_1#AAACGAATCGTCCCAT-1,ATAC_555_1,ATAC_555_1#AAACGAATCGTCCCAT-1,CD14 Mono,4-5,False,-12.066025,-3.170494
ATAC_555_1#AAACTCGGTGGTTCTA-1,ATAC_555_1,ATAC_555_1#AAACTCGGTGGTTCTA-1,CD14 Mono,4-5,False,-13.042558,-1.140425
ATAC_555_1#AAACTGCTCGGTTGTA-1,ATAC_555_1,ATAC_555_1#AAACTGCTCGGTTGTA-1,CD14 Mono,4-5,False,-11.194732,-3.347949
ATAC_555_1#AAAGATGCATCGGCTG-1,ATAC_555_1,ATAC_555_1#AAAGATGCATCGGCTG-1,CD8 TEM,4-5,False,8.443998,4.560493
ATAC_555_1#AAAGATGGTACTATGC-1,ATAC_555_1,ATAC_555_1#AAAGATGGTACTATGC-1,CD4 TCM,4-5,False,8.428380,-0.713068
...,...,...,...,...,...,...,...
ATAC_HIP045#TTGTTGTTCTGTAGAC-1,ATAC_HIP045,ATAC_HIP045#TTGTTGTTCTGTAGAC-1,CD8 TEM,0,False,8.049340,3.950382
ATAC_HIP045#TTTACGTAGACCATAA-1,ATAC_HIP045,ATAC_HIP045#TTTACGTAGACCATAA-1,B naive,0,False,1.007606,-8.486456
ATAC_HIP045#TTTACGTTCGCCACTT-1,ATAC_HIP045,ATAC_HIP045#TTTACGTTCGCCACTT-1,CD4 TCM,0,False,8.048807,-0.918113
ATAC_HIP045#TTTGGTTGTGCCCAGT-1,ATAC_HIP045,ATAC_HIP045#TTTGGTTGTGCCCAGT-1,CD8 TEM,0,True,5.913862,2.662103


In [9]:
anndat_atac_peak.obs['sample'] = meta['Sample'].to_numpy()
anndat_atac_act.obs['sample'] = meta['Sample'].to_numpy()

anndat_atac_peak.obs['cell_type'] = meta['cell_type'].to_numpy()
anndat_atac_act.obs['cell_type'] = meta['cell_type'].to_numpy()

anndat_atac_peak.obs['current_severity_bin'] = meta['current_severity_bin'].to_numpy()
anndat_atac_act.obs['current_severity_bin'] = meta['current_severity_bin'].to_numpy()

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


In [10]:
anndat_atac_peak.obs['current_severity_bin'].value_counts()

0      6341
4-5    2119
6-7    1778
Name: current_severity_bin, dtype: int64

In [11]:
np.all(anndat_atac_act.obs_names == meta.index)

True

In [12]:
anndat_atac_act.layers['counts'] = anndat_atac_act.X.copy()
sc.pp.normalize_total(anndat_atac_act, target_sum=1e6)
sc.pp.log1p(anndat_atac_act)
#sc.pp.scale(anndat_atac_act, max_value=10)

In [13]:
anndat_atac_act.write_h5ad('JEM_ATAC.act.2.h5ad')
anndat_atac_peak.write_h5ad('JEM_ATAC.peaks.2.h5ad')

... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'current_severity_bin' as categorical
... storing 'sample' as categorical
... storing 'cell_type' as categorical
... storing 'current_severity_bin' as categorical
