In [20]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
def load_mtx(mtx_path: str, barcodes_path: str, features_path: str, sample_name: str | None = None) -> ad.AnnData:
    adata = sc.read_mtx(mtx_path).transpose()
    
    barcodes = pd.read_csv(barcodes_path, header=None, sep='\t', names=['barcodes'])
    features = pd.read_csv(features_path, header=None, sep='\t', names=['gene_ids', 'gene_names'])

    adata.obs_names = sample_name + "_" + barcodes['barcodes'] if sample_name is not None else barcodes['barcodes']
    adata.var_names = features['gene_ids']
    adata.var['gene_names'] = features['gene_names'].values
    
    if sample_name is not None:
        adata.obs['sample'] = sample_name

    return adata

In [6]:
data_path = "/nfs/data/COST_IBD/data/atopic_dermatitis/datasets/Gao"
sample_names = os.listdir(data_path)

In [7]:
adata_list = []

for sample_name in tqdm(sample_names):
    sample_dir = os.path.join(data_path, sample_name)

    matrix_path = os.path.join(sample_dir, "matrix.mtx.gz")
    barcodes_path = os.path.join(sample_dir, "barcodes.tsv.gz")
    features_path = os.path.join(sample_dir, "features.tsv.gz")

    single_adata: ad.AnnData = load_mtx(matrix_path, barcodes_path, features_path, sample_name)
    
    # The contents of gene_names are worthless
    single_adata.var.drop(columns=["gene_names"], inplace=True)

    # The dataset contains duplicated gene names
    single_adata.var_names_make_unique()

    adata_list.append(single_adata)

100%|██████████| 4/4 [01:44<00:00, 26.25s/it]


In [8]:
adata = ad.concat(adata_list, join="outer")
adata

AnnData object with n_obs × n_vars = 35890 × 36601
    obs: 'sample'

In [9]:
adata.write('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/Gao.h5ad')

In [10]:
adata

AnnData object with n_obs × n_vars = 35890 × 36601
    obs: 'sample'

In [21]:
adata = sc.read('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/Gao.h5ad')

In [22]:
adata.obs['patient'] = adata.obs['sample']
adata.obs.head()

Unnamed: 0_level_0,sample,patient,tissue,condition,cell_type,batch
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Con4_AAACCTGAGAAGCCCA-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
Con4_AAACCTGAGAGAGCTC-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
Con4_AAACCTGAGATAGCAT-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
Con4_AAACCTGAGGGTCGAT-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
Con4_AAACCTGAGTTATCGC-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control


In [23]:
adata.obs['tissue'] = 'heparinized blood'

In [4]:
def assign_condition(sample):
    if 'AD' in sample:
        return 'atopic dermatitis'
    elif 'Con' in sample:
        return 'healthy'
    else:
        return 'unknown'

adata.obs['condition'] = adata.obs['sample'].apply(assign_condition)

adata.obs.head()

Unnamed: 0_level_0,sample,patient,tissue,condition,cell_type,batch
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Con4_AAACCTGAGAAGCCCA-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy control
Con4_AAACCTGAGAGAGCTC-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy control
Con4_AAACCTGAGATAGCAT-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy control
Con4_AAACCTGAGGGTCGAT-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy control
Con4_AAACCTGAGTTATCGC-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy control


In [25]:
adata.obs['cell_type'] = 'PBMC cells'

In [26]:
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,cell_type,batch
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Con4_AAACCTGAGAAGCCCA-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
Con4_AAACCTGAGAGAGCTC-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
Con4_AAACCTGAGATAGCAT-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
Con4_AAACCTGAGGGTCGAT-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
Con4_AAACCTGAGTTATCGC-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE230575_healthy control
...,...,...,...,...,...,...
AD1_TTTGTCATCAGTTAGC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE230575_atopic dermatitis
AD1_TTTGTCATCCTTGGTC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE230575_atopic dermatitis
AD1_TTTGTCATCTCGCTTG-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE230575_atopic dermatitis
AD1_TTTGTCATCTCTTGAT-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE230575_atopic dermatitis


In [27]:
adata.obs['condition'].unique()

array(['healthy control', 'atopic dermatitis'], dtype=object)

In [5]:
adata.obs['batch'] = 'GSE193096_' + adata.obs['condition']
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,cell_type,batch
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Con4_AAACCTGAGAAGCCCA-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
Con4_AAACCTGAGAGAGCTC-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
Con4_AAACCTGAGATAGCAT-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
Con4_AAACCTGAGGGTCGAT-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
Con4_AAACCTGAGTTATCGC-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
...,...,...,...,...,...,...
AD1_TTTGTCATCAGTTAGC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCCTTGGTC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCTCGCTTG-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCTCTTGAT-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis


In [17]:
adata.write('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/Gao.h5ad')

In [2]:
adata = sc.read('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/Gao.h5ad')

In [5]:
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,cell_type,batch
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Con4_AAACCTGAGAAGCCCA-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE193096_healthy control
Con4_AAACCTGAGAGAGCTC-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE193096_healthy control
Con4_AAACCTGAGATAGCAT-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE193096_healthy control
Con4_AAACCTGAGGGTCGAT-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE193096_healthy control
Con4_AAACCTGAGTTATCGC-1,Con4,Con4,heparinized blood,healthy control,PBMC cells,GSE193096_healthy control
...,...,...,...,...,...,...
AD1_TTTGTCATCAGTTAGC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCCTTGGTC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCTCGCTTG-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCTCTTGAT-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis


In [6]:
adata = sc.read('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/filtered/Gao_filtered.h5ad')
adata

AnnData object with n_obs × n_vars = 8622 × 36601
    obs: 'batch', 'cell_type', 'condition', 'sex', 'patient', 'tissue', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [6]:
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,cell_type,batch
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Con4_AAACCTGAGAAGCCCA-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
Con4_AAACCTGAGAGAGCTC-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
Con4_AAACCTGAGATAGCAT-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
Con4_AAACCTGAGGGTCGAT-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
Con4_AAACCTGAGTTATCGC-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy
...,...,...,...,...,...,...
AD1_TTTGTCATCAGTTAGC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCCTTGGTC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCTCGCTTG-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis
AD1_TTTGTCATCTCTTGAT-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis


In [7]:
adata.obs['severity'] = 'unknown' 
adata.obs.loc[adata.obs['condition'] == 'healthy', 'severity'] = 'unknown_healthy'

In [8]:
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,cell_type,batch,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Con4_AAACCTGAGAAGCCCA-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy,unknown_healthy
Con4_AAACCTGAGAGAGCTC-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy,unknown_healthy
Con4_AAACCTGAGATAGCAT-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy,unknown_healthy
Con4_AAACCTGAGGGTCGAT-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy,unknown_healthy
Con4_AAACCTGAGTTATCGC-1,Con4,Con4,heparinized blood,healthy,PBMC cells,GSE193096_healthy,unknown_healthy
...,...,...,...,...,...,...,...
AD1_TTTGTCATCAGTTAGC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis,unknown
AD1_TTTGTCATCCTTGGTC-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis,unknown
AD1_TTTGTCATCTCGCTTG-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis,unknown
AD1_TTTGTCATCTCTTGAT-1,AD1,AD1,heparinized blood,atopic dermatitis,PBMC cells,GSE193096_atopic dermatitis,unknown
