In [1]:
import anndata as ad
import scanpy as sc
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
import toolbox as tb

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
def load_mtx(mtx_path: str, barcodes_path: str, features_path: str, sample_name: str | None = None) -> ad.AnnData:
    adata = sc.read_mtx(mtx_path).transpose()
    
    barcodes = pd.read_csv(barcodes_path, header=None, sep='\t', names=['barcodes'])
    features = pd.read_csv(features_path, header=None, sep='\t', names=['gene_ids', 'gene_names'])

    adata.obs_names = sample_name + "_" + barcodes['barcodes'] if sample_name is not None else barcodes['barcodes']
    adata.var_names = features['gene_ids']
    adata.var['gene_names'] = features['gene_names'].values
    
    if sample_name is not None:
        adata.obs['sample'] = sample_name

    return adata

In [4]:
data_path = "/nfs/data/COST_IBD/data/atopic_dermatitis/datasets/Bangert"
sample_names = os.listdir(data_path)

In [6]:
adata_list = []

for sample_name in tqdm(sample_names):
    sample_dir = os.path.join(data_path, sample_name)

    matrix_path = os.path.join(sample_dir, "matrix.mtx.gz")
    barcodes_path = os.path.join(sample_dir, "barcodes.tsv.gz")
    features_path = os.path.join(sample_dir, "features.tsv.gz")

    single_adata: ad.AnnData = load_mtx(matrix_path, barcodes_path, features_path, sample_name)
    
    single_adata.var.drop(columns=["gene_names"], inplace=True)

    single_adata.var_names_make_unique()

    adata_list.append(single_adata)

100%|██████████| 20/20 [09:42<00:00, 29.11s/it]


In [7]:
adata = ad.concat(adata_list, join="outer")
adata

AnnData object with n_obs × n_vars = 306537 × 36601
    obs: 'sample'

In [9]:
adata.write('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/bangert.h5ad')

In [2]:
adata = sc.read('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/bangert.h5ad')

# Metadata

In [4]:
adata.obs['patient'] = adata.obs['sample']
adata.obs.head()

Unnamed: 0_level_0,sample,patient
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3


In [5]:
adata.obs['tissue'] = 'Skin'
adata.obs

Unnamed: 0_level_0,sample,patient,tissue
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3,Skin
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3,Skin
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3,Skin
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin
...,...,...,...
P116_HC3_TTTGTCACATCCGCGA-1,P116_HC3,P116_HC3,Skin
P116_HC3_TTTGTCAGTAGCGTAG-1,P116_HC3,P116_HC3,Skin
P116_HC3_TTTGTCAGTGACCAAG-1,P116_HC3,P116_HC3,Skin
P116_HC3_TTTGTCAGTGGACGAT-1,P116_HC3,P116_HC3,Skin


In [12]:
def assign_condition(sample):
    if 'AD' in sample:
        return 'atopic dermatitis'
    elif 'HC' in sample:
        return 'healthy'
    elif 'DAHND' in sample:
        return 'dupilumab-associated head and neck dermatitis'
    else:
        return 'unknown'

adata.obs['condition'] = adata.obs['sample'].apply(assign_condition)

adata.obs.head()


Unnamed: 0_level_0,sample,patient,tissue,condition,serum_ige,batch,iga_score,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate


In [7]:
adata.obs    

Unnamed: 0_level_0,sample,patient,tissue,condition
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis
...,...,...,...,...
P116_HC3_TTTGTCACATCCGCGA-1,P116_HC3,P116_HC3,Skin,healthy control
P116_HC3_TTTGTCAGTAGCGTAG-1,P116_HC3,P116_HC3,Skin,healthy control
P116_HC3_TTTGTCAGTGACCAAG-1,P116_HC3,P116_HC3,Skin,healthy control
P116_HC3_TTTGTCAGTGGACGAT-1,P116_HC3,P116_HC3,Skin,healthy control


In [8]:
serum_ige_dict = {
    'P120_DAHND1': '82.8',
    'P126_DAHND2': '2874',
    'P133_DAHND3': '>5000',
    'P137_DAHND4': '2247',
    'P147_DAHND5': '>5000',
    'P161_DAHND6': '1648',
    'P148_HeadNeckAD1': '2650',
    'P151_HeadNeckAD2': '3803',
    'P176_HeadNeckAD3': '111',
    'P177_HeadNeckAD4': '2349',
    'P180_HeadNeckAD5': '991',
    'P74_TrunkAD1': '351',
    'P75_TrunkAD2': '8',
    'P77_TrunkAD3': '2045',
    'P81_TrunkAD4': '>5000',
    'P96_TrunkAD5': '>5000',
    'P112_HC1': '<100',
    'P115_HC2': '<100',
    'P116_HC3': '<100',
    'P121_HC4': '<100'
}
adata.obs['serum_ige'] = adata.obs['sample'].map(serum_ige_dict)
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,serum_ige
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045
...,...,...,...,...,...
P116_HC3_TTTGTCACATCCGCGA-1,P116_HC3,P116_HC3,Skin,healthy control,<100
P116_HC3_TTTGTCAGTAGCGTAG-1,P116_HC3,P116_HC3,Skin,healthy control,<100
P116_HC3_TTTGTCAGTGACCAAG-1,P116_HC3,P116_HC3,Skin,healthy control,<100
P116_HC3_TTTGTCAGTGGACGAT-1,P116_HC3,P116_HC3,Skin,healthy control,<100


In [10]:
iga_dict = {
    'P120_DAHND1': 1,
    'P126_DAHND2': 1,
    'P133_DAHND3': 1,
    'P137_DAHND4': 1,
    'P147_DAHND5': 2,
    'P161_DAHND6': 1,
    'P148_HeadNeckAD1': 3,
    'P151_HeadNeckAD2': 2,
    'P176_HeadNeckAD3': 4,
    'P177_HeadNeckAD4': 3,
    'P180_HeadNeckAD5': 4,
    'P74_TrunkAD1': 3,
    'P75_TrunkAD2': 4,
    'P77_TrunkAD3': 3,
    'P81_TrunkAD4': 4,
    'P96_TrunkAD5': 4,
}
adata.obs['iga_score'] = adata.obs['sample'].map(iga_dict)
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,serum_ige,batch,iga_score
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0
...,...,...,...,...,...,...,...
P116_HC3_TTTGTCACATCCGCGA-1,P116_HC3,P116_HC3,Skin,healthy control,<100,GSE230575_healthy control,
P116_HC3_TTTGTCAGTAGCGTAG-1,P116_HC3,P116_HC3,Skin,healthy control,<100,GSE230575_healthy control,
P116_HC3_TTTGTCAGTGACCAAG-1,P116_HC3,P116_HC3,Skin,healthy control,<100,GSE230575_healthy control,
P116_HC3_TTTGTCAGTGGACGAT-1,P116_HC3,P116_HC3,Skin,healthy control,<100,GSE230575_healthy control,


In [11]:
def assign_severity(iga_score):
    if pd.isna(iga_score):
        return None
    elif iga_score == 0:
        return 'clear'
    elif iga_score in [1, 2]:
        return 'mild'
    elif iga_score == 3:
        return 'moderate'
    elif iga_score >= 4:
        return 'severe'
    else:
        return 'unknown'

adata.obs['severity'] = adata.obs['iga_score'].apply(assign_severity).values
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,serum_ige,batch,iga_score,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
...,...,...,...,...,...,...,...,...
P116_HC3_TTTGTCACATCCGCGA-1,P116_HC3,P116_HC3,Skin,healthy control,<100,GSE230575_healthy control,,
P116_HC3_TTTGTCAGTAGCGTAG-1,P116_HC3,P116_HC3,Skin,healthy control,<100,GSE230575_healthy control,,
P116_HC3_TTTGTCAGTGACCAAG-1,P116_HC3,P116_HC3,Skin,healthy control,<100,GSE230575_healthy control,,
P116_HC3_TTTGTCAGTGGACGAT-1,P116_HC3,P116_HC3,Skin,healthy control,<100,GSE230575_healthy control,,


In [14]:
adata.obs['batch'] = 'GSE230575_' + adata.obs['condition']
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,serum_ige,batch,iga_score,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
...,...,...,...,...,...,...,...,...
P116_HC3_TTTGTCACATCCGCGA-1,P116_HC3,P116_HC3,Skin,healthy,<100,GSE230575_healthy,,
P116_HC3_TTTGTCAGTAGCGTAG-1,P116_HC3,P116_HC3,Skin,healthy,<100,GSE230575_healthy,,
P116_HC3_TTTGTCAGTGACCAAG-1,P116_HC3,P116_HC3,Skin,healthy,<100,GSE230575_healthy,,
P116_HC3_TTTGTCAGTGGACGAT-1,P116_HC3,P116_HC3,Skin,healthy,<100,GSE230575_healthy,,


In [None]:
adata.write('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/bangert.h5ad')

In [2]:
adata = sc.read('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/filtered/bangert_filtered.h5ad')

In [13]:
adata.obs['condition'].unique()

array(['atopic dermatitis', 'healthy',
       'dupilumab-associated head and neck dermatitis'], dtype=object)

In [16]:
adata.obs['severity'] = adata.obs['severity'].astype('category')
adata.obs['severity'] = adata.obs['severity'].cat.add_categories('unknown_healthy')

adata.obs.loc[adata.obs['condition'] == 'healthy', 'severity'] = 'unknown_healthy'

In [17]:
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,serum_ige,batch,iga_score,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P77_TrunkAD3_AAACCTGAGAAGGGTA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGAGAGCTC-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGATCGATA-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGCCCTAAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
P77_TrunkAD3_AAACCTGAGCGTCTAT-1,P77_TrunkAD3,P77_TrunkAD3,Skin,atopic dermatitis,2045,GSE230575_atopic dermatitis,3.0,moderate
...,...,...,...,...,...,...,...,...
P116_HC3_TTTGTCACATCCGCGA-1,P116_HC3,P116_HC3,Skin,healthy,<100,GSE230575_healthy,,unknown_healthy
P116_HC3_TTTGTCAGTAGCGTAG-1,P116_HC3,P116_HC3,Skin,healthy,<100,GSE230575_healthy,,unknown_healthy
P116_HC3_TTTGTCAGTGACCAAG-1,P116_HC3,P116_HC3,Skin,healthy,<100,GSE230575_healthy,,unknown_healthy
P116_HC3_TTTGTCAGTGGACGAT-1,P116_HC3,P116_HC3,Skin,healthy,<100,GSE230575_healthy,,unknown_healthy


In [4]:
adata.obs['sample']

barcodes
P77_TrunkAD3_AAACCTGAGAAGGGTA-1    P77_TrunkAD3
P77_TrunkAD3_AAACCTGAGAGAGCTC-1    P77_TrunkAD3
P77_TrunkAD3_AAACCTGAGATCGATA-1    P77_TrunkAD3
P77_TrunkAD3_AAACCTGAGCCCTAAT-1    P77_TrunkAD3
P77_TrunkAD3_AAACCTGAGCGTCTAT-1    P77_TrunkAD3
                                       ...     
P116_HC3_TTTGTCACATCCGCGA-1            P116_HC3
P116_HC3_TTTGTCAGTAGCGTAG-1            P116_HC3
P116_HC3_TTTGTCAGTGACCAAG-1            P116_HC3
P116_HC3_TTTGTCAGTGGACGAT-1            P116_HC3
P116_HC3_TTTGTCATCCCATTAT-1            P116_HC3
Name: sample, Length: 306537, dtype: category
Categories (20, object): ['P74_TrunkAD1', 'P75_TrunkAD2', 'P77_TrunkAD3', 'P81_TrunkAD4', ..., 'P161_DAHND6', 'P176_HeadNeckAD3', 'P177_HeadNeckAD4', 'P180_HeadNeckAD5']

In [5]:

samples_to_remove = ["P96_TrunkAD5", "P77_TrunkAD3"]

mask = ~adata.obs["sample"].str.contains("|".join(samples_to_remove))

adata_filtered = adata[mask, :]



In [7]:
adata_filtered.obs['sample']

barcodes
P115_HC2_AAACCTGAGAAGGCCT-1    P115_HC2
P115_HC2_AAACCTGAGCAGCCTC-1    P115_HC2
P115_HC2_AAACCTGAGCCCAGCT-1    P115_HC2
P115_HC2_AAACCTGAGTGTGAAT-1    P115_HC2
P115_HC2_AAACCTGCACATCCAA-1    P115_HC2
                                 ...   
P116_HC3_TTTGTCACATCCGCGA-1    P116_HC3
P116_HC3_TTTGTCAGTAGCGTAG-1    P116_HC3
P116_HC3_TTTGTCAGTGACCAAG-1    P116_HC3
P116_HC3_TTTGTCAGTGGACGAT-1    P116_HC3
P116_HC3_TTTGTCATCCCATTAT-1    P116_HC3
Name: sample, Length: 252149, dtype: category
Categories (18, object): ['P74_TrunkAD1', 'P75_TrunkAD2', 'P81_TrunkAD4', 'P112_HC1', ..., 'P161_DAHND6', 'P176_HeadNeckAD3', 'P177_HeadNeckAD4', 'P180_HeadNeckAD5']

In [None]:
adata_filtered.write('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/bangert_without_patients.h5ad')