In [1]:
import anndata as ad
import scanpy as sc
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
import toolbox as tb

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
def load_mtx(mtx_path: str, barcodes_path: str, features_path: str, sample_name: str | None = None) -> ad.AnnData:
    adata = sc.read_mtx(mtx_path).transpose()
    
    barcodes = pd.read_csv(barcodes_path, header=None, sep='\t', names=['barcodes'])
    features = pd.read_csv(features_path, header=None, sep='\t', names=['gene_ids', 'gene_names'])

    adata.obs_names = sample_name + "_" + barcodes['barcodes'] if sample_name is not None else barcodes['barcodes']
    adata.var_names = features['gene_ids']
    adata.var['gene_names'] = features['gene_names'].values
    
    if sample_name is not None:
        adata.obs['sample'] = sample_name

    return adata

In [4]:
data_path = "/nfs/data/COST_IBD/data/atopic_dermatitis/datasets/Rojahn"
sample_names = os.listdir(data_path)

In [5]:
adata_list = []

for sample_name in tqdm(sample_names):
    sample_dir = os.path.join(data_path, sample_name)

    matrix_path = os.path.join(sample_dir, "matrix.mtx.gz")
    barcodes_path = os.path.join(sample_dir, "barcodes.tsv.gz")
    features_path = os.path.join(sample_dir, "features.tsv.gz")

    single_adata: ad.AnnData = load_mtx(matrix_path, barcodes_path, features_path, sample_name)
    
    # The contents of gene_names are worthless
    single_adata.var.drop(columns=["gene_names"], inplace=True)

    # The dataset contains duplicated gene names
    single_adata.var_names_make_unique()

    adata_list.append(single_adata)

100%|██████████| 15/15 [01:57<00:00,  7.87s/it]


In [6]:
adata = ad.concat(adata_list, join="outer")
adata

AnnData object with n_obs × n_vars = 60584 × 33538
    obs: 'sample'

In [7]:
adata.write('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/rojahn.h5ad')

In [9]:
adata = sc.read('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/rojahn.h5ad')

In [10]:
adata.obs


Unnamed: 0_level_0,sample
barcodes,Unnamed: 1_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7
HC7_AAACCCACAAAGGGCT-1,HC7
HC7_AAACCCACAACGATTC-1,HC7
HC7_AAACCCAGTCCAACGC-1,HC7
HC7_AAACCCATCAGGAGAC-1,HC7
...,...
AD7_TTTGTTGGTGCATCTA-1,AD7
AD7_TTTGTTGGTGTTGACT-1,AD7
AD7_TTTGTTGGTTAGAAGT-1,AD7
AD7_TTTGTTGTCGGCCAAC-1,AD7


In [11]:
adata.obs['patient'] = adata.obs['sample']
adata.obs.head()

Unnamed: 0_level_0,sample,patient
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7,HC7
HC7_AAACCCACAAAGGGCT-1,HC7,HC7
HC7_AAACCCACAACGATTC-1,HC7,HC7
HC7_AAACCCAGTCCAACGC-1,HC7,HC7
HC7_AAACCCATCAGGAGAC-1,HC7,HC7


In [12]:
tissue_mapping = {
    'AD1': 'Skin Suction Blister',
    'AD2': 'Skin Suction Blister',
    'AD3': 'Skin Suction Blister',
    'AD4': 'Skin Suction Blister',
    'AD5': 'Skin',
    'AD6': 'Skin',
    'AD7': 'Skin',
    'AD8': 'Skin',
    'HC1': 'Skin Suction Blister',
    'HC2': 'Skin Suction Blister',
    'HC3': 'Skin Suction Blister',
    'HC4': 'Skin Suction Blister',
    'HC5': 'Skin Suction Blister',
    'HC6': 'Skin',
    'HC7': 'Skin'
}
adata.obs['tissue'] = adata.obs['sample'].map(tissue_mapping)
adata.obs

Unnamed: 0_level_0,sample,patient,tissue
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7,HC7,Skin
HC7_AAACCCACAAAGGGCT-1,HC7,HC7,Skin
HC7_AAACCCACAACGATTC-1,HC7,HC7,Skin
HC7_AAACCCAGTCCAACGC-1,HC7,HC7,Skin
HC7_AAACCCATCAGGAGAC-1,HC7,HC7,Skin
...,...,...,...
AD7_TTTGTTGGTGCATCTA-1,AD7,AD7,Skin
AD7_TTTGTTGGTGTTGACT-1,AD7,AD7,Skin
AD7_TTTGTTGGTTAGAAGT-1,AD7,AD7,Skin
AD7_TTTGTTGTCGGCCAAC-1,AD7,AD7,Skin


In [13]:
adata.obs.loc[adata.obs['sample'].str.startswith('AD'), 'condition'] = 'atopic dermatitis'
adata.obs.loc[adata.obs['sample'].str.startswith('HC'), 'condition'] = 'healthy'
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7,HC7,Skin,healthy
HC7_AAACCCACAAAGGGCT-1,HC7,HC7,Skin,healthy
HC7_AAACCCACAACGATTC-1,HC7,HC7,Skin,healthy
HC7_AAACCCAGTCCAACGC-1,HC7,HC7,Skin,healthy
HC7_AAACCCATCAGGAGAC-1,HC7,HC7,Skin,healthy
...,...,...,...,...
AD7_TTTGTTGGTGCATCTA-1,AD7,AD7,Skin,atopic dermatitis
AD7_TTTGTTGGTGTTGACT-1,AD7,AD7,Skin,atopic dermatitis
AD7_TTTGTTGGTTAGAAGT-1,AD7,AD7,Skin,atopic dermatitis
AD7_TTTGTTGTCGGCCAAC-1,AD7,AD7,Skin,atopic dermatitis


In [8]:
easi_score_mapping = {
    'AD1': 34.2,
    'AD2': 44.6,
    'AD3': 44.7,
    'AD4': 5.5,
    'AD5': 24.1,
    'AD6': 28.1,
    'AD7': 42.8,
    'AD8': 46.5
}

adata.obs['easi_score'] = adata.obs['sample'].map(easi_score_mapping)
adata.obs

adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,easi_socre,batch,easi_score,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
HC7_AAACCCACAAAGGGCT-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
HC7_AAACCCACAACGATTC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
HC7_AAACCCAGTCCAACGC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
HC7_AAACCCATCAGGAGAC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
...,...,...,...,...,...,...,...,...
AD7_TTTGTTGGTGCATCTA-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin,42.8,unknown
AD7_TTTGTTGGTGTTGACT-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin,42.8,unknown
AD7_TTTGTTGGTTAGAAGT-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin,42.8,unknown
AD7_TTTGTTGTCGGCCAAC-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin,42.8,unknown


In [17]:
adata.obs['batch'] = 'GSE153760_' + adata.obs['condition'] + '_' + adata.obs['tissue']
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,easi_socre,batch
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin
HC7_AAACCCACAAAGGGCT-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin
HC7_AAACCCACAACGATTC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin
HC7_AAACCCAGTCCAACGC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin
HC7_AAACCCATCAGGAGAC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin
...,...,...,...,...,...,...
AD7_TTTGTTGGTGCATCTA-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin
AD7_TTTGTTGGTGTTGACT-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin
AD7_TTTGTTGGTTAGAAGT-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin
AD7_TTTGTTGTCGGCCAAC-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin


In [13]:
adata.write('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/rojahn.h5ad')

In [2]:
adata = sc.read('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/rojahn.h5ad')

In [9]:
def assign_severity(easi_score):
    if pd.isna(easi_score):
        return None
    elif easi_score >= 0 and easi_score <= 1:
        return 'clear'
    elif easi_score > 1 and easi_score <= 7:
        return 'mild'
    elif easi_score > 7 and easi_score <= 21:
        return 'moderate'
    elif easi_score > 21 and easi_score <= 50:
        return 'severe'
    else:
        return 'unknown'

adata.obs['severity'] = adata.obs['easi_score'].apply(assign_severity).values
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,easi_socre,batch,easi_score,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
HC7_AAACCCACAAAGGGCT-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
HC7_AAACCCACAACGATTC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
HC7_AAACCCAGTCCAACGC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
HC7_AAACCCATCAGGAGAC-1,HC7,HC7,Skin,healthy,,GSE153760_healthy_Skin,,
...,...,...,...,...,...,...,...,...
AD7_TTTGTTGGTGCATCTA-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGGTGTTGACT-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGGTTAGAAGT-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGTCGGCCAAC-1,AD7,AD7,Skin,atopic dermatitis,42.8,GSE153760_atopic dermatitis_Skin,42.8,severe


In [10]:
adata.obs = adata.obs.drop(columns=['easi_socre'])

In [11]:
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,batch,easi_score,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
HC7_AAACCCACAAAGGGCT-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
HC7_AAACCCACAACGATTC-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
HC7_AAACCCAGTCCAACGC-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
HC7_AAACCCATCAGGAGAC-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
...,...,...,...,...,...,...,...
AD7_TTTGTTGGTGCATCTA-1,AD7,AD7,Skin,atopic dermatitis,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGGTGTTGACT-1,AD7,AD7,Skin,atopic dermatitis,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGGTTAGAAGT-1,AD7,AD7,Skin,atopic dermatitis,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGTCGGCCAAC-1,AD7,AD7,Skin,atopic dermatitis,GSE153760_atopic dermatitis_Skin,42.8,severe


In [3]:
adata = sc.read('/nfs/data/COST_IBD/data/atopic_dermatitis/anndata/filtered/rojahn_filtered.h5ad')
adata

AnnData object with n_obs × n_vars = 6971 × 33538
    obs: 'batch', 'cell_type', 'condition', 'sex', 'patient', 'tissue', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [3]:
adata.obs

Unnamed: 0_level_0,sample,patient,tissue,condition,batch,easi_score,severity
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HC7_AAACCCAAGAGCCCAA-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
HC7_AAACCCACAAAGGGCT-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
HC7_AAACCCACAACGATTC-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
HC7_AAACCCAGTCCAACGC-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
HC7_AAACCCATCAGGAGAC-1,HC7,HC7,Skin,healthy,GSE153760_healthy_Skin,,
...,...,...,...,...,...,...,...
AD7_TTTGTTGGTGCATCTA-1,AD7,AD7,Skin,atopic dermatitis,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGGTGTTGACT-1,AD7,AD7,Skin,atopic dermatitis,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGGTTAGAAGT-1,AD7,AD7,Skin,atopic dermatitis,GSE153760_atopic dermatitis_Skin,42.8,severe
AD7_TTTGTTGTCGGCCAAC-1,AD7,AD7,Skin,atopic dermatitis,GSE153760_atopic dermatitis_Skin,42.8,severe


In [10]:
adata.obs['severity'] = adata.obs['severity'].astype('category')
adata.obs['severity'] = adata.obs['severity'].cat.add_categories('unknown_healthy')

adata.obs.loc[adata.obs['condition'] == 'healthy', 'severity'] = 'unknown_healthy'


In [14]:
adata.obs['condition'].unique()

['healthy', 'atopic dermatitis']
Categories (2, object): ['atopic dermatitis', 'healthy']