### Setup

In [5]:
import os

import anndata as ad
import numpy as np
import pandas as pd

In [6]:
ANN_DATA_DIR = "../data/processed/anndata_orthologs"

EMBEDDINGS = {
    "human_burn": "../data/processed/bulkformer_embeddings/human_burn_transcriptome_embeddings.npy",
    "human_trauma": "../data/processed/bulkformer_embeddings/human_trauma_transcriptome_embeddings.npy",
    "human_sepsis": "../data/processed/bulkformer_embeddings/human_sepsis_transcriptome_embeddings.npy",
    "mouse_burn": "../data/processed/bulkformer_embeddings/mouse_burn_transcriptome_embeddings.npy",
    "mouse_trauma": "../data/processed/bulkformer_embeddings/mouse_trauma_transcriptome_embeddings.npy",
    "mouse_sepsis": "../data/processed/bulkformer_embeddings/mouse_sepsis_transcriptome_embeddings.npy",
    "mouse_infection": "../data/processed/bulkformer_embeddings/mouse_infection_transcriptome_embeddings.npy",
}

In [7]:
# load adatas
adatas = {}
for f in sorted(os.listdir(ANN_DATA_DIR)):
    if f.endswith(".h5ad"):
        name = f.replace("_orthologs.h5ad", "")
        path = os.path.join(ANN_DATA_DIR, f)
        adatas[name] = ad.read_h5ad(path)
        print(f"Loaded {name}: {adatas[name].shape}")

        # add embeddings to adata
        embedding = np.load(EMBEDDINGS[name])
        adatas[name].obsm["X_bulkformer"] = embedding

adatas

Loaded human_burn: (590, 19914)
Loaded human_sepsis: (30, 19914)
Loaded human_trauma: (857, 19914)
Loaded mouse_burn: (32, 13833)
Loaded mouse_infection: (72, 10248)
Loaded mouse_sepsis: (50, 13833)
Loaded mouse_trauma: (96, 13833)


{'human_burn': AnnData object with n_obs × n_vars = 590 × 19914
     obs: 'group', 'patient_id', 'tissue', 'sex', 'age', 'time_point_hours', 'qc_flag'
     var: 'symbol'
     uns: 'provenance'
     obsm: 'X_bulkformer',
 'human_sepsis': AnnData object with n_obs × n_vars = 30 × 19914
     obs: 'tissue', 'group', 'qc_flag'
     var: 'symbol'
     uns: 'provenance'
     obsm: 'X_bulkformer',
 'human_trauma': AnnData object with n_obs × n_vars = 857 × 19914
     obs: 'group', 'patient_id', 'tissue', 'sex', 'age', 'time_point_hours', 'qc_flag'
     var: 'symbol'
     uns: 'provenance'
     obsm: 'X_bulkformer',
 'mouse_burn': AnnData object with n_obs × n_vars = 32 × 13833
     obs: 'title', 'cell_type', 'qc_flag', 'sex', 'strain', 'time_point_hours', 'group', 'patient_id'
     var: 'symbol', 'human_ensembl'
     uns: 'provenance'
     obsm: 'X_bulkformer',
 'mouse_infection': AnnData object with n_obs × n_vars = 72 × 10248
     obs: 'age', 'strain', 'qc_flag', 'infection_status_detail', '

### Clean datasets

In [8]:
adatas["human_burn"].obs["age"] = pd.to_numeric(adatas["human_burn"].obs["age"], errors="coerce")
adatas["human_burn"].obs.loc[
    adatas["human_burn"].obs["group"] == "control", "time_point_hours"
] = pd.NA
adatas["human_burn"].obs["takao_inflamed"] = (
    (adatas["human_burn"].obs["group"] == "inflammation")
    & (adatas["human_burn"].obs["time_point_hours"] > (28 * 24))
    & (adatas["human_burn"].obs["time_point_hours"] < (90 * 24))
)
adatas["human_burn"].obs["takao_control"] = adatas["human_burn"].obs["group"] == "control"
adatas["human_burn"].obs["takao_status"] = pd.NA
adatas["human_burn"].obs.loc[adatas["human_burn"].obs["takao_inflamed"], "takao_status"] = (
    "takao_inflamed"
)
adatas["human_burn"].obs.loc[adatas["human_burn"].obs["takao_control"], "takao_status"] = (
    "takao_control"
)
adatas["human_burn"].obs["takao_status"] = (
    adatas["human_burn"].obs["takao_status"].astype("category")
)

adatas["human_trauma"].obs["age"] = pd.to_numeric(
    adatas["human_trauma"].obs["age"], errors="coerce"
)
adatas["human_trauma"].obs["takao_inflamed"] = (
    (adatas["human_trauma"].obs["group"] == "inflammation")
    & (adatas["human_trauma"].obs["time_point_hours"] > (28 * 24))
    & (adatas["human_trauma"].obs["time_point_hours"] < (90 * 24))
)
adatas["human_trauma"].obs["takao_control"] = adatas["human_trauma"].obs["group"] == "control"
adatas["human_trauma"].obs["takao_status"] = pd.NA
adatas["human_trauma"].obs.loc[adatas["human_trauma"].obs["takao_inflamed"], "takao_status"] = (
    "takao_inflamed"
)
adatas["human_trauma"].obs.loc[adatas["human_trauma"].obs["takao_control"], "takao_status"] = (
    "takao_control"
)
adatas["human_trauma"].obs.loc[
    adatas["human_trauma"].obs["group"] == "control", "time_point_hours"
] = pd.NA
adatas["human_trauma"].obs["takao_status"] = (
    adatas["human_trauma"].obs["takao_status"].astype("category")
)

adatas["human_sepsis"].obs["takao_inflamed"] = (
    adatas["human_sepsis"].obs["group"] == "inflammation"
)
adatas["human_sepsis"].obs["takao_control"] = adatas["human_sepsis"].obs["group"] == "control"
adatas["human_sepsis"].obs["takao_status"] = pd.NA
adatas["human_sepsis"].obs.loc[adatas["human_sepsis"].obs["takao_inflamed"], "takao_status"] = (
    "takao_inflamed"
)
adatas["human_sepsis"].obs.loc[adatas["human_sepsis"].obs["takao_control"], "takao_status"] = (
    "takao_control"
)
adatas["human_sepsis"].obs["takao_status"] = (
    adatas["human_sepsis"].obs["takao_status"].astype("category")
)

adatas["mouse_burn"].obs["takao_inflamed"] = (
    adatas["mouse_burn"].obs["group"] == "inflammation"
) & (adatas["mouse_burn"].obs["time_point_hours"] >= 150)
adatas["mouse_burn"].obs["takao_control"] = adatas["mouse_burn"].obs["group"] == "control"
adatas["mouse_burn"].obs["takao_status"] = pd.NA
adatas["mouse_burn"].obs.loc[adatas["mouse_burn"].obs["takao_inflamed"], "takao_status"] = (
    "takao_inflamed"
)
adatas["mouse_burn"].obs.loc[adatas["mouse_burn"].obs["takao_control"], "takao_status"] = (
    "takao_control"
)
adatas["mouse_burn"].obs.loc[
    adatas["mouse_burn"].obs["group"] == "control", "time_point_hours"
] = pd.NA
adatas["mouse_burn"].obs["takao_status"] = (
    adatas["mouse_burn"].obs["takao_status"].astype("category")
)

adatas["mouse_trauma"].obs["takao_inflamed"] = (
    adatas["mouse_trauma"].obs["group"] == "inflammation"
) & (adatas["mouse_trauma"].obs["time_point_hours"] > 72)
adatas["mouse_trauma"].obs["takao_control"] = adatas["mouse_trauma"].obs["group"] == "control"
adatas["mouse_trauma"].obs["takao_status"] = pd.NA
adatas["mouse_trauma"].obs.loc[adatas["mouse_trauma"].obs["takao_inflamed"], "takao_status"] = (
    "takao_inflamed"
)
adatas["mouse_trauma"].obs.loc[adatas["mouse_trauma"].obs["takao_control"], "takao_status"] = (
    "takao_control"
)
adatas["mouse_trauma"].obs.loc[
    adatas["mouse_trauma"].obs["group"] == "control", "time_point_hours"
] = pd.NA
adatas["mouse_trauma"].obs["takao_status"] = (
    adatas["mouse_trauma"].obs["takao_status"].astype("category")
)

adatas["mouse_sepsis"].obs["takao_inflamed"] = (
    (adatas["mouse_sepsis"].obs["group"] == "inflammation")
    & (adatas["mouse_sepsis"].obs["time_point"] == 4.0)
    & (adatas["mouse_sepsis"].obs["strain"] == "C57BL/6J")
)
adatas["mouse_sepsis"].obs["takao_control"] = (
    adatas["mouse_sepsis"].obs["group"] == "control"
) & (adatas["mouse_sepsis"].obs["strain"] == "C57BL/6J")
adatas["mouse_sepsis"].obs["takao_status"] = pd.NA
adatas["mouse_sepsis"].obs.loc[adatas["mouse_sepsis"].obs["takao_inflamed"], "takao_status"] = (
    "takao_inflamed"
)
adatas["mouse_sepsis"].obs.loc[adatas["mouse_sepsis"].obs["takao_control"], "takao_status"] = (
    "takao_control"
)
adatas["mouse_sepsis"].obs.loc[adatas["mouse_sepsis"].obs["group"] == "control", "time_point"] = (
    pd.NA
)
adatas["mouse_sepsis"].obs["takao_status"] = (
    adatas["mouse_sepsis"].obs["takao_status"].astype("category")
)
adatas["mouse_sepsis"].obs["time_point_hours"] = adatas["mouse_sepsis"].obs["time_point"]
adatas["mouse_sepsis"].obs = adatas["mouse_sepsis"].obs.drop(columns=["time_point"])

adatas["mouse_infection"].obs["takao_inflamed"] = (
    (adatas["mouse_infection"].obs["group"] == "inflammation")
    & (adatas["mouse_infection"].obs["time_point_hours"] == 24)
    & (adatas["mouse_infection"].obs["infection_status_detail"] == "candida")
)
adatas["mouse_infection"].obs["takao_control"] = (
    adatas["mouse_infection"].obs["group"] == "control"
)
adatas["mouse_infection"].obs["takao_status"] = pd.NA
adatas["mouse_infection"].obs.loc[
    adatas["mouse_infection"].obs["takao_inflamed"], "takao_status"
] = "takao_inflamed"
adatas["mouse_infection"].obs.loc[
    adatas["mouse_infection"].obs["takao_control"], "takao_status"
] = "takao_control"
adatas["mouse_infection"].obs.loc[
    adatas["mouse_infection"].obs["group"] == "control", "time_point_hours"
] = pd.NA
adatas["mouse_infection"].obs["takao_status"] = (
    adatas["mouse_infection"].obs["takao_status"].astype("category")
)

In [9]:
# add embeddings as obsm to each anndata
for name, adata in adatas.items():
    embedding = np.load(EMBEDDINGS[name])
    adata.obsm["X_bulkformer"] = embedding

In [10]:
# save all adatas
for name, adata in adatas.items():
    save_path = os.path.join("../data/processed/anndata_combined", f"{name}.h5ad")
    adata.write_h5ad(save_path)
    print(f"Saved {name} with embeddings to {save_path}")

Saved human_burn with embeddings to ../data/processed/anndata_combined/human_burn.h5ad
Saved human_sepsis with embeddings to ../data/processed/anndata_combined/human_sepsis.h5ad
Saved human_trauma with embeddings to ../data/processed/anndata_combined/human_trauma.h5ad
Saved mouse_burn with embeddings to ../data/processed/anndata_combined/mouse_burn.h5ad
Saved mouse_infection with embeddings to ../data/processed/anndata_combined/mouse_infection.h5ad
Saved mouse_sepsis with embeddings to ../data/processed/anndata_combined/mouse_sepsis.h5ad
Saved mouse_trauma with embeddings to ../data/processed/anndata_combined/mouse_trauma.h5ad


### Combine mouse and human datasets separately

In [11]:
# combine all human and mouse adatas separately
human_adatas = [adatas[k] for k in adatas if k.startswith("human")]
mouse_adatas = [adatas[k] for k in adatas if k.startswith("mouse")]

# concatenate
human_adata_combined = ad.concat(
    human_adatas, join="outer", label="dataset", keys=[k for k in adatas if k.startswith("human")]
)
mouse_adata_combined = ad.concat(
    mouse_adatas, join="outer", label="dataset", keys=[k for k in adatas if k.startswith("mouse")]
)

In [12]:
human_adata_combined.obs

Unnamed: 0,group,patient_id,tissue,sex,age,time_point_hours,qc_flag,takao_inflamed,takao_control,takao_status,dataset
GSM909644,control,19297865.0,White Blood Cells,F,30.0,,0,False,True,takao_control,human_burn
GSM909645,control,35028656.0,White Blood Cells,M,35.0,,0,False,True,takao_control,human_burn
GSM909646,control,16952213.0,White Blood Cells,F,30.0,,0,False,True,takao_control,human_burn
GSM909647,control,20591195.0,White Blood Cells,M,19.0,,0,False,True,takao_control,human_burn
GSM909648,control,19107727.0,White Blood Cells,F,18.0,,1,False,True,takao_control,human_burn
...,...,...,...,...,...,...,...,...,...,...,...
GSM902165,inflammation,9754329.0,White Blood Cells,F,48.0,83.6,0,False,False,,human_trauma
GSM902166,inflammation,9973279.0,White Blood Cells,F,44.0,162.6,0,False,False,,human_trauma
GSM902167,inflammation,9973279.0,White Blood Cells,F,44.0,22.7,0,False,False,,human_trauma
GSM902168,inflammation,9973279.0,White Blood Cells,F,44.0,5.4,0,False,False,,human_trauma


In [13]:
mouse_adata_combined.obs

Unnamed: 0,title,cell_type,qc_flag,sex,strain,time_point_hours,group,patient_id,takao_inflamed,takao_control,takao_status,age,infection_status_detail,tissue,dataset
GSM178608,Burn Blood 2 hr rep 1,mouse leukocytes,0,Male,C57BL/6J,2.0,inflammation,1.0,False,False,,,,,mouse_burn
GSM178609,Burn Blood 2 hr rep 2,mouse leukocytes,0,Male,C57BL/6J,2.0,inflammation,2.0,False,False,,,,,mouse_burn
GSM178610,Burn Blood 2 hr rep 3,mouse leukocytes,1,Male,C57BL/6J,2.0,inflammation,3.0,False,False,,,,,mouse_burn
GSM178611,Burn Blood 2 hr rep 4,mouse leukocytes,0,Male,C57BL/6J,2.0,inflammation,4.0,False,False,,,,,mouse_burn
GSM178612,Burn Blood 1 day rep 1,mouse leukocytes,0,Male,C57BL/6J,24.0,inflammation,1.0,False,False,,,,,mouse_burn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM178731,Trauma Hemorrhage Sham Spleen 3 day rep 4,mouse splenocytes,0,Male,C57BL/6J,,control,4.0,False,True,takao_control,,,,mouse_trauma
GSM178732,Trauma Hemorrhage Sham Spleen 7 day rep 1,mouse splenocytes,0,Male,C57BL/6J,,control,1.0,False,True,takao_control,,,,mouse_trauma
GSM178733,Trauma Hemorrhage Sham Spleen 7 day rep 2,mouse splenocytes,0,Male,C57BL/6J,,control,2.0,False,True,takao_control,,,,mouse_trauma
GSM178734,Trauma Hemorrhage Sham Spleen 7 day rep 3,mouse splenocytes,0,Male,C57BL/6J,,control,3.0,False,True,takao_control,,,,mouse_trauma


In [14]:
# save combined adatas
human_adata_combined.write_h5ad("../data/processed/anndata_combined/human_combined.h5ad")
mouse_adata_combined.write_h5ad("../data/processed/anndata_combined/mouse_combined.h5ad")
print("Saved combined human and mouse adatas.")

Saved combined human and mouse adatas.
