# use se to generate embeddings

## 1.prepare data

In [None]:
# Download tabula sapiens data

! wget https://datasets.cellxgene.cziscience.com/762edb8f-1207-4814-831e-99d7a801fdec.h5ad -O heart.h5ad
! wget https://datasets.cellxgene.cziscience.com/01e3402f-a625-4c4d-b94a-bc5996f738f7.h5ad -O kidney.h5ad
! wget https://datasets.cellxgene.cziscience.com/d6761a21-e226-434f-9370-fbcc7e549aa0.h5ad -O lung.h5ad

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import anndata as ad
# Read the h5ad files
heart_adata = ad.read_h5ad("heart.h5ad")
kidney_adata = ad.read_h5ad("kidney.h5ad")
lung_adata = ad.read_h5ad("lung.h5ad")

# Concatenate the AnnData objects
concatenated_adata = ad.concat([heart_adata, kidney_adata, lung_adata], merge="unique")

# Write the concatenated AnnData object to a new h5ad file
concatenated_adata.write("processed.h5ad")

print("Concatenated data saved to processed.h5ad")

## 2.compute the embedding

In [None]:
# Embed the data (adds a .obsm['X_state'] property)
state emb transform \
    --model-folder SE-600M \
    --input processed.h5ad \
    --output processed_emb.h5ad

## compute PCA

In [None]:
# Create AnnData object for X_state and compute PCA

adata = sc.read("processed_emb.h5ad")

# Compute PCA on the gene expression data
sc.tl.pca(adata)

# Create figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot 1: PCA using .X
sc.pl.pca(adata,
          color='donor_id',
          components=['1,2'],
          ax=axes[0],
          show=False,
          title="PCA Separation of Donor using .X")

# Plot 2: PCA using .obsm['X_state']
sc.pl.pca(adata,
          color='tissue_in_publication',
          components=['1,2'],
          ax=axes[1],
          show=False,
          title="PCA Separation of Tissue using .X")

# Adjust layout and show
plt.tight_layout()
plt.show()

In [None]:
# Create AnnData object for X_state and compute PCA
adata_state = ad.AnnData(adata.obsm['X_state'])
adata_state.obs_names = adata.obs_names
adata_state.obs['tissue_in_publication'] = adata.obs['tissue_in_publication']
adata_state.obs['donor_id'] = adata.obs['donor_id']
sc.tl.pca(adata_state)

# Create figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot 1: PCA using .X
sc.pl.pca(adata_state,
          color='donor_id',
          components=['1,2'],
          ax=axes[0],
          show=False,
          title="PCA Separation of Donor using .obsm['X_state']")

# Plot 2: PCA using .obsm['X_state']
sc.pl.pca(adata_state,
          color='tissue_in_publication',
          components=['1,2'],
          ax=axes[1],
          show=False,
          title="PCA Separation of Tissue using .obsm['X_state']")

# Adjust layout and show
plt.tight_layout()
plt.show()