# Domain Data

## Library Import

In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import seaborn as sns

## Helper Functions

In [None]:
def get_files(data_dir, prefix):
    return [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.startswith(prefix)]

In [None]:
def create_sample_df(files, domain_label, dataset):
    df = pd.DataFrame(
        columns=["Sample", "Dataset", "Num_genes", "Num_cells", "Domain_label", "Has_spatial"]
    )

    for file in files:
        adata = sc.read_h5ad(file)

        sample = os.path.basename(file).split(".h5ad")[0]
        num_genes = adata.shape[1]
        num_cells = adata.shape[0]
        domain_label = domain_label

        if "spatial" in adata.obsm:
            has_spatial = True
        else:
            has_spatial = False

        new_row = pd.DataFrame(
            [
                {
                    "Sample": sample,
                    "Dataset": dataset,
                    "Num_genes": num_genes,
                    "Num_cells": num_cells,
                    "Domain_label": domain_label,
                    "Has_spatial": has_spatial,
                }
            ]
        )

        df = pd.concat([df, new_row], ignore_index=True)

    df["Num_genes"] = pd.to_numeric(df["Num_genes"], errors="coerce")
    df["Num_cells"] = pd.to_numeric(df["Num_cells"], errors="coerce")

    return df

In [None]:
def plot_statistics(df):
    sns.set(style="whitegrid")
    fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=False)

    sns.barplot(
        data=df,
        x="Sample",
        hue="Sample",
        y="Num_genes",
        ax=axes[0],
        palette="Blues_d",
        legend=False,
    )
    axes[0].set_title("Number of Genes per Sample", fontsize=14)
    axes[0].set_xlabel("Sample", fontsize=12)
    axes[0].set_ylabel("Number of Genes", fontsize=12)
    axes[0].tick_params(axis="x", labelsize=8)

    sns.barplot(
        data=df,
        x="Sample",
        hue="Sample",
        y="Num_cells",
        ax=axes[1],
        palette="Greens_d",
        legend=False,
    )
    axes[1].set_title("Number of Cells per Sample", fontsize=14)
    axes[1].set_xlabel("Sample", fontsize=12)
    axes[1].set_ylabel("Number of Cells", fontsize=12)
    axes[1].tick_params(axis="x", labelsize=8)

    plt.tight_layout()
    plt.show()

In [None]:
def plot_merfish_zhuang(adata, color_key):
    fig, ax = plt.subplots(figsize=(6, 6))
    sc.pl.embedding(adata, basis="spatial", color=color_key, size=30, ax=ax, show=False)

    ax.set_ylim(11, 0)
    ax.set_xlim(0, 11)
    ax.axis("equal")
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title("Single Section Plot")

    # Show the plot
    plt.show()

## Dataset 1 - MERFISH small

In [None]:
files = get_files("../data/domain/raw_123", "MERFISH_small")
files = sorted(files, key=lambda x: int(os.path.basename(x).split("small")[1].split(".")[0]))
print(files)

In [None]:
df_mf = create_sample_df(files, "domain", "MERFISH_small")
df_mf

In [None]:
plot_statistics(df_mf)

In [None]:
adata = sc.read_h5ad(files[0])
domain_label = df_mf["Domain_label"][0]
title = df_mf["Sample"][0]
sc.pl.embedding(adata, basis="spatial", color=domain_label, size=40, title=title)

## Dataset 2 - STARmap

In [None]:
files = get_files("../data/domain/raw_123", "STARmap")
files = sorted(files, key=lambda x: int(os.path.basename(x).split("STARmap")[1].split(".")[0]))
print(files)

In [None]:
df_sm = create_sample_df(files, "region", "STARmap")
df_sm

In [None]:
plot_statistics(df_sm)

In [None]:
adata = sc.read_h5ad(files[0])
domain_label = df_sm["Domain_label"][0]
title = df_sm["Sample"][0]
sc.pl.embedding(adata, basis="spatial", color=domain_label, size=100, title=title)

## Dataset 3 - BaristaSeq

In [None]:
files = get_files("../data/domain/raw_123", "BaristaSeq")
files = sorted(files, key=lambda x: int(os.path.basename(x).split("BaristaSeq")[1].split(".")[0]))
print(files)

In [None]:
df_bs = create_sample_df(files, "layer", "BaristaSeq")
df_bs

In [None]:
plot_statistics(df_bs)

In [None]:
adata = sc.read_h5ad(files[0])
domain_label = df_bs["Domain_label"][0]
title = df_bs["Sample"][0]
sc.pl.embedding(adata, basis="spatial", color=domain_label, size=60, title=title)

## Dataset 7.1 - MERFISH Zhuang 1

In [None]:
files = get_files("../data/domain/raw_7", "Zhuang-ABCA-1")
files = sorted(
    files, key=lambda x: int(os.path.basename(x).split("Zhuang-ABCA-1.")[1].split(".")[0])
)
print(files[0:4])
print(len(files))

In [None]:
df_z1 = create_sample_df(files, "parcellation_division", "MERFISH_zhuang_1")
df_z1

In [None]:
df_z1.describe()

In [None]:
adata = sc.read_h5ad("../data/domain/raw_7/Zhuang-ABCA-1.079.h5ad")
color_key = df_z1["Domain_label"][0]  # Key in adata.obs for coloring

plot_merfish_zhuang(adata, color_key)

## Dataset 7.2 - MERFISH Zhuang 2

In [None]:
files = get_files("../data/domain/raw_7", "Zhuang-ABCA-2")
files = sorted(
    files, key=lambda x: int(os.path.basename(x).split("Zhuang-ABCA-2.")[1].split(".")[0])
)
print(files[0:4])
print(len(files))

In [None]:
df_z2 = create_sample_df(files, "parcellation_division", "MERFISH_zhuang_2")
df_z2.head(10)

In [None]:
df_z2.describe()

In [None]:
adata = sc.read_h5ad("../data/domain/raw_7/Zhuang-ABCA-2.037.h5ad")
color_key = df_z2["Domain_label"][0]  # Key in adata.obs for coloring

plot_merfish_zhuang(adata, color_key)

## Dataset 7.3 - MERFISH Zhuang 3

In [None]:
files = get_files("../data/domain/raw_7", "Zhuang-ABCA-3")
files = sorted(
    files, key=lambda x: int(os.path.basename(x).split("Zhuang-ABCA-3.")[1].split(".")[0])
)
print(files[0:4])
print(len(files))

In [None]:
df_z3 = create_sample_df(files, "parcellation_division", "MERFISH_zhuang_3")
df_z3.head(10)

In [None]:
df_z3.describe()

In [None]:
adata = sc.read_h5ad("../data/domain/raw_7/Zhuang-ABCA-3.010.h5ad")
color_key = df_z3["Domain_label"][0]  # Key in adata.obs for coloring

plot_merfish_zhuang(adata, color_key)

## Dataset 7.4 - MERFISH Zhuang 4

In [None]:
files = get_files("../data/domain/raw_7", "Zhuang-ABCA-4")
files = sorted(
    files, key=lambda x: int(os.path.basename(x).split("Zhuang-ABCA-4.")[1].split(".")[0])
)
print(files)

In [None]:
df_z4 = create_sample_df(files, "parcellation_division", "MERFISH_zhuang_4")
df_z4

In [None]:
adata = sc.read_h5ad("../data/domain/raw_7/Zhuang-ABCA-4.002.h5ad")
color_key = df_z4["Domain_label"][0]  # Key in adata.obs for coloring

plot_merfish_zhuang(adata, color_key)

## All Datasets

In [None]:
df = pd.concat([df_mf, df_sm, df_bs, df_z1, df_z2, df_z3, df_z4], ignore_index=True)
df

In [None]:
new_order = [
    "MERFISH_small",
    "STARmap",
    "BaristaSeq",
    "MERFISH_zhuang_1",
    "MERFISH_zhuang_2",
    "MERFISH_zhuang_3",
    "MERFISH_zhuang_4",
]
dataset_counts = df["Dataset"].value_counts().reset_index()
dataset_counts = dataset_counts.set_index("Dataset").reindex(new_order).reset_index()


sns.set(style="whitegrid")

plt.figure(figsize=(12, 6))
sns.barplot(
    x="Dataset", y="count", data=dataset_counts, hue="Dataset", palette="Blues_d", legend=False
)

plt.xlabel("Dataset")
plt.ylabel("Number of Samples")
plt.title("Barplot of Number of Samples per Dataset", fontsize=14)
plt.tick_params(axis="x", labelsize=9)

plt.show()

In [None]:
sns.set(style="whitegrid")

plt.figure(figsize=(12, 6))
sns.boxplot(x="Dataset", y="Num_genes", data=df, hue="Dataset", palette="Blues_d", legend=False)
sns.stripplot(x="Dataset", y="Num_genes", data=df, color="black", size=5, jitter=True)

plt.xlabel("Dataset")
plt.ylabel("Number of Genes")
plt.title("Boxplot of Number of Genes per Sample by Dataset", fontsize=14)
plt.tick_params(axis="x", labelsize=9)

plt.show()

In [None]:
sns.set(style="whitegrid")

plt.figure(figsize=(12, 6))
sns.boxplot(x="Dataset", y="Num_cells", data=df, hue="Dataset", palette="Blues_d", legend=False)
sns.stripplot(x="Dataset", y="Num_cells", data=df, color="black", size=5, jitter=True)

plt.xlabel("Dataset")
plt.ylabel("Number of Cells")
plt.title("Boxplot of Number of Cells per Sample by Dataset", fontsize=14)
plt.tick_params(axis="x", labelsize=9)

plt.show()