In [1]:
!pip install scanpy

Collecting scanpy
  Downloading scanpy-1.11.1-py3-none-any.whl.metadata (9.9 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.11.4-py3-none-any.whl.metadata (9.3 kB)
Collecting legacy-api-wrap>=1.4 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting scikit-learn<1.6.0,>=1.1 (from scanpy)
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.1.2-py3-none-any.whl.metadata (2.5 kB)
Collecting array-api-compat!=1.5,>1.4 (from anndata>=0.8->scanpy)
  Downloading array_api_compat-1.11.2-py3-none-any.whl.metadata (1.9 kB)
Downloading scanpy-1.11.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading anndata-0.11.4-py3-none-any.whl (144 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.

In [2]:
import pandas as pd
import scanpy as sc
import os

In [3]:
# new loader
def load_simulated_cna_dataset(folder_path):
    # Load raw counts
    expr = pd.read_csv(os.path.join(folder_path, "counts.matrix"), sep="\t", index_col=0)
    adata = sc.AnnData(expr.T)  # transpose so cells are rows

    # Load gene metadata
    gene_order = pd.read_csv(os.path.join(folder_path, "gene_order_file.txt"), sep="\t")
    gene_order = gene_order.set_index("gene_name")
    adata.var = gene_order.loc[adata.var_names].copy()

    # Load cell metadata
    annot = pd.read_csv(os.path.join(folder_path, "annotation.txt"), sep="\t")
    annot = annot.set_index("cell_name")
    adata.obs = annot.loc[adata.obs_names].copy()

    return adata

In [4]:
import zipfile
import os

In [39]:
zip_path = "sim_mixed_cna_goldstandard.zip"  # name of uploaded file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(".")

In [40]:
# Now you can access the folder like this:
adata = load_simulated_cna_dataset("sim_mixed_cna_goldstandard")

In [41]:
print(adata)

AnnData object with n_obs × n_vars = 500 × 2000
    obs: 'cell_type', 'simulated_cnvs', 'n_genes', 'n_counts'
    var: 'chromosome', 'start', 'end'


In [42]:
adata.write("sim_mixed_cna_goldstandard.h5ad")


In [43]:
from google.colab import files
files.download("sim_mixed_cna_goldstandard.h5ad")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
print(adata)


AnnData object with n_obs × n_vars = 500 × 2000
    obs: 'cell_type', 'simulated_cnvs', 'n_genes', 'n_counts'
    var: 'chromosome', 'start', 'end'


combine all 3 datasets

In [16]:
adata_small = sc.read_h5ad("sim_small_gain_lowfreq.h5ad")
adata_medium = sc.read_h5ad("sim_medium_gain_midfreq.h5ad")
adata_large = sc.read_h5ad("sim_large_gain_highfreq.h5ad")

In [17]:
adata_small.obs["simulation_group"] = "small_gain_lowfreq"
adata_medium.obs["simulation_group"] = "medium_gain_midfreq"
adata_large.obs["simulation_group"] = "large_gain_highfreq"


In [18]:
adata_combined = adata_small.concatenate(
    adata_medium,
    adata_large,
    batch_key="source",
    batch_categories=["small", "medium", "large"],
    index_unique=None  # prevent appending _1, _2 to cell names
)


  adata_combined = adata_small.concatenate(
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [22]:
# Combine into a single 'chromosome' column
adata_combined.var["chromosome"] = adata_combined.var["chromosome-small"].combine_first(
    adata_combined.var["chromosome-medium"]).combine_first(
    adata_combined.var["chromosome-large"])

# Drop the old ones
adata_combined.var.drop(columns=["chromosome-small", "chromosome-medium", "chromosome-large"], inplace=True)


  adata_combined.var["chromosome"] = adata_combined.var["chromosome-small"].combine_first(
  adata_combined.var["chromosome-medium"]).combine_first(


In [23]:
print(adata_combined)
print(adata_combined.obs["simulation_group"].value_counts())


AnnData object with n_obs × n_vars = 1500 × 2000
    obs: 'cell_type', 'simulated_cnvs', 'n_genes', 'n_counts', 'simulation_group', 'source'
    var: 'start', 'end', 'chromosome'
simulation_group
large_gain_highfreq    500
medium_gain_midfreq    500
small_gain_lowfreq     500
Name: count, dtype: int64


In [24]:
adata_combined.write("sim_combined_cnas.h5ad")


In [25]:
from google.colab import files
files.download("sim_combined_cnas.h5ad")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [45]:
print(adata.obs)

           cell_type simulated_cnvs  n_genes  n_counts
Cell1    medium_gain           gain     1986     10212
Cell2         normal           none     1984      9958
Cell3    medium_gain           gain     1986     10230
Cell4    medium_gain           gain     1992     10176
Cell5    medium_gain           gain     1990     10367
...              ...            ...      ...       ...
Cell496   large_gain           gain     1990     10845
Cell497  medium_gain           gain     1984     10252
Cell498  medium_gain           gain     1984     10172
Cell499   large_gain           gain     1984     10873
Cell500   small_gain           gain     1989     10063

[500 rows x 4 columns]
