<b> <font size="+2.5"> Downsampling of number of nuclei
 </b> </font> <br>

Date: 2023-11-21 <br>
Author: ASF

# Set-up

In [1]:
import sys
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse

import scanpy as sc

In [2]:
sc.settings.set_figure_params(dpi=180, dpi_save=300)

# Read in adata

In [3]:
adata = sc.read("../datasets/adata_celltypes_after_manual_curation_20231120.h5ad")

In [4]:
cells_cross = pd.crosstab(adata.obs['celltypes_final'],adata.obs["sample"])
#cells_cross.head()

In [5]:
# count number of nuclei per celltype
ct_counts = adata.obs['celltypes_final'].value_counts()
ct_counts

Exc_L2-3           218679
Oligodendrocyte     93357
Astro_PP            87924
OPC                 45435
In_VIP              42250
Microglia           39154
Exc_L4-6_1          36486
Exc_L4-6_2          34813
In_SST              33791
In_PVALB_Ba         31796
Astro_FB            25410
In_RELN             24191
Exc_L4-6_3          14470
Endothelial         13567
Exc_L3-5            11815
In_LAMP5_1           9794
Exc_L5-6_1           7541
In_PVALB_Ch          6709
In_LAMP5_2           5295
Exc_L5-6_HTR2C       4139
Exc_L5-6_2           1069
Name: celltypes_final, dtype: int64

In [6]:
# get median, 25%, 15%, 10% and 5% percentile
per_50 = np.percentile(ct_counts.array, 50)
per_25 = np.percentile(ct_counts.array, 25)
per_15 = np.percentile(ct_counts.array, 15)
per_10 = np.percentile(ct_counts.array, 10)
per_5 = np.percentile(ct_counts.array, 5)

print('50% percentile: '+str(per_50)+'\t 25% percentile: '+str(per_25) +
      '\t 15% percentile: '+str(per_15) +'\t 10% percentile: '+str(per_10) +'\t 5% percentile: '+str(per_5))

50% percentile: 25410.0	 25% percentile: 9794.0	 15% percentile: 6709.0	 10% percentile: 5295.0	 5% percentile: 4139.0


# subsample each celltype to 5000 nuclei

## subsample1

In [7]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=0)

adata_downsampled_1 = adatas[0].concatenate(*adatas[1:])

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [8]:
adata_downsampled_1.write("../datasets/adata_celltypes_after_manual_curation_downsampled_1.h5ad")

## subsample 2

In [11]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_2 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_2:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=1)

adata_downsampled_2 = adatas_2[0].concatenate(*adatas_2[1:])

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [12]:
# count number of nuclei per celltype
ct_counts_downsampled_2 = adata_downsampled_2.obs['celltypes_final'].value_counts()
ct_counts_downsampled_2

Exc_L2-3           5000
In_PVALB_Ch        5000
Oligodendrocyte    5000
Microglia          5000
Endothelial        5000
Astro_PP           5000
Astro_FB           5000
In_VIP             5000
In_SST             5000
In_RELN            5000
In_PVALB_Ba        5000
Exc_L3-5           5000
In_LAMP5_2         5000
In_LAMP5_1         5000
Exc_L5-6_1         5000
Exc_L4-6_3         5000
Exc_L4-6_2         5000
Exc_L4-6_1         5000
OPC                5000
Exc_L5-6_HTR2C     4139
Exc_L5-6_2         1069
Name: celltypes_final, dtype: int64

In [13]:
adata_downsampled_2.write("../datasets/adata_celltypes_after_manual_curation_downsampled_2.h5ad")

## subsample 3

In [14]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_3 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_3:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=2)

adata_downsampled_3 = adatas_3[0].concatenate(*adatas_3[1:])

adata_downsampled_3.write("../datasets/adata_celltypes_after_manual_curation_downsampled_3.h5ad")

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


## subsample 4

In [15]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_4 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_4:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=3)

adata_downsampled_4 = adatas_4[0].concatenate(*adatas_4[1:])

adata_downsampled_4.write("../datasets/adata_celltypes_after_manual_curation_downsampled_4.h5ad")

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


## subsample 5

In [16]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_5 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_5:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=4)

adata_downsampled_5 = adatas_5[0].concatenate(*adatas_5[1:])

adata_downsampled_5.write("../datasets/adata_celltypes_after_manual_curation_downsampled_5.h5ad")

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


## subsample 6

In [17]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_6 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_6:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=5)

adata_downsampled_6 = adatas_6[0].concatenate(*adatas_6[1:])

adata_downsampled_6.write("../datasets/adata_celltypes_after_manual_curation_downsampled_6.h5ad")

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


## subsample 7

In [18]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_7 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_7:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=6)

adata_downsampled_7 = adatas_7[0].concatenate(*adatas_7[1:])

adata_downsampled_7.write("../datasets/adata_celltypes_after_manual_curation_downsampled_7.h5ad")


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


## subsample 8

In [19]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_8 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_8:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=7)

adata_downsampled_8 = adatas_8[0].concatenate(*adatas_8[1:])

adata_downsampled_8.write("../datasets/adata_celltypes_after_manual_curation_downsampled_8.h5ad")

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


## subsample 9

In [20]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_9 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_9:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=8)

adata_downsampled_9 = adatas_9[0].concatenate(*adatas_9[1:])

adata_downsampled_9.write("../datasets/adata_celltypes_after_manual_curation_downsampled_9.h5ad")


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


## subsample 10

In [21]:
# sc.pp.subsample: Subsample to a fraction of the number of observations.
target_cells = 5000

adatas_10 = [adata[adata.obs['celltypes_final'].isin([clust])] for clust in adata.obs['celltypes_final'].cat.categories]

for dat in adatas_10:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells, random_state=9)

adata_downsampled_10 = adatas_10[0].concatenate(*adatas_10[1:])

adata_downsampled_10.write("../datasets/adata_celltypes_after_manual_curation_downsampled_10.h5ad")

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
