In [1]:
from tqdm import tqdm
import torch
import random
import numpy as np
import pandas as pd
import collections
import anndata
from anndata import read_h5ad
import scanpy as sc
from sklearn.preprocessing import MinMaxScaler

###       Load input data from a h5ad file and divide into training and test set 
#####     原始scaden好像沒有特別切分
#####     :param input_path: path to h5ad file
#####     :param batch_size: batch size to use for training
#####     :param datasets: a list of datasets to extract from the file
#####     :return: Dataset object

In [2]:
#input_path = "./pbmc_data.h5ad"
#raw_input = read_h5ad(input_path)

In [3]:
#raw_input
#raw_input.obs
#raw_input.var
#raw_input.X
#collections.Counter(raw_input.obs['ds'])

### Simulate RNA data from harmony 轉成 h5ad

In [21]:
exp = pd.read_csv("./0.input_data/0819_Simulate_RNA/0819_Simulate_RNA_after_harmony_filter.csv")
metadata = pd.read_csv("./0.input_data/0819_Simulate_RNA/0819-3.Simulate_RNA_after_harmony_filter_metadata.csv")
wall = pd.read_csv("./0.input_data/0819_Simulate_RNA/0819-1.Simulate_RNA_after_harmony_filter_wall_table.csv",
                   index_col=0)
exp = exp.drop(exp.columns[0],axis=1)

In [22]:
#gene = pd.read_csv("./0.input_data/0522_scRNA_25type_top1000_filter_genes.csv")
gene = pd.read_csv("./0.input_data/simulate_remain_cgl2_harmony_gene.csv")

In [25]:
print(exp.shape)
print(wall.shape)

(1971, 40000)
(40000, 25)


In [26]:
symbol = list(gene['x'])
exp.index = symbol
#exp = exp.T

wall = wall.apply(lambda x: x/x.sum(), axis=1)

metadata.index = list(metadata['barcode'])
metadata = metadata['donor']
meta = pd.concat([wall, metadata], axis=1)

In [27]:
# log 2 transfromation
#genes_ID x sample
total_set = np.log2(exp+1)
total_set

Unnamed: 0,Simulate_RNA_after_scRNA_harmony_1,Simulate_RNA_after_scRNA_harmony_2,Simulate_RNA_after_scRNA_harmony_3,Simulate_RNA_after_scRNA_harmony_4,Simulate_RNA_after_scRNA_harmony_5,Simulate_RNA_after_scRNA_harmony_6,Simulate_RNA_after_scRNA_harmony_7,Simulate_RNA_after_scRNA_harmony_8,Simulate_RNA_after_scRNA_harmony_9,Simulate_RNA_after_scRNA_harmony_10,...,Simulate_RNA_after_scRNA_harmony_39991,Simulate_RNA_after_scRNA_harmony_39992,Simulate_RNA_after_scRNA_harmony_39993,Simulate_RNA_after_scRNA_harmony_39994,Simulate_RNA_after_scRNA_harmony_39995,Simulate_RNA_after_scRNA_harmony_39996,Simulate_RNA_after_scRNA_harmony_39997,Simulate_RNA_after_scRNA_harmony_39998,Simulate_RNA_after_scRNA_harmony_39999,Simulate_RNA_after_scRNA_harmony_40000
SEMA3F,10.702285,10.702177,10.700756,10.699747,10.701197,10.700347,10.700405,10.699038,10.701095,10.700506,...,10.699904,10.699497,10.699836,10.699419,10.700599,10.699738,10.700006,10.699460,10.699853,10.700357
HECW1,10.701501,10.701515,10.701546,10.701515,10.701506,10.701508,10.701570,10.701468,10.701521,10.701522,...,10.701407,10.701396,10.701419,10.701436,10.701362,10.701405,10.701385,10.701438,10.701423,10.701416
CYP26B1,10.702085,10.702341,10.701524,10.700948,10.701186,10.701287,10.701296,10.700755,10.701213,10.701540,...,10.701259,10.701119,10.701042,10.701327,10.701213,10.701610,10.701187,10.700979,10.701222,10.701198
ABCB5,10.701604,10.701682,10.701713,10.701591,10.701622,10.701652,10.701642,10.701608,10.701632,10.701652,...,10.701484,10.701494,10.701478,10.701458,10.701481,10.701491,10.701489,10.701462,10.701432,10.701474
CALCR,10.702875,10.702740,10.702441,10.702346,10.702498,10.702374,10.702449,10.702620,10.702423,10.702648,...,10.702110,10.702929,10.702190,10.702598,10.701862,10.702245,10.702392,10.702697,10.702436,10.702007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LINC01144,10.701499,10.701521,10.701557,10.701550,10.701597,10.701376,10.701597,10.701507,10.701464,10.701677,...,10.700661,10.700971,10.700728,10.700918,10.700677,10.700899,10.700822,10.700825,10.700706,10.700801
FAM95C,10.701372,10.701360,10.701414,10.701394,10.701379,10.701315,10.701363,10.701304,10.701377,10.701351,...,10.701121,10.701023,10.701072,10.701145,10.701070,10.701055,10.701101,10.701152,10.701106,10.701065
LINC00032,10.701627,10.701642,10.701609,10.701620,10.701609,10.701716,10.701764,10.701599,10.701663,10.701607,...,10.701377,10.701370,10.701314,10.701367,10.701336,10.701379,10.701300,10.701507,10.701333,10.701421
OR8B3,10.701594,10.701646,10.701608,10.701656,10.701612,10.701615,10.701643,10.701646,10.701654,10.701574,...,10.701554,10.701566,10.701568,10.701557,10.701538,10.701559,10.701533,10.701570,10.701564,10.701575


In [28]:
# 對每個細胞進行 min max scaler
scaler = MinMaxScaler(feature_range=(0, 1))
total_set = scaler.fit_transform(total_set)
total_set = total_set.T
total_set.shape

(40000, 1971)

In [29]:
adata = sc.AnnData(X = total_set, obs = meta, dtype = 'float32')
adata

AnnData object with n_obs × n_vars = 40000 × 1971
    obs: 'CD141+DC', 'CD4/CD8-C1-CCR7', 'CD4/CD8-C2-MKI67', 'CD8-C7-KLRD1', 'CD8-C9-SLC4A10', 'Central memory T cells', 'Circulating NK', 'Conventional dendritic cells(CD1C DC)', 'Cytotoxicity CD8T', 'DC-C4-LAMP3', 'Effector memory T cells', 'Exhausted CD8+ T (Tex) cells', 'ILCs', 'Liver-resident NK (lrNK) cell', 'Lymphoid-B', 'M-C4-GPX3', 'M1', 'Mast', 'Mono', 'Myeloid-derived suppressor cells', 'NK', 'TAM-like', 'Th0', 'Th1', 'Treg', 'donor'

In [30]:
adata.write('./0819_simulate_harmony_10p_400cell.h5ad')

... storing 'donor' as categorical


In [26]:
print(adata.X)

[[0.40708616 0.41086468 0.4101966  ... 0.40673444 0.40753734 0.40669328]
 [0.416816   0.42551008 0.41929892 ... 0.41529727 0.4166121  0.41578513]
 [0.42765132 0.43434086 0.43002236 ... 0.42680675 0.42789    0.4270775 ]
 ...
 [0.3309121  0.32991502 0.33051074 ... 0.33116788 0.3315232  0.33089715]
 [0.343845   0.3430046  0.34332058 ... 0.3441015  0.34457627 0.34384733]
 [0.35368448 0.35309914 0.3532627  ... 0.35396788 0.35435963 0.3536556 ]]


In [27]:
collections.Counter(adata.obs['donor'])

Counter({'D20171109': 1600,
         'D20171215': 1600,
         'D20180108': 1600,
         'D20180110': 1600,
         'D20180116': 1600})

In [31]:
adata.obs

Unnamed: 0,CD141+DC,CD4/CD8-C1-CCR7,CD4/CD8-C2-MKI67,CD8-C7-KLRD1,CD8-C9-SLC4A10,Central memory T cells,Circulating NK,Conventional dendritic cells(CD1C DC),Cytotoxicity CD8T,DC-C4-LAMP3,...,M1,Mast,Mono,Myeloid-derived suppressor cells,NK,TAM-like,Th0,Th1,Treg,donor
Simulate_RNA_after_scRNA_harmony_1,0.0100,0.0425,0.0100,0.0500,0.0200,0.1275,0.0300,0.0575,0.0625,0.0100,...,0.0275,0.0150,0.0175,0.1175,0.0250,0.0525,0.0325,0.0100,0.0100,D20171109
Simulate_RNA_after_scRNA_harmony_2,0.0100,0.0525,0.0075,0.0550,0.0175,0.1100,0.0275,0.0575,0.0700,0.0100,...,0.0250,0.0150,0.0175,0.1175,0.0250,0.0550,0.0275,0.0100,0.0075,D20171109
Simulate_RNA_after_scRNA_harmony_3,0.0075,0.0500,0.0075,0.0500,0.0175,0.1275,0.0275,0.0575,0.0650,0.0100,...,0.0275,0.0150,0.0175,0.1000,0.0300,0.0550,0.0325,0.0075,0.0100,D20171109
Simulate_RNA_after_scRNA_harmony_4,0.0075,0.0475,0.0100,0.0425,0.0175,0.1175,0.0325,0.0575,0.0625,0.0100,...,0.0275,0.0175,0.0200,0.1075,0.0300,0.0575,0.0325,0.0075,0.0075,D20171109
Simulate_RNA_after_scRNA_harmony_5,0.0075,0.0500,0.0075,0.0475,0.0200,0.1275,0.0325,0.0575,0.0725,0.0075,...,0.0275,0.0150,0.0175,0.1025,0.0325,0.0525,0.0275,0.0075,0.0075,D20171109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Simulate_RNA_after_scRNA_harmony_39996,0.0100,0.0675,0.0075,0.0400,0.0225,0.1825,0.0325,0.0225,0.0500,0.0100,...,0.0275,0.0100,0.0200,0.0325,0.0175,0.0200,0.0300,0.0200,0.0725,D20180116
Simulate_RNA_after_scRNA_harmony_39997,0.0100,0.0650,0.0100,0.0400,0.0175,0.1950,0.0325,0.0200,0.0525,0.0075,...,0.0300,0.0100,0.0150,0.0300,0.0225,0.0200,0.0325,0.0175,0.0675,D20180116
Simulate_RNA_after_scRNA_harmony_39998,0.0100,0.0650,0.0100,0.0425,0.0200,0.1825,0.0300,0.0225,0.0525,0.0100,...,0.0300,0.0125,0.0175,0.0325,0.0200,0.0175,0.0325,0.0200,0.0725,D20180116
Simulate_RNA_after_scRNA_harmony_39999,0.0100,0.0675,0.0100,0.0475,0.0225,0.1650,0.0325,0.0200,0.0450,0.0100,...,0.0300,0.0100,0.0175,0.0275,0.0225,0.0200,0.0275,0.0225,0.0775,D20180116


### Simulate RNA data from SCT 轉成 h5ad

In [4]:
exp = pd.read_csv("./0.input_data/0819_Simulate_RNA/0819_Simulate_RNA_after_SCT_filter.csv")
metadata = pd.read_csv("./0.input_data/0819_Simulate_RNA/0819-3.Simulate_RNA_after_SCT_filter_metadata.csv")
wall = pd.read_csv("./0.input_data/0819_Simulate_RNA/0819-1.Simulate_RNA_after_SCT_filter_wall_table.csv",
                   index_col=0)
exp = exp.drop(exp.columns[0],axis=1)

In [5]:
#gene = pd.read_csv("./0.input_data/0622_scRNA_after_SCT_filter_genes_cv_20to200.csv")
gene = pd.read_csv("./0.input_data/simulate_remain_cgl2_sct_gene.csv")

In [6]:
print(exp.shape)
print(wall.shape)

(2870, 40000)
(40000, 25)


In [7]:
symbol = list(gene['x'])
exp.index = symbol
#exp = exp.T

wall = wall.apply(lambda x: x/x.sum(), axis=1)

metadata.index = list(metadata['barcode'])
metadata = metadata['donor']
meta = pd.concat([wall, metadata], axis=1)

In [8]:
# log 2 transfromation
#genes_ID x sample
total_set = np.log2(exp+1)
total_set

Unnamed: 0,Simulate_RNA_after_scRNA_SCT_1,Simulate_RNA_after_scRNA_SCT_2,Simulate_RNA_after_scRNA_SCT_3,Simulate_RNA_after_scRNA_SCT_4,Simulate_RNA_after_scRNA_SCT_5,Simulate_RNA_after_scRNA_SCT_6,Simulate_RNA_after_scRNA_SCT_7,Simulate_RNA_after_scRNA_SCT_8,Simulate_RNA_after_scRNA_SCT_9,Simulate_RNA_after_scRNA_SCT_10,...,Simulate_RNA_after_scRNA_SCT_39991,Simulate_RNA_after_scRNA_SCT_39992,Simulate_RNA_after_scRNA_SCT_39993,Simulate_RNA_after_scRNA_SCT_39994,Simulate_RNA_after_scRNA_SCT_39995,Simulate_RNA_after_scRNA_SCT_39996,Simulate_RNA_after_scRNA_SCT_39997,Simulate_RNA_after_scRNA_SCT_39998,Simulate_RNA_after_scRNA_SCT_39999,Simulate_RNA_after_scRNA_SCT_40000
TSPAN6,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
RAD52,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
AOC1,0.000000,0.759707,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
ALS2,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
TFPI,0.000000,0.759707,0.000000,0.000000,0.759707,0.0,0.0,0.759707,0.000000,0.0,...,0.759707,0.759707,0.0,0.0,0.759707,0.759707,0.759707,0.759707,0.0,0.759707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LINC00891,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
EXOC3L2,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
LINC01226,1.254772,0.000000,0.759707,1.254772,0.000000,0.0,0.0,0.000000,0.759707,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.759707
HULC,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [9]:
# 對每個細胞進行 min max scaler
scaler = MinMaxScaler(feature_range=(0, 1))
total_set = scaler.fit_transform(total_set)
total_set = total_set.T
total_set.shape

(40000, 2870)

In [10]:
adata = sc.AnnData(X = total_set, obs = meta, dtype = 'float32')
print(adata)

AnnData object with n_obs × n_vars = 40000 × 2870
    obs: 'CD141+DC', 'CD4/CD8-C1-CCR7', 'CD4/CD8-C2-MKI67', 'CD8-C7-KLRD1', 'CD8-C9-SLC4A10', 'Central memory T cells', 'Circulating NK', 'Conventional dendritic cells(CD1C DC)', 'Cytotoxicity CD8T', 'DC-C4-LAMP3', 'Effector memory T cells', 'Exhausted CD8+ T (Tex) cells', 'ILCs', 'Liver-resident NK (lrNK) cell', 'Lymphoid-B', 'M-C4-GPX3', 'M1', 'Mast', 'Mono', 'Myeloid-derived suppressor cells', 'NK', 'TAM-like', 'Th0', 'Th1', 'Treg', 'donor'


In [12]:
adata.write('./0819_simulate_SCT_cv20to200_10p_400cell.h5ad')

... storing 'donor' as categorical


In [11]:
collections.Counter(adata.obs['donor'])

Counter({'D20171109': 8000,
         'D20171215': 8000,
         'D20180108': 8000,
         'D20180110': 8000,
         'D20180116': 8000})

In [13]:
adata.obs

Unnamed: 0,CD141+DC,CD4/CD8-C1-CCR7,CD4/CD8-C2-MKI67,CD8-C7-KLRD1,CD8-C9-SLC4A10,Central memory T cells,Circulating NK,Conventional dendritic cells(CD1C DC),Cytotoxicity CD8T,DC-C4-LAMP3,...,M1,Mast,Mono,Myeloid-derived suppressor cells,NK,TAM-like,Th0,Th1,Treg,donor
Simulate_RNA_after_scRNA_SCT_1,0.0100,0.0475,0.0075,0.0500,0.0200,0.1250,0.0275,0.0575,0.0625,0.0075,...,0.0300,0.0175,0.0175,0.1075,0.0275,0.0550,0.0275,0.0075,0.0075,D20171109
Simulate_RNA_after_scRNA_SCT_2,0.0075,0.0450,0.0075,0.0450,0.0175,0.1100,0.0275,0.0625,0.0650,0.0100,...,0.0300,0.0175,0.0175,0.0975,0.0325,0.0625,0.0275,0.0075,0.0075,D20171109
Simulate_RNA_after_scRNA_SCT_3,0.0075,0.0475,0.0075,0.0525,0.0200,0.1250,0.0300,0.0575,0.0725,0.0075,...,0.0250,0.0175,0.0175,0.1100,0.0275,0.0600,0.0300,0.0050,0.0075,D20171109
Simulate_RNA_after_scRNA_SCT_4,0.0100,0.0475,0.0075,0.0450,0.0200,0.1175,0.0275,0.0625,0.0725,0.0075,...,0.0300,0.0175,0.0175,0.1075,0.0325,0.0575,0.0300,0.0100,0.0075,D20171109
Simulate_RNA_after_scRNA_SCT_5,0.0100,0.0475,0.0075,0.0475,0.0175,0.1125,0.0300,0.0600,0.0750,0.0100,...,0.0275,0.0200,0.0200,0.0975,0.0300,0.0625,0.0275,0.0075,0.0075,D20171109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Simulate_RNA_after_scRNA_SCT_39996,0.0100,0.0700,0.0100,0.0400,0.0225,0.1725,0.0350,0.0200,0.0475,0.0125,...,0.0275,0.0100,0.0175,0.0300,0.0200,0.0250,0.0300,0.0200,0.0625,D20180116
Simulate_RNA_after_scRNA_SCT_39997,0.0100,0.0775,0.0100,0.0375,0.0200,0.1775,0.0275,0.0200,0.0550,0.0100,...,0.0275,0.0100,0.0200,0.0325,0.0175,0.0175,0.0300,0.0200,0.0650,D20180116
Simulate_RNA_after_scRNA_SCT_39998,0.0100,0.0725,0.0100,0.0450,0.0200,0.1850,0.0275,0.0225,0.0475,0.0100,...,0.0275,0.0100,0.0200,0.0300,0.0200,0.0200,0.0275,0.0200,0.0725,D20180116
Simulate_RNA_after_scRNA_SCT_39999,0.0100,0.0675,0.0100,0.0400,0.0200,0.1825,0.0325,0.0175,0.0475,0.0100,...,0.0300,0.0100,0.0175,0.0325,0.0200,0.0200,0.0275,0.0200,0.0750,D20180116
