In [22]:
import pandas as pd
import json
from pathlib import Path
import mygene
import seaborn as sns

In [2]:
data_dir = Path('../../data')
cello_dir = data_dir / 'CellO_data/bulk_RNA_seq_training_set'
split_dir = cello_dir / 'pretraining_validation_split'

## Loading Ewing data

In [60]:
df = pd.read_csv(data_dir / 'Ewing_cell_lines/Ewing_NT_cell_lines.tsv', sep='\t', index_col=0)
df.shape

(58243, 11)

#### Loading gene symbols

In [27]:
mg = mygene.MyGeneInfo()
q_results = mg.querymany(df.index)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [42]:
q_results_df = pd.DataFrame(q_results).drop_duplicates('query').set_index('query')
query2gene_id = q_results_df['symbol']

In [69]:
# convert Ensembl gene id to gene symbol(gene id)
ewing_df = df.copy()
ewing_df.index = ewing_df.index.map(query2gene_id)

# drop genes with missing symbols
ewing_df = ewing_df[ewing_df.index.notna()].transpose()
ewing_df.to_csv(data_dir / 'Ewing_cell_lines/Ewing_NT_cell_lines.csv')

In [88]:
ewing_df = pd.read_csv(data_dir / 'Ewing_cell_lines/Ewing_NT_cell_lines.csv', index_col=0)

#### Loading PLIER assets built on Cello train data

In [None]:
full_Y_df = pd.read_csv(data_dir / 'mat4.csv').transpose()

In [95]:
with open(split_dir / 'pre_training_bulk_experiments.json', 'r') as f:
    train_egs = json.load(f)

train_Y_df = full_Y_df[full_Y_df.index.isin(train_egs)]
train_Y_df.shape

(3609, 55904)

In [71]:
Z_df = pd.read_csv(data_dir / 'plierResult-cello_train/Z.csv')
Z_df.shape

(5900, 512)

In [72]:
(~Z_df.index.isin(ewing_df.columns)).sum()

125

In [74]:
Z_df.index[~Z_df.index.isin(ewing_df.columns)]

Index(['TCEB3', 'MRE11A', 'SARS', 'ADSS', 'SKIV2L2', 'GUCY1B3', 'KARS',
       'ZNRD1', 'MPP5', 'MRVI1',
       ...
       'HIST1H3E', 'HIST1H2AK', 'HIST1H2BH', 'HIST1H4I', 'HIST1H4E',
       'HIST1H2AE', 'HIST1H4D', 'HIST1H2BF', 'HIST1H2AM', 'HIST1H3H'],
      dtype='object', length=125)

#### Keeping only genes shared between two datasets

In [75]:
shared_genes = set(Z_df.index) & set(ewing_df.columns) & set(train_Y_df.columns)
len(shared_genes)

5775

In [97]:
Z_df = Z_df[Z_df.index.isin(shared_genes)]

ewing_df = ewing_df[shared_genes]

train_Y_df = train_Y_df[shared_genes]

In [99]:
train_Y_df.to_csv(data_dir / 'mat4-CellO_train-ewing_genes.csv')

In [94]:
ewing_df.to_csv(data_dir / 'Ewing_cell_lines/Ewing_NT_cell_lines.csv')