#### 1. Convert Gene Symbol: (If you use the demo data, skip this step)

- Convert the gene symbol in your data to match our list `OS_scRNA_gene_index.19264.tsv`.
- For Python users, you can use the `main_gene_selection` function in `get_embedding.py`:
  ```python
  # X_df represents your single cell data with cells in rows and genes in columns
  gene_list_df = pd.read_csv('../OS_scRNA_gene_index.19264.tsv', header=0, delimiter='\t')
  gene_list = list(gene_list_df['gene_name'])
  X_df, to_fill_columns, var = main_gene_selection(X_df, gene_list)
  ```
- Save your data `X_df` in either `npy` or `csv` format.

In [32]:
# imports
import argparse
import random,os
import numpy as np
import pandas as pd
import argparse
import torch
from tqdm import tqdm
import scipy.sparse
from scipy.sparse import issparse
import scanpy as sc
from load import *

#Set random seed
random.seed(0)
np.random.seed(0)  # numpy random generator

torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# variables
data_path = os.path.join("..","examples/examples/SCAD/Source_exprs_resp_19264.NVP-TAE684.csv")

# functions

def main_gene_selection(X_df, gene_list):
    """
    Describe:
        rebuild the input adata to select target genes encode protein 
    Parameters:
        adata->`~anndata.AnnData` object: adata with var index_name by gene symbol
        gene_list->list: wanted target gene 
    Returns:
        adata_new->`~anndata.AnnData` object
        to_fill_columns->list: zero padding gene
    """
    to_fill_columns = list(set(gene_list) - set(X_df.columns))
    padding_df = pd.DataFrame(np.zeros((X_df.shape[0], len(to_fill_columns))), 
                              columns=to_fill_columns, 
                              index=X_df.index)
    X_df = pd.DataFrame(np.concatenate([df.values for df in [X_df, padding_df]], axis=1), 
                        index=X_df.index, 
                        columns=list(X_df.columns) + list(padding_df.columns))
    X_df = X_df[gene_list]
    
    var = pd.DataFrame(index=X_df.columns)
    var['mask'] = [1 if i in to_fill_columns else 0 for i in list(var.index)]
    return X_df, to_fill_columns,var



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#Load data
gexpr_feature=pd.read_csv(data_path,index_col=0)

In [35]:
# drop column A1BG A1CF A3GALT2

gexpr_feature=gexpr_feature.drop(['A1BG','A1CF','A3GALT2'],axis=1)
print(gexpr_feature.shape)

(395, 19261)


In [None]:
# Convert Gene symbols
gene_list_df = pd.read_csv('./OS_scRNA_gene_index.19264.tsv', header=0, delimiter='\t')
gene_list = list(gene_list_df['gene_name'])

X_df, to_fill_columns,var = main_gene_selection(gexpr_feature, gene_list)