# Preparing Data

BEELINE framework is used to benchmark the performance of our algorithm. You can download raw data from https://doi.org/10.5281/zenodo.3378975, which is provided by BEELINE framework, and use the preoprocess code in https://github.com/Murali-group/Beeline/blob/master/generateExpInputs.py to generate datasets.


In [1]:
import pandas as pd
from statistics import mean
import numpy as np

In [2]:
#please specify the name of expression data file
exp_data_fname = "data.csv"

#please specify the name of network file
net_fname = "network.csv"

suffix = "rna"

## Uploading Expression Data

In [3]:
expData = pd.read_csv(exp_data_fname)
expData = expData.T
expData.columns = expData.iloc[0]
expData = expData.iloc[pd.RangeIndex(len(expData)).drop(0)]
expData.insert(0, "cell_id", expData.index, True)
expData = expData.set_index(np.arange(expData.shape[0]))
expData = expData.rename_axis(None, axis=1)
expData = expData.reindex(sorted(expData.columns), axis=1)
expData = expData.sample(frac = 1)
expData = expData.set_index(np.arange(expData.shape[0]))
expData = expData.drop(columns='cell_id')

## saving formatted expression data

In [4]:
network = pd.read_csv(net_fname)
genes = (network['Gene1'].append(network['Gene2'])).unique()
expData = expData.loc[:, expData.columns.intersection(genes)]
tfs = (network['Gene1']).unique()
non_tfs = []
for gene in genes:
    if (gene not in tfs):
        non_tfs.append(gene)
tfs = tfs.tolist()
expData = expData.reindex((tfs+non_tfs), axis=1)
#[number of cells, time, features, genes]
shape = [expData.shape[0], 1, 1, expData.shape[1]]
formatted_data = expData.to_numpy(dtype=np.float32)
formatted_data = np.reshape(formatted_data, shape)
np.save('train_' + suffix + '.npy', formatted_data)

  genes = (network['Gene1'].append(network['Gene2'])).unique()


## Generating and saving true edges file

In [5]:
sh = expData.columns.shape[0]
true_edges = np.zeros((sh, sh))
cols = expData.columns.tolist()
for i in range(network.shape[0]):
    idx1 = cols.index(network.iloc[i]['Gene1'])
    idx2 = cols.index(network.iloc[i]['Gene2'])
    true_edges[idx1, idx2] = 1
np.save('true_edges.npy', true_edges)

### (Optional) used only for computing accuracy for each batch as an indicator for the training process. If you want to ignore it, set compute_acc argument in training file to False. 

In [6]:
all_edges = np.empty([expData.shape[0], len(tfs), genes.shape[0]])
for i in range(expData.shape[0]):
    edges = pd.DataFrame(0, tfs, np.concatenate((tfs, non_tfs)), np.int8)
    for j in range(network.shape[0]):
        gene1 = network.iloc[j]["Gene1"]
        gene2 = network.iloc[j]["Gene2"]
        if((expData.iloc[i][gene1] > 0) & (expData.iloc[i][gene2] > 0)):
            edges[gene1][gene2] = 1
    all_edges[i] = edges.to_numpy()
np.save('edges_train_' + suffix + '.npy', all_edges)    