# SINCERITIES DATA - THP1
Script to reformat the THP1 data into the format we use
- For pseudotime, normalize between 0 and 1
- Give an ID to each cell
- Reformat the TF network 
  - limit the expression data to only the TFs in the network?

Example Pseudotime format:

|Cell ID|PseudoTime|Time|
|-------|----------|----|
|E0_2|0.01|0|
|E0_4|0.01|0|
|E0_6|0.01|0|
|E0_8|0.02|0|

Example expression data file format: Cell IDs as columns and gene IDs/names as rows

|E0_2|E0_4|
|----|----|
|x_G56|0.14738218332239472|0.026161670658449|
|x_G15|0.12421452917941098|0.100690884078467|
|x_G85|0.09218533983220337|0.276959159861100|
|x_G47|0.03047679075336165|0.026051158718921|

In [3]:
import pandas as pd

In [81]:
# I replaced the spaces with '-'
input_dir = "/home/jeffl/single-cell/SINCERITIES/SINCERITIES-R-v2.0/THP1-data/"
exp_file = "%s/single_cell_kouno_data.csv" % input_dir
tf_net_file = "%s/tomaru2.csv" % input_dir
exp_df = pd.read_csv(exp_file)
tf_df = pd.read_csv(tf_net_file, header=None)

out_dir = "/home/jeffl/ModelEval/inputs/datasets/human/THP1"
tf_net_out_file = "%s/refNetwork.csv" % (out_dir)
exp_out_file = "%s/ExpressionData.csv" % (out_dir)
pseudotime_out_file = "%s/PseudoTime.csv" % (out_dir)

In [82]:
h_df = exp_df['h'].T
exp_df = exp_df.drop(columns='h').T
exp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,950,951,952,953,954,955,956,957,958,959
BCL6,8.69537,14.024167,3.953921,8.603252,3.821529,17.882448,3.900899,3.996369,11.153821,9.714884,...,15.195303,23.831861,13.368191,11.348399,16.010302,23.770548,44.812659,22.586484,30.22556,33.79703
CBFB,9.175493,8.261743,9.348385,10.164625,9.081856,13.200109,8.124713,8.256181,7.929798,10.354791,...,8.283221,9.901897,9.387798,3.889142,11.059151,9.386934,10.88807,7.999635,9.41108,10.575992
CEBPB,8.414535,12.621252,3.953921,8.825003,6.642862,10.89602,10.750074,11.906658,9.049745,3.755985,...,10.972338,12.107408,13.77674,17.778161,15.538424,4.482382,18.660747,11.985124,9.77837,12.784058
CEBPD,11.424286,11.386729,12.616369,11.643744,11.954111,13.378882,8.750219,10.534708,10.215678,11.405255,...,9.588085,8.399777,13.086464,9.7714,8.188118,9.107268,9.581101,8.666051,9.119476,11.96086
EGR2,3.78027,4.048026,3.953921,3.702583,9.71706,4.041904,3.900899,3.996369,3.947249,3.755985,...,80.334549,39.721643,11.374534,26.830324,46.90624,13.939211,26.484526,7.732172,38.22936,18.440434


In [45]:
tfs = set(exp_df.index)
print("%d tfs" % len(tfs))

45 tfs


In [38]:
# the second column contains the type of regulation
# reformat to have an edge list with the type as the third column
tf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,BCL6,1,EGR2,FOS,ID1,KLF13,KLF4,MLLT3,NFATC2,SNAI3,SP1,ND,ND,ND
1,BCL6,-1,GATA2,ND,ND,ND,ND,ND,ND,ND,ND,ND,ND,ND
2,BRCA1,-1,SPIB,ZNF217,ND,ND,ND,ND,ND,ND,ND,ND,ND,ND
3,CBFB,1,BRCA1,CEBPA,CEBPB,E2F2,LMO2,MYBL2,NFATC2,PPARG,RARG,RUNX1,ND,ND
4,CBFB,-1,ETS2,TCFL5,ZNF217,ND,ND,ND,ND,ND,ND,ND,ND,ND


In [59]:
net = {}
net_tfs = set()

for idx, line in tf_df.iterrows():
    # keep the TF if we have expression for it
    tf = line[0]
    if tf not in tfs:
        continue
    reg_type = line[1]
    reg_type = "+" if reg_type == "1" else "-"
    regulated = line[2:][line[2:] != "ND"]
    for r in regulated:
        if r not in tfs:
            continue
        net[(tf, r)] = reg_type
        net_tfs.add(tf)
        net_tfs.add(r)

print("total of %d nodes, %d edges" % (len(net_tfs), len(net)))
net_df = pd.DataFrame(pd.Series(net)).reset_index()
net_df.head()

total of 45 nodes, 335 edges


Unnamed: 0,level_0,level_1,0
0,BCL6,EGR2,-
1,BCL6,FOS,-
2,BCL6,KLF13,-
3,BCL6,NFATC2,-
4,BCL6,SNAI3,-


In [61]:
print("writing to %s" % (tf_net_out_file))
net_df.to_csv(out_file, index=False, header=["Gene1", "Gene2", "Type"])
# net_df.to_csv(out_file)

writing to /home/jeffl/ModelEval/inputs/datasets/human/THP1/refNetwork.csv


In [63]:
# now write the expression values, and limit to the tfs in the network
exp_df = exp_df.loc[exp_df.index.isin(net_tfs)]
print("limitted expression to %d TFs" % (len(exp_df.index)))


print("writing to %s" % (exp_out_file))
exp_df.to_csv(exp_out_file)

limitted expression to 45 TFs
writing to /home/jeffl/ModelEval/inputs/datasets/human/THP1/ExpressionData.csv


In [84]:
# now write the pseudotime 
h_df = pd.DataFrame(h_df)
h_df['PseudoTime'] = h_df / h_df.max()
# h_df = pd.DataFrame(h_df)
# pseudotime = h_df / h_df.max()
# h_df = pd.DataFrame({"Pseudotime": pseudotime, "Time": h_df})
h_df.columns = ['Time', 'PseudoTime']
print(h_df.head())

print("writing to %s" % (pseudotime_out_file))
h_df.to_csv(pseudotime_out_file, index_label="Cell ID", columns=["PseudoTime", "Time"])

   Time  PseudoTime
0     0         0.0
1     0         0.0
2     0         0.0
3     0         0.0
4     0         0.0
writing to /home/jeffl/ModelEval/inputs/datasets/human/THP1/PseudoTime.csv
