# Create Labels

This notebook creates labels for training. Positive samples are obtained through DisGenNet. Negative samples are randomly samples from the remaining genes.

In [1]:
import pandas as pd
import numpy as np

In [2]:
node_dataset = pd.read_csv('data/HPAnode_PPInetwork_labels_v3.csv', index_col=0)
node_dataset

Unnamed: 0_level_0,index,Gene,Gene synonym,Uniprot,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,...,network_126,network_127,OMIM_pos,PROG_F_pos,PROG_UF_pos,CANCER_FPKM_pos,NIH_pos,DisGenNet_all_pos,DisGenNet_thresh_pos,Total_pos
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,13587,TSPAN6,"T245, TM4SF6, TSPAN-6",O43657,-0.293745,-0.037880,1.127839,0.280114,-0.562910,0.680988,...,-0.082205,0.046535,0,0,0,0,0,0,0,0
ENSG00000000419,3770,DPM1,"CDGIE, MPDS",O60762,0.526855,-0.037093,1.659597,1.147797,0.504771,-1.168400,...,-0.126685,-0.039454,0,0,0,0,0,0,0,0
ENSG00000000457,11231,SCYL3,"PACE-1, PACE1",Q8IZE3,-0.113110,-0.085092,0.917932,0.107147,-0.434965,-0.383316,...,-0.019312,-0.052639,0,0,0,0,0,0,0,0
ENSG00000000460,1709,C1orf112,FLJ10706,Q9NSG2,-0.587922,-0.103505,0.741257,-0.055523,-0.323032,0.007064,...,-0.049028,-0.048218,0,0,0,0,0,0,0,0
ENSG00000000938,4750,FGR,"c-fgr, p55c-fgr, SRC2",P09769,0.872642,-0.105079,1.660303,1.024769,0.060513,-0.997900,...,-0.112261,0.020117,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000284922,810,AP000812.5,0,0,-0.634371,-0.105079,0.700987,-0.087903,-0.319591,0.006010,...,0.092639,-0.014959,0,0,0,0,0,0,0,0
ENSG00000285043,621,ALDOA,0,P04075,-0.603405,0.835389,2.009425,1.142514,0.587500,-1.108236,...,0.030017,-0.069705,0,0,1,0,0,1,1,3
ENSG00000285188,121,AC008397.2,0,Q08493,-0.618888,-0.105079,1.292300,0.347360,-0.751088,-0.469017,...,0.069851,-0.100453,0,0,0,0,0,0,0,0
ENSG00000285292,56,ABCF2,"ABC28, EST133090, HUSSY-18, M-ABC1",Q9UG63,-0.422770,-0.092489,0.942309,-0.071680,-0.648776,0.007953,...,0.020679,-0.081719,0,0,0,0,0,0,0,0


In [3]:
np.random.seed(314159)

# create positives
label_name = 'training_label'

# find positives
pos_label_col = 'DisGenNet_thresh_pos'
pos_labels = pd.array([1 if row[pos_label_col] else None for id_, row in node_dataset.iterrows()], dtype='Int32')
node_dataset[label_name] = pos_labels

# create negatives
def sample_negatives(PU_labels):
    '''randomly samples from the unlabeled samples'''

    # sample same # as positives
    num_pos = (PU_labels==1).sum()
    neg_inds = PU_labels[PU_labels.isna()].sample(num_pos).index

    # TODO: more sophisticated methods for sampling methods. (e.g.: use mutation rate, unsupervised learning, etc.)

    return neg_inds # returns ID's of negative samples

neg_label_inds = sample_negatives(node_dataset[label_name])
node_dataset[label_name].loc[neg_label_inds] = 0

# TODO: save this data for reproducibility (not now, but once this is finalized and fixed)

node_dataset[label_name].value_counts()

0    191
1    191
Name: training_label, dtype: Int64

In [4]:
node_dataset

Unnamed: 0_level_0,index,Gene,Gene synonym,Uniprot,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,...,network_127,OMIM_pos,PROG_F_pos,PROG_UF_pos,CANCER_FPKM_pos,NIH_pos,DisGenNet_all_pos,DisGenNet_thresh_pos,Total_pos,training_label
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,13587,TSPAN6,"T245, TM4SF6, TSPAN-6",O43657,-0.293745,-0.037880,1.127839,0.280114,-0.562910,0.680988,...,0.046535,0,0,0,0,0,0,0,0,
ENSG00000000419,3770,DPM1,"CDGIE, MPDS",O60762,0.526855,-0.037093,1.659597,1.147797,0.504771,-1.168400,...,-0.039454,0,0,0,0,0,0,0,0,
ENSG00000000457,11231,SCYL3,"PACE-1, PACE1",Q8IZE3,-0.113110,-0.085092,0.917932,0.107147,-0.434965,-0.383316,...,-0.052639,0,0,0,0,0,0,0,0,
ENSG00000000460,1709,C1orf112,FLJ10706,Q9NSG2,-0.587922,-0.103505,0.741257,-0.055523,-0.323032,0.007064,...,-0.048218,0,0,0,0,0,0,0,0,
ENSG00000000938,4750,FGR,"c-fgr, p55c-fgr, SRC2",P09769,0.872642,-0.105079,1.660303,1.024769,0.060513,-0.997900,...,0.020117,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000284922,810,AP000812.5,0,0,-0.634371,-0.105079,0.700987,-0.087903,-0.319591,0.006010,...,-0.014959,0,0,0,0,0,0,0,0,
ENSG00000285043,621,ALDOA,0,P04075,-0.603405,0.835389,2.009425,1.142514,0.587500,-1.108236,...,-0.069705,0,0,1,0,0,1,1,3,1
ENSG00000285188,121,AC008397.2,0,Q08493,-0.618888,-0.105079,1.292300,0.347360,-0.751088,-0.469017,...,-0.100453,0,0,0,0,0,0,0,0,
ENSG00000285292,56,ABCF2,"ABC28, EST133090, HUSSY-18, M-ABC1",Q9UG63,-0.422770,-0.092489,0.942309,-0.071680,-0.648776,0.007953,...,-0.081719,0,0,0,0,0,0,0,0,


In [5]:
node_dataset.to_csv('data/HPAnode_PPInetwork_labels_v3.1.csv')