# Create Labels

This notebook creates labels for training. Positive samples are obtained through DisGenNet via a chosen threshold of the GDA (Gene-Disease Association) score. Negative samples are randomly samples from genes with a GDA score of 0. For evaluation and reproducibility, multiple random samplings are created to run multiple trials and thoroughly evaluate our models.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = 'data/phys_graph/final_nodeonly_node_data_phys_v1.csv'
node_dataset = pd.read_csv(data_path, index_col=0)

In [3]:
label_col = 'gda_score_thres0.02'

# in dataset '0' is 'unlabeled'; map this to NA in order to use '0' as 'negative'
node_dataset[label_col] = node_dataset[label_col].map({0: pd.NA, 1:1})

print('# of positive labels: ', node_dataset[label_col].sum())


labels = node_dataset[['gda_score', label_col]].copy()

labels[label_col] = labels[label_col].astype('Int64')

# sample negative samples from unlabeled samples
num_neg_samples = node_dataset[label_col].sum() # num of neg samples to create

# genes which have no publications supporting gene-disease association
unlabeled_samples = node_dataset.index[node_dataset['gda_score']==0]

# sample 100 trials of labels
num_random_samplings = 100
for i in range(num_random_samplings):

    negative_samples = np.random.choice(unlabeled_samples, size=num_neg_samples, replace=False)

    labels[f'label_{i}'] = labels[label_col]
    labels.loc[negative_samples, f'label_{i}'] = 0

# of positive labels:  777


  labels[f'label_{i}'] = labels[label_col]


In [4]:
labels

Unnamed: 0_level_0,gda_score,gda_score_thres0.02,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,...,label_90,label_91,label_92,label_93,label_94,label_95,label_96,label_97,label_98,label_99
ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000175899,0.0,,,,,,,,,,...,,,,,,,,,,0
ENSG00000128274,0.0,,,,,,,,,,...,,0,,,,,,0,0,
ENSG00000094914,0.0,,,,0,,,,,,...,,,,,,0,,,,
ENSG00000081760,0.0,,,,,,,,,,...,,0,,,,,,,,
ENSG00000114771,0.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000198455,0.0,,,,,,,,,,...,,,,,,,,,,
ENSG00000070476,0.0,,,,,,0,,,,...,,,,,,,,,,
ENSG00000162378,0.0,,,,,,0,,,,...,,,,,0,,,,,
ENSG00000159840,0.0,,,,,,,,0,,...,,,,,,,,,,


In [5]:
labels.to_csv('data/labels_thres0,02_trials.csv')