# "Flat" Modeling for Gene Association with Lung Adenocarcinoma

Modeling with tabular data using 'classic' machine learning methods. The models aim to predict gene association with Lung Adenocarcinoma (LUAD). Data includes 'node features' (ontological features about the genes) and network features (a feature embedding on genes' position in the Protein-Protein Interaction (PPI) network). Genes are identified by their 'Ensembl' ID.

'Node features' come from the Human Protein Atlas, and the PPI network comes from the STRING dataset (restricted to human genes).

## Data and Setup

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

In [2]:
data_path = 'data/HPAnode_PPInetwork_labels_v2.csv'
data = pd.read_csv(data_path, index_col=0)

In [3]:
data

Unnamed: 0_level_0,Gene,Gene synonym,Uniprot,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,node_4,...,network_127,OMIM_pos,PROG_F_pos,PROG_UF_pos,CANCER_FPKM_pos,NIH_pos,Any_pos,NIH_neg,NIH_label,NIH_Cancer
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,TSPAN6,"T245, TM4SF6, TSPAN-6",O43657,-0.293745,-0.037880,1.127839,0.280114,-0.562910,0.680988,-0.107089,...,0.046535,0,0,0,0,0,0,0,,1
ENSG00000000419,DPM1,"CDGIE, MPDS",O60762,0.526855,-0.037093,1.659597,1.147797,0.504771,-1.168400,0.246220,...,-0.039454,0,0,0,0,0,0,0,,1
ENSG00000000457,SCYL3,"PACE-1, PACE1",Q8IZE3,-0.113110,-0.085092,0.917932,0.107147,-0.434965,-0.383316,0.318524,...,-0.052639,0,0,0,0,0,0,0,,1
ENSG00000000460,C1orf112,FLJ10706,Q9NSG2,-0.587922,-0.103505,0.741257,-0.055523,-0.323032,0.007064,-0.198350,...,-0.048218,0,0,0,0,0,0,0,,1
ENSG00000000938,FGR,"c-fgr, p55c-fgr, SRC2",P09769,0.872642,-0.105079,1.660303,1.024769,0.060513,-0.997900,0.527449,...,0.020117,0,0,0,0,0,0,0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000284922,AP000812.5,,,-0.634371,-0.105079,0.700987,-0.087903,-0.319591,0.006010,-0.224073,...,-0.014959,0,0,0,0,0,0,0,,0
ENSG00000285043,ALDOA,,P04075,-0.603405,0.835389,2.009425,1.142514,0.587500,-1.108236,-0.523388,...,-0.069705,0,0,1,0,0,1,0,,0
ENSG00000285188,AC008397.2,,Q08493,-0.618888,-0.105079,1.292300,0.347360,-0.751088,-0.469017,0.258115,...,-0.100453,0,0,0,0,0,0,0,,0
ENSG00000285292,ABCF2,"ABC28, EST133090, HUSSY-18, M-ABC1",Q9UG63,-0.422770,-0.092489,0.942309,-0.071680,-0.648776,0.007953,-0.479971,...,-0.081719,0,0,0,0,0,0,0,,0


In [4]:
# craete positives
label_name = 'my_label'

# find positives
pos_label_col = 'Any_pos' #FIXME: figure out meaning of columns and determing appropriate choice of positive labels
pos_labels = pd.array([1 if row[pos_label_col] else None for id_, row in data.iterrows()], dtype='Int32')
data[label_name] = pos_labels

In [5]:
def sample_negatives(PU_labels):
    '''randomly samples from the unlabeled samples'''

    # sample same # as positives
    num_pos = (PU_labels==1).sum()
    neg_inds = PU_labels[PU_labels.isna()].sample(num_pos).index

    # TODO: more sophisticated methods for sampling methods. (e.g.: use mutation rate, unsupervised learning, etc.)

    return neg_inds # returns ID's of negative samples

neg_label_inds = sample_negatives(data[label_name])
data[label_name].loc[neg_label_inds] = 0

# TODO: save this data for reproducibility (not now, but once this is finalized and fixed)

In [6]:
data[label_name].value_counts()

0    1268
1    1268
Name: my_label, dtype: Int64

In [7]:
# use the above label we just created
label_col = label_name
data[label_col] = data[label_col].astype('Int32')

In [8]:
from sklearn.metrics import classification_report

def eval_model(model, X, y):
    preds = model.predict(X)
    clf_report = classification_report(y, preds, labels=[0, 1], target_names=['negative', 'positive'], digits=2)
    print(clf_report)

## Node-only Modeling

### Set up

In [9]:
num_node_feats = 100
node_feat_cols = ['Tissue RNA - lung [NX]', 'Single Cell Type RNA - Mucus-secreting cells [NX]'] + [f'node_{i}' for i in range(num_node_feats)]

# get subset of node features features + labels
node_data = data[node_feat_cols + [label_col]]

# restrict to data with labels
node_data_labeled = node_data[node_data[label_col].notna()]
node_data_labeled

Unnamed: 0_level_0,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,node_4,node_5,node_6,node_7,...,node_91,node_92,node_93,node_94,node_95,node_96,node_97,node_98,node_99,my_label
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000001036,0.397830,-0.049368,1.193657,0.416142,-0.413031,-0.666605,0.247194,0.397195,-0.874968,-0.011381,...,-0.049438,-0.019217,0.003838,-0.024580,0.005512,0.011996,0.019038,-0.009659,-0.016101,0
ENSG00000001617,0.330737,-0.090758,0.923626,0.115813,-0.237788,-0.126936,-0.438613,0.374190,-0.786831,-0.075578,...,0.005586,0.008552,-0.008633,-0.010206,0.022776,0.018299,0.026055,-0.029779,-0.050307,0
ENSG00000002016,-0.365999,-0.101459,1.163242,-0.231269,0.208343,-0.070411,-0.254480,0.821744,-0.410762,0.079198,...,-0.049879,0.001983,-0.044911,-0.020929,-0.034409,0.055112,-0.030171,-0.049135,-0.029934,0
ENSG00000002587,-0.546634,-0.096266,1.066499,0.290129,-0.545229,-0.473837,0.721052,-0.240669,-0.533160,0.339745,...,-0.022602,0.004721,0.039322,0.008252,-0.010188,-0.035582,0.035657,0.004951,0.013460,0
ENSG00000002822,-0.035695,-0.092961,1.512152,0.132940,0.379079,-0.246361,-0.444551,-0.722999,0.279000,-0.803510,...,-0.031945,-0.028437,-0.006068,0.055419,-0.046591,0.037800,-0.069330,-0.003930,0.015922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277117,-0.660176,-0.105079,0.248569,0.430044,0.020491,0.760891,0.314649,0.212613,0.146690,0.174679,...,0.019891,0.005300,0.000750,-0.041174,0.022332,0.003812,-0.044829,0.011847,0.034006,0
ENSG00000278259,-0.525990,-0.073132,1.012271,-0.024737,-0.669918,0.003448,-0.520130,-0.093473,0.182536,0.526239,...,0.118073,-0.105865,0.085523,0.060406,0.145454,-0.039257,0.129101,-0.075034,0.083078,0
ENSG00000278318,-0.649854,-0.105079,1.335599,-1.172499,0.599662,0.361298,0.138984,0.009006,-0.505264,0.982591,...,0.014641,0.000657,0.004365,-0.015424,-0.001050,-0.002493,0.006437,-0.002337,-0.006283,0
ENSG00000284194,-0.531151,-0.025605,0.621929,0.980805,0.929813,0.320361,-0.048782,-0.544191,0.307942,-0.000984,...,-0.145053,-0.034630,0.017256,-0.011496,0.010741,-0.032822,0.070829,-0.042562,0.010891,0


In [10]:
# separate features and labels
node_feats = node_data_labeled[node_feat_cols]
node_labels = node_data_labeled[label_col].astype('int32')

In [11]:
# create train-test split

from sklearn.model_selection import train_test_split
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(node_feats, node_labels, test_size=test_size, shuffle=True, stratify=node_labels)
# NOTE: train test split is shuffled and stratified across labels

### Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

# define and train model
rf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=5)

rf_clf.fit(X_train, y_train)

# evaluate model
print('Training Metrics')
eval_model(rf_clf, X_train, y_train)

print()
print('Testing Metrics')
eval_model(rf_clf, X_test, y_test)

Training Metrics
              precision    recall  f1-score   support

    negative       0.71      0.94      0.81       951
    positive       0.91      0.61      0.73       951

    accuracy                           0.77      1902
   macro avg       0.81      0.77      0.77      1902
weighted avg       0.81      0.77      0.77      1902


Testing Metrics
              precision    recall  f1-score   support

    negative       0.62      0.84      0.71       317
    positive       0.75      0.49      0.59       317

    accuracy                           0.66       634
   macro avg       0.68      0.66      0.65       634
weighted avg       0.68      0.66      0.65       634



## Network-only Modeling

### Set up

In [13]:
num_network_feats = 128
network_feat_cols = [f'network_{i}' for i in range(num_node_feats)]

# get subset of node features features + labels
network_data = data[network_feat_cols + [label_col]]

# restrict to data with labels
network_data_labeled = network_data[network_data[label_col].notna()]
network_data_labeled

Unnamed: 0_level_0,network_0,network_1,network_2,network_3,network_4,network_5,network_6,network_7,network_8,network_9,...,network_91,network_92,network_93,network_94,network_95,network_96,network_97,network_98,network_99,my_label
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000001036,-0.195539,-0.011342,0.012136,-0.038927,0.088972,-0.242919,-0.028783,0.031071,-0.122504,0.057561,...,-0.064782,-0.126870,0.038508,-0.072104,-0.067658,-0.269228,0.156211,-0.322330,0.157769,0
ENSG00000001617,-0.072077,0.063123,0.135294,-0.116113,0.010464,-0.150923,0.057985,-0.055697,-0.031168,0.265917,...,0.075400,0.035914,0.060360,0.014441,0.127357,0.115016,0.077610,-0.047705,0.091207,0
ENSG00000002016,0.254290,-0.216872,-0.015965,0.504287,0.119811,0.028899,-0.205150,-0.155675,0.004775,-0.112099,...,0.040691,-0.331095,0.205814,-0.174470,0.027875,-0.018708,0.130749,0.091869,0.173594,0
ENSG00000002587,-0.146225,0.059875,0.067772,-0.074139,-0.064995,-0.134350,0.030717,-0.043840,-0.160943,0.040913,...,-0.040477,-0.184431,0.028453,-0.057537,-0.000662,-0.167848,0.102259,-0.180153,0.120956,0
ENSG00000002822,-0.044328,-0.036199,-0.061789,0.121916,0.019136,-0.013479,0.006201,-0.200232,-0.049497,0.021477,...,0.090789,-0.190247,0.199716,0.006697,-0.047643,0.045322,0.014506,-0.032671,0.201487,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277117,0.015416,-0.204552,0.306276,0.151071,0.292707,0.165843,-0.014494,-0.055188,0.016094,0.183903,...,-0.195956,-0.172670,-0.035149,-0.345270,0.256509,-0.520672,0.251216,0.263975,-0.135650,0
ENSG00000278259,0.189178,-0.102453,0.146866,-0.180728,-0.029996,-0.130784,-0.059223,-0.094376,-0.029747,-0.024768,...,0.014758,-0.143367,0.232356,-0.013327,-0.062648,0.048794,0.033635,0.038317,0.083510,0
ENSG00000278318,-0.123296,-0.079331,0.051634,0.066839,0.122630,-0.128684,0.101123,-0.156109,-0.139375,0.050216,...,0.038358,-0.022871,0.031635,-0.044231,0.025973,0.058876,0.120213,-0.116427,0.143766,0
ENSG00000284194,-0.033481,-0.050076,-0.003246,0.246847,0.198195,-0.020044,-0.235567,-0.033447,0.018221,0.185741,...,-0.105263,-0.131112,0.144285,-0.248792,0.066141,0.104347,0.199547,-0.095504,0.045695,0


In [14]:
# separate features and labels
network_feats = network_data_labeled[network_feat_cols]
network_labels = network_data_labeled[label_col].astype('int32')

In [15]:
# create train-test split

from sklearn.model_selection import train_test_split
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(network_feats, network_labels, test_size=test_size, shuffle=True, stratify=network_labels)
# NOTE: train test split is shuffled and stratified across labels

### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

# define and train model
rf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=5)

rf_clf.fit(X_train, y_train)

# evaluate model
print('Training Metrics')
eval_model(rf_clf, X_train, y_train)

print()
print('Testing Metrics')
eval_model(rf_clf, X_test, y_test)

Training Metrics
              precision    recall  f1-score   support

    negative       0.74      0.84      0.79       951
    positive       0.82      0.70      0.75       951

    accuracy                           0.77      1902
   macro avg       0.78      0.77      0.77      1902
weighted avg       0.78      0.77      0.77      1902


Testing Metrics
              precision    recall  f1-score   support

    negative       0.64      0.73      0.69       317
    positive       0.69      0.60      0.64       317

    accuracy                           0.66       634
   macro avg       0.67      0.66      0.66       634
weighted avg       0.67      0.66      0.66       634



## Node + Network Modeling

### Set up

In [17]:
node_network_feat_cols = node_feat_cols + network_feat_cols

# get subset of node features features + labels
node_network_data = data[node_network_feat_cols + [label_col]]

# restrict to data with labels
node_network_data_labeled = node_network_data[node_network_data[label_col].notna()]
node_network_data_labeled

Unnamed: 0_level_0,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,node_4,node_5,node_6,node_7,...,network_91,network_92,network_93,network_94,network_95,network_96,network_97,network_98,network_99,my_label
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000001036,0.397830,-0.049368,1.193657,0.416142,-0.413031,-0.666605,0.247194,0.397195,-0.874968,-0.011381,...,-0.064782,-0.126870,0.038508,-0.072104,-0.067658,-0.269228,0.156211,-0.322330,0.157769,0
ENSG00000001617,0.330737,-0.090758,0.923626,0.115813,-0.237788,-0.126936,-0.438613,0.374190,-0.786831,-0.075578,...,0.075400,0.035914,0.060360,0.014441,0.127357,0.115016,0.077610,-0.047705,0.091207,0
ENSG00000002016,-0.365999,-0.101459,1.163242,-0.231269,0.208343,-0.070411,-0.254480,0.821744,-0.410762,0.079198,...,0.040691,-0.331095,0.205814,-0.174470,0.027875,-0.018708,0.130749,0.091869,0.173594,0
ENSG00000002587,-0.546634,-0.096266,1.066499,0.290129,-0.545229,-0.473837,0.721052,-0.240669,-0.533160,0.339745,...,-0.040477,-0.184431,0.028453,-0.057537,-0.000662,-0.167848,0.102259,-0.180153,0.120956,0
ENSG00000002822,-0.035695,-0.092961,1.512152,0.132940,0.379079,-0.246361,-0.444551,-0.722999,0.279000,-0.803510,...,0.090789,-0.190247,0.199716,0.006697,-0.047643,0.045322,0.014506,-0.032671,0.201487,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000277117,-0.660176,-0.105079,0.248569,0.430044,0.020491,0.760891,0.314649,0.212613,0.146690,0.174679,...,-0.195956,-0.172670,-0.035149,-0.345270,0.256509,-0.520672,0.251216,0.263975,-0.135650,0
ENSG00000278259,-0.525990,-0.073132,1.012271,-0.024737,-0.669918,0.003448,-0.520130,-0.093473,0.182536,0.526239,...,0.014758,-0.143367,0.232356,-0.013327,-0.062648,0.048794,0.033635,0.038317,0.083510,0
ENSG00000278318,-0.649854,-0.105079,1.335599,-1.172499,0.599662,0.361298,0.138984,0.009006,-0.505264,0.982591,...,0.038358,-0.022871,0.031635,-0.044231,0.025973,0.058876,0.120213,-0.116427,0.143766,0
ENSG00000284194,-0.531151,-0.025605,0.621929,0.980805,0.929813,0.320361,-0.048782,-0.544191,0.307942,-0.000984,...,-0.105263,-0.131112,0.144285,-0.248792,0.066141,0.104347,0.199547,-0.095504,0.045695,0


In [18]:
# separate features and labels
node_network_feats = node_network_data_labeled[node_network_feat_cols]
node_network_labels = node_network_data_labeled[label_col].astype('int32')

In [19]:
# create train-test split

from sklearn.model_selection import train_test_split
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(node_network_feats, node_network_labels, test_size=test_size, shuffle=True, stratify=node_network_labels)
# NOTE: train test split is shuffled and stratified across labels

### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

# define and train model
rf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=5)

rf_clf.fit(X_train, y_train)

# evaluate model
print('Training Metrics')
eval_model(rf_clf, X_train, y_train)

print()
print('Testing Metrics')
eval_model(rf_clf, X_test, y_test)

Training Metrics
              precision    recall  f1-score   support

    negative       0.73      0.93      0.82       951
    positive       0.90      0.66      0.76       951

    accuracy                           0.79      1902
   macro avg       0.82      0.79      0.79      1902
weighted avg       0.82      0.79      0.79      1902


Testing Metrics
              precision    recall  f1-score   support

    negative       0.69      0.82      0.75       317
    positive       0.77      0.63      0.69       317

    accuracy                           0.72       634
   macro avg       0.73      0.72      0.72       634
weighted avg       0.73      0.72      0.72       634

