# "Flat" Modeling for Gene Association with Lung Adenocarcinoma

Modeling with tabular data using 'classic' machine learning methods. The models aim to predict gene association with Lung Adenocarcinoma (LUAD). Data includes 'node features' (ontological features about the genes) and network features (a feature embedding on genes' position in the Protein-Protein Interaction (PPI) network). Genes are identified by their 'Ensembl' ID.

'Node features' come from the Human Protein Atlas, and the PPI network comes from the STRING dataset (restricted to human genes).

## Data and Setup

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

In [2]:
data_path = 'data/HPAnode_PPInetwork_labels_v3.csv'
data = pd.read_csv(data_path, index_col=0)

In [3]:
data

Unnamed: 0_level_0,index,Gene,Gene synonym,Uniprot,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,...,network_126,network_127,OMIM_pos,PROG_F_pos,PROG_UF_pos,CANCER_FPKM_pos,NIH_pos,DisGenNet_all_pos,DisGenNet_thresh_pos,Total_pos
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,13587,TSPAN6,"T245, TM4SF6, TSPAN-6",O43657,-0.293745,-0.037880,1.127839,0.280114,-0.562910,0.680988,...,-0.082205,0.046535,0,0,0,0,0,0,0,0
ENSG00000000419,3770,DPM1,"CDGIE, MPDS",O60762,0.526855,-0.037093,1.659597,1.147797,0.504771,-1.168400,...,-0.126685,-0.039454,0,0,0,0,0,0,0,0
ENSG00000000457,11231,SCYL3,"PACE-1, PACE1",Q8IZE3,-0.113110,-0.085092,0.917932,0.107147,-0.434965,-0.383316,...,-0.019312,-0.052639,0,0,0,0,0,0,0,0
ENSG00000000460,1709,C1orf112,FLJ10706,Q9NSG2,-0.587922,-0.103505,0.741257,-0.055523,-0.323032,0.007064,...,-0.049028,-0.048218,0,0,0,0,0,0,0,0
ENSG00000000938,4750,FGR,"c-fgr, p55c-fgr, SRC2",P09769,0.872642,-0.105079,1.660303,1.024769,0.060513,-0.997900,...,-0.112261,0.020117,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000284922,810,AP000812.5,0,0,-0.634371,-0.105079,0.700987,-0.087903,-0.319591,0.006010,...,0.092639,-0.014959,0,0,0,0,0,0,0,0
ENSG00000285043,621,ALDOA,0,P04075,-0.603405,0.835389,2.009425,1.142514,0.587500,-1.108236,...,0.030017,-0.069705,0,0,1,0,0,1,1,3
ENSG00000285188,121,AC008397.2,0,Q08493,-0.618888,-0.105079,1.292300,0.347360,-0.751088,-0.469017,...,0.069851,-0.100453,0,0,0,0,0,0,0,0
ENSG00000285292,56,ABCF2,"ABC28, EST133090, HUSSY-18, M-ABC1",Q9UG63,-0.422770,-0.092489,0.942309,-0.071680,-0.648776,0.007953,...,0.020679,-0.081719,0,0,0,0,0,0,0,0


In [4]:
# create positives
label_name = 'my_label'

# find positives
pos_label_col = 'DisGenNet_thresh_pos' #NOTE: Genes associated with all synonyms of LUAD disease with GDA_score > 0.2
pos_labels = pd.array([1 if row[pos_label_col] else None for id_, row in data.iterrows()], dtype='Int32')
data[label_name] = pos_labels

In [5]:
# create negatives
def sample_negatives(PU_labels):
    '''randomly samples from the unlabeled samples'''

    # sample same # as positives
    num_pos = (PU_labels==1).sum()
    neg_inds = PU_labels[PU_labels.isna()].sample(num_pos).index

    # TODO: more sophisticated methods for sampling methods. (e.g.: use mutation rate, unsupervised learning, etc.)

    return neg_inds # returns ID's of negative samples

neg_label_inds = sample_negatives(data[label_name])
data[label_name].loc[neg_label_inds] = 0

# TODO: save this data for reproducibility (not now, but once this is finalized and fixed)

In [6]:
data[label_name].value_counts()

1    191
0    191
Name: my_label, dtype: Int64

In [7]:
# use the above label we just created
label_col = label_name
data[label_col] = data[label_col].astype('Int32')

In [8]:
from sklearn.metrics import classification_report

def eval_model(model, X, y):
    preds = model.predict(X)
    clf_report = classification_report(y, preds, labels=[0, 1], target_names=['negative', 'positive'], digits=2)
    print(clf_report)

## Node-only Modeling

### Set up

In [9]:
num_node_feats = 100
node_feat_cols = ['Tissue RNA - lung [NX]', 'Single Cell Type RNA - Mucus-secreting cells [NX]'] + [f'node_{i}' for i in range(num_node_feats)]

# get subset of node features features + labels
node_data = data[node_feat_cols + [label_col]]

# restrict to data with labels
node_data_labeled = node_data[node_data[label_col].notna()]
node_data_labeled

Unnamed: 0_level_0,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,node_4,node_5,node_6,node_7,...,node_91,node_92,node_93,node_94,node_95,node_96,node_97,node_98,node_99,my_label
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000005339,0.150101,-0.069827,2.622879,0.092524,1.558535,-1.148822,0.606971,0.573626,0.106728,-0.357630,...,-0.327284,-0.087676,0.254183,-0.066311,-0.014220,-0.059492,0.095315,0.159288,-0.186821,1
ENSG00000006534,0.748778,-0.089341,0.902192,0.159079,-0.435641,-0.410976,0.345636,-0.247802,-0.428201,0.254421,...,-0.002932,-0.036781,-0.002733,0.009001,-0.022897,0.003437,-0.006250,-0.003819,0.029561,0
ENSG00000007312,-0.655015,-0.105079,1.480835,0.846645,0.805566,0.485862,-0.194441,0.923558,1.438937,-0.492804,...,-0.031383,-0.038140,-0.029342,-0.007463,-0.029591,-0.034695,-0.038255,0.000011,0.000772,0
ENSG00000007520,0.088169,-0.056293,1.100148,0.054271,-0.761169,-0.140970,-0.102144,-0.007949,0.279435,0.606258,...,0.327088,-0.138941,0.142827,-0.262890,-0.390360,-0.098290,-0.249308,-0.193768,0.025886,0
ENSG00000010810,0.578465,-0.101302,1.677973,0.581332,-0.799185,-0.650435,0.386856,0.852724,0.152394,0.765213,...,-0.085160,0.043194,0.094463,-0.135246,0.052647,0.171408,0.179968,-0.194070,0.171541,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000269095,-0.665337,-0.105079,0.690082,-0.092586,-0.314090,0.006767,-0.213380,-0.234725,-0.411117,0.127340,...,0.002987,0.001183,0.005107,-0.002537,-0.004068,0.001942,0.000797,0.000638,-0.004410,0
ENSG00000275163,-0.644693,-0.105079,0.156730,0.354903,0.093585,0.653698,0.369548,0.067753,0.047865,0.084707,...,-0.001112,-0.001918,0.008036,-0.008390,0.008359,0.003466,0.012333,0.003029,-0.003975,0
ENSG00000275895,-0.381482,-0.102089,1.071540,-0.431914,-0.415929,0.096122,-0.030220,-0.295607,-0.076375,-0.654858,...,-0.096775,0.020389,-0.000086,0.065344,-0.012173,0.036081,0.027791,0.010751,-0.005335,0
ENSG00000276409,0.640397,-0.103505,0.437030,0.302569,0.424052,-0.277284,-0.332591,1.518940,-0.449915,-0.439091,...,-0.005886,-0.056655,-0.098032,0.097823,0.012923,-0.074887,-0.041497,-0.130827,-0.107135,1


In [10]:
# separate features and labels
node_feats = node_data_labeled[node_feat_cols]
node_labels = node_data_labeled[label_col].astype('int32')

In [11]:
# create train-test split

from sklearn.model_selection import train_test_split
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(node_feats, node_labels, test_size=test_size, shuffle=True, stratify=node_labels, random_state=360)
# NOTE: train test split is shuffled and stratified across labels

### Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# parameter grid to search over
parameters = {
                'n_estimators':[20, 50, 100, 150, 200],
                'criterion':['gini'],
                'max_depth':[1, 2, 3, 4, 5]
             }

# base random forest model
rf = RandomForestClassifier(n_jobs=-1)

# perform a gridsearch with 5-fold crossvalidation to find the best model
rf_clf = GridSearchCV(rf, parameters, n_jobs=-1, refit=True, cv=5, return_train_score=True)

rf_clf.fit(X_train, y_train)


# show choice of parameters that yielded the best performance
print('Best Parameters')
print(rf_clf.best_params_)
print('\n')

# evaluate model
print('Training Metrics')
eval_model(rf_clf.best_estimator_, X_train, y_train)

print()
print('Testing Metrics')
eval_model(rf_clf.best_estimator_, X_test, y_test)

Best Parameters
{'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100}


Training Metrics
              precision    recall  f1-score   support

    negative       0.97      1.00      0.98       143
    positive       1.00      0.97      0.98       143

    accuracy                           0.98       286
   macro avg       0.98      0.98      0.98       286
weighted avg       0.98      0.98      0.98       286


Testing Metrics
              precision    recall  f1-score   support

    negative       0.64      0.67      0.65        48
    positive       0.65      0.62      0.64        48

    accuracy                           0.65        96
   macro avg       0.65      0.65      0.65        96
weighted avg       0.65      0.65      0.65        96



## Network-only Modeling

### Set up

In [13]:
num_network_feats = 128
network_feat_cols = [f'network_{i}' for i in range(num_node_feats)]

# get subset of node features features + labels
network_data = data[network_feat_cols + [label_col]]

# restrict to data with labels
network_data_labeled = network_data[network_data[label_col].notna()]
network_data_labeled

Unnamed: 0_level_0,network_0,network_1,network_2,network_3,network_4,network_5,network_6,network_7,network_8,network_9,...,network_91,network_92,network_93,network_94,network_95,network_96,network_97,network_98,network_99,my_label
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000005339,-0.063438,-0.123012,-0.025039,0.447147,0.187767,-0.070443,-0.044440,-0.194156,0.062328,0.117881,...,0.191070,-0.010945,0.051817,-0.096172,0.099404,0.052227,0.107331,-0.030173,0.059622,1
ENSG00000006534,-0.035242,0.064876,0.040202,0.082499,0.252994,-0.233564,-0.180584,-0.113003,-0.129792,0.065249,...,-0.099884,0.047175,0.118440,-0.021755,-0.011008,-0.241016,0.029578,-0.283478,0.041169,0
ENSG00000007312,-0.098109,-0.085026,0.310362,0.172199,0.249988,0.150812,-0.117433,-0.093251,-0.021089,0.172613,...,-0.104160,-0.119484,-0.006960,-0.234720,0.234587,-0.303881,0.260561,0.104567,-0.057288,0
ENSG00000007520,-0.101301,-0.147959,0.232277,0.318382,0.259074,0.003932,-0.116673,0.086732,0.005244,0.106439,...,-0.053229,-0.259643,0.120694,0.018940,-0.110807,-0.103595,0.121609,-0.164909,0.073710,0
ENSG00000010810,0.031518,-0.010374,0.257410,-0.026564,0.118352,0.005907,0.084970,-0.128120,0.005545,0.196510,...,-0.062661,-0.004803,0.012089,-0.027547,0.172744,0.049692,0.270915,0.095332,0.001077,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000269095,-0.058508,-0.131115,-0.001548,0.366281,0.165283,-0.120703,0.027367,-0.141815,-0.075961,0.056650,...,0.114281,-0.115354,0.090206,-0.052724,0.070604,0.050085,0.024046,-0.021835,0.167415,0
ENSG00000275163,-0.034133,0.019414,0.035749,-0.021617,0.065612,-0.244060,0.073659,-0.082991,-0.155493,0.007917,...,-0.037778,-0.103085,0.035907,-0.119050,0.003325,-0.092242,0.065556,-0.177275,0.103259,0
ENSG00000275895,-0.088792,0.004608,0.055495,0.216077,0.220475,0.171754,-0.061360,-0.113663,0.194315,0.084847,...,0.064851,-0.222000,0.063197,-0.049756,0.059245,-0.040826,0.089133,-0.004569,0.072815,0
ENSG00000276409,-0.055234,-0.121879,0.256238,0.099107,0.259746,0.120346,0.005804,-0.035196,-0.001159,0.269637,...,-0.163702,-0.118985,-0.078946,-0.330696,0.245002,-0.551165,0.222941,0.124248,-0.117245,1


In [14]:
# separate features and labels
network_feats = network_data_labeled[network_feat_cols]
network_labels = network_data_labeled[label_col].astype('int32')

In [15]:
# create train-test split

from sklearn.model_selection import train_test_split
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(network_feats, network_labels, test_size=test_size, shuffle=True, stratify=network_labels)
# NOTE: train test split is shuffled and stratified across labels

### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# parameter grid to search over
parameters = {
                'n_estimators':[20, 50, 100, 150, 200],
                'criterion':['gini'],
                'max_depth':[1, 2, 3, 4, 5]
             }

# base random forest model
rf = RandomForestClassifier(n_jobs=-1)

# perform a gridsearch with 5-fold crossvalidation to find the best model
rf_clf = GridSearchCV(rf, parameters, n_jobs=-1, refit=True, cv=5, return_train_score=True)

rf_clf.fit(X_train, y_train)


# show choice of parameters that yielded the best performance
print('Best Parameters')
print(rf_clf.best_params_)
print('\n')

# evaluate model
print('Training Metrics')
eval_model(rf_clf.best_estimator_, X_train, y_train)

print()
print('Testing Metrics')
eval_model(rf_clf.best_estimator_, X_test, y_test)

Best Parameters
{'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100}


Training Metrics
              precision    recall  f1-score   support

    negative       0.97      0.96      0.96       143
    positive       0.96      0.97      0.97       143

    accuracy                           0.97       286
   macro avg       0.97      0.97      0.97       286
weighted avg       0.97      0.97      0.97       286


Testing Metrics
              precision    recall  f1-score   support

    negative       0.75      0.75      0.75        48
    positive       0.75      0.75      0.75        48

    accuracy                           0.75        96
   macro avg       0.75      0.75      0.75        96
weighted avg       0.75      0.75      0.75        96



## Node + Network Modeling

### Set up

In [17]:
node_network_feat_cols = node_feat_cols + network_feat_cols

# get subset of node features features + labels
node_network_data = data[node_network_feat_cols + [label_col]]

# restrict to data with labels
node_network_data_labeled = node_network_data[node_network_data[label_col].notna()]
node_network_data_labeled

Unnamed: 0_level_0,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,node_4,node_5,node_6,node_7,...,network_91,network_92,network_93,network_94,network_95,network_96,network_97,network_98,network_99,my_label
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000005339,0.150101,-0.069827,2.622879,0.092524,1.558535,-1.148822,0.606971,0.573626,0.106728,-0.357630,...,0.191070,-0.010945,0.051817,-0.096172,0.099404,0.052227,0.107331,-0.030173,0.059622,1
ENSG00000006534,0.748778,-0.089341,0.902192,0.159079,-0.435641,-0.410976,0.345636,-0.247802,-0.428201,0.254421,...,-0.099884,0.047175,0.118440,-0.021755,-0.011008,-0.241016,0.029578,-0.283478,0.041169,0
ENSG00000007312,-0.655015,-0.105079,1.480835,0.846645,0.805566,0.485862,-0.194441,0.923558,1.438937,-0.492804,...,-0.104160,-0.119484,-0.006960,-0.234720,0.234587,-0.303881,0.260561,0.104567,-0.057288,0
ENSG00000007520,0.088169,-0.056293,1.100148,0.054271,-0.761169,-0.140970,-0.102144,-0.007949,0.279435,0.606258,...,-0.053229,-0.259643,0.120694,0.018940,-0.110807,-0.103595,0.121609,-0.164909,0.073710,0
ENSG00000010810,0.578465,-0.101302,1.677973,0.581332,-0.799185,-0.650435,0.386856,0.852724,0.152394,0.765213,...,-0.062661,-0.004803,0.012089,-0.027547,0.172744,0.049692,0.270915,0.095332,0.001077,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000269095,-0.665337,-0.105079,0.690082,-0.092586,-0.314090,0.006767,-0.213380,-0.234725,-0.411117,0.127340,...,0.114281,-0.115354,0.090206,-0.052724,0.070604,0.050085,0.024046,-0.021835,0.167415,0
ENSG00000275163,-0.644693,-0.105079,0.156730,0.354903,0.093585,0.653698,0.369548,0.067753,0.047865,0.084707,...,-0.037778,-0.103085,0.035907,-0.119050,0.003325,-0.092242,0.065556,-0.177275,0.103259,0
ENSG00000275895,-0.381482,-0.102089,1.071540,-0.431914,-0.415929,0.096122,-0.030220,-0.295607,-0.076375,-0.654858,...,0.064851,-0.222000,0.063197,-0.049756,0.059245,-0.040826,0.089133,-0.004569,0.072815,0
ENSG00000276409,0.640397,-0.103505,0.437030,0.302569,0.424052,-0.277284,-0.332591,1.518940,-0.449915,-0.439091,...,-0.163702,-0.118985,-0.078946,-0.330696,0.245002,-0.551165,0.222941,0.124248,-0.117245,1


In [18]:
# separate features and labels
node_network_feats = node_network_data_labeled[node_network_feat_cols]
node_network_labels = node_network_data_labeled[label_col].astype('int32')

In [19]:
# create train-test split

from sklearn.model_selection import train_test_split
test_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(node_network_feats, node_network_labels, test_size=test_size, shuffle=True, stratify=node_network_labels)
# NOTE: train test split is shuffled and stratified across labels

### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# # parameter grid to search over
# parameters = {
#                 'n_estimators':[20, 50, 100, 150, 200],
#                 'criterion':['gini', 'entropy'],
#                 'max_depth':[1, 2, 3, 4, 5]
#              }

# # base random forest model
# rf = RandomForestClassifier(n_jobs=-1)

# # perform a gridsearch with 5-fold crossvalidation to find the best model
# rf_clf = GridSearchCV(rf, parameters, n_jobs=-1, refit=True, cv=5, return_train_score=True)

rf_clf = RandomForestClassifier(n_estimators=150, max_depth=3, criterion='gini', n_jobs=-1)
rf_clf.fit(X_train, y_train)


# # show choice of parameters that yielded the best performance
# print('Best Parameters')
# print(rf_clf.best_params_)
# print('\n')

# evaluate model
print('Training Metrics')
eval_model(rf_clf, X_train, y_train)

print()
print('Testing Metrics')
eval_model(rf_clf, X_test, y_test)

Training Metrics
              precision    recall  f1-score   support

    negative       0.91      0.87      0.89       143
    positive       0.88      0.91      0.89       143

    accuracy                           0.89       286
   macro avg       0.89      0.89      0.89       286
weighted avg       0.89      0.89      0.89       286


Testing Metrics
              precision    recall  f1-score   support

    negative       0.76      0.77      0.76        48
    positive       0.77      0.75      0.76        48

    accuracy                           0.76        96
   macro avg       0.76      0.76      0.76        96
weighted avg       0.76      0.76      0.76        96

