# Non-Graph Modeling

Some modeling done purely on node features (i.e. no network yet)

In [1]:
import pandas as pd
import numpy as np

## Data Prep

Note: Here, we use data from the Human Protein Atlas along with labels based on OMIM searches. The labels implicityly assume that if a gene does not come up as positive in a search then it is not associated with LUAD. This assumption is not correct. A more careful labeling method is required. Thus, we can't draw any conclusions from the results of this notebook; we merely hope to gain some insights on the node features.

In [2]:
HPA_data = pd.read_csv('data/HPA_Complete_v1.csv', index_col=0)
HPA_data.set_index('Ensembl', inplace=True)
HPA_data.head()

Unnamed: 0_level_0,Gene,Gene synonym,Uniprot,Disease involvement,Subcellular location,Pathology prognostics - Lung cancer,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],0,1,...,95,96,97,98,99,OMIM_pos,PROG_F_pos,PROG_UF_pos,CANCER_FPKM_pos,Total_pos
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000175899,A2M,"CPAMD5, FWP007, S863-7",P01023,Cancer-related genes,,unprognostic (3.65e-2),227.4,0.0,0.395646,-0.212794,...,-0.010209,0.022287,0.019173,-0.048816,-0.023244,0,0,0,0,0
ENSG00000128274,A4GALT,"A14GALT, Gb3S, P(k), P1",Q9NPC4,,Mitochondria,unprognostic (2.10e-1),14.0,0.0,0.56911,-0.864203,...,0.007013,0.023082,-0.007774,-0.00464,0.086262,0,0,0,0,0
ENSG00000094914,AAAS,,Q9NRG9,Disease mutation,"Nuclear membrane,Centrosome,Cytosol",unprognostic (1.70e-1),13.4,11.5,1.325301,-0.550087,...,0.024812,0.009532,0.01974,0.058779,-0.03386,0,0,0,0,0
ENSG00000081760,AACS,"ACSF1, FLJ12389, SUR-5",Q86V21,,Vesicles,unprognostic (3.03e-2),4.9,20.1,0.445957,-0.720734,...,-0.025242,-0.078717,0.034633,-0.020142,-0.155357,0,0,0,0,0
ENSG00000114771,AADAC,"CES5A1, DAC",P22760,,,unprognostic (2.15e-1),1.7,0.0,1.1163,-0.259656,...,0.042293,-0.00263,0.020109,-0.035264,-0.028654,0,0,0,0,0


In [3]:
label_col = 'Total_pos'

# TODO: shuffle HPA data

filt = HPA_data[label_col] == 1
pos_data = HPA_data[filt]
num_pos = len(pos_data)
neg_data = HPA_data[~filt].iloc[:num_pos]

train_test_data = pos_data.append(neg_data)
train_test_data = train_test_data.sample(frac=1)
train_test_data.head()

Unnamed: 0_level_0,Gene,Gene synonym,Uniprot,Disease involvement,Subcellular location,Pathology prognostics - Lung cancer,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],0,1,...,95,96,97,98,99,OMIM_pos,PROG_F_pos,PROG_UF_pos,CANCER_FPKM_pos,Total_pos
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000254692,AL136295.1,,,,Golgi apparatus,,7.0,0.0,0.182367,-0.404351,...,-0.00259,-0.00207,0.002401,0.003802,0.002711,0,0,0,0,0
ENSG00000131747,TOP2A,TOP2,P11388,"Cancer-related genes, FDA approved drug targets","Nucleoplasm,Nucleoli",prognostic unfavorable (4.99e-4),5.5,5.9,1.54594,-0.035698,...,-0.046726,-0.005839,-0.018328,-0.003894,-0.058346,0,0,1,0,1
ENSG00000165275,TRMT10B,"bA3J10.9, FLJ31455, RG9MTD3",Q6PF06,,"Nucleoplasm,Plasma membrane",prognostic favorable (3.91e-4),8.6,12.9,1.1522,-0.29954,...,0.024222,-0.006081,0.03175,-0.004874,0.025895,0,1,0,0,1
ENSG00000138160,KIF11,"Eg5, HKSP, KNSL1, TRIP5",P52732,Disease mutation,"Mitotic spindle,Cytosol",prognostic unfavorable (3.27e-4),2.8,1.9,1.361867,-0.239267,...,-0.017084,0.003806,0.00264,0.01994,-0.002632,0,0,1,0,1
ENSG00000133612,AGAP3,CENTG3,Q96P47,,,unprognostic (2.05e-2),21.4,9.1,0.808818,0.083761,...,-0.001807,-0.014019,-0.002411,6.9e-05,-0.00303,0,0,0,0,0


In [4]:
labels = train_test_data[label_col]

feature_cols = [str(i) for i in range(100)]
feat_data = train_test_data[feature_cols]
feat_data.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000254692,0.182367,-0.404351,0.653869,0.045736,-0.337307,0.225691,-0.229722,-0.064069,-0.12125,0.072755,...,0.008016,-0.001336,-0.000216,0.003683,0.001125,-0.00259,-0.00207,0.002401,0.003802,0.002711
ENSG00000131747,1.54594,-0.035698,-0.394453,0.400871,0.390605,0.793767,0.128098,0.096271,0.462357,0.039422,...,0.050982,-0.046663,0.035733,-0.023324,0.007467,-0.046726,-0.005839,-0.018328,-0.003894,-0.058346
ENSG00000165275,1.1522,-0.29954,-0.769891,0.167567,-0.750713,0.029751,0.149351,0.386945,-0.038403,-0.20188,...,-0.109284,0.034729,-0.031382,-0.040707,-0.007831,0.024222,-0.006081,0.03175,-0.004874,0.025895
ENSG00000138160,1.361867,-0.239267,-0.18875,-0.097263,0.98468,-0.131822,-0.142095,0.342607,-0.102758,1.224565,...,0.007774,-0.011807,0.095089,-0.007062,0.006514,-0.017084,0.003806,0.00264,0.01994,-0.002632
ENSG00000133612,0.808818,0.083761,-0.18132,-0.52926,-0.087528,-0.115068,-0.145428,0.007115,-0.035359,-0.042402,...,-0.004531,-0.000309,-0.006272,-0.005092,0.004168,-0.001807,-0.014019,-0.002411,6.9e-05,-0.00303


In [5]:
print('n_samples: ', len(feat_data))

n_samples:  1688


In [6]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feat_data, labels, test_size=0.2)

## Modeling

### Random Forest

In [7]:
# random forest
from sklearn.ensemble import RandomForestClassifier

# define RF classifier
rf_clf = RandomForestClassifier(n_estimators=100)#, max_depth=5)

# fit classifier
rf_clf.fit(X_train, y_train)

RandomForestClassifier()

In [8]:
rf_clf.score(X_train, y_train)

0.8792592592592593

In [9]:
rf_clf.score(X_test, y_test)

0.7011834319526628

### Adaboost

In [10]:
from sklearn.ensemble import AdaBoostClassifier

# define Adaboost classifier
ada_clf = AdaBoostClassifier(n_estimators=100, algorithm='SAMME.R')

# fit classifier
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=100)

In [11]:
ada_clf.score(X_train, y_train)

0.82

In [12]:
ada_clf.score(X_test, y_test)

0.6715976331360947

### GradBoost

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

# define GradBoost classifier

gb_clf = GradientBoostingClassifier(loss='deviance', n_estimators=100, criterion='friedman_mse')

# fit classifier
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier()

In [14]:
gb_clf.score(X_train, y_train)

0.8481481481481481

In [15]:
gb_clf.score(X_test, y_test)

0.7159763313609467

### Grid Search over Parameters

In [16]:
from sklearn.model_selection import GridSearchCV

gradboost_clf = GradientBoostingClassifier()
params = {
            'n_estimators': [20, 50, 100, 150],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'loss': ['deviance', 'exponential'],
            'criterion':['friedman_mse']
         }

# n_jobs=-1 => use all processors
gridsearch_clf = GridSearchCV(gradboost_clf, param_grid=params, refit=True, cv=5, n_jobs=-1, verbose=4)

gridsearch_clf.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'criterion': ['friedman_mse'],
                         'learning_rate': [0.01, 0.05, 0.1, 0.2],
                         'loss': ['deviance', 'exponential'],
                         'n_estimators': [20, 50, 100, 150]},
             verbose=4)

In [18]:
gridsearch_clf.best_params_

{'criterion': 'friedman_mse',
 'learning_rate': 0.1,
 'loss': 'exponential',
 'n_estimators': 100}

In [19]:
gridsearch_clf.best_score_

0.6725925925925924

In [20]:
gridsearch_clf.score(X_test, y_test)

0.7130177514792899

## Conclusion

It appears the node features have some notable correlation with the labels (whether or not a gene shows up in an OMIM search for LUAD).