In [1]:
import torch
from community import community_louvain
from torch_geometric.data import Data, InMemoryDataset
import random
import networkx as nx
import math
import pandas as pd
from torch_geometric.utils import from_networkx
import numpy as np


In [2]:
# read and process input data
with open("../data/HMIN_edgelist.csv", 'r') as data:
    G = nx.parse_edgelist(data, delimiter=',', create_using=nx.Graph(), nodetype=int)
# autism df
autism_df = pd.read_csv('../data/labeled_genes.csv')
autism_df = autism_df.drop_duplicates(subset='entrez_id', keep="last")
autism_nodes = autism_df['entrez_id'].to_list()

G = G.subgraph(autism_nodes)


In [3]:
autism_df['label'][autism_df['confidence']==0.75] = 2
autism_df['label'][autism_df['confidence'] == 0.5] = 3
# y label
y = torch.tensor(autism_df['label'].to_list())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  autism_df['label'][autism_df['confidence'] == 0.5] = 3


In [4]:
autism_df['label'].value_counts()

0    1102
3     491
2     163
1      79
Name: label, dtype: int64

In [5]:
# Hand engineered features
# feature: node degree
degrees = torch.tensor([val for (node, val) in G.degree()], dtype=torch.float)
# closeness
closeness = torch.tensor([val for (node, val) in nx.closeness_centrality(G).items()], dtype=torch.float)
#  Betweenness
betweenness =  torch.tensor([val for (node, val) in nx.betweenness_centrality(G).items()], dtype=torch.float)
# feature: eigenvector_centrality
ec = torch.tensor([val for (node, val) in nx.eigenvector_centrality(G).items()], dtype=torch.float)
# feature: page rank
pr = torch.tensor([val for (node, val) in nx.pagerank(G, alpha=0.9).items()], dtype=torch.float)
# feature: coreness
coreness = torch.tensor([val for (node, val) in nx.core_number(G).items()], dtype=torch.float)

x = torch.stack((degrees, closeness, betweenness, pr, ec, coreness)).t()

In [6]:
x.shape

torch.Size([1835, 6])

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [8]:
X = x.numpy()
y = y.numpy()

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=0)

Random Forest

In [22]:
# parameter tuning
parameters = {'max_depth':[i for i in range(3,15)],
              'n_estimators':[i*50 for i in range(1,3)]}
model_rf = RandomForestClassifier()
gs_rf = GridSearchCV(model_rf, parameters)
gs_rf.fit(X_train, y_train)
print(gs_rf.best_params_)
print(gs_rf.best_score_)

{'max_depth': 3, 'n_estimators': 50}
0.6049035313784217


In [23]:
model_rf = RandomForestClassifier(**gs_rf.best_params_)
model_rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=3, n_estimators=50)

In [24]:
y_pred = model_rf.predict(X_test)

In [25]:
accuracy_score(y_test, y_pred)

0.5912806539509536

In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      0.60      0.74       363
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.01      0.25      0.02         4

    accuracy                           0.59       367
   macro avg       0.25      0.21      0.19       367
weighted avg       0.98      0.59      0.73       367



  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# 5 fold cross validation
all_accuracies = cross_val_score(estimator=model_rf, X=X_train, y=y_train, cv=5, n_jobs=-1)
print(all_accuracies.mean())

0.6035406654129228


SVM


In [28]:
from sklearn.svm import SVC

# parameter tuning
parameters = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
model_SVM = SVC(kernel='rbf')
gs_SVM = GridSearchCV(model_SVM, parameters)
gs_SVM.fit(X_train, y_train)
print(gs_SVM.best_params_)
print(gs_SVM.best_score_)

{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.6035383436651112


In [29]:
model_SVM = SVC(**gs_SVM.best_params_)
model_SVM.fit(X_train,y_train)

SVC(C=1, gamma=0.01)

In [30]:
y_pred = model_SVM.predict(X_test)

In [31]:
accuracy_score(y_test, y_pred)

0.5912806539509536

In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      0.60      0.74       362
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.01      0.20      0.02         5

    accuracy                           0.59       367
   macro avg       0.25      0.20      0.19       367
weighted avg       0.97      0.59      0.73       367



  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# 5 fold cross validation
all_accuracies = cross_val_score(estimator=model_SVM, X=X_train, y=y_train, cv=5, n_jobs=-1)
print(all_accuracies.mean())

0.6035383436651112
