In [1]:
import torch
import random
import networkx as nx
import math
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# seed for reproducibility
seed = 0
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


In [2]:
# read and process input data
with open("../data/HMIN_edgelist.csv", 'r') as data:
    G = nx.parse_edgelist(data, delimiter=',', create_using=nx.Graph(), nodetype=int)
# autism df
autism_df = pd.read_csv('../data/labeled_genes.csv')
autism_df = autism_df.drop_duplicates(subset='entrez_id', keep="last")
autism_nodes = autism_df['entrez_id'].to_list()

G = G.subgraph(autism_nodes)


In [3]:
autism_df['label'][autism_df['confidence'] == 0.5] = 0
# y label
y = torch.tensor(autism_df['label'].to_list()).numpy()
# autism_df['label'].value_counts()

In [4]:
# Hand engineered features
# feature: node degree
degrees = torch.tensor([val for (node, val) in G.degree()], dtype=torch.float)
# closeness
closeness = torch.tensor([val for (node, val) in nx.closeness_centrality(G).items()], dtype=torch.float)
#  Betweenness
betweenness =  torch.tensor([val for (node, val) in nx.betweenness_centrality(G).items()], dtype=torch.float)
# feature: eigenvector_centrality
ec = torch.tensor([val for (node, val) in nx.eigenvector_centrality(G).items()], dtype=torch.float)
# feature: page rank
pr = torch.tensor([val for (node, val) in nx.pagerank(G, alpha=0.9).items()], dtype=torch.float)
# feature: coreness
coreness = torch.tensor([val for (node, val) in nx.core_number(G).items()], dtype=torch.float)

X = torch.stack((degrees, closeness, betweenness, pr, ec, coreness)).t().numpy()

In [5]:
X.shape

(1835, 6)

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import f1_score,  recall_score, precision_score

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.3, random_state=0)

In [14]:
X_test,X_val,y_test,y_val =train_test_split(X_test, y_test, test_size=0.5, random_state=0)

### Random Forest

Hand parameter tuning the hyperparameters based on the performance on the validation set:
max_depth: 5 - 25
n_estimators: 50 - 150

In [30]:
model_rf = RandomForestClassifier(max_depth=15, n_estimators=100)
model_rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=15)

In [31]:
y_pred = model_rf.predict(X_val)

In [32]:
accuracy_score(y_val, y_pred)

0.8586956521739131

In [33]:
y_pred = model_rf.predict(X_test)

In [34]:
accuracy_score(y_test, y_pred)

0.8436363636363636

In [35]:
f1 = f1_score(y_pred, y_test, average='weighted')
precision = precision_score(y_pred, y_test, average='weighted')
recall = recall_score(y_pred, y_test,average='weighted')
print('f1: ', f1, 'precision: ', precision, 'recall: ', recall )

f1:  0.9052035144342835 precision:  0.9764642166344294 recall:  0.8436363636363636


### SVM


Hand parameter tuning the hyperparameters based on the performance on the validation set:
C: 0.1, 1, 10, 100, 1000
gamma: 1, 0.1, 0.01, 0.001, 0.0001
kernel: rbf

In [49]:
model_SVM = SVC(C=100, gamma=0.001)
model_SVM.fit(X_train,y_train)

SVC(C=100, gamma=0.001)

In [50]:
y_pred = model_SVM.predict(X_test)

In [51]:
accuracy_score(y_test, y_pred)

0.850909090909091

In [52]:
f1 = f1_score(y_pred, y_test, average='weighted')
precision = precision_score(y_pred, y_test, average='weighted')
recall = recall_score(y_pred, y_test,average='weighted')
print('f1: ', f1, 'precision: ', precision, 'recall: ', recall )

f1:  0.916106447579925 precision:  0.9921237911025145 recall:  0.850909090909091


### LGP


This is an evolutionary learning algorithm, Hyperparamter details can be found in this [website](https://smile-mib.readthedocs.io/en/latest/?badge=latest).
Hand parameter tuning the hyperparameters based on the performance on the validation set:
tournamentSize: 4, 6, 8
min_prog_ini_length: 5, 10, 15
max_prog_ini_length: 20, 30, 40
pCrossover: 0.6, 0.7, 0.8
pMacro: 0.6, 0.7, 0.8
pMicro: 0.6, 0.7, 0.8


In [55]:
from linear_genetic_programming.lgp_classifier import LGPClassifier

In [58]:
lgp = LGPClassifier(numberOfInput = X_train.shape[1], numberOfVariable = 200, populationSize = 100,
                        fitnessThreshold = 1.0, max_prog_ini_length = 20, min_prog_ini_length = 5,
                        maxGeneration = 5, tournamentSize = 4, showGenerationStat=True,
                        isRandomSampling=True, maxProgLength = 500)

In [59]:
lgp.fit(X_train, y_train)

Gen|Best Indv|  CE  |Pop Avg|Ran Sampling|AvgProgLen|AvgEffProgLen
--- --------- ------ ------- ------------ ---------- -------------
  0|     0.89|    89|   0.73|         929|     18.18|         0.96
  1|     0.89|    72|   0.83|         970|     19.57|         1.04
  2|     0.90|    73|   0.85|        1146|     21.28|         1.05
  3|     0.90|    73|   0.87|        1181|     24.59|         1.12
  4|     0.90|    73|   0.84|        1078|     24.13|         1.10


LGPClassifier(maxGeneration=5, maxProgLength=500, max_prog_ini_length=20,
              min_prog_ini_length=5, numberOfInput=6, numberOfVariable=200,
              populationSize=100, tournamentSize=4)

In [60]:
y_pred = model_rf.predict(X_val)

In [62]:
accuracy_score(y_val, y_pred)

0.8586956521739131

In [63]:
y_pred = model_rf.predict(X_test)

In [64]:
accuracy_score(y_test, y_pred)

0.8436363636363636

In [65]:
f1 = f1_score(y_pred, y_test, average='weighted')
precision = precision_score(y_pred, y_test, average='weighted')
recall = recall_score(y_pred, y_test,average='weighted')
print('f1: ', f1, 'precision: ', precision, 'recall: ', recall )

f1:  0.9052035144342835 precision:  0.9764642166344294 recall:  0.8436363636363636
