## Proteins expression predictions (using Node2Vec approach)

#### Plan:
<ul>
    <li>Use graph structure to train Node2Vec model and get node embeddings for each protein</li>
    <li>Use any regressor model to train on node embeddings and predict protein expression</li>
    <li>Get embeddings for test data and validate regressor</li>
</ul>

In [140]:
#initial imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import stellargraph as sg
from stellargraph import StellarGraph
from tensorflow import keras
from tqdm import tqdm

In [18]:
#stellar graph imports for Node2Vec
from stellargraph.data import BiasedRandomWalk, EdgeSplitter, UniformRandomWalk, UnsupervisedSampler
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.layer import Node2Vec, link_classification

### Load graph data

In [4]:
edges = pd.read_csv("https://raw.githubusercontent.com/a-milenkin/Otus_HW_protein_expression/main/edges.csv", sep=",") 
edges.head()

Unnamed: 0,node_1,node_2
0,344,50
1,344,153
2,344,532
3,344,679
4,344,986


In [6]:
Graphtype = nx.Graph()
G = nx.from_pandas_edgelist(edges, source='node_1', target='node_2', create_using=Graphtype)

In [7]:
print(nx.info(G))

Graph with 10000 nodes and 594174 edges



  print(nx.info(G))


#### Train Node2Vec model

In [120]:
batch_size = 64
epochs = 10
emb_size = 128
walk_number = 100
walk_length = 5

In [121]:
SG = StellarGraph.from_networkx(G)

In [122]:
rw = BiasedRandomWalk(SG, n=walk_number, length=walk_length, p=0.5, q=2.0)

In [123]:
unsupervised_sampler = UnsupervisedSampler(SG, nodes=list(SG.nodes()), walker=rw)

In [124]:
generator = Node2VecLinkGenerator(SG, batch_size)

In [125]:
node2vec = Node2Vec(emb_size, generator=generator)

In [126]:
x_inp, x_out = node2vec.in_out_tensors()

In [127]:
prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method="dot")(x_out)

link_classification: using 'dot' method to combine node embeddings into edge embeddings


In [128]:
model = keras.Model(inputs=x_inp, outputs=prediction)

In [129]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss=keras.losses.binary_crossentropy)

In [130]:
history = model.fit(
    generator.flow(unsupervised_sampler), epochs=epochs, verbose=1, shuffle=True
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Get node embeddings for  train data

In [27]:
train = pd.read_csv("https://raw.githubusercontent.com/a-milenkin/Otus_HW_protein_expression/main/train.csv", sep=",") 
train.head()

Unnamed: 0,target,node
0,0.251968,11142
1,0.689541,2243
2,0.678245,15514
3,0.2725,20944
4,0.248888,8721


In [43]:
train.shape

(8000, 2)

In [105]:
train_node_names = train["node"].values

In [106]:
train_node_names

array([11142,  2243, 15514, ..., 20535,  9769, 15711])

In [107]:
x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [108]:
node_gen = Node2VecNodeGenerator(SG, batch_size).flow(train_node_names)

In [109]:
node_embeddings = embedding_model.predict(node_gen, verbose=1)



In [35]:
node_embeddings.shape

(8000, 128)

### Build regressor model to predict proteins level

In [44]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [135]:
regressor = XGBRegressor(n_estimators=150, max_depth=15,  learning_rate=0.01)

In [136]:
targets = train["target"].values

In [138]:
scores = cross_val_score(estimator=regressor, X=node_embeddings, y=targets, cv=4, scoring="neg_mean_squared_error", n_jobs=-1)

In [151]:
print(f"Validation results: {scores}")

Validation results: [-0.43744031 -0.45344047 -0.43669918 -0.56970117]


In [141]:
regressor.fit(node_embeddings, targets)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=15,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=150, n_jobs=16, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

### Validate on test data

In [57]:
test = pd.read_csv("https://raw.githubusercontent.com/a-milenkin/Otus_HW_protein_expression/main/test.csv", sep=",")
test.head()

Unnamed: 0,target,node
0,0.279231,817
1,0.380795,9574
2,0.686527,1607
3,0.303594,4782
4,0.367374,24125


In [142]:
test_node_names = test["node"].values

In [143]:
node_gen_test = Node2VecNodeGenerator(SG, 1).flow(test_node_names)

In [144]:
test_node_embeddings = embedding_model.predict(node_gen_test, verbose=1)



In [145]:
predicts = regressor.predict(test_node_embeddings)

In [154]:
mse_test = mean_squared_error(test["target"].values, predicts)
print(f"MeanSquaredError for test data: {mse_test}")

MeanSquaredError for test data: 0.764815189247934


### Check RandomForestRegressor

In [112]:
from sklearn.ensemble import RandomForestRegressor

In [161]:
regressor_forest = RandomForestRegressor(max_depth=15, n_jobs=-1, n_estimators=150)

In [162]:
regressor_forest.fit(node_embeddings, targets)

RandomForestRegressor(max_depth=15, n_estimators=150, n_jobs=-1)

In [164]:
predicts_forest = regressor_forest.predict(test_node_embeddings)
mse_test = mean_squared_error(test["target"].values, predicts_forest)
print(f"MeanSquaredError for test data (for RandomForest): {mse_test}")

MeanSquaredError for test data (for RandomForest): 0.762426649964919


### Compare with random distribution

In [170]:
import random

random_test_values = random.choices(targets, k=test.shape[0])
mse_test = mean_squared_error(test["target"].values, random_test_values)
print(f"MeanSquaredError for test data (for Random model): {mse_test}")

MeanSquaredError for test data (for Random model): 1.2487912419337044


### Выводы. Лучшая метрика по MSE получилась 0.76242. Не идеальное решение. Однако сравнивая с рандомной моделью полученной путем рандомной дистрибуции значений из тренировочной выборки, мы имеем приросто в качестве более чем в 1.63 раза. Для улучшения качества все же нужны доп. данные, кроме связей самих белков.