# Importing Libraries

In [None]:
import networkx as nx
import shutil
import pandas as pd
import os
import numpy as np
import tqdm
from karateclub import NetMF

# Create Graph Embedding Features

Now after we have done creating our sorted graph we are ready to use this graph inside KarateClub library to get geometric graph embedding for each node, specifically we shall use NetMF embedding algorithm with 32 embedding space for each node, for the node that does not show in our train graph due to splitting and negative edges creating, we shall drop those rows from our data.

In [None]:
# reading sorted graph we have created previously
sorted_graph = nx.read_gpickle("Data/sorted_graph.gpickle")

# reading train graph we to get base and refelction nodes
train_graph = nx.read_edgelist("Data/train graph.csv", comments = 's', create_using = nx.DiGraph(), nodetype = int, delimiter = ",")

In [None]:
base_nodes = dict()
reflection = dict()

i = 0
for node in train_graph.nodes():
  base_nodes[i] = node
  reflection[node] = i
  i += 1

In [None]:
model = NetMF(32, 100, 2)
model.fit(sorted_graph)
# get embedding
embedding = model.get_embedding()
# save embedding file
np.save("Data/NetMF_embed.npy", embedding)

In [None]:
# reading org_train data frame who include only source node, destination node and the label
org_train = pd.read_csv("Data/org_train.csv")
org_test = pd.read_csv("Data/org_test.csv")
org_valid = pd.read_csv("Data/org_valid.csv")

In [None]:
def get_embedding(df):
  """
  Return the embedding of df dataframe using graph embedding algorithm
  Args: df ====> the dataframe we are processing
  Return three arrays: source_embedding, destination_embedding, removed indices
  """
  val = df.values
  # save the removed index for excecluding those laters
  removed = []
  source_embedding = []
  destination_embedding = []
  
  for index, row in enumerate(tqdm.tqdm(val)):
    try:
      # get the embedding of the refelction of current node
      x = embedding[reflection[row[0]]]
      y = embedding[reflection[row[1]]]
      source_embedding.append(x)
      destination_embedding.append(y)
    except:
      removed.append(index)
      continue
  
  return source_embedding, destination_embedding, removed

In [None]:
# get train embedding
train_source_embedding, train_destination_embedding, train_removed = get_embedding(org_train)

100%|██████████| 16798783/16798783 [00:57<00:00, 293521.28it/s]


In [None]:
# get test embedding
test_source_embedding, test_destination_embedding, test_removed = get_embedding(org_test)

100%|██████████| 1567572/1567572 [00:09<00:00, 165041.66it/s]


In [None]:
# get validation embedding
valid_source_embedding, valid_destination_embedding, valid_removed = get_embedding(org_valid)

100%|██████████| 508683/508683 [00:01<00:00, 271355.81it/s]


In [None]:
# Converting result list to Numpy array
train_source_embedding = np.array(train_source_embedding)
train_destination_embedding = np.array(train_destination_embedding)
train_removed = np.array(train_removed)

In [None]:
# Converting result list to Numpy array
test_source_embedding = np.array(test_source_embedding)
test_destination_embedding = np.array(test_destination_embedding)
test_removed = np.array(test_removed)

In [None]:
# Converting result list to Numpy array
valid_source_embedding = np.array(valid_source_embedding)
valid_destination_embedding = np.array(valid_destination_embedding)
valid_removed = np.array(valid_removed)

In [None]:
# Merging train embedding into one Numpy array
train_embedding = np.hstack((train_source_embedding, train_destination_embedding))
# Merging test embedding into one Numpy array
test_embedding = np.hstack((test_source_embedding, test_destination_embedding))
# Merging validation embedding into one Numpy array
valid_embedding = np.hstack((valid_source_embedding, valid_destination_embedding))

In [None]:
# saving embedding files
np.save("Data/train_embedding", train_embedding)
np.save("Data/test_embedding", test_embedding)
np.save("Data/valid_embedding", valid_embedding)

In [None]:
# saving removed indices files
np.save("Data/train_removed", train_removed)
np.save("Data/test_removed", test_removed)
np.save("Data/valid_removed", valid_removed)