# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import shutil
import os
import scipy

# Global Based Similarity Features

This is the second part of designing graph features, at this stage we shall design Global Similarity-Based features.

For Global Similarity-Based Approaches we will implement the following indices:

1. Katz Centrality Index

3. Page Rank

4. Random Walk With Restart

For Quasi-Local Similarity-Based Approaches:

1. Third-Order Resource Allocation Based on Common Neighbor Interactions(RACN)

In [None]:
x_train = pd.read_csv("Data/org_train.csv")
x_test = pd.read_csv("Data/org_test.csv")
x_valid = pd.read_csv("Data/org_valid.csv")

In [None]:
# Reading Traing Graph From Existing Links only
train_graph = nx.read_edgelist("Data/train graph.csv", comments = 's', create_using = nx.DiGraph(), nodetype = int, delimiter = ",")

### Katz Centrality

Katz centrality computes the centrality for a node 
    based on the centrality of its neighbors. It is a 
    generalization of the eigenvector centrality. The
    Katz centrality for node `i` is
 
$$x_i = \alpha \sum_{j} A_{ij} x_j + \beta,$$
where `A` is the adjacency matrix of the graph G 
with eigenvalues $$\lambda$$.

The parameter $$\beta$$ controls the initial centrality and 

$$\alpha < \frac{1}{\lambda_{max}}.$$


it works something like PageRank where the rank of node depend of the rank of it's neighbors.

In [None]:
# Getting Katz Score for each node in the graph using networkx library
# Using beta parameter give init socre of 1 for each node
katz = nx.katz.katz_centrality(train_graph , max_iter = 3000 , alpha = 0.005 , beta = 1 , tol = 1e-08, normalized = True)
# Get average katz values for all nodes in order to impute missing katz values
katz_average = sum(katz.values()) / len(katz)

In [None]:
print("The Average Katz Value Of All Nodes =", katz_average)

The Average Katz Value Of All Nodes = 0.0007395898700549085


In [None]:
# Getting katz score for source node and for destination node seperately
# If the score is missing for some reasons like some nodes are not in train graph impute the result with average of katz values of all nodes

# Getting Katz values for train data
x_train["source_katz"] = x_train.source_node.apply(lambda x : katz.get(x, katz_average))
x_train["destination_katz"] = x_train.destination_node.apply(lambda x : katz.get(x, katz_average))

# Getting Katz values for test data
x_test["source_katz"] = x_test.source_node.apply(lambda x : katz.get(x, katz_average))
x_test["destination_katz"] = x_test.destination_node.apply(lambda x : katz.get(x, katz_average))

# Getting Katz values for validation data
x_valid["source_katz"] = x_valid.source_node.apply(lambda x : katz.get(x, katz_average))
x_valid["destination_katz"] = x_valid.destination_node.apply(lambda x : katz.get(x, katz_average))

### Page Rank

PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links. It was originally designed as an algorithm to rank web pages.

if the node has a lot of incoming edge it's rank increase and if it has edge coming from high ranking node it's rank increase more.

In [None]:
# Computing the Page Rank with 500 max iteration, this may not converage [no weighting]
PageRank = nx.pagerank(train_graph, max_iter = 500, tol = 1e-8)
# Compute Averange of Page Rank so to impute missing nodes
PageRank_Average = sum(PageRank.values()) / len(PageRank)

In [None]:
print("The average value of all page rank =", PageRank_Average)

The average value of all page rank = 5.490375646031004e-07


In [None]:
x_train["source_rank"] = x_train.source_node.apply(lambda x : PageRank.get(x, PageRank_Average))
x_train["destination_rank"] = x_train.destination_node.apply(lambda x : PageRank.get(x, PageRank_Average))

x_test["source_rank"] = x_test.source_node.apply(lambda x : PageRank.get(x, PageRank_Average))
x_test["destination_rank"] = x_test.destination_node.apply(lambda x : PageRank.get(x, PageRank_Average))

x_valid["source_rank"] = x_valid.source_node.apply(lambda x : PageRank.get(x, PageRank_Average))
x_valid["destination_rank"] = x_valid.destination_node.apply(lambda x : PageRank.get(x, PageRank_Average))

In [None]:
# Multiplying each values by 1e5 so we don't have very small values
# This will not affect because we multiply all values in all of our dataframes by the same values
x_train.source_rank *= 1e5
x_train.destination_rank *= 1e5

x_test.source_rank *= 1e5
x_test.destination_rank *= 1e5

x_valid.source_rank *= 1e5
x_valid.destination_rank *= 1e5

### Random Walk With Restart

A random walk is a type of stochastic process. The simplest explanation of a random walk would be through walking. that each step you take is determined probabilistically. This implies that at each index of time, you have moved in a certain direction based on a probabilistic outcome. This algorithm explores the relationship to each step that you would take and its distance from the initial starting point.

Random walk with restart is exactly as a random walk but with one extra component to it. This component can be denoted as the restart probability. Essentially indicating that for every step taken in any direction there is a probability associated to going back to the initial starting position, the origin. In our example above of randomly moving in any direction, there is a chance that you would instantly teleport back to the origin after every step based on this restart probability.

In [None]:
def random_with_restart(g, alpha = 0.85, max_iter = 300, tol = 1e-8):

  N = g.number_of_nodes()
  # Getting adjacency matrix of the graph
  A = nx.to_scipy_sparse_matrix(g, dtype = float)
  # create vectors with N values, value for each node in the train graph 
  x = np.repeat(1 / N, N)
  # probability reachable vector for each node initiatied by 1
  p = np.repeat(1, N)
  # Normalization step
  p = p / p.sum()
  
  temp = 1.0 / A.sum(axis = 1)
  
  D = scipy.sparse.spdiags(temp.T, 0, *A.shape)
  # D is transition matrix for digraph
  W = D * A # transition matrix
  
  for i in range(max_iter):

    x_prev = x
    x = alpha * x * W + (1 - alpha) * p

    if abs(np.sum(x - x_prev)) < tol:
      print("Convergenec before reached max iteration")
      return x
  
  return x

In [None]:
# Getting random walk for the train graph
random_walk = random_with_restart(train_graph)
# average random walk value to impute missing nodes
average_random_walk = np.mean(random_walk)
# Multiplication by 1e6 to remove very small number
average_random_walk *= 1e6

Convergenec before reached max iteration


In [None]:
def generate_random_walk(frame):
  """
  Generate the random walk frame based on the data frame passed to it for example X_Train, X_Test, etc.
  If the node not in the train graph it will be imputed using average random walk value
  """

  source_random_walk, destination_random_walk = [], []

  for source in frame.source_node:
    try:
      source_random_walk.append(random_walk[source - 1] * 1e6)
    except:
      source_random_walk.append(average_random_walk)

  for destination in frame.destination_node:
    try:
      destination_random_walk.append(random_walk[destination - 1] * 1e6)
    except:
      destination_random_walk.append(average_random_walk)
  
  return source_random_walk, destination_random_walk

In [None]:
x_train["source_random_walk"], x_train["destination_random_walk"] = generate_random_walk(x_train)
x_test["source_random_walk"], x_test["destination_random_walk"] = generate_random_walk(x_test)
x_valid["source_random_walk"], x_valid["destination_random_walk"] = generate_random_walk(x_valid)

In [None]:
# Getting successors, predecessors, in degree and out degree for each node in the graph otherwise use imputing
# This will make the code below run much much faster because getting those values will happen frequently
# A simple memoization trick would improve code speed by 180X Factor
succs = dict()
preds = dict()
in_degrees = dict()
out_degrees = dict()

avg = 5.6
for i in train_graph.nodes():
  try:
    succs[i] = set(train_graph.successors(i))
    preds[i] = set(train_graph.predecessors(i))
    in_degrees[i] = train_graph.in_degree(i)
    out_degrees[i] = train_graph.out_degree(i)
  except:
    succs[i] = set()
    in_degrees[i] = avg
    out_degrees[i] = avg

### Third-Order Resource Allocation Based on Common Neighbor Interactions (RACN)

This metric was designed to detech graph structure around the two target nodes, which the resources of nodes are allocated to the
neighbors so its related to number of neighbors of the two nodes and and successors and predecessors of those neighbors, **This metric beside Random Walk With Restart is not implemented so i had to implement them myself.**

In [None]:
def racn(u, v):
  """
  Given two nodes u and v, Return Third Order Resource Allocation Index Based on Common Neighbor Interactions 
  The function will compute two parts the first part is realted to the successors of u and v
  The second part related to the interactions between neighbors of those two nodes.
  """
  part1, part2 = 0.0, 0.0
  # u_succ, v_succ = set(g.successors(u)), set(g.successors(v))
  u_succ, v_succ = succs[u], succs[v]
  s = u_succ.intersection(v_succ)

  if len(s) == 0:
    return 0

  for node in s:
    part1 += 1 / in_degrees[node]
  
  for i in u_succ:
    for j in v_succ:
      if i == j:
        continue
      if i in preds[j]:
        i_n, j_n = (in_degrees[i] + out_degrees[i]), (in_degrees[j] + out_degrees[j])
        if i_n > j_n:
          part2 += ((1 / i_n) - (1 / j_n))
      else:
        continue
  # Replace value with it's absolute values, We don't care much about the sign  
  return np.abs(part1 + part2)

In [None]:
def generate_racn(frame):
  """
  Generate the Racn frame based on the data frame passed to it for example X_Train, X_Test, etc.
  If the node not in the train graph it will be imputed using zero value.
  """
  results = []
  for source, destination in zip(frame.source_node, frame.destination_node):
    try:
      results.append(racn(source, destination))
    except:
      results.append(0)
  return results

In [None]:
x_train["racn"] = generate_racn(x_train)
x_test["racn"] = generate_racn(x_test)
x_valid["racn"] = generate_racn(x_valid)

In [None]:
x_train.head(10)

Unnamed: 0,source_node,destination_node,label,source_katz,source_rank,destination_rank,source_random_walk,destination_random_walk,racn
0,712635,150377,1,0.000736,0.062449,0.029567,0.273411,0.965121,0.0
1,1627794,174448,0,0.000725,0.021017,0.062142,0.082356,0.762205,0.0
2,1304062,391971,0,0.000721,0.011936,0.08829,0.549038,0.869534,0.0
3,485532,637180,0,0.000721,0.011936,0.056763,0.309079,0.26396,0.0
4,583251,1021767,1,0.000736,0.033136,0.015527,0.456601,0.436576,0.0
5,481131,836938,1,0.001028,0.457588,0.06677,0.788137,0.232027,0.050088
6,587779,334336,1,0.00082,0.226426,0.115005,0.293321,1.790829,0.462079
7,1491051,773032,0,0.000783,0.134629,0.011936,0.549038,0.146073,0.0
8,189069,272812,1,0.000721,0.011936,0.066705,0.675744,0.750177,0.0
9,1416904,1007057,1,0.000971,0.367171,0.01733,0.082356,0.630529,0.0


In [None]:
# Dropping unrelated features columns
x_train.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)
x_test.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)
x_valid.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)

In [None]:
x_train.head()

Unnamed: 0,source_katz,source_rank,destination_rank,source_random_walk,destination_random_walk,racn
0,0.000736,0.062449,0.029567,0.273411,0.965121,0.0
1,0.000725,0.021017,0.062142,0.082356,0.762205,0.0
2,0.000721,0.011936,0.08829,0.549038,0.869534,0.0
3,0.000721,0.011936,0.056763,0.309079,0.26396,0.0
4,0.000736,0.033136,0.015527,0.456601,0.436576,0.0


In [None]:
x_train.to_csv("Data/x_train_2.csv", index = False)
x_test.to_csv("Data/x_test_2.csv", index = False)
x_valid.to_csv("Data/x_valid_2.csv", index = False)