# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import shutil
import os
import scipy

# General Features

It's time now to design general features coming from some ideas from the graph itself, We shall implement and create the following features:

1.   Shortest Path
2.   Communites
3.   Mutual Following
4.   Edges Weighting




In [None]:
x_train = pd.read_csv("Data/org_train.csv")
x_test = pd.read_csv("Data/org_test.csv")
x_valid = pd.read_csv("Data/org_valid.csv")

# Reading Traing Graph From Existing Links only
train_graph = nx.read_edgelist("Data/train graph.csv", comments = 's', create_using = nx.DiGraph(), nodetype = int, delimiter = ",")

In [None]:
x_train.head()

Unnamed: 0,source_node,destination_node,label
0,712635,150377,1
1,1627794,174448,0
2,1304062,391971,0
3,485532,637180,0
4,583251,1021767,1


### Shortest Path

This feature intend to find the shortest path between two nodes , if they have edge between them (one of them follow the other or both do) then shortest path is equal to 1 so it's meaningless to compute the sortest path between them in this case, for that we shall remove the direct edge between them and calculate again to find what is the distance between those two nodes which shape the edge between them.


In [None]:
def shortest_path(u, v):
  """
  Compute shortest path between node u and node v given graph, This function would yield the distance between
  those two nodes in term of edges (number of edges between u and v) if there is a path otherwise it will return -1
  to denote that there is not any way to reach v staritng from u.
  Args: u is the source node
        v is the target node
  """
  # init path length by -1
  path_length = -1
  try:
    # if there is an edge between u and v ===> remove this edge temporaly
    if train_graph.has_edge(u, v):
      train_graph.remove_edge(u, v)
      path_length = nx.shortest_path_length(train_graph, source = u, target = v)
      # recreate removed edge
      train_graph.add_edge(u, v)
    else:
      path_length = nx.shortest_path_length(train_graph, source = u, target = v)
    return path_length
  except:
    return -1

In [None]:
x_train["shortest_path"] = x_train.apply(lambda row : shortest_path(row.source_node, row.destination_node), axis = 1)
x_test["shortest_path"] = x_test.apply(lambda row : shortest_path(row.source_node, row.destination_node), axis = 1)
x_valid["shortest_path"] = x_valid.apply(lambda row : shortest_path(row.source_node, row.destination_node), axis = 1)

In [None]:
# # Manually generate shortest path series for each data frame we have
# def generate_shortest_lengths(frame):
#   lengths = []
#   for source, destination in zip(frame.source_node, frame.destination_node):
#     lengths.append(shortest_path(source, destination))
#   return lengths

In [None]:
x_train.head()

Unnamed: 0,source_node,destination_node,label,shortest_path_length
0,712635,150377,1,-1
1,1627794,174448,0,-1
2,1304062,391971,0,-1
3,485532,637180,0,8
4,583251,1021767,1,-1


### Detecting Communites

We define communites in directed graph as a part of a graph where all nodes in the community are reachable from other nodes given the underlaying undirected graph.

Straongly connected component in directed graph is subset of graph which all node are reachable to each other.

Weakely connected component in undirected graph when we get rid of direction in directed graph is subset of nodes all reachable to each other hence we shall use this concept to define communites in our graph.

In [None]:
# Getting the weakly connected components of the graph
weakly_connected_components = list(nx.weakly_connected_components(train_graph))

def detect_communites(u, v):
  """
  Given two nodes u and v this function will return 1 if those nodes are in the same community otherewise it will return 0
  Args: u is the source node
        v is the target node
  """
  # First Case
  # if we have edge from target to source then they both belong to the same community
  if train_graph.has_edge(v, u):
    return 1
  
  temp = []
  # Second Case
  # if we have only edge from u to v we should remove it to see if u and v only share the same community through this link 
  # if not then we return 1 as signal to they are in the same community otherwise return 0 as they not belonging to the same community.

  if train_graph.has_edge(u, v):
    # getting the wcc that u belong to
    for w in weakly_connected_components:
      if u in w:
        temp = w
        break
    # check if v in the same community with u
    if v in temp:
        # remove the edge to see if they only related to this edge
        train_graph.remove_edge(u, v)
        if shortest_path(u, v) == -1:
          train_graph.add_edge(u, v)
          return 0
        else:
          train_graph.add_edge(u, v)
          return 1
    else:
      return 0
  # Third Case: there is no direct edge between u and v in this case we check if they both belong to the same wcc 
  # aka share the same community if yes return 1 otherwise return 0.
  else: # no edge at all between u and v
    # Getting the wcc that u belong to
    for w in weakly_connected_components:
      if u in w:
        temp = w
        break
    if v in temp:
      return 1
    else:
      return 0

In [None]:
def generate_community(frame):
  i = 0
  result = []
  for source, destination in zip(frame.source_node, frame.destination_node):
    result.append(detect_communites(source, destination))
  return result

In [None]:
x_train["same_community"] = generate_community(x_train)
x_test["same_community"] = generate_community(x_test)
x_valid["same_community"] = generate_community(x_valid)

In [None]:
x_train.head()

Unnamed: 0,source_node,destination_node,label,shortest_path_length,same_community
0,712635,150377,1,-1,1
1,1627794,174448,0,-1,1
2,1304062,391971,0,-1,0
3,485532,637180,0,8,1
4,583251,1021767,1,-1,0


### Mutual Following?

Suppose we try to predict if there is an edge between u to v and we know that there is an edge from v to u, in this case  there is a bigger chance that there is an edge from a to b.

In [None]:
def mutual_following(u, v):
  if train_graph.has_edge(v, u):
    return 1
  else:
    return 0

In [None]:
x_train["mutual_following"] = x_train.apply(lambda row : mutual_following(row["source_node"], row["destination_node"]), axis = 1)
x_test["mutual_following"] = x_test.apply(lambda row : mutual_following(row["source_node"], row["destination_node"]), axis = 1)
x_valid["mutual_following"] = x_valid.apply(lambda row : mutual_following(row["source_node"], row["destination_node"]), axis = 1)

In [None]:
x_train.head()

Unnamed: 0,source_node,destination_node,label,shortest_path_length,same_community,mutual_following
0,712635,150377,1,-1,1,1
1,1627794,174448,0,-1,1,0
2,1304062,391971,0,-1,0,0
3,485532,637180,0,8,1,0
4,583251,1021767,1,-1,0,0


### Edge Weighting

For this part we will implement handcraft features related to the incoming and the outcoming edges, We will add the following features:

<ol>
<li>Weight Features
    <ul>
        <li>weight of incoming edges</li>
        <li>weight of outgoing edges</li>
        <li>weight of incoming edges + weight of outgoing edges</li>
        <li>weight of incoming edges * weight of outgoing edges</li>
    </ul>


Explanation:

An edge weight value is calculated between nodes. Edge weight decreases as the neighbor count goes up. Intuitively, consider one million people following a celebrity on a social network then chances are most of them never met each other or the celebrity. On the other hand, if a user has 30 contacts in his / her social network, the chances are higher that many of them know each other. 
**credit** - Graph-based Features for Supervised Link Prediction William Cukierski, Benjamin Hamner, Bo Yang


\begin{equation}
W = \frac{1}{\sqrt{1+|X|}}
\end{equation}


**Note**: Some of those features may be linearly related with some of features of the first part (Local Similarites Features) for that we let the model choose the most useful features and get rid of the others.

In [None]:
weight_in, weight_out = dict(), dict()

for node in train_graph.nodes():
  node_weight_in = train_graph.in_degree(node)
  node_weight_out = train_graph.out_degree(node)

  node_weight_in = 1 / np.sqrt((1 + node_weight_in))
  node_weight_out = 1 / np.sqrt((1 + node_weight_out))

  weight_in[node] = node_weight_in
  weight_out[node] = node_weight_out

weight_in_average = sum(weight_in.values()) / train_graph.number_of_nodes()
weight_out_average = sum(weight_out.values()) / train_graph.number_of_nodes()

In [None]:
x_train["source_weight_out"] = x_train.apply(lambda row : weight_out.get(row["source_node"], weight_out_average), axis = 1)
x_train["destination_weight_in"] = x_train.apply(lambda row : weight_in.get(row["destination_node"], weight_in_average), axis = 1)

In [None]:
x_test["source_weight_out"] = x_test.apply(lambda row : weight_out.get(row["source_node"], weight_out_average), axis = 1)
x_test["destination_weight_in"] = x_test.apply(lambda row : weight_in.get(row["destination_node"], weight_in_average), axis = 1)

In [None]:
x_valid["source_weight_out"] = x_valid.apply(lambda row : weight_out.get(row["source_node"], weight_out_average), axis = 1)
x_valid["destination_weight_in"] = x_valid.apply(lambda row : weight_in.get(row["destination_node"], weight_in_average), axis = 1)

In [None]:
x_train["weight_multiplication"] = x_train["source_weight_out"] * x_train["destination_weight_in"]
x_train["weight_addition"] = x_train["source_weight_out"] + x_train["destination_weight_in"]

In [None]:
x_test["weight_multiplication"] = x_test["source_weight_out"] * x_test["destination_weight_in"]
x_test["weight_addition"] = x_test["source_weight_out"] + x_test["destination_weight_in"]

In [None]:
x_valid["weight_multiplication"] = x_valid["source_weight_out"] * x_valid["destination_weight_in"]
x_valid["weight_addition"] = x_valid["source_weight_out"] + x_valid["destination_weight_in"]

In [None]:
x_train.head(10)

Unnamed: 0,source_node,destination_node,label,shortest_path_length,same_community,mutual_following,source_weight_out,destination_weight_in,weight_multiplication,weight_addition
0,712635,150377,1,-1,1,1,0.5,0.707107,0.353553,1.207107
1,1627794,174448,0,-1,1,0,0.707107,0.377964,0.267261,1.085071
2,1304062,391971,0,-1,0,0,0.707107,0.316228,0.223607,1.023335
3,485532,637180,0,8,1,0,0.707107,0.377964,0.267261,1.085071
4,583251,1021767,1,-1,0,0,0.333333,0.707107,0.235702,1.04044
5,481131,836938,1,2,1,1,0.116248,0.333333,0.038749,0.449581
6,587779,334336,1,2,1,1,0.19245,0.25,0.048113,0.44245
7,1491051,773032,0,-1,1,0,0.242536,1.0,0.242536,1.242536
8,189069,272812,1,-1,0,0,0.57735,0.377964,0.218218,0.955315
9,1416904,1007057,1,-1,1,1,0.130189,0.707107,0.092057,0.837296


In [None]:
print("Number of columns in train data =", x_train.shape[1])
print("Number of columns in test data =", x_test.shape[1])
print("Number of columns in valid data =", x_valid.shape[1])

Number of columns in train data = 10
Number of columns in test data = 10
Number of columns in valid data = 10


In [None]:
x_train.to_csv("Data/x_train_3.csv", index = False)
x_test.to_csv("Data/x_test_3.csv", index = False)
x_valid.to_csv("Data/x_valid_3.csv", index = False)