# Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import shutil
import os

# Creating Features

In [None]:
x_train = pd.read_csv("Data/x_train.csv")
x_test = pd.read_csv("Data/x_test.csv")
x_valid = pd.read_csv("Data/x_valid.csv")

We shall design X_Train_Positive to use it for features creating, as it would be meaningless to use x_negative as well in graph buliding because  there is no edges at all so it would be just a bunch of scatterd nodes around the space, we will design train graph from x_train only so we don't have any data lekage.

In [None]:
x_train_positive = x_train[x_train.label == 1]
x_train_positive = x_train_positive.drop(["label"], axis = 1)
x_train_positive.to_csv("Data/train graph.csv", header = None, index = False)

In [None]:
# Reading Traing Graph From Existing Links only
train_graph = nx.read_edgelist("Data/train graph.csv", comments = 's', create_using = nx.DiGraph(), nodetype = int, delimiter = ",")

In [None]:
# Calculating average degree of nodes in train graph to use later by some features
# As it's a directed graph i will add indegree and outdegree then divie by 2
nnodes = train_graph.number_of_nodes()
in_degree = sum(d for n, d in train_graph.in_degree()) / float(nnodes)
print("Average in degree: %8.4f" % in_degree)
out_degree = sum(d for n, d in train_graph.out_degree()) / float(nnodes)
print("Average out degree: %8.4f" % out_degree)

### Creating Degrees Features

In [None]:
x_train["source_outdegree"], x_train["source_indegree"] = get_degree(x_train["source_node"])
x_train["destination_outdegree"], x_train["destination_indegree"] = get_degree(x_train["destination_node"])

x_test["source_outdegree"], x_test["source_indegree"] = get_degree(x_test["source_node"])
x_test["destination_outdegree"], x_test["destination_indegree"] = get_degree(x_test["destination_node"])

x_valid["source_outdegree"], x_valid["source_indegree"] = get_degree(x_valid["source_node"])
x_valid["destination_outdegree"], x_valid["destination_indegree"] = get_degree(x_valid["destination_node"])

### Creating Intersection Features

In [None]:
x_train["outcoming_intersection"], x_train["incoming_intersection"] = get_intersection(x_train["source_node"], x_train["destination_node"])
x_test["outcoming_intersection"], x_test["incoming_intersection"] = get_intersection(x_test["source_node"], x_test["destination_node"])
x_valid["outcoming_intersection"], x_valid["incoming_intersection"] = get_intersection(x_valid["source_node"], x_valid["destination_node"])

### Creating Jaccard Features

In [None]:
x_train["jaccard_for_outcoming"] = x_train.apply(lambda row : jaccard_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)
x_test["jaccard_for_outcoming"] = x_test.apply(lambda row : jaccard_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)
x_valid["jaccard_for_outcoming"] = x_valid.apply(lambda row : jaccard_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)

In [None]:
x_train["jaccard_for_incoming"] = x_train.apply(lambda row : jaccard_for_incoming(row["source_node"], row["destination_node"]), axis = 1)
x_test["jaccard_for_incoming"] = x_test.apply(lambda row : jaccard_for_incoming(row["source_node"], row["destination_node"]), axis = 1)
x_valid["jaccard_for_incoming"] = x_valid.apply(lambda row : jaccard_for_incoming(row["source_node"], row["destination_node"]), axis = 1)

In [None]:
# Rearrange columns in datasets
# x_train = x_train[['source_node', 'destination_node', 'source_outdegree', 'source_indegree',
#        'destination_outdegree', 'destination_indegree',
#        'outcoming_intersection', 'incoming_intersection','jaccard_for_outcoming', 'jaccard_for_incoming', 'label']]

# x_test = x_test[['source_node', 'destination_node', 'source_outdegree', 'source_indegree',
#        'destination_outdegree', 'destination_indegree',
#        'outcoming_intersection', 'incoming_intersection','jaccard_for_outcoming', 'jaccard_for_incoming', 'label']]

# x_valid = x_valid[['source_node', 'destination_node', 'source_outdegree', 'source_indegree',
#        'destination_outdegree', 'destination_indegree',
#        'outcoming_intersection', 'incoming_intersection','jaccard_for_outcoming', 'jaccard_for_incoming', 'label']]

In [None]:
x_train.head()

Unnamed: 0,source_node,destination_node,source_outdegree,source_indegree,destination_outdegree,destination_indegree,outcoming_intersection,incoming_intersection,jaccard_for_outcoming,jaccard_for_incoming,label
0,712635,150377,3,4,1,1,0,0,0.0,0.0,1
1,1627794,174448,1,1,12,6,0,0,0.0,0.0,0
2,1304062,391971,1,0,10,9,0,0,0.0,0.0,0
3,485532,637180,1,0,9,6,0,0,0.0,0.0,0
4,583251,1021767,8,4,0,1,0,0,0.0,0.0,1


### Creating Salton Features

In [None]:
def salton_for_incoming(source, destination):
  """
  Calculate Salton Index for outcoming edges, this index also known as cosine distance and defined as:
  Number of common followee / square root of (number of followee of soruce times number of followee of destination)
  """
  outcoming = []
  
  for s, d in zip(source, destination):
    try:
      s_in, d_in = train_graph.in_degree(s) == 0, train_graph.in_degree(d) == 0
      if s_ind == 0 or d_in == 0:
        outcoming.append(0)
        continue

      numerator = len(set(train_graph.predecessors(s)).intersection(set(train_graph.predecessors(d))))
      denominator = (s_in * d_in) ** 0.5
      
      outcoming.append(numerator / denominator)
    
    except:
      outcoming.append(0)

  return outcoming

In [None]:
salton_for_outcoming1 = salton_for_outcoming(x_train["source_node"], x_train["destination_node"])
x_train["salton_for_outcoming"] = salton_for_outcoming1
salton_for_outcoming1 = salton_for_outcoming(x_test["source_node"], x_test["destination_node"])
x_test["salton_for_outcoming"] = salton_for_outcoming1
salton_for_outcoming1 = salton_for_outcoming(x_valid["source_node"], x_valid["destination_node"])
x_valid["salton_for_outcoming"] = salton_for_outcoming1

In [None]:
salton_for_incoming1 = salton_for_incoming(x_train["source_node"], x_train["destination_node"])
x_train["salton_for_incoming"] = salton_for_incoming1
salton_for_incoming1 = salton_for_incoming(x_test["source_node"], x_test["destination_node"])
x_test["salton_for_incoming"] = salton_for_incoming1
salton_for_incoming1 = salton_for_incoming(x_valid["source_node"], x_valid["destination_node"])
x_valid["salton_for_incoming"] = salton_for_incoming1

In [None]:
x_train["salton_for_outcoming"] = x_train.apply(lambda row : salton_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)
x_test["salton_for_outcoming"] = x_test.apply(lambda row : salton_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)
x_valid["salton_for_outcoming"] = x_valid.apply(lambda row : salton_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)

In [None]:
x_train["salton_for_incoming"] = x_train.apply(lambda row : salton_for_incoming(row["source_node"], row["destination_node"]), axis = 1)
x_test["salton_for_incoming"] = x_test.apply(lambda row : salton_for_incoming(row["source_node"], row["destination_node"]), axis = 1)
x_valid["salton_for_incoming"] = x_valid.apply(lambda row : salton_for_incoming(row["source_node"], row["destination_node"]), axis = 1)

In [None]:
x_train.head(10)

Unnamed: 0,source_node,destination_node,label,salton_for_outcoming,salton_for_incoming,source_outdegree,source_indegree,destination_outdegree,destination_indegree,outcoming_intersection,incoming_intersection,jaccard_for_outcoming,jaccard_for_incoming,sorensen_for_outcoming,sorensen_for_incoming,outcoming_preferential_attachment
0,712635,150377,1,0.0,0.0,3,4,1,1,0,0,0.0,0.0,0.0,0.0,3
1,1627794,174448,0,0.0,0.0,1,1,12,6,0,0,0.0,0.0,0.0,0.0,12
2,1304062,391971,0,0.0,0.0,1,0,10,9,0,0,0.0,0.0,0.0,0.0,10
3,485532,637180,0,0.0,0.0,1,0,9,6,0,0,0.0,0.0,0.0,0.0,9
4,583251,1021767,1,0.0,0.0,8,4,0,1,0,0,0.0,0.0,0.0,0.0,0
5,481131,836938,1,0.111035,0.084515,73,70,10,8,3,2,0.0375,0.026316,0.036145,0.025641,730
6,587779,334336,1,0.202548,0.455733,26,26,15,15,4,9,0.108108,0.28125,0.097561,0.219512,390
7,1491051,773032,0,0.0,0.0,16,16,1,0,0,0,0.0,0.0,0.0,0.0,16
8,189069,272812,1,0.0,0.0,2,0,5,6,0,0,0.0,0.0,0.0,0.0,10
9,1416904,1007057,1,0.0,0.0,58,59,1,1,0,0,0.0,0.0,0.0,0.0,58


### Creating Sorensen Features

In [None]:
x_train["sorensen_for_outcoming"] = x_train.apply(lambda row : sorensen_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)
x_test["sorensen_for_outcoming"] = x_test.apply(lambda row : sorensen_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)
x_valid["sorensen_for_outcoming"] = x_valid.apply(lambda row : sorensen_for_outcoming(row["source_node"], row["destination_node"]), axis = 1)

In [None]:
x_train["sorensen_for_incoming"] = x_train.apply(lambda row : sorensen_for_incoming(row["source_node"], row["destination_node"]), axis = 1)
x_test["sorensen_for_incoming"] = x_test.apply(lambda row : sorensen_for_incoming(row["source_node"], row["destination_node"]), axis = 1)
x_valid["sorensen_for_incoming"] = x_valid.apply(lambda row : sorensen_for_incoming(row["source_node"], row["destination_node"]), axis = 1)

### Creating Preferential Attachment Features

In [None]:
x_train["outcoming_preferential_attachment"] = x_train.apply(lambda row : outcoming_preferential_attachment(row["source_node"], row["destination_node"]), axis = 1)
x_test["outcoming_preferential_attachment"] = x_test.apply(lambda row : outcoming_preferential_attachment(row["source_node"], row["destination_node"]), axis = 1)
x_valid["outcoming_preferential_attachment"] = x_valid.apply(lambda row : outcoming_preferential_attachment(row["source_node"], row["destination_node"]), axis = 1)

In [None]:
x_train["incoming_preferential_attachment"] = x_train.apply(lambda row : incoming_preferential_attachment(row["source_node"], row["destination_node"]), axis = 1)
x_test["incoming_preferential_attachment"] = x_test.apply(lambda row : incoming_preferential_attachment(row["source_node"], row["destination_node"]), axis = 1)
x_valid["incoming_preferential_attachment"] = x_valid.apply(lambda row : incoming_preferential_attachment(row["source_node"], row["destination_node"]), axis = 1)

### Creating Adamic Adar Index Features

In [None]:
x_train["adamic_adar"] = adamic_adar(x_train["source_node"], x_train["destination_node"])
x_test["adamic_adar"] = adamic_adar(x_test["source_node"], x_test["destination_node"])
x_valid["adamic_adar"] = adamic_adar(x_valid["source_node"], x_valid["destination_node"])

### Creating Hup Promoted Features

In [None]:
x_train["outcoming_hup_promoted"] = outcoming_hup_promoted(x_train["source_node"], x_train["destination_node"])
x_test["outcoming_hup_promoted"] = outcoming_hup_promoted(x_test["source_node"], x_test["destination_node"])
x_valid["outcoming_hup_promoted"] = outcoming_hup_promoted(x_valid["source_node"], x_valid["destination_node"])

In [None]:
x_train["incoming_hup_promoted"] = incoming_hup_promoted(x_train["source_node"], x_train["destination_node"])
x_test["incoming_hup_promoted"] = incoming_hup_promoted(x_test["source_node"], x_test["destination_node"])
x_valid["incoming_hup_promoted"] = incoming_hup_promoted(x_valid["source_node"], x_valid["destination_node"])

### Creating Hup Depressed Features

In [None]:
x_train["outcoming_hup_depressed"] = outcoming_hup_depressed(x_train["source_node"], x_train["destination_node"])
x_test["outcoming_hup_depressed"] = outcoming_hup_depressed(x_test["source_node"], x_test["destination_node"])
x_valid["outcoming_hup_depressed"] = outcoming_hup_depressed(x_valid["source_node"], x_valid["destination_node"])

In [None]:
x_train["incoming_hup_depressed"] = incoming_hup_depressed(x_train["source_node"], x_train["destination_node"])
x_test["incoming_hup_depressed"] = incoming_hup_depressed(x_test["source_node"], x_test["destination_node"])
x_valid["incoming_hup_depressed"] = incoming_hup_depressed(x_valid["source_node"], x_valid["destination_node"])

### Creating Leicht Holme Newman Features

In [None]:
x_train["outcoming_leicht"] = outcoming_leicht(x_train["source_node"], x_train["destination_node"])
x_test["outcoming_leicht"] = outcoming_leicht(x_test["source_node"], x_test["destination_node"])
x_valid["outcoming_leicht"] = outcoming_leicht(x_valid["source_node"], x_valid["destination_node"])

In [None]:
x_train["incoming_leicht"] = incoming_leicht(x_train["source_node"], x_train["destination_node"])
x_test["incoming_leicht"] = incoming_leicht(x_test["source_node"], x_test["destination_node"])
x_valid["incoming_leicht"] = incoming_leicht(x_valid["source_node"], x_valid["destination_node"])

### Creating Local Affinity Structure Features

In [None]:
x_train["outcoming_affinity"] = outcoming_local_affinity(x_train["source_node"], x_train["destination_node"])
x_test["outcoming_affinity"] = outcoming_local_affinity(x_test["source_node"], x_test["destination_node"])
x_valid["outcoming_affinity"] = outcoming_local_affinity(x_valid["source_node"], x_valid["destination_node"])

In [None]:
x_train["incoming_affinity"] = incoming_local_affinity(x_train["source_node"], x_train["destination_node"])
x_test["incoming_affinity"] = incoming_local_affinity(x_test["source_node"], x_test["destination_node"])
x_valid["incoming_affinity"] = incoming_local_affinity(x_valid["source_node"], x_valid["destination_node"])

### Creating Car Based Index Features

In [None]:
x_train["car_index"] = car_based_index(x_train["source_node"], x_train["destination_node"])
x_test["car_index"] = car_based_index(x_test["source_node"], x_test["destination_node"])
x_valid["car_index"] = car_based_index(x_valid["source_node"], x_valid["destination_node"])

### Creating Individual Attraction Features

In [None]:
x_train["individual_attraction"] = individual_attraction(x_train["source_node"], x_train["destination_node"])
x_test["individual_attraction"] = individual_attraction(x_test["source_node"], x_test["destination_node"])
x_valid["individual_attraction"] = individual_attraction(x_valid["source_node"], x_valid["destination_node"])

### Creating Functional Similarity Weight Features

In [None]:
x_train["out_func_sim_weight"] = out_func_sim_weight(x_train["source_node"], x_train["destination_node"], out_degree)
x_test["out_func_sim_weight"] = out_func_sim_weight(x_test["source_node"], x_test["destination_node"], out_degree)
x_valid["out_func_sim_weight"] = out_func_sim_weight(x_valid["source_node"], x_valid["destination_node"], out_degree)

In [None]:
x_train["in_func_sim_weight"] = in_func_sim_weight(x_train["source_node"], x_train["destination_node"], in_degree)
x_test["in_func_sim_weight"] = in_func_sim_weight(x_test["source_node"], x_test["destination_node"], in_degree)
x_valid["in_func_sim_weight"] = in_func_sim_weight(x_valid["source_node"], x_valid["destination_node"], in_degree)

In [5]:
# Projecting the first 10 observations of result dataframe
x_train.head(10)

Unnamed: 0,source_node,destination_node,label,salton_for_outcoming,salton_for_incoming,source_outdegree,source_indegree,destination_outdegree,destination_indegree,outcoming_intersection,incoming_intersection,jaccard_for_outcoming,jaccard_for_incoming,sorensen_for_outcoming,sorensen_for_incoming,outcoming_preferential_attachment,incoming_preferential_attachment,adamic_adar,outcoming_hup_promoted,incoming_hup_promoted,outcoming_hup_depressed,incoming_hup_depressed,outcoming_leicht,incoming_leicht,car_index,individual_attraction,out_func_sim_weight,in_func_sim_weight
0,712635,150377,1,0.0,0.0,3,4,1,1,0,0,0.0,0.0,0.0,0.0,3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1627794,174448,0,0.0,0.0,1,1,12,6,0,0,0.0,0.0,0.0,0.0,12,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1304062,391971,0,0.0,0.0,1,0,10,9,0,0,0.0,0.0,0.0,0.0,10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,485532,637180,0,0.0,0.0,1,0,9,6,0,0,0.0,0.0,0.0,0.0,9,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,583251,1021767,1,0.0,0.0,8,4,0,1,0,0,0.0,0.0,0.0,0.0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,481131,836938,1,0.111035,0.084515,73,70,10,8,3,2,0.0375,0.026316,0.036145,0.025641,730,560,2.431317,0.3,0.25,0.041096,0.028571,0.00411,0.003571,111.0,0.311717,0.007347,0.00346
6,587779,334336,1,0.202548,0.455733,26,26,15,15,4,9,0.108108,0.28125,0.097561,0.219512,390,390,3.529007,0.266667,0.6,0.153846,0.346154,0.010256,0.023077,132.0,0.451417,0.132231,1.121107
7,1491051,773032,0,0.0,0.0,16,16,1,0,0,0,0.0,0.0,0.0,0.0,16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,189069,272812,1,0.0,0.0,2,0,5,6,0,0,0.0,0.0,0.0,0.0,10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1416904,1007057,1,0.0,0.0,58,59,1,1,0,0,0.0,0.0,0.0,0.0,58,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
org_train.head()

Unnamed: 0,source_node,destination_node,label
0,712635,150377,1
1,1627794,174448,0
2,1304062,391971,0
3,485532,637180,0
4,583251,1021767,1


In [None]:
org_train = x_train[["source_node", "destination_node", "label"]].copy()
org_test = x_test[["source_node", "destination_node", "label"]].copy()
org_valid = x_valid[["source_node", "destination_node", "label"]].copy()

x_train.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)
x_test.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)
x_valid.drop(["source_node", "destination_node", "label"], axis = 1, inplace = True)

In [None]:
# Saving first features part to data folder
x_train.to_csv("Data/x_train_1.csv", index = False)
x_test.to_csv("Data/x_test_1.csv", index = False)
x_valid.to_csv("Data/x_valid_1.csv", index = False)

In [None]:
# Saving original columns to spearated data frame to use it to build more features
org_train.to_csv("Data/org_train.csv", index = False)
org_test.to_csv("Data/org_test.csv", index = False)
org_valid.to_csv("Data/org_valid.csv", index = False)