# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import shutil
import os
import random
from sklearn.model_selection import train_test_split

# Reading Data

In [None]:
train_data = pd.read_csv("Data/train.csv", index_col = False)

In [None]:
# Reading the whole data using networkx
train_graph = nx.read_edgelist("Data/train.csv", delimiter = ',', nodetype = int, comments = 's', create_using = nx.DiGraph())

# Generating Negative Pairs [Missing Links]

What we originaly have in our dataset is just pair or edges between nodes so it's all 1 label data, we have about 1.8 million nodes so the whole graph could reach 1.7 trillion edge given n * (n - 1) / 2 where n is the number of nodes, we don't need them all, we have about 9 million edge labeld as 1 we need the construct same as them labeld as 0 so we genereate two numbers as out nodes check for them if they have edge, if not add them and label them as 0.

This would go in two parts:

1. Generating Positive Dataset by giving 1 label to exisiting edges.

2. Generating Negative Dataset by giving 0 label to each row in negative or missing dataset. 

### Generating Positive Dataset

In [None]:
# Define dictionary of all directed edges in graph
edges = train_graph.edges()

In [None]:
def generate_positive_labels(edges):
  """
  Generate Positive Labels Given Exisitng Edges in Train Graph
  """
  positive_edges = dict()
  for edge in edges:
    positive_edges[(edge[0], edge[1])] = 1
  return positive_edges

positive_edges = generate_positive_labels(edges)

## Generating (Negative / Missing) (Links / Edges)

### Parallel Generator

This is the parallel version of the code to generate negative links, the more cpu cores you have the faster the program would run, it require more memory because it use 2 queue to store and extract and there is also an overhead of passing mesages between processes.

In [None]:
# number_of_edges, number_of_nodes = train_graph.number_of_edges(), train_graph.number_of_nodes()

# q = mp.Queue()
# while q.qsize() < number_of_edges:
#   a, b = random.randint(1, number_of_nodes), random.randint(1, number_of_nodes)
#   is_exist = positive_edges.get((a, b), -1)
  
#   if a == b or is_exist != -1:
#     continue
#   else:
#     q.put((a, b))

# def process(q, missing_edges, missing_count):
  
#   while missing_count.value < number_of_edges:
#     a, b = q.get()

#     try:
#       if nx.shortest_path_length(train_graph, source = a, target = b) > 2:
#         missing_edges.put((a, b))
#         missing_count.value += 1
#       else:
#         continue
#     except:
#       missing_edges.put((a, b))
#       missing_count.value += 1

#     To check progress in code [use tqdm instead if you like]
#     if missing_count.value % 10000 == 0 and missing_count.value != 0:
#       print(missing_count.value)
  
# if __name__ == '__main__':
#   missing_edges = mp.Queue(maxsize = number_of_edges + 1)
#   missing_count = mp.Value('i', 0, lock = True)
#   # q = mp.Queue()
#   p1 = mp.Process(target = process, args = (q, missing_edges, missing_count))
#   p2 = mp.Process(target = process, args = (q, missing_edges, missing_count))
#   p3 = mp.Process(target = process, args = (q, missing_edges, missing_count))
#   p4 = mp.Process(target = process, args = (q, missing_edges, missing_count))
#   .
#   .
#   .
#   .
#   p1.start(); p2.start(); p3.start(); p4.start().....
#   p1.join(); p2.join(); p3.join() p4.join().....

### Sequential Generator

In [None]:
# put missing edge into set so we don't have any duplicate edges
missing_edges = set()
positive_eges_len = len(positive_edges)
nodes_num = train_graph.number_of_nodes()

while len(missing_edges) < positive_eges_len:

  # Generate two random number to be source and destination node
  a, b = random.randint(1, nodes_num), random.randint(1, nodes_num)
  # check if this exist as already presence link
  is_exist = positive_edges.get((a, b), -1)
  
  if a == b or is_exist != -1:
    continue
  
  try: # in case no path at all this would generate error
    # check if the shortest path between a and b is more than 2 then add it as negative edge [no edge here]
    # if shortest path is less than 3 then there is high probability to have edge between those nodes hence not suitable to be consider it as negative links
    if nx.shortest_path_length(train_graph, source = a, target = b) > 2:
      missing_edges.add((a, b))
    else:
      continue
  except: # in case no path at all add those as negative links
    missing_edges.add((a, b))

In [None]:
# take the keys only to get rid of value which is basically ones
positive = pd.DataFrame(positive_edges.keys() , columns = ['source_node' , 'destination_node'])
negative = pd.DataFrame(missing_edges , columns = ['source_node' , 'destination_node'])

# Add ones label to positive data frame
positive["label"] = np.ones(len(positive), dtype = int)

# Add zeros label to negative data frame
negative["label"] = np.zeros(len(positive), dtype = int)

positive.to_csv("Data/positive.csv", index = False)
negative.to_csv("Data/negative.csv", index = False)

# positive = pd.read_csv("Data/positive.csv")
# negative = pd.read_csv("Data/negative.csv")

In [None]:
positive.head()

Unnamed: 0,source_node,destination_node,label
0,1,690569,1
1,1,315892,1
2,1,189226,1
3,690569,663369,1
4,690569,603627,1


In [None]:
negative.head()

Unnamed: 0,source_node,destination_node,label
0,551833,106315,0
1,1829481,1485071,0
2,890210,351061,0
3,568593,1311358,0
4,1653933,1115066,0


In [None]:
# Checking shape compatibility
print("Number of rows in positive data frame", positive.shape[0])
print("Number of rows in negative data frame", negative.shape[0])

print("=============================================")

print("Number of columns in positive data frame", positive.shape[1])
print("Number of columns in negative data frame", positive.shape[1])

Number of rows in positive data frame 9437519
Number of rows in negative data frame 9437519
Number of columns in positive data frame 3
Number of columns in negative data frame 3


# Train & Test Split

In [None]:
positive = pd.read_csv("Data/positive.csv")
negative = pd.read_csv("Data/negative.csv")

In [None]:
# mergin positive and negative data frame togeather
data = positive.append(negative, ignore_index = True)

In [None]:
# to be used in stratify splitting
label = data["label"]

In [None]:
# Not shuffled dataset
data.head()

Unnamed: 0,source_node,destination_node,label
0,1,690569,1
1,1,315892,1
2,1,189226,1
3,690569,663369,1
4,690569,603627,1


In [None]:
# stratify using label to get balanced datasets, use 0.11 test_size to get 2 million rows in testset
x_train, x_test = train_test_split(data, shuffle = True, stratify = label, test_size = 0.11)

In [None]:
print("Number of nodes in graph =", train_graph.number_of_nodes())

Number of nodes in graph = 1862220


In [None]:
train_source_nodes = set(x_train.source_node.values)
train_destination_nodes = set(x_train.destination_node.values)
train_nodes = train_source_nodes.union(train_destination_nodes)

print("Number of unique nodes in train data =", len(train_nodes))

Number of unique nodes in train data = 1862212


In [None]:
test_source_nodes = set(x_test.source_node.values)
test_destination_nodes = set(x_test.destination_node.values)
test_nodes = test_source_nodes.union(test_destination_nodes)

print("Number of unique nodes in train data =", len(test_nodes))

Number of unique nodes in train data = 1536243


In [None]:
print("Number of nodes present in train data and test data =", len(train_nodes.union(test_nodes)))

Number of nodes present in train data and test data = 1862220


In [None]:
print("Number of nodes present in train data and not in test data =", len(train_nodes - test_nodes))
print("Percentage of nodes present in train data and not in test data =", (len(train_nodes - test_nodes) / 1862220) * 100 , '%')
print("=====================================================================================")
print("Number of nodes present in test data and not in train data =", len(test_nodes - train_nodes))
print("Percentage of nodes present in test data and not in test data =", (len(test_nodes - train_nodes) / 1862220) * 100 , '%')

Number of nodes present in train data and not in test data = 325977
Percentage of nodes present in train data and not in test data = 17.50475239230596 %
Number of nodes present in test data and not in train data = 8
Percentage of nodes present in test data and not in test data = 0.0004295947847193135 %


Almost all nodes are present in train data and majority of them are present in test data, this test data will be spliited later into test and validation data as well, we have 17% of train nodes not present in test data but this is ok, and only 8 nodes in test data and not in train data and this is really good so we shall not confront could start problem later.

In [None]:
label = x_test["label"]
# test size to keep only 500K to use in validation data and 1.5 million to be used as test data
x_t, x_valid = train_test_split(x_test, shuffle = True, stratify = label, test_size = 0.245)
x_test = x_t

In [None]:
print("Number of rows in train data =", x_train.shape[0])
print("Number of rows in test data =", x_test.shape[0])
print("Number of rows in validation data =", x_valid.shape[0])
print("===========================================")
print("Number of columns in train data =", x_train.shape[1])
print("Number of columns in test data =", x_test.shape[1])
print("Number of columns in validation data =", x_valid.shape[1])

Number of rows in train data = 16798783
Number of rows in test data = 1567572
Number of rows in validation data = 508683
Number of columns in train data = 3
Number of columns in test data = 3
Number of columns in validation data = 3


In [None]:
print("Number of rows in test data =", x_test.shape[0])
print("Number of rows in validation data =", x_valid.shape[0])

Number of rows in test data = 1567572
Number of rows in validation data = 508683


In [None]:
# check if we got a balanced train dataset
x_train["label"].value_counts()

0    8399392
1    8399391
Name: label, dtype: int64

In [None]:
# check if we got a balanced test dataset
x_test["label"].value_counts()

1    783786
0    783786
Name: label, dtype: int64

In [None]:
# check if we got a balanced validation dataset
x_valid["label"].value_counts()

1    254342
0    254341
Name: label, dtype: int64

In [None]:
x_train.to_csv("Data/x_train.csv", index = False)
x_test.to_csv("Data/x_test.csv", index = False)
x_valid.to_csv("Data/x_valid.csv", index = False)