# Preprocess Facebook Network

This file converts the data in the `/data/` folder to a .gml file
Specifically, `/data/facebook_combined.txt` is iterated over to generate the .gml files, which is stored  as `/preprocessed_data/full_network.gml` and `/preprocessed_data/removed_links_network.gml`. The `removed_links_network` can be used to test the link prediction task. While iteratively writing edges to the `full_network`, each edge has a 80% chance of ending up in the `removed_links_network`. 

Additionally, this file outputs JSON files containing dictionaries with the node's neighboring nodes. Again, one file for all links and one file for the removed links.


## Imports

In [1]:
import json
import numpy as np
import random
from pathlib import Path

In [2]:
graph_data_path = Path() / "data/facebook_combined.txt"
graph_data = np.loadtxt(graph_data_path, dtype=int)
graph_data

array([[   0,    1],
       [   0,    2],
       [   0,    3],
       ...,
       [4027, 4032],
       [4027, 4038],
       [4031, 4038]])

## Write to files

In [3]:
# Create or over-write files
gml_full_path = Path() / "preprocessed_data/full_network.gml"
gml_removed_links_path = Path() / "preprocessed_data/removed_links_network.gml"

# Open files
gml_full = open(gml_full_path, "w")
gml_removed_links = open(gml_removed_links_path, "w")

# Add default start to .gml
gml_full.write("graph [\n")
gml_removed_links.write("graph [\n")

8

In [4]:
# Initiate empty dictionaries
all_links = {}
removed_links = {}

for i in range(0, graph_data.max() + 1):
    all_links[i] = []
    removed_links[i] = []

In [5]:
# For each node, create a node instance in the gml
# Example:
# node [
#   id 1
#   label "1"
# ]

for node in range(graph_data.max()+1):
    gml_full.write("node [\nid " + str(node) + "\nlabel \"" + str(node) + "\"\n]\n")
    gml_removed_links.write("node [\nid " + str(node) + "\nlabel \"" + str(node) + "\"\n]\n")

In [6]:
# For each edge, create an edge instance in the gml_full and add to all_links dictionary
# Each edge has an 80% chance of being written to the gml_removed_links and removed_links dictionary
# Example:
# edge [
#   source 1
#   target 2
# ]

for edge in graph_data:
    gml_full.write("edge [\nsource " + str(edge[0]) + "\ntarget " + str(edge[1]) + "\n]\n")
    all_links[int(edge[0])].append(int(edge[1]))
    if random.random() < 0.8:
        gml_removed_links.write("edge [\nsource " + str(edge[0]) + "\ntarget " + str(edge[1]) + "\n]\n")
        removed_links[int(edge[0])].append(int(edge[1]))

In [7]:
# Add default ending to .gml
gml_full.write("]")
gml_removed_links.write("]")

# Close the .gml files
gml_full.close()
gml_removed_links.close()

In [8]:
# Save dictionaries as JSON
with open('preprocessed_data/all_links.json', 'w') as fp:
    json.dump(all_links, fp)

with open('preprocessed_data/removed_links.json', 'w') as fp:
    json.dump(removed_links, fp)