In [2]:
import random
import networkx as nx
import numpy as np
from pathlib import Path
import csv


# load the graph data
facebook_gml_file = Path() / "preprocessed_data/facebook_network.gml"
graph = nx.read_gml(facebook_gml_file)

# links to remove, let's randomly remove 30% of the links
links_to_remove = random.sample(graph.edges(), int(0.3 * graph.number_of_edges()))
graph.remove_edges_from(links_to_remove)
# node representations from the train graph
node_embeddings = []
with open('Splitter/output/facebook_embedding_Splitter.csv', 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # skip the header row
    for row in csv_reader:
        embeddings = np.array(row, dtype=np.float32)  
        node_embeddings.append(embeddings)


In [3]:
import json

# Load persona JSON file
with open('Splitter/output/facebook_personas_splitter.json', 'r') as file:
    connections = json.load(file)

positive_examples = [(source, target) for source, target in connections.items() if target != 0]

num_negative_examples = len(positive_examples)  # --- not sure about this line
persona_ids = set(connections.keys())

negative_examples = []

while len(negative_examples) < num_negative_examples:
    random_pair = random.sample(persona_ids, 2)
    if random_pair not in positive_examples:
        negative_examples.append(tuple(random_pair))

In [4]:
def extract_features(source_node, target_node):
    source_embedding = node_embeddings[int(source_node)]  # Convert source_node to an integer index
    target_embedding = node_embeddings[int(target_node)]  # Convert target_node to an integer index
    feature = np.concatenate((source_embedding, target_embedding))
    return feature

features = []
labels = []

for example in positive_examples:
    source_node, target_node = example
    feature = extract_features(source_node, target_node)
    features.append(feature)
    labels.append(1)

for example in negative_examples:
    source_node, target_node = example
    feature = extract_features(source_node, target_node)
    features.append(feature)
    labels.append(0)

# Convert features and labels to numpy arrays
features = np.array(features)
labels = np.array(labels)

In [12]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.3, random_state=42
)
from sklearn.utils import shuffle

# Shuffle the training data
train_features, train_labels = shuffle(train_features, train_labels, random_state=42)

logreg_model = LogisticRegression(solver='liblinear', max_iter=1000)
# logreg_model = LogisticRegression(solver='lbfgs', max_iter=1000, C=1.0)
logreg_model.fit(train_features, train_labels)

accuracy = logreg_model.score(test_features, test_labels)
print("Accuracy:", accuracy)


Accuracy: 0.4878640776699029
