<a href="https://colab.research.google.com/github/AmiriHayes/sandbox/blob/main/homophily_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

STEP ONE: SYNTHETIC DATASET AND FEATURE ENGINEERING

In [4]:
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import os
import csv

In [36]:
# simulate dataset w/ avg node homophily value and max_num_neighbors K

n_nodes = 1000
n_classes = 7
p_same = 0.8
K = 20

seed = 42
rng = np.random.default_rng(seed)
os.makedirs("data", exist_ok=True)

labels = rng.integers(low=0, high=n_classes, size=n_nodes)
class_nodes = {c: np.where(labels == c)[0].tolist() for c in range(n_classes)}
all_nodes = np.arange(n_nodes).tolist()

def sample_from_pool(pool, m, excludes=set(), rng=None):
    pool_set = set(pool) - excludes
    pool_list = list(pool_set)
    if len(pool_list) == 0:
        return list(rng.choice(pool, size=m, replace=True))
    if len(pool_list) >= m:
        return list(rng.choice(pool_list, size=m, replace=False))
    else:
        result = pool_list.copy()
        rem = m - len(result)
        result += list(rng.choice(pool_list, size=rem, replace=True))
        return result

edges_set = set()
for i in tqdm(range(n_nodes), desc="constructing edges"):
    k = int(rng.integers(0, K+1))  # uniform integer in [0,K]
    if k == 0: continue
    n_same = int(rng.binomial(k, p_same))
    n_diff = k - n_same

    same_pool = class_nodes[labels[i]]
    excludes = {i}

    same_choices = []
    if n_same > 0:
        same_choices = sample_from_pool(same_pool, n_same, excludes=excludes, rng=rng)
        excludes = excludes.union(same_choices)

    other_pool = [x for x in all_nodes if labels[x] != labels[i]]
    diff_choices = []
    if n_diff > 0:
        diff_choices = sample_from_pool(other_pool, n_diff, excludes=excludes, rng=rng)

    for j in same_choices + diff_choices:
        if j == i: continue
        a, b = (i, j) if i < j else (j, i)
        edges_set.add((a, b))

edges = sorted(list(edges_set))
edges_df = pd.DataFrame(edges, columns=["sourceNodeId", "targetNodeId"])
edges_df.to_csv("data/simulated_edges_new.csv", index=False)
nodes_df = pd.DataFrame({"nodeId": np.arange(n_nodes), "label": labels})
nodes_df.to_csv("data/simulated_nodes_new.csv", index=False)

G = nx.Graph()
G.add_nodes_from(range(n_nodes))
G.add_edges_from(edges)

homophily = []
for node in range(n_nodes):
    nbrs = list(G.neighbors(node))
    deg = len(nbrs)
    if deg == 0:
        homophily.append(0.0)
    else:
        same_count = sum(1 for nb in nbrs if labels[nb] == labels[node])
        homophily.append(same_count / deg)

print(f"Generated graph: n_nodes={n_nodes}, n_edges={G.number_of_edges()}")
print(f"Homophily mean: {np.mean(homophily):.4f}, std: {np.std(homophily):.4f}")
nodes_df["homophily"] = homophily
nodes_df.to_csv("data/simulated_nodes_with_homophily.csv", index=False)


constructing edges: 100%|██████████| 1000/1000 [00:00<00:00, 1760.00it/s]


Generated graph: n_nodes=1000, n_edges=9992
Homophily mean: 0.7893, std: 0.1043
