Walk Length 10

In [6]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Function to load edge list from a file
def load_edgelist(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = line.strip().split()
            edges.append((edge[0], edge[1]))
    return edges

# Function to sample a subgraph using BFS
def sample_subgraph_bfs(graph, start_node=None, number_of_nodes=1000, seed=None):
    if start_node is None:
        nodes = list(graph.nodes())
        random.Random(seed).shuffle(nodes)
        start_node = nodes[0]

    sampled_nodes = set([start_node])
    node_queue = [start_node]

    while len(sampled_nodes) < number_of_nodes and node_queue:
        current = node_queue.pop(0)
        neighbors = sorted(list(graph.neighbors(current)))
        random.Random(seed).shuffle(neighbors)

        for neighbor in neighbors:
            if len(sampled_nodes) >= number_of_nodes:
                break
            if neighbor not in sampled_nodes:
                sampled_nodes.add(neighbor)
                node_queue.append(neighbor)

    return graph.subgraph(sampled_nodes)

# Function to generate edge embeddings
def generate_edge_embeddings(edges, embedding_function):
    edge_embeddings = []
    for u, v in edges:
        u_embedding = embedding_function(u)
        v_embedding = embedding_function(v)
        edge_embedding = np.multiply(u_embedding, v_embedding)
        edge_embeddings.append(edge_embedding)
    return np.array(edge_embeddings)

# Function to split the graph into train and test sets, ensuring connectivity
def split_graph_with_connectivity(graph, fraction_to_remove=0.1, seed=None):
    edge_list = list(graph.edges())
    num_edges_to_remove = int(fraction_to_remove * graph.number_of_edges())

    while True:
        random.Random(seed).shuffle(edge_list)
        edges_to_remove = edge_list[:num_edges_to_remove]
        G_train = graph.copy()
        G_train.remove_edges_from(edges_to_remove)
        if nx.is_connected(G_train):
            break

    G_test = graph.copy()
    G_test.remove_edges_from(G_train.edges())

    return G_train, G_test, edges_to_remove

# Function to generate positive and negative samples
def generate_samples(graph, seed=None):
    positive_samples = list(graph.edges())
    negative_samples = []
    all_nodes = sorted(list(graph.nodes()))

    while len(negative_samples) < len(positive_samples):
        node_pair = random.sample(all_nodes, 2)
        if not graph.has_edge(*node_pair):
            negative_samples.append(tuple(sorted(node_pair)))

    return positive_samples, negative_samples

# Function to generate Node2Vec embeddings
def generate_node2vec_embeddings(graph, dimensions=128, num_walks=30, walk_length=10, p=1.0, q=1.0, seed=None):
    # Function to perform random walk based on Node2Vec algorithm
    def node2vec_walk(node):
        walk = [node]
        for _ in range(walk_length):
            neighbors = list(graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    probs = []
                    for neighbor in neighbors:
                        if neighbor == prev_node:
                            probs.append(1 / p)
                        elif graph.has_edge(prev_node, neighbor):
                            probs.append(1)
                        else:
                            probs.append(1 / q)
                    probs = np.array(probs)
                    probs /= np.sum(probs)
                    walk.append(np.random.choice(neighbors, p=probs))
            else:
                break
        return walk

    # Generating random walks
    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.seed(seed)
        random.shuffle(nodes)
        for node in nodes:
            walks.append(node2vec_walk(node))

    # Learning embeddings using Word2Vec with a fixed seed
    embeddings = {}
    model = Word2Vec(walks, vector_size=dimensions, window=5, min_count=1, sg=1, workers=1, seed=seed)
    for node in nodes:
        embeddings[node] = model.wv[node]

    # Function to get node embeddings
    def embedding_function(u):
        return embeddings[u]

    return model, embedding_function

# Grid search over p, q ∈ {0.25, 0.50, 1, 2}
p_values = [0.25, 0.5, 1, 2]
q_values = [0.25, 0.5, 1, 2]

# DataFrame to store the results
results_df = pd.DataFrame(columns=['seed', 'p', 'q', 'roc_auc'])

for seed in range(1):
    # Set the new random seed for all functions
    np.random.seed(seed)
    random.seed(seed)

    # Load the data and create the graph
    edge_list = load_edgelist('googleplus.txt')
 # Replace with your actual file path
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # Sample subgraph, split into train/test, generate samples
    subgraph = sample_subgraph_bfs(G, number_of_nodes=300, seed=seed)
    G_train, G_test, _ = split_graph_with_connectivity(subgraph, seed=seed)
    positive_samples_train, negative_samples_train = generate_samples(G_train, seed=seed)
    positive_samples_test, negative_samples_test = generate_samples(G_test, seed=seed)

    for p in p_values:
        for q in q_values:
            # Node2Vec embeddings
            _, embedding_function = generate_node2vec_embeddings(G_train, p=p, q=q, seed=seed)

            # Generate edge embeddings
            edge_embeddings_train = generate_edge_embeddings(positive_samples_train + negative_samples_train, embedding_function)
            edge_embeddings_test = generate_edge_embeddings(positive_samples_test + negative_samples_test, embedding_function)

            # Logistic Regression Model
            labels_train = np.array([1] * len(positive_samples_train) + [0] * len(negative_samples_train))
            lr_clf = LogisticRegressionCV(cv=10, max_iter=2000, scoring="roc_auc", random_state=seed)
            classifier = lr_clf.fit(edge_embeddings_train, labels_train)

            # Evaluate the model
            labels_test = np.array([1] * len(positive_samples_test) + [0] * len(negative_samples_test))
            test_predictions = classifier.predict_proba(edge_embeddings_test)[:, 1]
            test_roc_auc = roc_auc_score(labels_test, test_predictions)

            # Store results
            results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)

# Print DataFrame
print(results_df)

# Find the best combination of p and q based on average ROC AUC
best_combination = results_df.groupby(['p', 'q'])['roc_auc'].mean().idxmax()
print(f"Best combination of p and q: {best_combination}")


  results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)


   seed     p     q   roc_auc
0     0  0.25  0.25  0.922388
1     0  0.25  0.50  0.925014
2     0  0.25  1.00  0.914294
3     0  0.25  2.00  0.912074
4     0  0.50  0.25  0.922596
5     0  0.50  0.50  0.919881
6     0  0.50  1.00  0.930306
7     0  0.50  2.00  0.929529
8     0  1.00  0.25  0.935148
9     0  1.00  0.50  0.930390
10    0  1.00  1.00  0.930619
11    0  1.00  2.00  0.926846
12    0  2.00  0.25  0.918795
13    0  2.00  0.50  0.925504
14    0  2.00  1.00  0.930200
15    0  2.00  2.00  0.929573
Best combination of p and q: (1.0, 0.25)


Walk Length 15

In [7]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Function to load edge list from a file
def load_edgelist(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = line.strip().split()
            edges.append((edge[0], edge[1]))
    return edges

# Function to sample a subgraph using BFS
def sample_subgraph_bfs(graph, start_node=None, number_of_nodes=1000, seed=None):
    if start_node is None:
        nodes = list(graph.nodes())
        random.Random(seed).shuffle(nodes)
        start_node = nodes[0]

    sampled_nodes = set([start_node])
    node_queue = [start_node]

    while len(sampled_nodes) < number_of_nodes and node_queue:
        current = node_queue.pop(0)
        neighbors = sorted(list(graph.neighbors(current)))
        random.Random(seed).shuffle(neighbors)

        for neighbor in neighbors:
            if len(sampled_nodes) >= number_of_nodes:
                break
            if neighbor not in sampled_nodes:
                sampled_nodes.add(neighbor)
                node_queue.append(neighbor)

    return graph.subgraph(sampled_nodes)

# Function to generate edge embeddings
def generate_edge_embeddings(edges, embedding_function):
    edge_embeddings = []
    for u, v in edges:
        u_embedding = embedding_function(u)
        v_embedding = embedding_function(v)
        edge_embedding = np.multiply(u_embedding, v_embedding)
        edge_embeddings.append(edge_embedding)
    return np.array(edge_embeddings)

# Function to split the graph into train and test sets, ensuring connectivity
def split_graph_with_connectivity(graph, fraction_to_remove=0.1, seed=None):
    edge_list = list(graph.edges())
    num_edges_to_remove = int(fraction_to_remove * graph.number_of_edges())

    while True:
        random.Random(seed).shuffle(edge_list)
        edges_to_remove = edge_list[:num_edges_to_remove]
        G_train = graph.copy()
        G_train.remove_edges_from(edges_to_remove)
        if nx.is_connected(G_train):
            break

    G_test = graph.copy()
    G_test.remove_edges_from(G_train.edges())

    return G_train, G_test, edges_to_remove

# Function to generate positive and negative samples
def generate_samples(graph, seed=None):
    positive_samples = list(graph.edges())
    negative_samples = []
    all_nodes = sorted(list(graph.nodes()))

    while len(negative_samples) < len(positive_samples):
        node_pair = random.sample(all_nodes, 2)
        if not graph.has_edge(*node_pair):
            negative_samples.append(tuple(sorted(node_pair)))

    return positive_samples, negative_samples

# Function to generate Node2Vec embeddings
def generate_node2vec_embeddings(graph, dimensions=128, num_walks=30, walk_length=15, p=1.0, q=1.0, seed=None):
    # Function to perform random walk based on Node2Vec algorithm
    def node2vec_walk(node):
        walk = [node]
        for _ in range(walk_length):
            neighbors = list(graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    probs = []
                    for neighbor in neighbors:
                        if neighbor == prev_node:
                            probs.append(1 / p)
                        elif graph.has_edge(prev_node, neighbor):
                            probs.append(1)
                        else:
                            probs.append(1 / q)
                    probs = np.array(probs)
                    probs /= np.sum(probs)
                    walk.append(np.random.choice(neighbors, p=probs))
            else:
                break
        return walk

    # Generating random walks
    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.seed(seed)
        random.shuffle(nodes)
        for node in nodes:
            walks.append(node2vec_walk(node))

    # Learning embeddings using Word2Vec with a fixed seed
    embeddings = {}
    model = Word2Vec(walks, vector_size=dimensions, window=5, min_count=1, sg=1, workers=1, seed=seed)
    for node in nodes:
        embeddings[node] = model.wv[node]

    # Function to get node embeddings
    def embedding_function(u):
        return embeddings[u]

    return model, embedding_function

# Grid search over p, q ∈ {0.25, 0.50, 1, 2}
p_values = [0.25, 0.5, 1, 2]
q_values = [0.25, 0.5, 1, 2]

# DataFrame to store the results
results_df = pd.DataFrame(columns=['seed', 'p', 'q', 'roc_auc'])

for seed in range(1):
    # Set the new random seed for all functions
    np.random.seed(seed)
    random.seed(seed)

    # Load the data and create the graph
    edge_list = load_edgelist('googleplus.txt')
 # Replace with your actual file path
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # Sample subgraph, split into train/test, generate samples
    subgraph = sample_subgraph_bfs(G, number_of_nodes=300, seed=seed)
    G_train, G_test, _ = split_graph_with_connectivity(subgraph, seed=seed)
    positive_samples_train, negative_samples_train = generate_samples(G_train, seed=seed)
    positive_samples_test, negative_samples_test = generate_samples(G_test, seed=seed)

    for p in p_values:
        for q in q_values:
            # Node2Vec embeddings
            _, embedding_function = generate_node2vec_embeddings(G_train, p=p, q=q, seed=seed)

            # Generate edge embeddings
            edge_embeddings_train = generate_edge_embeddings(positive_samples_train + negative_samples_train, embedding_function)
            edge_embeddings_test = generate_edge_embeddings(positive_samples_test + negative_samples_test, embedding_function)

            # Logistic Regression Model
            labels_train = np.array([1] * len(positive_samples_train) + [0] * len(negative_samples_train))
            lr_clf = LogisticRegressionCV(cv=10, max_iter=2000, scoring="roc_auc", random_state=seed)
            classifier = lr_clf.fit(edge_embeddings_train, labels_train)

            # Evaluate the model
            labels_test = np.array([1] * len(positive_samples_test) + [0] * len(negative_samples_test))
            test_predictions = classifier.predict_proba(edge_embeddings_test)[:, 1]
            test_roc_auc = roc_auc_score(labels_test, test_predictions)

            # Store results
            results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)

# Print DataFrame
print(results_df)

# Find the best combination of p and q based on average ROC AUC
best_combination = results_df.groupby(['p', 'q'])['roc_auc'].mean().idxmax()
print(f"Best combination of p and q: {best_combination}")


  results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)


   seed     p     q   roc_auc
0     0  0.25  0.25  0.897465
1     0  0.25  0.50  0.895152
2     0  0.25  1.00  0.892169
3     0  0.25  2.00  0.885028
4     0  0.50  0.25  0.906468
5     0  0.50  0.50  0.911050
6     0  0.50  1.00  0.904915
7     0  0.50  2.00  0.913270
8     0  1.00  0.25  0.910582
9     0  1.00  0.50  0.918725
10    0  1.00  1.00  0.906835
11    0  1.00  2.00  0.919338
12    0  2.00  0.25  0.911332
13    0  2.00  0.50  0.924449
14    0  2.00  1.00  0.918107
15    0  2.00  2.00  0.926938
Best combination of p and q: (2.0, 2.0)


Walk Length 5

In [5]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Function to load edge list from a file
def load_edgelist(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = line.strip().split()
            edges.append((edge[0], edge[1]))
    return edges

# Function to sample a subgraph using BFS
def sample_subgraph_bfs(graph, start_node=None, number_of_nodes=1000, seed=None):
    if start_node is None:
        nodes = list(graph.nodes())
        random.Random(seed).shuffle(nodes)
        start_node = nodes[0]

    sampled_nodes = set([start_node])
    node_queue = [start_node]

    while len(sampled_nodes) < number_of_nodes and node_queue:
        current = node_queue.pop(0)
        neighbors = sorted(list(graph.neighbors(current)))
        random.Random(seed).shuffle(neighbors)

        for neighbor in neighbors:
            if len(sampled_nodes) >= number_of_nodes:
                break
            if neighbor not in sampled_nodes:
                sampled_nodes.add(neighbor)
                node_queue.append(neighbor)

    return graph.subgraph(sampled_nodes)

# Function to generate edge embeddings
def generate_edge_embeddings(edges, embedding_function):
    edge_embeddings = []
    for u, v in edges:
        u_embedding = embedding_function(u)
        v_embedding = embedding_function(v)
        edge_embedding = np.multiply(u_embedding, v_embedding)
        edge_embeddings.append(edge_embedding)
    return np.array(edge_embeddings)

# Function to split the graph into train and test sets, ensuring connectivity
def split_graph_with_connectivity(graph, fraction_to_remove=0.1, seed=None):
    edge_list = list(graph.edges())
    num_edges_to_remove = int(fraction_to_remove * graph.number_of_edges())

    while True:
        random.Random(seed).shuffle(edge_list)
        edges_to_remove = edge_list[:num_edges_to_remove]
        G_train = graph.copy()
        G_train.remove_edges_from(edges_to_remove)
        if nx.is_connected(G_train):
            break

    G_test = graph.copy()
    G_test.remove_edges_from(G_train.edges())

    return G_train, G_test, edges_to_remove

# Function to generate positive and negative samples
def generate_samples(graph, seed=None):
    positive_samples = list(graph.edges())
    negative_samples = []
    all_nodes = sorted(list(graph.nodes()))

    while len(negative_samples) < len(positive_samples):
        node_pair = random.sample(all_nodes, 2)
        if not graph.has_edge(*node_pair):
            negative_samples.append(tuple(sorted(node_pair)))

    return positive_samples, negative_samples

# Function to generate Node2Vec embeddings
def generate_node2vec_embeddings(graph, dimensions=128, num_walks=30, walk_length=5, p=1.0, q=1.0, seed=None):
    # Function to perform random walk based on Node2Vec algorithm
    def node2vec_walk(node):
        walk = [node]
        for _ in range(walk_length):
            neighbors = list(graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    probs = []
                    for neighbor in neighbors:
                        if neighbor == prev_node:
                            probs.append(1 / p)
                        elif graph.has_edge(prev_node, neighbor):
                            probs.append(1)
                        else:
                            probs.append(1 / q)
                    probs = np.array(probs)
                    probs /= np.sum(probs)
                    walk.append(np.random.choice(neighbors, p=probs))
            else:
                break
        return walk

    # Generating random walks
    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.seed(seed)
        random.shuffle(nodes)
        for node in nodes:
            walks.append(node2vec_walk(node))

    # Learning embeddings using Word2Vec with a fixed seed
    embeddings = {}
    model = Word2Vec(walks, vector_size=dimensions, window=5, min_count=1, sg=1, workers=1, seed=seed)
    for node in nodes:
        embeddings[node] = model.wv[node]

    # Function to get node embeddings
    def embedding_function(u):
        return embeddings[u]

    return model, embedding_function

# Grid search over p, q ∈ {0.25, 0.50, 1, 2}
p_values = [0.25, 0.5, 1, 2]
q_values = [0.25, 0.5, 1, 2]

# DataFrame to store the results
results_df = pd.DataFrame(columns=['seed', 'p', 'q', 'roc_auc'])

for seed in range(1):
    # Set the new random seed for all functions
    np.random.seed(seed)
    random.seed(seed)

    # Load the data and create the graph
    edge_list = load_edgelist('googleplus.txt')
 # Replace with your actual file path
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # Sample subgraph, split into train/test, generate samples
    subgraph = sample_subgraph_bfs(G, number_of_nodes=300, seed=seed)
    G_train, G_test, _ = split_graph_with_connectivity(subgraph, seed=seed)
    positive_samples_train, negative_samples_train = generate_samples(G_train, seed=seed)
    positive_samples_test, negative_samples_test = generate_samples(G_test, seed=seed)

    for p in p_values:
        for q in q_values:
            # Node2Vec embeddings
            _, embedding_function = generate_node2vec_embeddings(G_train, p=p, q=q, seed=seed)

            # Generate edge embeddings
            edge_embeddings_train = generate_edge_embeddings(positive_samples_train + negative_samples_train, embedding_function)
            edge_embeddings_test = generate_edge_embeddings(positive_samples_test + negative_samples_test, embedding_function)

            # Logistic Regression Model
            labels_train = np.array([1] * len(positive_samples_train) + [0] * len(negative_samples_train))
            lr_clf = LogisticRegressionCV(cv=10, max_iter=2000, scoring="roc_auc", random_state=seed)
            classifier = lr_clf.fit(edge_embeddings_train, labels_train)

            # Evaluate the model
            labels_test = np.array([1] * len(positive_samples_test) + [0] * len(negative_samples_test))
            test_predictions = classifier.predict_proba(edge_embeddings_test)[:, 1]
            test_roc_auc = roc_auc_score(labels_test, test_predictions)

            # Store results
            results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)

# Print DataFrame
print(results_df)

# Find the best combination of p and q based on average ROC AUC
best_combination = results_df.groupby(['p', 'q'])['roc_auc'].mean().idxmax()
print(f"Best combination of p and q: {best_combination}")


  results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)


   seed     p     q   roc_auc
0     0  0.25  0.25  0.925071
1     0  0.25  0.50  0.929225
2     0  0.25  1.00  0.923443
3     0  0.25  2.00  0.916712
4     0  0.50  0.25  0.928545
5     0  0.50  0.50  0.925001
6     0  0.50  1.00  0.919965
7     0  0.50  2.00  0.932018
8     0  1.00  0.25  0.917427
9     0  1.00  0.50  0.927561
10    0  1.00  1.00  0.928655
11    0  1.00  2.00  0.928854
12    0  2.00  0.25  0.930134
13    0  2.00  0.50  0.935130
14    0  2.00  1.00  0.933969
15    0  2.00  2.00  0.928541
Best combination of p and q: (2.0, 0.5)


Multi Scale walk length

In [2]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Function to load edge list from a file
def load_edgelist(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = line.strip().split()
            edges.append((edge[0], edge[1]))
    return edges

# Function to sample a subgraph using BFS
def sample_subgraph_bfs(graph, start_node=None, number_of_nodes=1000, seed=None):
    if start_node is None:
        nodes = list(graph.nodes())
        random.Random(seed).shuffle(nodes)
        start_node = nodes[0]

    sampled_nodes = set([start_node])
    node_queue = [start_node]

    while len(sampled_nodes) < number_of_nodes and node_queue:
        current = node_queue.pop(0)
        neighbors = sorted(list(graph.neighbors(current)))
        random.Random(seed).shuffle(neighbors)

        for neighbor in neighbors:
            if len(sampled_nodes) >= number_of_nodes:
                break
            if neighbor not in sampled_nodes:
                sampled_nodes.add(neighbor)
                node_queue.append(neighbor)

    return graph.subgraph(sampled_nodes)

# Function to generate edge embeddings
def generate_edge_embeddings(edges, embedding_function):
    edge_embeddings = []
    for u, v in edges:
        u_embedding = embedding_function(u)
        v_embedding = embedding_function(v)
        edge_embedding = np.multiply(u_embedding, v_embedding)
        edge_embeddings.append(edge_embedding)
    return np.array(edge_embeddings)

# Function to split the graph into train and test sets, ensuring connectivity
def split_graph_with_connectivity(graph, fraction_to_remove=0.1, seed=None):
    edge_list = list(graph.edges())
    num_edges_to_remove = int(fraction_to_remove * graph.number_of_edges())

    while True:
        random.Random(seed).shuffle(edge_list)
        edges_to_remove = edge_list[:num_edges_to_remove]
        G_train = graph.copy()
        G_train.remove_edges_from(edges_to_remove)
        if nx.is_connected(G_train):
            break

    G_test = graph.copy()
    G_test.remove_edges_from(G_train.edges())

    return G_train, G_test, edges_to_remove

# Function to generate positive and negative samples
def generate_samples(graph, seed=None):
    positive_samples = list(graph.edges())
    negative_samples = []
    all_nodes = sorted(list(graph.nodes()))

    while len(negative_samples) < len(positive_samples):
        node_pair = random.sample(all_nodes, 2)
        if not graph.has_edge(*node_pair):
            negative_samples.append(tuple(sorted(node_pair)))

    return positive_samples, negative_samples

def generate_node2vec_embeddings_multiscale(graph, dimensions=128, num_walks=10, walk_lengths=[5, 10, 15], p=1.0, q=1.0, seed=None):
    def node2vec_walk(node, length):
        walk = [node]
        for _ in range(length):
            neighbors = list(graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    probs = []
                    for neighbor in neighbors:
                        if neighbor == prev_node:
                            probs.append(1 / p)
                        elif graph.has_edge(prev_node, neighbor):
                            probs.append(1)
                        else:
                            probs.append(1 / q)
                    probs = np.array(probs)
                    probs /= np.sum(probs)
                    walk.append(np.random.choice(neighbors, p=probs))
            else:
                break
        return walk

    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.seed(seed)
        random.shuffle(nodes)
        for node in nodes:
            for length in walk_lengths:
                walks.append(node2vec_walk(node, length))

    embeddings = {}
    model = Word2Vec(walks, vector_size=dimensions, window=5, min_count=1, sg=1, workers=1, seed=seed)
    for node in nodes:
        embeddings[node] = model.wv[node]

    def embedding_function(u):
        return embeddings[u]

    return model, embedding_function


# Grid search over p, q ∈ {0.25, 0.50, 1, 2}
p_values = [0.25, 0.5, 1, 2]
q_values = [0.25, 0.5, 1, 2]

# DataFrame to store the results
results_df = pd.DataFrame(columns=['seed', 'p', 'q', 'roc_auc'])

for seed in range(1):
    # Set the new random seed for all functions
    np.random.seed(seed)
    random.seed(seed)

    # Load the data and create the graph
    edge_list = load_edgelist('googleplus.txt')
# Replace with your actual file path
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # Sample subgraph, split into train/test, generate samples
    subgraph = sample_subgraph_bfs(G, number_of_nodes=300, seed=seed)
    G_train, G_test, _ = split_graph_with_connectivity(subgraph, seed=seed)
    positive_samples_train, negative_samples_train = generate_samples(G_train, seed=seed)
    positive_samples_test, negative_samples_test = generate_samples(G_test, seed=seed)

    # Define the list of walk lengths for multi-scale sampling
    walk_lengths = [5, 10, 15]  # Adjust this list based on the lengths you want to explore

    for p in p_values:
        for q in q_values:
            # Node2Vec embeddings with multi-scale sampling
            _, embedding_function = generate_node2vec_embeddings_multiscale(G_train, p=p, q=q, walk_lengths=walk_lengths, seed=seed)

            # Generate edge embeddings
            edge_embeddings_train = generate_edge_embeddings(positive_samples_train + negative_samples_train, embedding_function)
            edge_embeddings_test = generate_edge_embeddings(positive_samples_test + negative_samples_test, embedding_function)

            # Logistic Regression Model
            labels_train = np.array([1] * len(positive_samples_train) + [0] * len(negative_samples_train))
            lr_clf = LogisticRegressionCV(cv=10, max_iter=2000, scoring="roc_auc", random_state=seed)
            classifier = lr_clf.fit(edge_embeddings_train, labels_train)

            # Evaluate the model
            labels_test = np.array([1] * len(positive_samples_test) + [0] * len(negative_samples_test))
            test_predictions = classifier.predict_proba(edge_embeddings_test)[:, 1]
            test_roc_auc = roc_auc_score(labels_test, test_predictions)

            # Store results
            results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)

# Print DataFrame
print(results_df)

# Find the best combination of p and q based on average ROC AUC
best_combination = results_df.groupby(['p', 'q'])['roc_auc'].mean().idxmax()
print(f"Best combination of p and q: {best_combination}")


  results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)


   seed     p     q   roc_auc
0     0  0.25  0.25  0.926903
1     0  0.25  0.50  0.920367
2     0  0.25  1.00  0.910017
3     0  0.25  2.00  0.908777
4     0  0.50  0.25  0.920689
5     0  0.50  0.50  0.923143
6     0  0.50  1.00  0.926435
7     0  0.50  2.00  0.923946
8     0  1.00  0.25  0.926960
9     0  1.00  0.50  0.930650
10    0  1.00  1.00  0.926515
11    0  1.00  2.00  0.926846
12    0  2.00  0.25  0.928042
13    0  2.00  0.50  0.934190
14    0  2.00  1.00  0.924921
15    0  2.00  2.00  0.923231
Best combination of p and q: (2.0, 0.5)
