Walk Length 10

In [4]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Function to load edge list from a file
def load_edgelist(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = line.strip().split()
            edges.append((edge[0], edge[1]))
    return edges

# Function to sample a subgraph using BFS
def sample_subgraph_bfs(graph, start_node=None, number_of_nodes=1000, seed=None):
    if start_node is None:
        nodes = list(graph.nodes())
        random.Random(seed).shuffle(nodes)
        start_node = nodes[0]

    sampled_nodes = set([start_node])
    node_queue = [start_node]

    while len(sampled_nodes) < number_of_nodes and node_queue:
        current = node_queue.pop(0)
        neighbors = sorted(list(graph.neighbors(current)))
        random.Random(seed).shuffle(neighbors)

        for neighbor in neighbors:
            if len(sampled_nodes) >= number_of_nodes:
                break
            if neighbor not in sampled_nodes:
                sampled_nodes.add(neighbor)
                node_queue.append(neighbor)

    return graph.subgraph(sampled_nodes)

# Function to generate edge embeddings
def generate_edge_embeddings(edges, embedding_function):
    edge_embeddings = []
    for u, v in edges:
        u_embedding = embedding_function(u)
        v_embedding = embedding_function(v)
        edge_embedding = np.multiply(u_embedding, v_embedding)
        edge_embeddings.append(edge_embedding)
    return np.array(edge_embeddings)

# Function to split the graph into train and test sets, ensuring connectivity
def split_graph_with_connectivity(graph, fraction_to_remove=0.1, seed=None):
    edge_list = list(graph.edges())
    num_edges_to_remove = int(fraction_to_remove * graph.number_of_edges())

    while True:
        random.Random(seed).shuffle(edge_list)
        edges_to_remove = edge_list[:num_edges_to_remove]
        G_train = graph.copy()
        G_train.remove_edges_from(edges_to_remove)
        if nx.is_connected(G_train):
            break

    G_test = graph.copy()
    G_test.remove_edges_from(G_train.edges())

    return G_train, G_test, edges_to_remove

# Function to generate positive and negative samples
def generate_samples(graph, seed=None):
    positive_samples = list(graph.edges())
    negative_samples = []
    all_nodes = sorted(list(graph.nodes()))

    while len(negative_samples) < len(positive_samples):
        node_pair = random.sample(all_nodes, 2)
        if not graph.has_edge(*node_pair):
            negative_samples.append(tuple(sorted(node_pair)))

    return positive_samples, negative_samples

# Function to generate Node2Vec embeddings
def generate_node2vec_embeddings(graph, dimensions=128, num_walks=10, walk_length=10, p=1.0, q=1.0, seed=None):
    # Function to perform random walk based on Node2Vec algorithm
    def node2vec_walk(node):
        walk = [node]
        for _ in range(walk_length):
            neighbors = list(graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    probs = []
                    for neighbor in neighbors:
                        if neighbor == prev_node:
                            probs.append(1 / p)
                        elif graph.has_edge(prev_node, neighbor):
                            probs.append(1)
                        else:
                            probs.append(1 / q)
                    probs = np.array(probs)
                    probs /= np.sum(probs)
                    walk.append(np.random.choice(neighbors, p=probs))
            else:
                break
        return walk

    # Generating random walks
    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.seed(seed)
        random.shuffle(nodes)
        for node in nodes:
            walks.append(node2vec_walk(node))

    # Learning embeddings using Word2Vec with a fixed seed
    embeddings = {}
    model = Word2Vec(walks, vector_size=dimensions, window=5, min_count=1, sg=1, workers=1, seed=seed)
    for node in nodes:
        embeddings[node] = model.wv[node]

    # Function to get node embeddings
    def embedding_function(u):
        return embeddings[u]

    return model, embedding_function

# Grid search over p, q ∈ {0.25, 0.50, 1, 2}
p_values = [0.25, 0.5, 1, 2,3,6]
q_values = [0.25, 0.5, 1, 2,3,6]

# DataFrame to store the results
results_df = pd.DataFrame(columns=['seed', 'p', 'q', 'roc_auc'])

for seed in range(1):
    # Set the new random seed for all functions
    np.random.seed(seed)
    random.seed(seed)

    # Load the data and create the graph
    edge_list = load_edgelist('blog.txt')  # Replace with your actual file path
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # Sample subgraph, split into train/test, generate samples
    subgraph = sample_subgraph_bfs(G, number_of_nodes=300, seed=seed)
    G_train, G_test, _ = split_graph_with_connectivity(subgraph, seed=seed)
    positive_samples_train, negative_samples_train = generate_samples(G_train, seed=seed)
    positive_samples_test, negative_samples_test = generate_samples(G_test, seed=seed)

    for p in p_values:
        for q in q_values:
            # Node2Vec embeddings
            _, embedding_function = generate_node2vec_embeddings(G_train, p=p, q=q, seed=seed)

            # Generate edge embeddings
            edge_embeddings_train = generate_edge_embeddings(positive_samples_train + negative_samples_train, embedding_function)
            edge_embeddings_test = generate_edge_embeddings(positive_samples_test + negative_samples_test, embedding_function)

            # Logistic Regression Model
            labels_train = np.array([1] * len(positive_samples_train) + [0] * len(negative_samples_train))
            lr_clf = LogisticRegressionCV(cv=10, max_iter=2000, scoring="roc_auc", random_state=seed)
            classifier = lr_clf.fit(edge_embeddings_train, labels_train)

            # Evaluate the model
            labels_test = np.array([1] * len(positive_samples_test) + [0] * len(negative_samples_test))
            test_predictions = classifier.predict_proba(edge_embeddings_test)[:, 1]
            test_roc_auc = roc_auc_score(labels_test, test_predictions)

            # Store results
            results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)

# Print DataFrame
print(results_df)

# Find the best combination of p and q based on average ROC AUC
best_combination = results_df.groupby(['p', 'q'])['roc_auc'].mean().idxmax()
print(f"Best combination of p and q: {best_combination}")


  results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)


   seed     p     q   roc_auc
0     0  0.25  0.25  0.730408
1     0  0.25  0.50  0.706372
2     0  0.25  1.00  0.668639
3     0  0.25  2.00  0.644195
4     0  0.25  3.00  0.671066
5     0  0.25  6.00  0.665873
6     0  0.50  0.25  0.735760
7     0  0.50  0.50  0.702381
8     0  0.50  1.00  0.707619
9     0  0.50  2.00  0.681224
10    0  0.50  3.00  0.639478
11    0  0.50  6.00  0.652290
12    0  1.00  0.25  0.708458
13    0  1.00  0.50  0.702676
14    0  1.00  1.00  0.714921
15    0  1.00  2.00  0.693764
16    0  1.00  3.00  0.708050
17    0  1.00  6.00  0.658050
18    0  2.00  0.25  0.734762
19    0  2.00  0.50  0.704785
20    0  2.00  1.00  0.735283
21    0  2.00  2.00  0.694195
22    0  2.00  3.00  0.690726
23    0  2.00  6.00  0.687256
24    0  3.00  0.25  0.720295
25    0  3.00  0.50  0.706599
26    0  3.00  1.00  0.734603
27    0  3.00  2.00  0.733129
28    0  3.00  3.00  0.684626
29    0  3.00  6.00  0.695215
30    0  6.00  0.25  0.730590
31    0  6.00  0.50  0.744762
32    0  6

Walk Length 15

In [71]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

# Function to load edge list from a file
def load_edgelist(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = line.strip().split()
            edges.append((edge[0], edge[1]))
    return edges

# Function to sample a subgraph using BFS
def sample_subgraph_bfs(graph, start_node=None, number_of_nodes=1000, seed=None):
    if start_node is None:
        nodes = list(graph.nodes())
        random.Random(seed).shuffle(nodes)
        start_node = nodes[0]

    sampled_nodes = set([start_node])
    node_queue = [start_node]

    while len(sampled_nodes) < number_of_nodes and node_queue:
        current = node_queue.pop(0)
        neighbors = sorted(list(graph.neighbors(current)))
        random.Random(seed).shuffle(neighbors)

        for neighbor in neighbors:
            if len(sampled_nodes) >= number_of_nodes:
                break
            if neighbor not in sampled_nodes:
                sampled_nodes.add(neighbor)
                node_queue.append(neighbor)

    return graph.subgraph(sampled_nodes)

# Function to generate edge embeddings
def generate_edge_embeddings(edges, embedding_function):
    edge_embeddings = []
    for u, v in edges:
        u_embedding = embedding_function(u)
        v_embedding = embedding_function(v)
        edge_embedding = np.multiply(u_embedding, v_embedding)
        edge_embeddings.append(edge_embedding)
    return np.array(edge_embeddings)

# Function to split the graph into train and test sets, ensuring connectivity
def split_graph_with_connectivity(graph, fraction_to_remove=0.1, seed=None):
    edge_list = list(graph.edges())
    num_edges_to_remove = int(fraction_to_remove * graph.number_of_edges())

    while True:
        random.Random(seed).shuffle(edge_list)
        edges_to_remove = edge_list[:num_edges_to_remove]
        G_train = graph.copy()
        G_train.remove_edges_from(edges_to_remove)
        if nx.is_connected(G_train):
            break

    G_test = graph.copy()
    G_test.remove_edges_from(G_train.edges())

    return G_train, G_test, edges_to_remove

# Function to generate positive and negative samples
def generate_samples(graph, seed=None):
    positive_samples = list(graph.edges())
    negative_samples = []
    all_nodes = sorted(list(graph.nodes()))

    while len(negative_samples) < len(positive_samples):
        node_pair = random.sample(all_nodes, 2)
        if not graph.has_edge(*node_pair):
            negative_samples.append(tuple(sorted(node_pair)))

    return positive_samples, negative_samples

# Function to generate Node2Vec embeddings
def generate_node2vec_embeddings(graph, dimensions=128, num_walks=30, walk_length=15, p=1.0, q=1.0, seed=None):
    # Function to perform random walk based on Node2Vec algorithm
    def node2vec_walk(node):
        walk = [node]
        for _ in range(walk_length):
            neighbors = list(graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    probs = []
                    for neighbor in neighbors:
                        if neighbor == prev_node:
                            probs.append(1 / p)
                        elif graph.has_edge(prev_node, neighbor):
                            probs.append(1)
                        else:
                            probs.append(1 / q)
                    probs = np.array(probs)
                    probs /= np.sum(probs)
                    walk.append(np.random.choice(neighbors, p=probs))
            else:
                break
        return walk

    # Generating random walks
    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.seed(seed)
        random.shuffle(nodes)
        for node in nodes:
            walks.append(node2vec_walk(node))

    # Learning embeddings using Word2Vec
    embeddings = {}
    model = Word2Vec(walks, vector_size=dimensions, window=5, min_count=1, sg=1, workers=1, seed=seed)
    for node in nodes:
        embeddings[node] = model.wv[node]

    # Function to get node embeddings
    def embedding_function(u):
        return embeddings[u]

    return model, embedding_function

# Grid search over p, q ∈ {0.25, 0.50, 1, 2}
p_values = [0.25, 0.5, 1, 2,3,6]
q_values = [0.25, 0.5, 1, 2,3,4,6]

# DataFrame to store the results
results_df = pd.DataFrame(columns=['seed', 'p', 'q', 'roc_auc'])

for seed in range(1):
    # Set the new random seed for all functions
    np.random.seed(seed)
    random.seed(seed)

    # Load the data and create the graph
    edge_list = load_edgelist('blog.txt')  # Replace with your actual file path
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # Sample subgraph, split into train/test, generate samples
    subgraph = sample_subgraph_bfs(G, number_of_nodes=300, seed=seed)
    G_train, G_test, _ = split_graph_with_connectivity(subgraph, seed=seed)
    positive_samples_train, negative_samples_train = generate_samples(G_train, seed=seed)
    positive_samples_test, negative_samples_test = generate_samples(G_test, seed=seed)

    for p in p_values:
        for q in q_values:
            # Node2Vec embeddings
            _, embedding_function = generate_node2vec_embeddings(G_train, p=p, q=q, seed=seed)

            # Generate edge embeddings
            edge_embeddings_train = generate_edge_embeddings(positive_samples_train + negative_samples_train, embedding_function)
            edge_embeddings_test = generate_edge_embeddings(positive_samples_test + negative_samples_test, embedding_function)

            # Logistic Regression Model
            labels_train = np.array([1] * len(positive_samples_train) + [0] * len(negative_samples_train))
            lr_clf = LogisticRegressionCV(cv=10, max_iter=2000, scoring="roc_auc", random_state=seed)
            classifier = lr_clf.fit(edge_embeddings_train, labels_train)

            # Evaluate the model
            labels_test = np.array([1] * len(positive_samples_test) + [0] * len(negative_samples_test))
            test_predictions = classifier.predict_proba(edge_embeddings_test)[:, 1]
            test_roc_auc = roc_auc_score(labels_test, test_predictions)

            # Store results
            results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)

# Print DataFrame
print(results_df)

# Find the best combination of p and q based on average ROC AUC
best_combination = results_df.groupby(['p', 'q'])['roc_auc'].mean().idxmax()
print(f"Best combination of p and q: {best_combination}")


  results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)


   seed     p     q   roc_auc
0     0  0.25  0.25  0.714422
1     0  0.25  0.50  0.723447
2     0  0.25  1.00  0.707052
3     0  0.25  2.00  0.712018
4     0  0.25  3.00  0.693878
5     0  0.25  4.00  0.679252
6     0  0.25  6.00  0.676712
7     0  0.50  0.25  0.733401
8     0  0.50  0.50  0.713946
9     0  0.50  1.00  0.709433
10    0  0.50  2.00  0.695896
11    0  0.50  3.00  0.730612
12    0  0.50  4.00  0.716757
13    0  0.50  6.00  0.715556
14    0  1.00  0.25  0.719524
15    0  1.00  0.50  0.769365
16    0  1.00  1.00  0.726395
17    0  1.00  2.00  0.732041
18    0  1.00  3.00  0.688503
19    0  1.00  4.00  0.732948
20    0  1.00  6.00  0.697868
21    0  2.00  0.25  0.720045
22    0  2.00  0.50  0.722766
23    0  2.00  1.00  0.719206
24    0  2.00  2.00  0.725556
25    0  2.00  3.00  0.691610
26    0  2.00  4.00  0.715850
27    0  2.00  6.00  0.711156
28    0  3.00  0.25  0.752426
29    0  3.00  0.50  0.734989
30    0  3.00  1.00  0.703061
31    0  3.00  2.00  0.724558
32    0  3

Walk length 5

In [1]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

# Function to load edge list from a file
def load_edgelist(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = line.strip().split()
            edges.append((edge[0], edge[1]))
    return edges

# Function to sample a subgraph using BFS
def sample_subgraph_bfs(graph, start_node=None, number_of_nodes=1000, seed=None):
    if start_node is None:
        nodes = list(graph.nodes())
        random.Random(seed).shuffle(nodes)
        start_node = nodes[0]

    sampled_nodes = set([start_node])
    node_queue = [start_node]

    while len(sampled_nodes) < number_of_nodes and node_queue:
        current = node_queue.pop(0)
        neighbors = sorted(list(graph.neighbors(current)))
        random.Random(seed).shuffle(neighbors)

        for neighbor in neighbors:
            if len(sampled_nodes) >= number_of_nodes:
                break
            if neighbor not in sampled_nodes:
                sampled_nodes.add(neighbor)
                node_queue.append(neighbor)

    return graph.subgraph(sampled_nodes)

# Function to generate edge embeddings
def generate_edge_embeddings(edges, embedding_function):
    edge_embeddings = []
    for u, v in edges:
        u_embedding = embedding_function(u)
        v_embedding = embedding_function(v)
        edge_embedding = np.multiply(u_embedding, v_embedding)
        edge_embeddings.append(edge_embedding)
    return np.array(edge_embeddings)

# Function to split the graph into train and test sets, ensuring connectivity
def split_graph_with_connectivity(graph, fraction_to_remove=0.1, seed=None):
    edge_list = list(graph.edges())
    num_edges_to_remove = int(fraction_to_remove * graph.number_of_edges())

    while True:
        random.Random(seed).shuffle(edge_list)
        edges_to_remove = edge_list[:num_edges_to_remove]
        G_train = graph.copy()
        G_train.remove_edges_from(edges_to_remove)
        if nx.is_connected(G_train):
            break

    G_test = graph.copy()
    G_test.remove_edges_from(G_train.edges())

    return G_train, G_test, edges_to_remove

# Function to generate positive and negative samples
def generate_samples(graph, seed=None):
    positive_samples = list(graph.edges())
    negative_samples = []
    all_nodes = sorted(list(graph.nodes()))

    while len(negative_samples) < len(positive_samples):
        node_pair = random.sample(all_nodes, 2)
        if not graph.has_edge(*node_pair):
            negative_samples.append(tuple(sorted(node_pair)))

    return positive_samples, negative_samples

# Function to generate Node2Vec embeddings
def generate_node2vec_embeddings(graph, dimensions=128, num_walks=30, walk_length=5, p=1.0, q=1.0, seed=None):
    # Function to perform random walk based on Node2Vec algorithm
    def node2vec_walk(node):
        walk = [node]
        for _ in range(walk_length):
            neighbors = list(graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    probs = []
                    for neighbor in neighbors:
                        if neighbor == prev_node:
                            probs.append(1 / p)
                        elif graph.has_edge(prev_node, neighbor):
                            probs.append(1)
                        else:
                            probs.append(1 / q)
                    probs = np.array(probs)
                    probs /= np.sum(probs)
                    walk.append(np.random.choice(neighbors, p=probs))
            else:
                break
        return walk

    # Generating random walks
    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.seed(seed)
        random.shuffle(nodes)
        for node in nodes:
            walks.append(node2vec_walk(node))

    # Learning embeddings using Word2Vec
    embeddings = {}
    model = Word2Vec(walks, vector_size=dimensions, window=5, min_count=1, sg=1, workers=1, seed=seed)
    for node in nodes:
        embeddings[node] = model.wv[node]

    # Function to get node embeddings
    def embedding_function(u):
        return embeddings[u]

    return model, embedding_function

# Grid search over p, q ∈ {0.25, 0.50, 1, 2}
p_values = [0.25, 0.5, 1, 2,3,6]
q_values = [0.25, 0.5, 1, 2,3,6]

# DataFrame to store the results
results_df = pd.DataFrame(columns=['seed', 'p', 'q', 'roc_auc'])

for seed in range(1):
    # Set the new random seed for all functions
    np.random.seed(seed)
    random.seed(seed)

    # Load the data and create the graph
    edge_list = load_edgelist('blog.txt')  # Replace with your actual file path
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # Sample subgraph, split into train/test, generate samples
    subgraph = sample_subgraph_bfs(G, number_of_nodes=300, seed=seed)
    G_train, G_test, _ = split_graph_with_connectivity(subgraph, seed=seed)
    positive_samples_train, negative_samples_train = generate_samples(G_train, seed=seed)
    positive_samples_test, negative_samples_test = generate_samples(G_test, seed=seed)

    for p in p_values:
        for q in q_values:
            # Node2Vec embeddings
            _, embedding_function = generate_node2vec_embeddings(G_train, p=p, q=q, seed=seed)

            # Generate edge embeddings
            edge_embeddings_train = generate_edge_embeddings(positive_samples_train + negative_samples_train, embedding_function)
            edge_embeddings_test = generate_edge_embeddings(positive_samples_test + negative_samples_test, embedding_function)

            # Logistic Regression Model
            labels_train = np.array([1] * len(positive_samples_train) + [0] * len(negative_samples_train))
            lr_clf = LogisticRegressionCV(cv=10, max_iter=2000, scoring="roc_auc", random_state=seed)
            classifier = lr_clf.fit(edge_embeddings_train, labels_train)

            # Evaluate the model
            labels_test = np.array([1] * len(positive_samples_test) + [0] * len(negative_samples_test))
            test_predictions = classifier.predict_proba(edge_embeddings_test)[:, 1]
            test_roc_auc = roc_auc_score(labels_test, test_predictions)

            # Store results
            results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)

# Print DataFrame
print(results_df)

# Find the best combination of p and q based on average ROC AUC
best_combination = results_df.groupby(['p', 'q'])['roc_auc'].mean().idxmax()
print(f"Best combination of p and q: {best_combination}")


  results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)


   seed     p     q   roc_auc
0     0  0.25  0.25  0.708345
1     0  0.25  0.50  0.712472
2     0  0.25  1.00  0.696259
3     0  0.25  2.00  0.706100
4     0  0.25  3.00  0.704558
5     0  0.25  6.00  0.749342
6     0  0.50  0.25  0.712744
7     0  0.50  0.50  0.706667
8     0  0.50  1.00  0.703946
9     0  0.50  2.00  0.737710
10    0  0.50  3.00  0.691950
11    0  0.50  6.00  0.719932
12    0  1.00  0.25  0.742766
13    0  1.00  0.50  0.754195
14    0  1.00  1.00  0.729932
15    0  1.00  2.00  0.721451
16    0  1.00  3.00  0.730794
17    0  1.00  6.00  0.704263
18    0  2.00  0.25  0.723333
19    0  2.00  0.50  0.736349
20    0  2.00  1.00  0.723810
21    0  2.00  2.00  0.715828
22    0  2.00  3.00  0.709773
23    0  2.00  6.00  0.731859
24    0  3.00  0.25  0.745669
25    0  3.00  0.50  0.754308
26    0  3.00  1.00  0.712608
27    0  3.00  2.00  0.749138
28    0  3.00  3.00  0.740680
29    0  3.00  6.00  0.721542
30    0  6.00  0.25  0.715556
31    0  6.00  0.50  0.730862
32    0  6

mutli level walks

In [2]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Function to load edge list from a file
def load_edgelist(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            edge = line.strip().split()
            edges.append((edge[0], edge[1]))
    return edges

# Function to sample a subgraph using BFS
def sample_subgraph_bfs(graph, start_node=None, number_of_nodes=1000, seed=None):
    if start_node is None:
        nodes = list(graph.nodes())
        random.Random(seed).shuffle(nodes)
        start_node = nodes[0]

    sampled_nodes = set([start_node])
    node_queue = [start_node]

    while len(sampled_nodes) < number_of_nodes and node_queue:
        current = node_queue.pop(0)
        neighbors = sorted(list(graph.neighbors(current)))
        random.Random(seed).shuffle(neighbors)

        for neighbor in neighbors:
            if len(sampled_nodes) >= number_of_nodes:
                break
            if neighbor not in sampled_nodes:
                sampled_nodes.add(neighbor)
                node_queue.append(neighbor)

    return graph.subgraph(sampled_nodes)

# Function to generate edge embeddings
def generate_edge_embeddings(edges, embedding_function):
    edge_embeddings = []
    for u, v in edges:
        u_embedding = embedding_function(u)
        v_embedding = embedding_function(v)
        edge_embedding = np.multiply(u_embedding, v_embedding)
        edge_embeddings.append(edge_embedding)
    return np.array(edge_embeddings)

# Function to split the graph into train and test sets, ensuring connectivity
def split_graph_with_connectivity(graph, fraction_to_remove=0.1, seed=None):
    edge_list = list(graph.edges())
    num_edges_to_remove = int(fraction_to_remove * graph.number_of_edges())

    while True:
        random.Random(seed).shuffle(edge_list)
        edges_to_remove = edge_list[:num_edges_to_remove]
        G_train = graph.copy()
        G_train.remove_edges_from(edges_to_remove)
        if nx.is_connected(G_train):
            break

    G_test = graph.copy()
    G_test.remove_edges_from(G_train.edges())

    return G_train, G_test, edges_to_remove

# Function to generate positive and negative samples
def generate_samples(graph, seed=None):
    positive_samples = list(graph.edges())
    negative_samples = []
    all_nodes = sorted(list(graph.nodes()))

    while len(negative_samples) < len(positive_samples):
        node_pair = random.sample(all_nodes, 2)
        if not graph.has_edge(*node_pair):
            negative_samples.append(tuple(sorted(node_pair)))

    return positive_samples, negative_samples

def generate_node2vec_embeddings_multiscale(graph, dimensions=128, num_walks=15, walk_lengths=[5,10, 15], p=1.0, q=1.0, seed=None):
    def node2vec_walk(node, length):
        walk = [node]
        for _ in range(length):
            neighbors = list(graph.neighbors(walk[-1]))
            if len(neighbors) > 0:
                if len(walk) == 1:
                    walk.append(random.choice(neighbors))
                else:
                    prev_node = walk[-2]
                    probs = []
                    for neighbor in neighbors:
                        if neighbor == prev_node:
                            probs.append(1 / p)
                        elif graph.has_edge(prev_node, neighbor):
                            probs.append(1)
                        else:
                            probs.append(1 / q)
                    probs = np.array(probs)
                    probs /= np.sum(probs)
                    walk.append(np.random.choice(neighbors, p=probs))
            else:
                break
        return walk

    walks = []
    nodes = list(graph.nodes())
    for _ in range(num_walks):
        random.seed(seed)
        random.shuffle(nodes)
        for node in nodes:
            for length in walk_lengths:
                walks.append(node2vec_walk(node, length))

    embeddings = {}
    model = Word2Vec(walks, vector_size=dimensions, window=5, min_count=1, sg=1, workers=1, seed=seed)
    for node in nodes:
        embeddings[node] = model.wv[node]

    def embedding_function(u):
        
        return embeddings[u]

    return model, embedding_function


# Grid search over p, q ∈ {0.25, 0.50, 1, 2}
p_values = [0.25, 0.5, 1, 2,6]
q_values = [0.25, 0.5, 1, 2,6]

# DataFrame to store the results
results_df = pd.DataFrame(columns=['seed', 'p', 'q', 'roc_auc'])

for seed in range(1):
    # Set the new random seed for all functions
    np.random.seed(seed)
    random.seed(seed)

    # Load the data and create the graph
    edge_list = load_edgelist('blog.txt')  # Replace with your actual file path
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # Sample subgraph, split into train/test, generate samples
    subgraph = sample_subgraph_bfs(G, number_of_nodes=300, seed=seed)
    G_train, G_test, _ = split_graph_with_connectivity(subgraph, seed=seed)
    positive_samples_train, negative_samples_train = generate_samples(G_train, seed=seed)
    positive_samples_test, negative_samples_test = generate_samples(G_test, seed=seed)

    # Define the list of walk lengths for multi-scale sampling
      # Adjust this list based on the lengths you want to explore

    for p in p_values:
        for q in q_values:
            # Node2Vec embeddings with multi-scale sampling
            _, embedding_function = generate_node2vec_embeddings_multiscale(G_train, p=p, q=q, walk_lengths=walk_lengths, seed=seed)

            # Generate edge embeddings
            edge_embeddings_train = generate_edge_embeddings(positive_samples_train + negative_samples_train, embedding_function)
            edge_embeddings_test = generate_edge_embeddings(positive_samples_test + negative_samples_test, embedding_function)

            # Logistic Regression Model
            labels_train = np.array([1] * len(positive_samples_train) + [0] * len(negative_samples_train))
            lr_clf = LogisticRegressionCV(cv=10, max_iter=2000, scoring="roc_auc", random_state=seed)
            classifier = lr_clf.fit(edge_embeddings_train, labels_train)

            # Evaluate the model
            labels_test = np.array([1] * len(positive_samples_test) + [0] * len(negative_samples_test))
            test_predictions = classifier.predict_proba(edge_embeddings_test)[:, 1]
            test_roc_auc = roc_auc_score(labels_test, test_predictions)

            # Store results
            results_df = pd.concat([results_df, pd.DataFrame([{'seed': seed, 'p': p, 'q': q, 'roc_auc': test_roc_auc}])], ignore_index=True)

# Print DataFrame
print(results_df)

# Find the best combination of p and q based on average ROC AUC
best_combination = results_df.groupby(['p', 'q'])['roc_auc'].mean().idxmax()
print(f"Best combination of p and q: {best_combination}")


NameError: name 'walk_lengths' is not defined