#Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Download the dataset

In [None]:
import os
import urllib.request

url = "https://nrvis.com/download/data/asn/mammalia-dolphin-florida-overall.zip"
dataset_zip = "mammalia-dolphin-florida-overall.zip"
filename = "mammalia-dolphin-florida-overall.edges"
output_dir = "/content/drive/MyDrive/GraphLA/clustering"
data_dir = "/content/drive/MyDrive/GraphLA/data"

os.makedirs(output_dir, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)

if not os.path.exists(dataset_zip):
    print(f"Downloading {dataset_zip}...")
    try:
        urllib.request.urlretrieve(url, dataset_zip)
        print("Download completed!")
    except Exception as e:
        print(f"Error during download: {e}")

Downloading mammalia-dolphin-florida-overall.zip...
Download completed!


#Unzip the dataset

In [None]:
import zipfile

with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
  zip_ref.extractall('.')

#Define some utility method

In [None]:
import pickle
import random
import sys
import os
from os import path

import networkx as nx
import torch

def load_graph(file_path: str) -> nx.Graph:
    if not path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    G = nx.Graph()
    with open(file_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 2:
                continue
            u, v = parts[0], parts[1]
            w = float(parts[2])
            G.add_edge(u, v, weight=w)

    return G


def load_cluster_info(clusters_path: str = os.path.join(data_dir, 'cluster_info.pkl')):
    if not os.path.exists(clusters_path):
        print(f"File '{clusters_path}' not found. Please run 'clustering.ipynb' first to generate it.")
        raise FileNotFoundError()

    with open(clusters_path, "rb") as f:
        return pickle.load(f)

#Main

In [None]:
import pickle

# 1) Load the graph
G = load_graph(filename)

# 2) Load cluster information (see src/script/clustering.py)
cluster_info = load_cluster_info()

# 3) Create the training set
node_list = list(G.nodes)
node_id_to_idx = {n: i for i, n in enumerate(node_list)}

followers = [f for c in cluster_info for f in c["followers"]]
leaders = [c["leader"] for c in cluster_info]

pos_edges = [
    (node_id_to_idx[f], node_id_to_idx[l])
    for l in leaders
    for f in followers
    if G.has_edge(f, l)
]

all_pairs = [(node_id_to_idx[f], node_id_to_idx[l])
             for l in leaders
             for f in followers
             if f != l]

possible_neg_edges = list(set(all_pairs) - set(pos_edges))

# 4) Save the training set
print("Saving training set...")

try:
    with open(os.path.join(data_dir, 'positive_edges.pkl') ,"wb") as f:
        pickle.dump(pos_edges, f)

    with open(os.path.join(data_dir, 'negative_edges.pkl'), "wb") as f:
        pickle.dump(possible_neg_edges, f)

except (OSError, IOError) as e:
    print(f"Some error occur: {e}")
else:
    print("Training set saved successfully")

Saving training set...
Training set saved successfully
