# Dataset generation

There are several approaches to generate data to start work with multilayer networks:

1. Generate synthetic dataset
2. Generate subset of existing large dataset for faster iterations
3. Download and prepare for the work real large dataset

## 0. Imports and paths

In [1]:
#| default_exp utils

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import networkx as nx
import csv
from pathlib import Path
import random
import string

In [4]:
#| hide
import nbdev; nbdev.nbdev_export()

In [5]:
path_data = Path("../data")
path_data.mkdir(exist_ok=True)

## 1. Generate synthetic dataset
Create a small multilayer graph for testing purposes.

In [6]:
#| export 
def obfuscate_nodes(n_nodes:int,
                    node_labels_pool:list[str]=None,
                    str_len:int=10) -> dict:
    """
    Maps nodes from range [0, n_nodes) to unique random strings of length str_len.
    All upper case letters and digits are used, starts with a letter.
    If node_labels_pool is provided, nodes selected from it if len(node_labels_pool) >= n_nodes,
    otherwise, additional unique nodes are generated.
    """
    if node_labels_pool is not None and len(node_labels_pool) > 0:
        assert len(node_labels_pool[0]) == str_len
    
    def encode(str_len):
        return ''.join(random.choices(string.ascii_uppercase, k=1)) +\
               ''.join(random.choices(string.ascii_uppercase + string.digits, k=(str_len-1)))
    
    if node_labels_pool is not None:
        if len(node_labels_pool) >= n_nodes:
            # select random nodes from node_labels_pool
            random.shuffle(node_labels_pool)
            vals = node_labels_pool[:n_nodes]
            d = dict(zip(range(n_nodes), vals))
        elif len(node_labels_pool) < n_nodes:
            # use all nodes from node_labels_pool
            random.shuffle(node_labels_pool)
            vals = node_labels_pool[:]
            # generate additional unique nodes
            for i in range(n_nodes - len(node_labels_pool)):
                x = encode(str_len)
                while x in vals: # guarantee uniqueness
                    x = encode(str_len)
                vals.append(x)
            # shuffle pre-defined and newly generated nodes
            random.shuffle(vals)
            d = dict(zip(range(n_nodes), vals))
    else:   # no node_labels_pool provided
        d = {}
        for i in range(n_nodes):
            x = encode(str_len)
            # gaurantee uniqueness
            while x in d.values():
                x = encode(str_len)
            
            d[i] = x
    
    return d

In [7]:
d = obfuscate_nodes(n_nodes=3)
d

{0: 'PAZN0O9IIH', 1: 'MNMUZUM48X', 2: 'VDBU4HPMCC'}

In [8]:
node_labels_pool = ['TI672Y15ZU', 'S0SM5PZWDN','G54WMQPO11']
d = obfuscate_nodes(n_nodes=2, node_labels_pool=node_labels_pool)
d

{0: 'S0SM5PZWDN', 1: 'TI672Y15ZU'}

In [9]:
d = obfuscate_nodes(n_nodes=5, node_labels_pool=node_labels_pool)
d

{0: 'JNFJDR2ZFN',
 1: 'S0SM5PZWDN',
 2: 'UTG42RLL53',
 3: 'TI672Y15ZU',
 4: 'G54WMQPO11'}

The following is a thin wrapper around the random graph generators from [networkx](https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.random_graphs) package.

In [10]:
#| export
def create_random_graph(generator:callable, obfuscate:bool=True, node_labels_pool:list[str]=None, **kwargs):
    """Thin wrapper around networkx's random graph generator

    Args:
        generator (callable): networkx's random graph generator
        **kwargs: passed to generator
    Example:
        >>> G = create_random_graph(nx.erdos_renyi_graph, n=10, p=0.6, directed=False)
    """
    G = generator(**kwargs)
    if obfuscate:
        G = nx.relabel_nodes(G, obfuscate_nodes(G.number_of_nodes(), node_labels_pool=node_labels_pool))
    return G

In [11]:
node_labels_pool

['G54WMQPO11', 'S0SM5PZWDN', 'TI672Y15ZU']

In [12]:
G = create_random_graph(nx.erdos_renyi_graph, node_labels_pool=node_labels_pool, n=5, p=0.6, directed=False)

In [13]:
G.nodes()

NodeView(('SBCUX8F7ST', 'S0SM5PZWDN', 'G54WMQPO11', 'TI672Y15ZU', 'JYSAI9KPVS'))

Create random graph and write it to the disk:

In [14]:
#| export
def create_and_save_random_graph(generator:callable,
                                 label_edges:str,
                                 path_to:str|Path,
                                 obfuscate:bool=True,
                                 node_labels_pool:list[str]=None,
                                 **kwargs) -> nx.classes.graph.Graph:
    """
    Creates a random graph and saves it to path_to.

    Args:
        generator (callable): networkx's random graph generator
        obfuscate (bool, optional): if true, obfuscates node names. Defaults to True.
        path_to (str|Path, optional): path to save the graph. Defaults to None.
    
    Example:
        >>> create_random_graph_and_save(nx.erdos_renyi_graph, n=10, p=0.6, directed=False, path_to=path_data / 'synthetic' / '1.csv')
    """

    G = create_random_graph(generator=generator, obfuscate=obfuscate, node_labels_pool=node_labels_pool, **kwargs)
    edges = G.edges()
    with open(path_to, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=' ')
        for edge in edges:
            writer.writerow([edge[0], edge[1], label_edges])
    
    return G

Create three layers. For experiments with CmmD algorithm we need at least several hundreds of nodes to see the difference in the clustering with a variety of $\gamma$ values:

In [15]:
path_dir_to = path_data / 'synthetic'
path_dir_to.mkdir(exist_ok=True)

Leaving the `label_edge` empty allows to follow the required format for MolTi community detection software:

In [16]:
node_labels_pool = []

n, p, label_edges = 300, 0.2, ""
G_1 = create_and_save_random_graph(nx.erdos_renyi_graph, label_edges, path_dir_to / '1.csv',
                                   obfuscate=True, node_labels_pool=node_labels_pool, n=n, p=p)
node_labels_pool.extend(list(G_1.nodes()))

n, p, label_edges = 500, 0.2, ""
G_2 = create_and_save_random_graph(nx.erdos_renyi_graph, label_edges, path_dir_to / '2.csv',
                                   obfuscate=True, node_labels_pool=node_labels_pool, n=n, p=p)
node_labels_pool.extend(list(G_2.nodes()))

n, p, label_edges = 400, 0.2, ""
G_3 = create_and_save_random_graph(nx.erdos_renyi_graph, label_edges, path_dir_to / '3.csv',
                                   obfuscate=True, node_labels_pool=node_labels_pool, n=n, p=p)
node_labels_pool.extend(list(G_3.nodes()))