# Dataset generation

There are several approaches to generate data to start work with multilayer networks:

1. Generate synthetic dataset
2. Generate subset of existing large dataset for faster iterations
3. Download and prepare for the work real large dataset

## 0. Imports and paths

In [1]:
#| default_exp utils

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import networkx as nx
import csv
from pathlib import Path
import random
import string

In [4]:
#| hide
import nbdev; nbdev.nbdev_export()

In [5]:
path_data = Path("../data")
path_data.mkdir(exist_ok=True)

## 1. Generate synthetic dataset
Create a small multilayer graph for testing purposes.

In [30]:
#| export 
def obfuscate_nodes(n_nodes:int,
                    node_labels_pool:list[str]=None,
                    str_len:int=10) -> dict:
    """
    Maps nodes from range [0, n_nodes) to unique random strings of length str_len.
    All upper case letters and digits are used, starts with a letter.
    If node_labels_pool is provided, nodes selected from it if len(node_labels_pool) >= n_nodes,
    otherwise, additional unique nodes are generated.
    """
    if node_labels_pool is not None:
        assert len(node_labels_pool[0]) == str_len
    
    def encode(str_len):
        return ''.join(random.choices(string.ascii_uppercase, k=1)) +\
               ''.join(random.choices(string.ascii_uppercase + string.digits, k=(str_len-1)))
    
    if node_labels_pool is not None:
        if len(node_labels_pool) >= n_nodes:
            # select random nodes from node_labels_pool
            random.shuffle(node_labels_pool)
            vals = node_labels_pool[:n_nodes]
            d = dict(zip(range(n_nodes), vals))
        elif len(node_labels_pool) < n_nodes:
            # use all nodes from node_labels_pool
            random.shuffle(node_labels_pool)
            vals = node_labels_pool[:]
            # generate additional unique nodes
            for i in range(n_nodes - len(node_labels_pool)):
                x = encode(str_len)
                while x in vals: # guarantee uniqueness
                    x = encode(str_len)
                vals.append(x)
            # shuffle pre-defined and newly generated nodes
            random.shuffle(vals)
            d = dict(zip(range(n_nodes), vals))
    else:   # no node_labels_pool provided
        d = {}
        for i in range(n_nodes):
            x = encode(str_len)
            # gaurantee uniqueness
            while x in d.values():
                x = encode(str_len)
            
            d[i] = x
    
    return d

In [35]:
d = obfuscate_nodes(n_nodes=3)
d

{0: 'TRYBYAJRMJ', 1: 'CLPEH5WCKO', 2: 'OXAJ0N6W56'}

In [38]:
node_labels_pool = ['TI672Y15ZU', 'S0SM5PZWDN','G54WMQPO11']
d = obfuscate_nodes(n_nodes=2, node_labels_pool=node_labels_pool)
d

{0: 'G54WMQPO11', 1: 'TI672Y15ZU'}

In [39]:
d = obfuscate_nodes(n_nodes=5, node_labels_pool=node_labels_pool)
d

{0: 'AM8P8D1UV8',
 1: 'XHNLHK4GVE',
 2: 'S0SM5PZWDN',
 3: 'TI672Y15ZU',
 4: 'G54WMQPO11'}

The following is a thin wrapper around the random graph generators from [networkx](https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.random_graphs) package.

In [8]:
#| export
def create_random_graph(generator:callable, obfuscate:bool=True, **kwargs):
    """Thin wrapper around networkx's random graph generator

    Args:
        generator (callable): networkx's random graph generator
        **kwargs: passed to generator
    Example:
        >>> G = create_random_graph(nx.erdos_renyi_graph, n=10, p=0.6, directed=False)
    """
    G = generator(**kwargs)
    if obfuscate:
        G = nx.relabel_nodes(G, obfuscate_nodes(G.number_of_nodes()))
    return G

In [9]:
G = create_random_graph(nx.erdos_renyi_graph, n=10, p=0.6, directed=False)

In [10]:
G.nodes()

NodeView(('MDVP1J58F6', 'B795PK40SH', 'G1BSY7WMCF', 'MVC97CN6T1', 'CG8QWJ8N8M', 'Z3ZZU63QTP', 'NY4QOEJ6WP', 'I8BFARR4ZZ', 'GMD9EKVQ0X', 'I4X7ZHEC29'))

Create random graph and write it to the disk:

In [11]:
#| export
def create_and_save_random_graph(generator:callable,
                                 label_edges:str,
                                 path_to:str|Path,
                                 obfuscate:bool=True,
                                 **kwargs):
    """
    Creates a random graph and saves it to path_to.

    Args:
        generator (callable): networkx's random graph generator
        obfuscate (bool, optional): if true, obfuscates node names. Defaults to True.
        path_to (str|Path, optional): path to save the graph. Defaults to None.
    
    Example:
        >>> create_random_graph_and_save(nx.erdos_renyi_graph, n=10, p=0.6, directed=False, path_to=path_data / 'synthetic' / '1.csv')
    """

    G = create_random_graph(generator=generator, obfuscate=obfuscate, **kwargs)
    edges = G.edges()
    with open(path_to, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=' ')
        for edge in edges:
            writer.writerow([edge[0], edge[1], label_edges])

Create three layers. For experiments with CmmD algorithm we need at least several hundreds of nodes to see the difference in the clustering with a variety of $\gamma$ values:

In [12]:
path_dir_to = path_data / 'synthetic'
path_dir_to.mkdir(exist_ok=True)

In [13]:
n, p, label_edges = 300, 0.6, "first"
create_and_save_random_graph(nx.erdos_renyi_graph, label_edges, path_dir_to / '1.csv', n=n, p=p)

n, p, label_edges = 500, 0.4, "second"
create_and_save_random_graph(nx.erdos_renyi_graph, label_edges, path_dir_to / '2.csv', n=n, p=p)

n, p, label_edges = 400, 0.7, "third"
create_and_save_random_graph(nx.erdos_renyi_graph, label_edges, path_dir_to / '3.csv', n=n, p=p)

The method above generates obfuscated graph with node labels different from layer to layer. However, for multilayer community analysis we need that at least some nodes share labels among layers.

In [14]:
# define parameters of the multilayer network
generator = nx.erdos_renyi_graph
n_nodes = [300, 500, 400]
p_edges = [0.6, 0.4, 0.7]
label_edges = ["first", "second", "third"]

# let's generate graphs step by step and propagate labels of edges between those graphs
G_1 = create_random_graph(generator=generator, obfuscate=True, n=n_nodes[0], p=p_edges[0], directed=False)

NodeView(('J1XAJ30WKZ', 'UCWIM36L4Q', 'D67YFUCGEB', 'GEQN1KPO3E', 'PJ5KUHO20B', 'H4SI4921B4', 'WQ84ODHC6Z', 'I4KADRE2P9', 'SDPGQ3YXDS', 'H6FNN7JP5D', 'CVV0O56KW0', 'SNC9VTAD3B', 'KIQL3VTXM7', 'LC7ZJUU6R7', 'BCBUH8EXFS', 'BH3O7QJUNJ', 'V0W0QWF2Q4', 'B004CWIY7Q', 'KP1MKK46G4', 'EN57S6V1KS', 'P9IG7RZJSK', 'C06PFIHIYO', 'OIPZLJ2WMX', 'BUQAYJXVP5', 'DC2V5RPCFP', 'RNODMQ39QF', 'WVUY3HHALM', 'F7KSFUZW59', 'DH9RNSHIW9', 'OI6XLDT9NI', 'MG2ZCQMNE2', 'OR9GI71DT7', 'LTDA4GJOLZ', 'OHZNL1T3C6', 'IV81III7T6', 'GO50K6S795', 'KOX79N08XZ', 'H7ZB7RAYUE', 'J4ZSAEOG7C', 'IG43OHUBK4', 'B8DR4LNXJZ', 'B2PN9376UY', 'O6B561OUQM', 'TFQJBYAD9D', 'LWCJ9GUHVT', 'PNO223DLH7', 'X0YO9ILGDT', 'GN1YMQU5R6', 'LRH199ETON', 'R0THC5E2C4', 'PV870KPFTD', 'C3L5O2M9P7', 'XHI4CS9SQO', 'DSRQFRM2IE', 'NO4JENLMH9', 'SRTA94VN76', 'CDFNV0XVC8', 'HOC4DIYZ9I', 'ZJLKBDXL3V', 'QQEKQ3PXYY', 'F68HN3PRCZ', 'DVALRIY6EK', 'JYV8PMNB8E', 'HNVE0AAARR', 'ONS81VYTWN', 'UREG6YSNQP', 'S4V0B01U99', 'FELW5SZOY9', 'YCZKJPT65A', 'CGSZTYE7GA', 'SIT60VWT3

In [17]:
nodes = list(G_1.nodes())

300