# Dataset generation

There are several approaches to generate data to start work with multilayer networks:

1. Generate synthetic dataset
2. Generate subset of existing large dataset for faster iterations
3. Download and prepare for the work real large dataset

## 0. Imports and paths

In [1]:
#| default_exp utils

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import networkx as nx
import csv
from pathlib import Path
import random
import string

In [4]:
#| hide
import nbdev; nbdev.nbdev_export()

In [5]:
path_data = Path("../data")
path_data.mkdir(exist_ok=True)

## 1. Generate synthetic dataset
Create a small multilayer graph for testing purposes.

In [6]:
#| export 
def obfuscate_nodes(n_nodes:int, str_len:int=10) -> dict:
    """
    Maps nodes from range [0, n_nodes) to random strings of length str_len
    """
    d = {}
    for i in range(n_nodes):
        d[i] = ''.join(random.choices(string.ascii_uppercase, k=1)) +\
              ''.join(random.choices(string.ascii_uppercase + string.digits, k=(str_len-1)))
    
    #TODO:verify that there are no duplicates
    return d

In [7]:
d = obfuscate_nodes(n_nodes=10)
d

{0: 'ZEERP9X9YZ',
 1: 'I69P25FUZY',
 2: 'NOZRBIZW72',
 3: 'SYKL461WOA',
 4: 'BT33J1VILM',
 5: 'RSFHAXR3DU',
 6: 'GS2WXNO1O8',
 7: 'DB4VF0FH1C',
 8: 'I8XOJYKED0',
 9: 'SD11TMJSK0'}

In [8]:
#| export
def create_random_graph(n_nodes:int, prob:float, directed:bool=False, path_to:str|Path=None, label:str=None):
    G = nx.erdos_renyi_graph(n_nodes, prob, directed=directed)
    edges = G.edges()
    if path_to is not None:
        with open(path_to, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=' ')
            for edge in edges:
                writer.writerow([edge[0], edge[1], label])
        return None
    return G

Create three layers. For experiments with CmmD algorithm we need at least several hundreds of nodes to see the difference in the clustering with a variety of $\gamma$ values:

In [9]:
path_dir_to = path_data / 'synthetic'
path_dir_to.mkdir(exist_ok=True)

In [10]:
n_nodes, prob, label = 300, 0.6, "first"
create_random_graph(n_nodes=n_nodes, prob=prob, path_to=path_dir_to / 'first.csv', label=label)

n_nodes, prob, label = 500, 0.4, "second"
create_random_graph(n_nodes=n_nodes, prob=prob, path_to=path_dir_to / 'second.csv', label=label)

n_nodes, prob, label = 400, 0.7, "third"
create_random_graph(n_nodes=n_nodes, prob=prob, path_to=path_dir_to / 'third.csv', label=label)