## Import and cleaning

In [1]:
import pandas as pd
import json
import re
import pyomo.environ as pyo


with open("../data/network_graph_2024_06_12.json") as f:
    d = json.load(f)

nodes = pd.DataFrame(d["nodes"])
channels = pd.DataFrame(d["edges"])

## Channels

In [2]:
## Change data types
channels["capacity"] = channels["capacity"].astype(int)
channels["last_update"] = pd.to_datetime(channels["last_update"], unit = 's')

## Filter out channels that are unused (no update time)
channels = channels[channels["last_update"] > "1970-01-01"]

## Filter out channels with nodes with no policy registered for this describegraph query (aka not reachable now)
channels = channels[pd.notnull(channels["node1_policy"]) & pd.notnull(channels["node2_policy"])]

channels = channels.filter(items=['channel_id',
                       'node1_pub',
                       'node2_pub',
                       'capacity',
                       'node1_policy',
                       'node2_policy'
                       ])


From these data we only need information that are strictly related to path finding for channels, thus we only need:

- channel peers
- channel id
- capacity
- nodes policy:
    - fee base msat
    - fee rate milli msat

In [3]:
channels['node1_fee_base_msat'] = channels['node1_policy'].apply(lambda x: x['fee_base_msat'])
channels['node2_fee_base_msat'] = channels['node2_policy'].apply(lambda x: x['fee_base_msat'])

channels['node1_fee_rate_milli_msat'] = channels['node1_policy'].apply(lambda x: x['fee_rate_milli_msat'])
channels['node2_fee_rate_milli_msat'] = channels['node2_policy'].apply(lambda x: x['fee_rate_milli_msat'])

channels["node1_fee_base_msat"] = channels["node1_fee_base_msat"].astype(int)
channels["node2_fee_base_msat"] = channels["node2_fee_base_msat"].astype(int)

channels["node1_fee_rate_milli_msat"] = channels["node1_fee_rate_milli_msat"].astype(int)
channels["node2_fee_rate_milli_msat"] = channels["node2_fee_rate_milli_msat"].astype(int)


channels = channels.filter(items=['channel_id',
                                  'node1_pub',
                                  'node2_pub',
                                  'capacity',
                                  'node1_fee_base_msat',
                                  'node1_fee_rate_milli_msat',
                                  'node2_fee_base_msat',
                                  'node2_fee_rate_milli_msat'
                                  ])

## Nodes

Set feature to flag the presence of onion address and the presence of clearnet address.
Feature is like the Linux permissions:
- 1 for clearnet
- 2 for onion
- 3 for both onion and clearnet

In [4]:
nodes['addresses'] = nodes.iloc[:, 3].apply(lambda x: [i['addr'] for i in x])

In [5]:
def allocate_code(addresses):
    """
    :param addresses: list of strings with Ip or onion addresses
    :return: score for the kind of addresses used by the node.
             1 of only onion, 2 if only clearnet, 3 if both onion and clearnet
             The score is independent from the number of addresses of each kind
    """
    code = []
    onion_pattern = re.compile(r".*\.onion")
    for element in addresses:
        if onion_pattern.match(element):
            code.append(1)
        else:
            code.append(2)
    return sum(set(code))


nodes["addresses"] = nodes["addresses"].apply(allocate_code)

Remove nodes that were not updated (aka with no last update time data)

In [6]:
nodes["last_update"] = pd.to_datetime(nodes["last_update"], unit = 's')
nodes = nodes[nodes["last_update"] > "1970-01-01"]

**Features to be analysed in order to understand the meaning of the properties is_known and is_required for nodes.**

Analyse the "features" column

- https://github.com/lightning/bolts/blob/master/09-features.md
- https://github.com/lightningnetwork/lnd/blob/master/lnrpc/lightning.proto
- https://github.com/lightningnetwork/lnd/blob/master/lnrpc/lightning_grpc.pb.go

In [7]:
nodes = nodes.filter(items = ["pub_key",
                     "alias",
                     "addresses"
                     ])


### Elaborate dataset to manage directed edges and fees

In [8]:
channels.dtypes

channel_id                   object
node1_pub                    object
node2_pub                    object
capacity                      int64
node1_fee_base_msat           int64
node1_fee_rate_milli_msat     int64
node2_fee_base_msat           int64
node2_fee_rate_milli_msat     int64
dtype: object

In [9]:
channels["node1_pub"] = channels["channel_id"] + "-" + channels["node1_pub"]
channels["node2_pub"] = channels["channel_id"] + "-" + channels["node2_pub"]
channels1 = channels.copy()
channels1["node1_pub"] = channels["node2_pub"]
channels1["node2_pub"] = channels["node1_pub"]

In [10]:
channels1["node2_fee_base_msat"] = channels["node1_fee_base_msat"]
channels1["node2_fee_rate_milli_msat"] = channels["node1_fee_rate_milli_msat"]

In [11]:
channels = channels.filter(items=["channel_id", "node1_pub", "node2_pub", "capacity", "node2_fee_base_msat", "node2_fee_rate_milli_msat"])
channels1 = channels1.filter(items=["channel_id", "node1_pub", "node2_pub", "capacity", "node2_fee_base_msat", "node2_fee_rate_milli_msat"])

In [12]:
channels = pd.concat([channels, channels1])

In [17]:
channels.rename(columns={"node2_fee_base_msat": "fee_base_msat", "node2_fee_rate_milli_msat": "fee_rate_milli_msat"}, inplace=True)

In [18]:
channels.dtypes

channel_id             object
node1_pub              object
node2_pub              object
capacity                int64
fee_base_msat           int64
fee_rate_milli_msat     int64
dtype: object