In [1]:
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
sns.set_theme(style="whitegrid")

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12, 6)

import os
import psycopg2
import itertools
import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm
from datetime import timedelta
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from collections import defaultdict
from matplotlib.patches import Patch

In [2]:
load_dotenv()

DB_HOST = os.getenv("DB_HOST")
DB_PORT = int(os.getenv("DB_PORT"))
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_NAME = os.getenv("DB_NAME")
DATA_FOLDER = os.getenv("DATA_FOLDER")

In [14]:
events_relays = pl.read_csv(os.path.join(DATA_FOLDER, 'events_relays.csv'))
events = pl.read_csv(os.path.join(DATA_FOLDER, 'events.csv'), columns=['id', 'pubkey']).rename({'id': 'event_id'})
events_relays = events_relays.join(events, on='event_id', how='left')
relay_stats = pl.read_csv(os.path.join(DATA_FOLDER, 'relay_stats.csv'))

In [None]:
events_share = events_relays.group_by(['event_id']).agg(
    [pl.col('relay_url').unique().alias('relay_urls')]
).filter(
    pl.col("relay_urls").list.len() > 1
)
events_share_dict = {}
for row in tqdm(events_share.iter_rows(named=True), desc="Writing pairs", total=events_share.height):
    for relay_url1 in row['relay_urls']:
        for relay_url2 in row['relay_urls']:
            if relay_url1 < relay_url2:
                pair = (relay_url1, relay_url2)
                if pair not in events_share_dict:
                    events_share_dict[pair] = 0
                events_share_dict[pair] += 1
events_share = pl.DataFrame(
    {
        'relay_url1': [pair[0] for pair in events_share_dict.keys()],
        'relay_url2': [pair[1] for pair in events_share_dict.keys()],
        'shared_event_count': list(events_share_dict.values())
    }
)
events_share = events_share.join(
    relay_stats.select(['relay_url', 'num_events']).rename({'num_events': 'relay_url1_event_count', 'relay_url': 'relay_url1'}),
    on='relay_url1',
    how='left'
)
events_share = events_share.join(
    relay_stats.select(['relay_url', 'num_events']).rename({'num_events': 'relay_url2_event_count', 'relay_url': 'relay_url2'}),
    on='relay_url2',
    how='left'
)
events_share.write_csv(os.path.join(DATA_FOLDER, 'events_share.csv'))

Writing pairs: 100%|██████████| 1841336/1841336 [00:07<00:00, 232326.35it/s]


In [None]:
pubkeys_share = events_relays.group_by(['pubkey']).agg(
    [pl.col('relay_url').unique().alias('relay_urls')]
).filter(
    pl.col("relay_urls").list.len() > 1
)
pubkeys_share_dict = {}
for row in tqdm(pubkeys_share.iter_rows(named=True), desc="Writing pairs", total=pubkeys_share.height):
    for relay_url1 in row['relay_urls']:
        for relay_url2 in row['relay_urls']:
            if relay_url1 < relay_url2:
                pair = (relay_url1, relay_url2)
                if pair not in pubkeys_share_dict:
                    pubkeys_share_dict[pair] = 0
                pubkeys_share_dict[pair] += 1
pubkeys_share = pl.DataFrame(
    {
        'relay_url1': [pair[0] for pair in pubkeys_share_dict.keys()],
        'relay_url2': [pair[1] for pair in pubkeys_share_dict.keys()],
        'shared_pubkey_count': list(pubkeys_share_dict.values())
    }
)
pubkeys_share = pubkeys_share.join(
    relay_stats.select(['relay_url', 'num_pubkeys']).rename({'num_pubkeys': 'relay_url1_pubkey_count', 'relay_url': 'relay_url1'}),
    on='relay_url1',
    how='left'
)
pubkeys_share = pubkeys_share.join(
    relay_stats.select(['relay_url', 'num_pubkeys']).rename({'num_pubkeys': 'relay_url2_pubkey_count', 'relay_url': 'relay_url2'}),
    on='relay_url2',
    how='left'
)
pubkeys_share.write_csv(os.path.join(DATA_FOLDER, 'pubkeys_share.csv'))

Writing pairs: 100%|██████████| 452079/452079 [00:02<00:00, 163588.61it/s]
