# Reducing the graph size while having all donors and politicians in the same component

In [20]:
import pandas as pd
from typing import Dict
from DonorIdeo.config import ASSETS_DIR, DATABASE_PATH, TEMPORARY_DATA_DIR
from DonorIdeo.littlesis_graph_utils import build_littlesis_graph, collect_donations_to
import json
from DonorIdeo.config import ASSETS_DIR, DATABASE_PATH, TEMPORARY_DATA_DIR
from DonorIdeo.littlesis_graph_utils import build_littlesis_graph, collect_donations_to
import json
import networkx as nx
from typing import List, Set, Tuple
from tqdm import tqdm
from typing import Set

In [21]:
# 1. collect the donations to the politicians in the graph
# Build the graph
_, G_largest = build_littlesis_graph()

# Collect the donations to the politicians in the graph
littlesis_ids: List[int] = (
    pd.read_csv(DATABASE_PATH, dtype={"littlesis": "Int64"})["littlesis"]
    .dropna()
    .values.tolist()
)
donations_to_politician: Dict[int, Dict[int, int]] = collect_donations_to(
    politicians=littlesis_ids, graph=G_largest, save_total_to_database=False
)


nodes_that_needs_to_be_there = set()
# check that the politicians are in the graph
for politician in donations_to_politician:
    assert politician in G_largest.nodes()
    nodes_that_needs_to_be_there.add(politician)

# same as previous but data from a different source. Confusion about the ids
for littlesis_id in littlesis_ids:
    assert littlesis_id in G_largest.nodes()
    nodes_that_needs_to_be_there.add(littlesis_id)

# check that the donors are in the graph
for littlesis_id in donations_to_politician:
    for donor_id in donations_to_politician[littlesis_id]:
        assert (
            donor_id in G_largest.nodes()
        ), f"{donor_id} (donated to {littlesis_id}) not in graph"
        nodes_that_needs_to_be_there.add(donor_id)

Reading /Users/viktorduepedersen/Documents/github/DonorIdeo/data/sources/littlesis-entities.json
Reading /Users/viktorduepedersen/Documents/github/DonorIdeo/data/sources/littlesis-relationships.json


## Asserting that all nodes are indeed present from the start

In [22]:
# Ensure all node_ids are in the graph from the start
for node in nodes_that_needs_to_be_there:
    if node not in G_largest:
        raise ValueError(f"Node {node} not in graph.")
print("All node_ids are in the graph from the start.")

All node_ids are in the graph from the start.


# Picking out the component with all the nodes in it

In [26]:
def component_with_all_nodes(
    G: nx.Graph, node_ids_to_keep: Set[int]
) -> nx.Graph | None:
    # Find the connected component containing all node_ids
    sorted_components = sorted(nx.connected_components(G), key=len, reverse=True)
    for component in sorted_components:
        if all(node in component for node in node_ids_to_keep):
            return G.subgraph(component)
    return None

In [27]:
valid_component = component_with_all_nodes(G_largest, nodes_that_needs_to_be_there)
assert (
    valid_component is not None
), "Not all node_ids are in a connected component from the start."

# Can we simply only include nodes that we need and they are still in a connected component?

In [29]:
test_G = nx.subgraph(valid_component, nodes_that_needs_to_be_there)
nx.is_connected(test_G)

False

In [43]:
sorted_components = sorted(nx.connected_components(test_G), key=len, reverse=True)

# lets make sure that these components haven't received any donations
for component in sorted_components:
    if len(component) == 1:
        for node_in_comp in component:
            node_id: int = G_largest.nodes[node_in_comp]["id"]
            assert len(donations_to_politician[node_id]) == 0

# Lets exclude them and only keep the biggest connected component