# Reducing the graph size while having all donors and politicians in the same component

In [1]:
import pandas as pd
from typing import Dict
from DonorIdeo.config import ASSETS_DIR, DATABASE_PATH, TEMPORARY_DATA_DIR
from DonorIdeo.littlesis_graph_utils import build_littlesis_graph, collect_donations_to
import json
from DonorIdeo.config import ASSETS_DIR, DATABASE_PATH, TEMPORARY_DATA_DIR
from DonorIdeo.littlesis_graph_utils import build_littlesis_graph, collect_donations_to
import json
import networkx as nx
from typing import List, Set, Tuple
from tqdm import tqdm
from typing import Set

In [2]:
# 1. collect the donations to the politicians in the graph
# Build the graph
_, G_largest = build_littlesis_graph()

# Collect the donations to the politicians in the graph
littlesis_ids: List[int] = (
    pd.read_csv(DATABASE_PATH, dtype={"littlesis": "Int64"})["littlesis"]
    .dropna()
    .values.tolist()
)
donations_to_politician: Dict[int, Dict[int, int]] = collect_donations_to(
    politicians=littlesis_ids, graph=G_largest, save_total_to_database=False
)

Reading /Users/viktorduepedersen/Documents/github/DonorIdeo/data/sources/littlesis-entities.json
Reading /Users/viktorduepedersen/Documents/github/DonorIdeo/data/sources/littlesis-relationships.json


# Intial checks

In [3]:
nodes_that_needs_to_be_there = set()
# check that the politicians are in the graph
for politician in donations_to_politician:
    assert politician in G_largest.nodes()
    nodes_that_needs_to_be_there.add(politician)

# same as previous but data from a different source. Confusion about the ids
for littlesis_id in littlesis_ids:
    assert littlesis_id in G_largest.nodes()
    nodes_that_needs_to_be_there.add(littlesis_id)

# check that the donors are in the graph
for littlesis_id in donations_to_politician:
    for donor_id in donations_to_politician[littlesis_id]:
        assert (
            donor_id in G_largest.nodes()
        ), f"{donor_id} (donated to {littlesis_id}) not in graph"
        nodes_that_needs_to_be_there.add(donor_id)

# Ensure all node_ids are in the graph from the start
for node in nodes_that_needs_to_be_there:
    if node not in G_largest:
        raise ValueError(f"Node {node} not in graph.")
print("All node_ids are in the graph from the start.")

All node_ids are in the graph from the start.


# Picking out the component with all the nodes in it

In [4]:
def component_with_all_nodes(
    G: nx.Graph, node_ids_to_keep: Set[int]
) -> nx.Graph | None:
    # Find the connected component containing all node_ids
    sorted_components = sorted(nx.connected_components(G), key=len, reverse=True)
    for component in sorted_components:
        if all(node in component for node in node_ids_to_keep):
            return G.subgraph(component)
    return None

In [5]:
valid_component = component_with_all_nodes(G_largest, nodes_that_needs_to_be_there)
assert (
    valid_component is not None
), "Not all node_ids are in a connected component from the start."

# Can we simply only include nodes that we need and they are still in a connected component?

In [6]:
test_G = nx.subgraph(valid_component, nodes_that_needs_to_be_there)
print(f"Is the graph connected? -> {nx.is_connected(test_G)}")

Is the graph connected? -> False


In [7]:
sorted_components = sorted(nx.connected_components(test_G), key=len, reverse=True)

# lets make sure that these components haven't received any donations
nodes_with_no_donations = set()
for component in sorted_components:
    if len(component) == 1:
        for node_in_comp in component:
            node_id: int = G_largest.nodes[node_in_comp]["id"]
            assert len(donations_to_politician[node_id]) == 0
            nodes_with_no_donations.add(node_id)

In [16]:
# Remove the nodes with no donations from nodes_that_needs_to_be_there
nodes_that_needs_to_be_there = nodes_that_needs_to_be_there - nodes_with_no_donations

valid_component = component_with_all_nodes(test_G, nodes_that_needs_to_be_there)

# save the valid_component as adjacency list
nx.write_adjlist(valid_component, TEMPORARY_DATA_DIR / "valid_component.adjlist)

In [9]:
print(f"Is the graph connected? -> {nx.is_connected(valid_component)}")
print(f"Number of nodes in the graph: {valid_component.number_of_nodes()}")
print(f"Number of edges in the graph: {valid_component.number_of_edges()}")

print(f"Number of nodes in G_largest: {G_largest.number_of_nodes()}")
print(f"Number of edges in G_largest: {G_largest.number_of_edges()}")

Is the graph connected? -> True
Number of nodes in the graph: 41439
Number of edges in the graph: 712705
Number of nodes in G_largest: 354538
Number of edges in G_largest: 1677720


# Generate node_attributes.csv and network.csv

In [10]:
# Collect the donations to the politicians in the graph
littlesis_ids: List[int] = (
    pd.read_csv(DATABASE_PATH, dtype={"littlesis": "Int64"})["littlesis"]
    .dropna()
    .values.tolist()
)

# Make sure that nodes_with_no_donations are removed from littlesis_ids
littlesis_ids = list(set(littlesis_ids) - nodes_with_no_donations)

donations_to_politician: Dict[int, Dict[int, int]] = collect_donations_to(
    politicians=littlesis_ids, graph=valid_component, save_total_to_database=False
)

In [11]:
import numpy as np
from DonorIdeo.json_utils import save_json
from DonorIdeo.config import MINIMUM_NVD_DIR

"""BECAUSE OF TIME NEEDS 
the following code is simply copied from the nvd.py file and modified to work with the new graph

Not good practice but it works.
"""


def generate_custom_id(G_largest: nx.Graph) -> Tuple[Dict[str, str], Dict[str, str]]:
    """Generate a custom ID for each node"""
    littlesis_id_to_my_id = {}
    my_id_to_littlesis_id = {}

    for i, node_id in enumerate(G_largest.nodes()):
        my_id_to_littlesis_id[str(i + 1)] = str(node_id)
        littlesis_id_to_my_id[str(node_id)] = str(i + 1)

    # Save the mappings to JSON files
    save_json(
        data=littlesis_id_to_my_id,
        path=MINIMUM_NVD_DIR / "id-mapping-littlesis-to-mine.json",
        verbose=True,
    )

    save_json(
        data=my_id_to_littlesis_id,
        path=MINIMUM_NVD_DIR / "id-mapping-mine-to-littlesis.json",
        verbose=True,
    )

    return littlesis_id_to_my_id, my_id_to_littlesis_id


def generate_network_csv(
    littlesis_id_to_my_id: Dict[str, str], G_largest: nx.Graph
) -> None:
    """
    Generate network.csv, which contains the edges of the graph in the format "a,b."
    Note that the IDs are custom-created.
    """

    with open(MINIMUM_NVD_DIR / "minimum-network.csv", "w") as f:
        for edge in tqdm(G_largest.edges(), desc="Writing network.csv"):
            littlesis_src_id, littlesis_dst_id = edge

            src_id = littlesis_id_to_my_id[str(littlesis_src_id)]
            dst_id = littlesis_id_to_my_id[str(littlesis_dst_id)]

            f.write(f"{src_id},{dst_id}\n")


def generate_node_attributes_csv(
    donation_to_politician: Dict[int, Dict[int, int]],
    my_id_to_littlesis_id: Dict[str, str],
) -> None:
    # Generate node_attributes.csv from donation_to_politician
    outpath = MINIMUM_NVD_DIR / "minimum-node_attributes.csv"
    with open(outpath, "w") as f:
        # Write the header
        f.write("donor,")
        politician_ids: List[int] = [
            politician_id for politician_id in set(donation_to_politician.keys())
        ]
        f.write(",".join([str(polit) for polit in politician_ids]) + "\n")

        # Write the data
        for my_donor_id, littlesis_donor_id in tqdm(
            my_id_to_littlesis_id.items(), desc=outpath.name
        ):
            # Write the donor ID
            line = f"{my_donor_id},"

            for politician in politician_ids:
                littlesis_donor_id: int = int(littlesis_donor_id)
                if littlesis_donor_id in donation_to_politician[politician]:
                    donation: int = donation_to_politician[politician][
                        littlesis_donor_id
                    ]

                    if donation < 0:  # Negative donations are not allowed
                        donation = 0
                    logged_donation: int = np.log(donation + 1)  # Add 1 to avoid log(0)
                    line += f"{logged_donation},"
                else:
                    line += "0,"  # np.log(1) = 0
            f.write(line[:-1] + "\n")  # Remove the last comma


def compute_distance_matrix(write_to_file: bool = True) -> np.ndarray:
    node_attributes = pd.read_csv(MINIMUM_NVD_DIR / "minimum-node_attributes.csv")

    politician_ids = node_attributes.columns[1:]
    politician_matrix_id = {
        int(polit_id): i for i, polit_id in enumerate(politician_ids)
    }
    distance_matrix = np.zeros((len(politician_ids), len(politician_ids)))
    for polit_id in tqdm(politician_ids, desc="Building distance matrix"):
        specific_path = MINIMUM_NVD_DIR / "nvd" / f"{polit_id}.csv"
        distances = pd.read_csv(
            specific_path, skiprows=1, names=["src", "dst", "distance"]
        )
        # Setting all values
        for idx, row in distances.iterrows():
            src_matrix_id = int(politician_matrix_id[row["src"]])
            dst_matrix_id = int(politician_matrix_id[row["dst"]])
            dist = row["distance"]

            # set value in matrix
            distance_matrix[src_matrix_id, dst_matrix_id] = dist
            distance_matrix[dst_matrix_id, src_matrix_id] = dist

    if write_to_file:
        outpath = MINIMUM_NVD_DIR / "distance_matrix.csv"
        print(f"Writing distance matrix to file: {outpath}")
        np.savetxt(outpath, distance_matrix, delimiter=",")

    return distance_matrix


from sklearn.manifold import TSNE


def project_and_scale(distance_matrix: np.ndarray, model: TSNE) -> np.ndarray:
    X_embedded = model.fit_transform(distance_matrix)

    # Scale the values to be between -1 and 1
    X_embedded = (X_embedded - X_embedded.min()) / (X_embedded.max() - X_embedded.min())
    X_embedded = (X_embedded * 2) - 1

    return X_embedded

In [12]:
""" 2. Build the network.csv and node_attributes.csv

Firstly I need to create a new_id for each of the nodes in the biggest connected components.
This is because the julia script requires the nodes to be numbered from 1 to n.
"""
littlesis_id_to_my_id, my_id_to_littlesis_id = generate_custom_id(valid_component)

generate_network_csv(littlesis_id_to_my_id, valid_component)

generate_node_attributes_csv(
    donation_to_politician=donations_to_politician,
    my_id_to_littlesis_id=my_id_to_littlesis_id,
)

Saving /Users/viktorduepedersen/Documents/github/DonorIdeo/data/minimum_nvd/id-mapping-littlesis-to-mine.json
Saving /Users/viktorduepedersen/Documents/github/DonorIdeo/data/minimum_nvd/id-mapping-mine-to-littlesis.json


Writing network.csv: 100%|██████████| 712705/712705 [00:02<00:00, 356141.50it/s]
minimum-node_attributes.csv: 100%|██████████| 41439/41439 [00:04<00:00, 8374.16it/s]


# Now the minimum-nvd.jl script can be run

In [13]:
import os

#  3. Compute the distance matrix
assert (
    len(
        [
            politician_file
            for politician_file in os.listdir(MINIMUM_NVD_DIR / "nvd")
            if politician_file.endswith(".csv")
        ]
    )
    == 551
), f"The nvd data directory is missing some files. The Julia script are most likely not finished running."

In [14]:
from sklearn.manifold import TSNE

distance_matrix = compute_distance_matrix(write_to_file=True)

# Project the distance matrix to 2D and scale the values to -1 to 1
tsne_2 = TSNE(n_components=2, metric="precomputed", init="random", random_state=42)
X_embedded_2 = project_and_scale(distance_matrix=distance_matrix, model=tsne_2)

# Project the distance matrix to 1D and scale the values to -1 to 1
tsne_1 = TSNE(n_components=1, metric="precomputed", init="random", random_state=42)
X_embedded_1 = project_and_scale(distance_matrix=distance_matrix, model=tsne_1)

Building distance matrix: 100%|██████████| 551/551 [00:03<00:00, 167.13it/s]


Writing distance matrix to file: /Users/viktorduepedersen/Documents/github/DonorIdeo/data/minimum_nvd/distance_matrix.csv


In [15]:
database = pd.read_csv(DATABASE_PATH, dtype={"littlesis": "Int64"})

node_attributes = pd.read_csv(MINIMUM_NVD_DIR / "minimum-node_attributes.csv")
politician_ids = node_attributes.columns[1:]

# zip the politicians with their projections
politicians_with_projections = zip(politician_ids, X_embedded_1, X_embedded_2)

# Add the projections to the database
for littlesis_id, projection_1d, projection_2d in politicians_with_projections:
    database.loc[
        database["littlesis"] == int(littlesis_id), f"minimum-projection-1d"
    ] = projection_1d
    database.loc[
        database["littlesis"] == int(littlesis_id), f"minimum-projection-2d_x"
    ] = projection_2d[0]
    database.loc[
        database["littlesis"] == int(littlesis_id), f"minimum-projection-2d_y"
    ] = projection_2d[1]

# Save the database
database.to_csv(DATABASE_PATH, index=False)