# Exploratory Data Analysis

In [31]:
import pandas as pd

from DonorIdeo.config import DATABASE_PATH, TEMPORARY_DATA_DIR, ASSETS_DIR
from DonorIdeo.json_utils import save_json, read_json
from DonorIdeo.littlesis_graph_utils import build_littlesis_graph
import networkx as nx
from typing import List

from pathlib import Path

In [28]:
# CONSTANTS
DATABASE: pd.DataFrame = pd.read_csv(DATABASE_PATH)
G, G_largest = build_littlesis_graph()

Reading /Users/viktorduepedersen/Documents/github/DonorIdeo/data/sources/littlesis-entities.json
Reading /Users/viktorduepedersen/Documents/github/DonorIdeo/data/sources/littlesis-relationships.json


In [29]:
def write_graph_info(G: nx.Graph, outpath: Path):
    with open(outpath, "w") as outfile:
        outfile.write(f"File generated at {pd.Timestamp.now()} \n")

        # basic info
        outfile.write("Basic info: \n")
        outfile.write(f"Number of nodes: {G.number_of_nodes()} \n")
        outfile.write(f"Number of edges: {G.number_of_edges()} \n")
        outfile.write(
            f"Number of connected components: {nx.number_connected_components(G)} \n"
        )
        connected_components = sorted(nx.connected_components(G), key=len, reverse=True)
        three_largest_components = connected_components[0:3]
        for i, component in enumerate(three_largest_components):
            outfile.write(
                f"{i+1}. Size of the connected component: {(len(component)/G.number_of_nodes())*100:.2f}% \n"
            )

        outfile.write("\n")

        # top 20 nodes based on pagerank
        pr = nx.pagerank(G)
        top_20_pr = sorted(pr.items(), key=lambda x: x[1], reverse=True)[0:20]
        outfile.write("Top 20 nodes based on pagerank: \n")
        for i, node in enumerate(top_20_pr):
            node_id, node_pr = node
            node_name: str = G.nodes[node_id]["name"]
            outfile.write(f"{i+1}.\t({node_id})\t\t{node_name}\t{node_pr:.5f} \n")

        outfile.write("\n")

        # Ranking only politicians from the DATABASE
        outfile.write("Ranking only politicians from the DATABASE: \n")
        littlesis_ids: List[int] = DATABASE["littlesis"].dropna().unique().tolist()

        # obtain the pr for only the politicians
        pr_politicians = {k: v for k, v in pr.items() if k in littlesis_ids}
        top_200_pr_politicians = sorted(
            pr_politicians.items(), key=lambda x: x[1], reverse=True
        )[0:200]
        for i, node in enumerate(top_200_pr_politicians):
            node_id, node_pr = node
            node_name: str = G.nodes[node_id]["name"]
            node_party: str = DATABASE[DATABASE["littlesis"] == node_id][
                "party"
            ].values[0]
            outfile.write(
                f"{i+1}.\t{node_party}\t\t({node_id})\t\t{node_name}: {node_pr:.5f} \n"
            )


write_graph_info(G, ASSETS_DIR / "graph_info.txt")
write_graph_info(G_largest, ASSETS_DIR / "graph_largest_info.txt")

# Degree distribution of the graph and only politicians in database.csv

In [30]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def compute_degree_distribution(G: nx.Graph, outpath: Path) -> None:
    """
    Compute the degree distribution of a graph and save it in a json file.
    """
    degree_distribution: List[int] = nx.degree_histogram(G)

    fig = make_subplots(
        rows=1,
        cols=1,
        subplot_titles=("Degree Distribution", "Degree Distribution (log-log)"),
    )
    fig.add_trace(
        go.Scatter(x=list(range(len(degree_distribution))), y=degree_distribution),
        row=1,
        col=1,
    )
    # Make the y-axis of the log-log plot logarithmic
    fig.update_yaxes(type="log", row=1, col=1)
    fig.update_xaxes(type="log", row=1, col=1)
    fig.write_image(outpath)


compute_degree_distribution(G_largest, ASSETS_DIR / "graph_degree_distribution.png")
compute_degree_distribution(
    G_largest, ASSETS_DIR / "graph_largest_degree_distribution.png"
)

# Donation distribution to the politicians in database.csv