In [2]:
import pandas as pd
import numpy as np
from  matplotlib import pyplot as plt
import networkx as nx
import seaborn as sns

In [3]:
def normalize_edge_weights(G):
    G_copy = G.copy()
    # Extract all edge weights
    weights = [G[u][v]['weight'] for u, v in G.edges()]
    
    w_min, w_max = min(weights), max(weights)
    
    # Avoid division by zero if all weights are the same
    if w_max == w_min:
        return G_copy  # No need to normalize if all weights are identical
    
    # Normalize each edge weight in the copied graph
    for u, v in G_copy.edges():
        G_copy[u][v]['weight'] = (G_copy[u][v]['weight'] - w_min) / (w_max - w_min)
    
    return G_copy

In [2]:
def sparsify_by_sector_inflow(G, threshold, weight_key='weight'):
    """
    Create a new DiGraph H where, for each buyer node v,
    we group its in-edges by the supplier sector and keep
    only those edges >= threshold * (total inflow from that sector).

    Then, for each node, if the sum of its outgoing edges in H is
    less than 50% of the sum of its outgoing edges in G, we add
    the highest-weight outgoing edges from G until we reach >= 50%.

    Assumes each node name looks like 'COUNTRY_SECTOR',
    e.g. 'FRA_MANche' or 'CHN_MANche'.
    """
    H = nx.DiGraph()

    # --- 1) Keep edges >= threshold * sector inflow ---
    for v in G.nodes():
        # All in-edges to v
        in_edges_data = G.in_edges(v, data=True)

        # 1A) Sum flows by supplier sector
        sector_in_sum = {}
        for (u, _, data) in in_edges_data:
            w = data.get(weight_key, 0.0)
            try:
                # Split "FRA_MANche" into "FRA" and "MANche"
                supplier_country, supplier_sector = u.split('_', 1)
            except ValueError:
                # If node naming doesn't match, skip or handle differently
                continue
            sector_in_sum.setdefault(supplier_sector, 0.0)
            sector_in_sum[supplier_sector] += w

        # 1B) Keep only edges >= threshold * total_for_that_sector
        for (u, _, data) in in_edges_data:
            w = data.get(weight_key, 0.0)
            try:
                supplier_country, supplier_sector = u.split('_', 1)
            except ValueError:
                continue

            total_for_sector = sector_in_sum.get(supplier_sector, 0.0)
            if total_for_sector > 0 and w >= threshold * total_for_sector:
                # Add edge to H
                H.add_edge(u, v, **{weight_key: w})

    # --- 2) Ensure each node's outgoing edges in H reach >= 50% of G's total ---
    for node in H.nodes():
        # 2A) Sum of outgoing edges in H
        out_edges_H = H.out_edges(node, data=True)
        sum_H = sum(data.get(weight_key, 0.0) for _, _, data in out_edges_H)

        # 2B) Sum of outgoing edges in G
        out_edges_G = G.out_edges(node, data=True)
        sum_G = sum(data.get(weight_key, 0.0) for _, _, data in out_edges_G)

        # 2C) If < 50%, add highest-weight edges from G until we reach >= 50%
        if sum_G > 0:
            ratio = sum_H / sum_G
            if ratio < 0.5:
                # Sort G's outgoing edges (descending by weight)
                sorted_out_edges = sorted(
                    out_edges_G,
                    key=lambda x: x[2].get(weight_key, 0.0),
                    reverse=True
                )
                # Add edges from G until ratio >= 0.5 (or no more edges)
                for u, v, data in sorted_out_edges:
                    w_g = data.get(weight_key, 0.0)
                    if w_g <= 0:
                        continue  # Skip zero-weight edges

                    # If H doesn't have this edge or has a smaller weight, update it
                    if H.has_edge(u, v):
                        current_w = H[u][v].get(weight_key, 0.0)
                    else:
                        current_w = 0.0

                    if current_w < w_g:
                        # Add/update in H
                        H.add_edge(u, v, **{weight_key: w_g})
                        # Update sum_H and check ratio
                        sum_H = sum_H - current_w + w_g
                        ratio = sum_H / sum_G
                        if ratio >= 0.5:
                            break

    return H

In [5]:
# -----------------------------
# EXAMPLE USAGE
# -----------------------------
# Apply the sector-based 2% inflow sparsification to the original Graph G (data from 2020)
S_20 = sparsify_by_sector_inflow(G, threshold = 0.02,  weight_key='weight')

#Convert the new graph H back to a DataFrame if you wish
dfz_sparsified = nx.to_pandas_adjacency(S_20, dtype=float)
#dfz_sparsified.to_parquet('dfz_sparsified.parquet')

# Check how many edges remain
print(f"Original graph edges: {G.number_of_edges()}")
print(f"Sparsified graph edges: {S_20.number_of_edges()}")

Original graph edges: 4255969
Sparsified graph edges: 343382


In [13]:
# Define the years and custom labels
years = range(2017, 2021)
labels = ['eu', 'gl', 'bc']
all_identifiers = list(years) + labels

# Loop through all identifiers
for identifier in all_identifiers:
    globals()[f"dfz_{identifier}"]  = pd.read_parquet(f'dfz_{identifier}.parquet')
    # Create the directed, weighted graph from DataFrame
    globals()[f"G_{identifier}"]= nx.from_pandas_adjacency(globals()[f"dfz_{identifier}"], create_using=nx.DiGraph)

for identifier in years:
    globals()[f"S_{identifier}"] = sparsify_by_sector_inflow( globals()[f"G_{identifier}"], threshold = 0.023,  weight_key='weight')
    globals()[f"dfz_s_{identifier}"] = nx.to_pandas_adjacency(globals()[f"S_{identifier}"], dtype=float)
    globals()[f"dfz_s_{identifier}"].to_parquet(f'dfz_s_{identifier}.parquet', engine="pyarrow", index=False)
    
    

In [16]:
for identifier in all_identifiers:
    clus_coeff = nx.clustering(globals()[f"S_{identifier}"], weight='weight')
    df = pd.DataFrame({"Node": list(clus_coeff.keys()), 
                       "Clustering_Coefficient": list(clus_coeff.values())})
    df.to_parquet(f'clus_{identifier}.parquet', engine="pyarrow", index=False)
    print(f"Saved: clus_{identifier}.parquet")

Saved: clus_2017.parquet
Saved: clus_2018.parquet
Saved: clus_2019.parquet
Saved: clus_2020.parquet
Saved: clus_eu.parquet
Saved: clus_gl.parquet
Saved: clus_bc.parquet


In [36]:
# Check how many edges remain
print(f"Original graph edges: {G_2017.number_of_edges()}")
print(f"Sparsified graph edges: {S_2017.number_of_edges()}")

Original graph edges: 4255969
Sparsified graph edges: 457589


In [None]:
years = range(2010, 2017)
# Loop through all identifiers
for identifier in years:
    globals()[f"dfz_{identifier}"]  = pd.read_parquet(f'dfz_{identifier}.parquet')
    # Create the directed, weighted graph for bca scenario from DataFrame
    globals()[f"G_{identifier}"]= nx.from_pandas_adjacency(globals()[f"dfz_{identifier}"], create_using=nx.DiGraph)
   

for identifier in years:
    globals()[f"S_{identifier}"] = sparsify_by_sector_inflow( globals()[f"G_{identifier}"], threshold = 0.023,  weight_key='weight')
    globals()[f"dfz_s_{identifier}"] = nx.to_pandas_adjacency(globals()[f"S_{identifier}"], dtype=float)
    globals()[f"dfz_s_{identifier}"].to_parquet(f'dfz_s_{identifier}.parquet', engine="pyarrow", index=False)

In [6]:
for identifier in years:
    clus_coeff = nx.clustering(globals()[f"S_{identifier}"], weight='weight')
    df = pd.DataFrame({"Node": list(clus_coeff.keys()), 
                       "Clustering_Coefficient": list(clus_coeff.values())})
    df.to_parquet(f'clus_{identifier}.parquet', engine="pyarrow", index=False)
    print(f"Saved: clus_{identifier}.parquet")

Saved: clus_2010.parquet
Saved: clus_2011.parquet
Saved: clus_2012.parquet
Saved: clus_2013.parquet
Saved: clus_2014.parquet
Saved: clus_2015.parquet
Saved: clus_2016.parquet
