In [21]:
#PCA METHOD WITH REPORTING
import pandas as pd
import numpy as np
import math
import networkx as nx
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def reduce_pca_by_variance(data: np.ndarray, feature_names: list, variance_threshold: float):
    """
    Performs PCA on n-dimensional data, automatically selecting the minimum
    number of components to explain at least the `variance_threshold`.
    
    This modified version also prints the results of the reduction and
    the top 5 feature contributors for each component.

    Args:
        data: A (n_samples, n_features) NumPy array.
        feature_names: A list of strings corresponding to the feature columns
                       in `data`. (e.g., list(df.columns))
        variance_threshold: The target amount of variance to explain
                            (e.g., 0.95 for 95%).

    Returns:
        A tuple containing:
        - data_transformed (np.ndarray): The data projected onto the
                                         new component space.
        - fitted_pca (PCA): The fitted PCA object, which you can use
                            to inspect the number of components, etc.
        - explained_variance_list (list): A list of the variance explained
                                          by each component (e.g., [0.5, 0.3]).
    """
    
    if len(feature_names) != data.shape[1]:
        raise ValueError(f"Number of feature_names ({len(feature_names)}) does not "
                         f"match number of data columns ({data.shape[1]}).")

    # 1. Create a PCA object with the variance threshold.
    # By setting n_components to a float, PCA automatically finds
    # the components needed to explain that much variance.
    pca = PCA(n_components=variance_threshold)
    
    # 2. Create a pipeline to first scale the data, then run PCA.
    # Scaling is crucial for PCA to work correctly.
    pipeline = Pipeline([
        ('scaler', StandardScaler(with_std=False)),
        ('pca', pca)
    ])
    
    # 3. Fit the pipeline to the data and transform it
    data_transformed = pipeline.fit_transform(data)
    
    # --- Report print statements ---
    
    # Get the original and new dimensions
    original_dimensions = data.shape[1]
    # We access the fitted pca object from step 2
    new_dimensions = pca.n_components_ 
    
    print("-" * 30)
    print("PCA Dimensionality Reduction Report")
    print("-" * 30)
    print(f"Original dimensions:   {original_dimensions}")
    print(f"New dimensions:        {new_dimensions}")
    print(f"Dimensions reduced by: {original_dimensions - new_dimensions}")
    print("\nVariance explained by each remaining component:")
    
    # pca.explained_variance_ratio_ is an array like [0.5, 0.3, 0.1]
    for i, variance in enumerate(pca.explained_variance_ratio_):
        print(f"  Principal Component {i+1}: {variance * 100:.2f}%")
        
    # Print total variance explained
    total_variance = np.sum(pca.explained_variance_ratio_)
    print(f"\nTotal variance explained: {total_variance * 100:.2f}%")
    print(f"(Target threshold was {variance_threshold * 100:.0f}%)")
    
    # --- New: Top 5 Contributors per Component Report ---
    print("\nTop 5 Contributors per Component:")
    
    # pca.components_ has shape (n_components, n_features)
    for i, component in enumerate(pca.components_):
        print(f"  --- Principal Component {i+1} ---")
        
        # Get indices of the top 5 absolute loadings
        # np.argsort returns indices of smallest to largest
        # We take the last 5, and then reverse them [::-1]
        top_5_indices = np.argsort(np.abs(component))[-5:][::-1]
        
        # Print the feature name and its loading (weight)
        for j, feature_index in enumerate(top_5_indices):
            feature_name = feature_names[feature_index]
            loading = component[feature_index]
            print(f"    {j+1}. {feature_name}: {loading:.4f}")
            
    print("-" * 30)
    
    # --- End of report ---
    
    # Get the list of explained variances
    explained_variance_list = pca.explained_variance_ratio_.tolist()
    
    # Return the new data, the fitted PCA object, and the list of variances
    return data_transformed, pca, explained_variance_list


# Processing and Weighting

In [22]:

# Get data to be a numpy array probably
df=pd.read_csv('consolidated_with_cd_118_116.csv')
data=np.array(df)
income_index = df.columns.get_loc('per_capita_income')
# Transform the income dimension by logarithmic scale
data[:, income_index] = np.log(data[:, income_index].astype(np.float64))
# Normalize data by dividing by standard deviation by dimension
for i in range(2,data.shape[1]-2):
    data[:, i] = data[:, i] / np.std(data[:, i])
weights = np.array([
    0.1225,     # 1:  per_capita_income (Economic Security - Income)
    0.2/6,      # 2:  white (Cultural - Race)
    0.2/6,      # 3:  black (Cultural - Race)
    0.2/6,      # 4:  asian (Cultural - Race)
    0.2/6,      # 5:  native (Cultural - Race)
    0.2/6,      # 6:  pacific islander (Cultural - Race)
    0.2/6,      # 7:  other (Cultural - Race)
    0.05,       # 8:  Under High School (not weighted)
    0.05,       # 9:  High School (No College Degree) (not weighted)
    0.05,       # 10: College or More (Education)
    0.0,        # 11: Agriculture (not weighted)
    0.0,        # 12: Construction_and_manufacturing (not weighted)
    0.0,        # 13: trade (not weighted)
    0.0,        # 14: Transportation and warehousing (not weighted)
    0.0,        # 15: nerds (not weighted)
    0.0,      # 16: Educational services, and health care (Economic Security - Healthcare)
    0.0,        # 17: finance_inurance_and_realty (not weighted)
    0.0,        # 18: other_services (not weighted)
    0.07,       # 19: in_labor_force (Economic Security - Employment)
    0.0,        # 20: out_labor_force (not weighted)
    0.15,       # 21: avg_commute_time (Location Affordability - Transportation)
    0.15,       # 22: avg_housing_cost_burden (Location Affordability - Housing)
    0.0525,     # 23: avg_poverty_ratio (Economic Security - Poverty)
])
for i in range(data.shape[1]-4):
    data[:, i+2] = data[:, i+2] * weights[i]

In [23]:
# Perform PCA, and project onto the top N dimensions so that they explain 70% of the variance
new_data,pca,var_explained = reduce_pca_by_variance(data[:,2:-2], list(df.columns)[2:-2], variance_threshold=0.8)

------------------------------
PCA Dimensionality Reduction Report
------------------------------
Original dimensions:   23
New dimensions:        4
Dimensions reduced by: 19

Variance explained by each remaining component:
  Principal Component 1: 31.37%
  Principal Component 2: 26.43%
  Principal Component 3: 18.92%
  Principal Component 4: 6.65%

Total variance explained: 83.36%
(Target threshold was 80%)

Top 5 Contributors per Component:
  --- Principal Component 1 ---
    1. avg_housing_cost_burden: 0.7903
    2. avg_commute_time: -0.5353
    3. per_capita_income: -0.2125
    4. avg_poverty_ratio: -0.1654
    5. Under High School: 0.0741
  --- Principal Component 2 ---
    1. avg_commute_time: 0.8283
    2. avg_housing_cost_burden: 0.4448
    3. per_capita_income: -0.2472
    4. College or More: -0.1207
    5. avg_poverty_ratio: -0.1123
  --- Principal Component 3 ---
    1. per_capita_income: 0.8816
    2. avg_housing_cost_burden: 0.3618
    3. College or More: 0.1741
    4. Hig

# Network initialization

In [24]:
def create_dimension_layered_knn(data, dimension_weights, k=10):
    """
    Create multi-layer network where each dimension has its own KNN graph.
    """
    G = nx.Graph()
    
    # Add nodes
    for i in range(data.shape[0]):
        G.add_node(i)
    
    # For each dimension, create KNN graph
    for dim in range(data.shape[1]):
        dim_weight = dimension_weights[dim]
        
        if dim_weight == 0:
            continue
        
        # Get this dimension's values (1D)
        dim_data = data[:, dim].reshape(-1, 1)
        
        # Build KNN graph for THIS dimension only
        from sklearn.neighbors import NearestNeighbors
        nbrs = NearestNeighbors(n_neighbors=k)
        nbrs.fit(dim_data)
        distances, indices = nbrs.kneighbors(dim_data)
        
        # Add edges weighted by dimension importance
        for i in range(len(data)):
            for j, neighbor in enumerate(indices[i]):
                if i != neighbor:
                    if G.has_edge(i, neighbor):
                        G[i][neighbor]['weight'] += dim_weight  # Accumulate
                    else:
                        G.add_edge(i, neighbor, weight=dim_weight)
    
    return G


In [25]:
# Create data topology graph
import numpy as np
import networkx as nx

def build_topology_graph(data, weights, rmax_scale, rdisc_scale, k, scaling_method='std'):
    """
    Builds a dimension-wise gap graph where connection radii are scaled 
    by the statistical spread (Variance or StdDev) of each dimension.

    Parameters:
    -----------
    data : np.ndarray (N, D)
    weights : list or np.array (D,)
        Importance weight for each dimension.
    rmax_scale : float
        The base scaling factor for the maximum radius. 
        Actual R_max[d] = rmax_scale * Variance[d]
    rdisc_scale : float
        The base scaling factor for the discounting radius.
        Actual R_disc[d] = rdisc_scale * Variance[d]
    k : int
        Max neighbors per side (1D).
    scaling_method : str
        'variance' (default) -> Scale by sigma^2.
        'std' -> Scale by sigma (standard deviation).

    Returns:
    --------
    nx.MultiGraph
    """
    N, D = data.shape
    G = nx.MultiGraph()
    G.add_nodes_from(range(N))
    
    # 1. Calculate Statistics for Scaling
    if scaling_method == 'variance':
        # Add a tiny epsilon to prevent 0-radius in constant dimensions
        metric = np.var(data, axis=0) + 1e-9
        #print("Scaling radii by Dimension Variance.")
    elif scaling_method == 'std':
        metric = np.std(data, axis=0) + 1e-9
        #print("Scaling radii by Dimension Standard Deviation.")
    else:
        raise ValueError("scaling_method must be 'variance' or 'std'")

    # Calculate the specific thresholds for each dimension
    # R_max[d] = Scale * Metric[d]
    dim_rmaxs = rmax_scale * metric
    dim_rdiscs = rdisc_scale * metric

    #print(f"Processing {D} dimensions for {N} nodes...")
    
    for d in range(D):
        # Retrieve specific thresholds for this dimension
        dim_weight = weights[d]
        current_rmax = dim_rmaxs[d]
        current_rdisc = dim_rdiscs[d]
        
        # Sort data for sliding window
        sorted_indices = np.argsort(data[:, d])
        sorted_vals = data[sorted_indices, d]
        
        # Sliding Window (Vectorized)
        for shift in range(1, k + 1):
            u_indices = sorted_indices[:-shift]
            v_indices = sorted_indices[shift:]
            
            # Calculate 1D distances
            dists = sorted_vals[shift:] - sorted_vals[:-shift]
            
            # --- GAP DETECTION ---
            # Use the variance-scaled R_max for this specific dimension
            valid_mask = dists <= current_rmax
            
            if not np.any(valid_mask):
                continue

            # Filter
            valid_u = u_indices[valid_mask]
            valid_v = v_indices[valid_mask]
            valid_dists = dists[valid_mask]
            
            # --- DISCOUNTING ---
            # Base weight
            edge_weights = np.full(valid_dists.shape, dim_weight, dtype=float)
            
            # Check against variance-scaled Discount Radius
            discount_mask = valid_dists > current_rdisc
            
            # Apply decay: w = w * (r_disc / dist)
            safe_dists = valid_dists.copy()
            safe_dists[safe_dists == 0] = 1e-9 
            
            edge_weights[discount_mask] *= (current_rdisc / safe_dists[discount_mask])
            
            # Add to graph
            edges_to_add = zip(
                valid_u, 
                valid_v, 
                [{'weight': w, 'dimension': d} for w in edge_weights]
            )
            G.add_edges_from(edges_to_add)

    #print(f"Done. Edges: {G.number_of_edges()}")
    return G

def flatten_graph_for_community_detection(G_multi):
    """
    Converts the MultiGraph into a simple Weighted Graph by summing weights.
    Required for Louvain/Leiden algorithms.
    """
    G_simple = nx.Graph()
    G_simple.add_nodes_from(G_multi.nodes)
    for u, v, data in G_multi.edges(data=True):
        w = data['weight']
        if G_simple.has_edge(u, v):
            G_simple[u][v]['weight'] += w
        else:
            G_simple.add_edge(u, v, weight=w)
            
    return G_simple

## Geographic weighting

In [26]:
import pickle
from scipy.sparse import load_npz
def add_adjacency_edges_with_weight(G, weight):
    """
    Adds edges to graph G based on a precomputed adjacency matrix,
    assigning a uniform weight to each added edge.

    Parameters:
    -----------
    G : nx.Graph
        The input graph to which edges will be added.
    weight : float
        The weight to assign to each added edge.

    Returns:
    --------
    None (modifies G in place)
    """
    # Create graph
    Geo = G.copy()

    # Load adjacency matrix
    adj_matrix = load_npz('adjacency_queen_matrix.npz')

    # Load mappings
    with open('adjacency_queen_mappings.pkl', 'rb') as f:
        mappings = pickle.load(f)
    index_to_geoid = mappings['index_to_geoid']

    # Get edges from sparse matrix
    rows, cols = adj_matrix.nonzero()

    # Add edges with your weight
    weight = 2  # Change this to your desired weight

    for i, j in zip(rows, cols):
        if i < j:  # Only add each edge once (undirected)
            geoid1 = index_to_geoid[i]
            geoid2 = index_to_geoid[j]
            Geo.add_edge(i, j, weight=weight)
            #Gring_geo.add_edge(i, j, weight=weight)
    return Geo


In [56]:
# Helper function
from typing import List
def is_contiguous(G: nx.Graph, partition: List) -> bool:
    for part in partition:
        if not nx.is_connected(G.subgraph(part)):
            return False
    return True

def generate_geopure(): # give it adj_matrix from the geoweight section
    rows, cols = load_npz('adjacency_queen_matrix.npz').nonzero()
    pure=nx.Graph()
    for i, j in zip(rows, cols):
        if i < j:  # Only add each edge once (undirected)
            pure.add_edge(int(i), int(j))
    res=0.3
    n=13
    cont=False
    while n!=14 or not cont:
        res*=(14/n)**0.5
        pure_part=nx.algorithms.community.louvain_communities(pure, weight='weight', resolution=res)
        cont=is_contiguous(pure, pure_part)
        n=len(pure_part)
        # print("iterating with resolution:", res, " got ", n, " communities","contiguous:", cont)
    return pure,pure_part


In [28]:

# for i in range(10):
#     pure,part=generate_geopure()
#     print(np.std([len(x) for x in part]))

iterating with resolution: 0.6226494259953249  got  23  communities contiguous: True
iterating with resolution: 0.48578454285164174  got  20  communities contiguous: True
iterating with resolution: 0.40643650851209834  got  17  communities contiguous: True
iterating with resolution: 0.36883512148711634  got  17  communities contiguous: True
iterating with resolution: 0.3347124187746692  got  16  communities contiguous: True
iterating with resolution: 0.31309479853830346  got  16  communities contiguous: True
iterating with resolution: 0.29287336642783557  got  16  communities contiguous: True
iterating with resolution: 0.27395794872101553  got  14  communities contiguous: True
113.43354163274167
iterating with resolution: 0.6226494259953249  got  24  communities contiguous: True
iterating with resolution: 0.4755563543407302  got  19  communities contiguous: True
iterating with resolution: 0.4082152325836951  got  19  communities contiguous: True
iterating with resolution: 0.35040994530

In [29]:
import networkx as nx
import numpy as np
import random
from sklearn.neighbors import NearestNeighbors

# --- 1. Helper Functions (Re-defined for completeness) ---

def randomize_graph(G):
    """
    Creates a random graph with the same Nodes (N), Edges (M), and 
    Weight Distribution as G, but random topology.
    """
    N = G.number_of_nodes()
    M = G.number_of_edges()
    
    # 1. Extract weights from original graph
    weights = [d['weight'] for u, v, d in G.edges(data=True)]
    
    # 2. Create random graph with same density (Erdos-Renyi variant)
    G_rand = nx.gnm_random_graph(N, M, seed=42)
    
    # 3. Assign shuffled weights to the new random edges
    random.shuffle(weights)
    for i, (u, v) in enumerate(G_rand.edges()):
        G_rand[u][v]['weight'] = weights[i]
        
    return G_rand

def get_modularity(G):
    """Runs Louvain and returns modularity score."""
    if G.number_of_edges() == 0: return 0
    try:
        part = nx.algorithms.community.louvain_communities(G, weight='weight', resolution=1)
        return nx.algorithms.community.modularity(G, part, weight='weight')
    except:
        return 0

# --- 2. Execution Loops ---

# Assumes 'new_data' and 'var_explained' are already defined in your notebook
print("--- Experiment 1: Varying K (Standard KNN) ---")
print(f"{'K':<5} | {'Edges':<10} | {'Q (Real)':<10} | {'Q (Rand)':<10} | {'Diff':<10}")
print("-" * 55)

for k in range(2, 21):
    # 1. Build Real Graph
    Gk = create_dimension_layered_knn(new_data, dimension_weights=var_explained, k=k)
    q_real = get_modularity(Gk)
    
    # 2. Build Random Graph & Measure
    Gk_rand = randomize_graph(Gk)
    q_rand = get_modularity(Gk_rand)
    
    total_weight = Gk.size(weight='weight')
    
    print(f"{k:<5} | {int(total_weight):<10} | {q_real:.4f}     | {q_rand:.4f}     | {q_real - q_rand:.4f}")

print("\n\n--- Experiment 2: Varying R_max (Topology Graph) ---")
print(f"{'R_max':<10} | {'Edges':<10} | {'Q (Real)':<10} | {'Q (Rand)':<10} | {'Diff':<10}")
print("-" * 60)

for i in range(1, 11):
    # 1. Build Real Graph
    rmax = 0.0002 * 2**(i-3)
    rdisc = rmax / 4
    
    Gmulti = build_topology_graph(new_data, var_explained, rmax, rdisc, k=10, scaling_method="std")
    Gtopo = flatten_graph_for_community_detection(Gmulti)
    q_real = get_modularity(Gtopo)
    
    # 2. Build Random Graph & Measure
    Gtopo_rand = randomize_graph(Gtopo)
    q_rand = get_modularity(Gtopo_rand)
    
    total_weight = Gtopo.size(weight='weight')
    
    print(f"{rmax:<10.5f} | {int(total_weight):<10} | {q_real:.4f}     | {q_rand:.4f}     | {q_real - q_rand:.4f}")

--- Experiment 1: Varying K (Standard KNN) ---
K     | Edges      | Q (Real)   | Q (Rand)   | Diff      
-------------------------------------------------------
2     | 5872       | 0.6108     | 0.5859     | 0.0249
3     | 11745      | 0.4645     | 0.3723     | 0.0922
4     | 17618      | 0.4149     | 0.2835     | 0.1314
5     | 23491      | 0.3947     | 0.2353     | 0.1593
6     | 29364      | 0.3824     | 0.1992     | 0.1833
7     | 35237      | 0.3763     | 0.1791     | 0.1971
8     | 41110      | 0.3730     | 0.1647     | 0.2082
9     | 46983      | 0.3709     | 0.1705     | 0.2005
10    | 52856      | 0.3695     | 0.1607     | 0.2088
11    | 58729      | 0.3686     | 0.1520     | 0.2166
12    | 64601      | 0.3675     | 0.1440     | 0.2235
13    | 70474      | 0.3668     | 0.1380     | 0.2288
14    | 76347      | 0.3657     | 0.1299     | 0.2358
15    | 82220      | 0.3650     | 0.1246     | 0.2404
16    | 88093      | 0.3645     | 0.1231     | 0.2414
17    | 93966      | 0.3642  

In [30]:
# Optimal Graph Gtopo
Gk = create_dimension_layered_knn(new_data, dimension_weights=var_explained, k=10)
r_max=0.001
r_disc=r_max/4
Gmulti=build_topology_graph(data=new_data, weights=var_explained, rmax_scale=r_max, rdisc_scale=r_disc, k=10,scaling_method="std")
Gtopo=flatten_graph_for_community_detection(Gmulti)


For topological graph, there could be some redundency with the parameters. Having the same radius parameters, while increasing k,

In [31]:
GeoPure,pure_part=generate_geopure()

iterating with resolution: 0.6226494259953249  got  22  communities contiguous: True
iterating with resolution: 0.4967024126308293  got  18  communities contiguous: True
iterating with resolution: 0.43805035314232066  got  20  communities contiguous: True
iterating with resolution: 0.36649922008331515  got  18  communities contiguous: True
iterating with resolution: 0.3232219306798603  got  17  communities contiguous: True
iterating with resolution: 0.29331912260056703  got  14  communities contiguous: True


# Grouping

In [32]:
# K means clustering function
from sklearn.cluster import KMeans

def get_kmeans_partition(data: np.ndarray, weights, n_clusters=14):
    """
    Runs K-means clustering on the input data and returns the loss
    (inertia) and a partition of the data indices by cluster.

    Args:
        data: A (n_samples, n_features) NumPy array.
        n_clusters: The number of clusters (k).

    Returns:
        A tuple containing:
        - loss (float): The inertia (Within-Cluster Sum of Squares).
        - partitions (dict): A dictionary where keys are cluster IDs (0 to k-1)
                             and values are lists of original data indices
                             belonging to that cluster.
    """
    
    # 1. Initialize and fit the K-means model
    # n_init=10 runs the algorithm 10 times and picks the best result
    # random_state=42 ensures the result is reproducible
    data=data.copy()
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    for i in range(len(weights)):
        data[:, i] = data[:, i] * weights[i]
    kmeans.fit(data)

    # 2. Get the loss (inertia)
    # .inertia_ is the WCSS (Within-Cluster Sum of Squares)
    loss = kmeans.inertia_

    # 3. Get the cluster assignment for each data point
    # .labels_ is an array like [0, 1, 1, 0, 2, ...]
    labels = kmeans.labels_

    # 4. Create the partition of indices
    partitions = {i: [] for i in range(n_clusters)}
    for index, cluster_id in enumerate(labels):
        partitions[cluster_id].append(index)

    return loss, partitions

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

import numpy as np

import numpy as np

def calculate_weighted_kmeans_loss_for_partition(data: np.ndarray, partition: list, weights: np.ndarray) -> float:
    """
    Calculates the **Weighted** K-means "loss" (Inertia, or Within-Cluster Sum of Squares)
    for a given dataset, partition, and a set of feature weights.

    Args:
        data: A (n_samples, n_features) NumPy array containing the unweighted, original data.
        partition: A list of iterables (e.g., lists, sets, or tuples), where each inner iterable 
                   contains the *indices* (row numbers) of the data points belonging to that cluster.
        weights: A (n_features,) NumPy array containing the weight for each dimension.

    Returns:
        total_loss (float): The total Weighted K-means loss for this partition.
    """

    total_loss = 0.0

    # Safety Check: Ensure the number of weights matches the number of features
    if len(weights)!= data.shape[1]:
        raise ValueError("The number of weights must match the number of features (columns) in the data.")

    # Iterate over each cluster (which is an iterable of indices)
    for indices in partition:
        
        # --- FIX: Convert the index iterable (set/list/tuple) to a NumPy array for slicing ---
        # This resolves the IndexError by providing valid index types to NumPy
        cluster_indices = np.array(list(indices), dtype=int)
        
        # 1. Get all data points belonging to this cluster
        cluster_points = data[cluster_indices, :]
        
        # Handle empty clusters (their loss is 0)
        # Check if the cluster contains any points
        if cluster_points.shape == 0:
            continue
            
        # 2. Calculate the "true" centroid (mean) for this cluster
        centroid = np.mean(cluster_points, axis=0)
        
        # 3. Calculate the sum of *weighted* squared distances from each point to the centroid
        
        # Calculate squared differences for each feature: (Point - Centroid)^2
        squared_diffs = (cluster_points - centroid) ** 2
        
        # Apply the weights: (Point - Centroid)^2 * Weight_j
        weighted_squared_diffs = squared_diffs * weights
        
        # Sum all weighted squared differences to get the total cluster loss
        # Summing twice: once over the dimensions (axis=1) and once over the points (np.sum)
        cluster_loss = np.sum(weighted_squared_diffs)
        
        # 4. Add this cluster's loss to the total
        total_loss += cluster_loss
        
    return total_loss#/data.shape[0]
# Note on Usage:
# You should pass the *unweighted* data array into this function,
# since the weighting array is now passed separately as the 'weights' argument.




k = 14

# 3. Run the function
total_loss, index_partitions = get_kmeans_partition(new_data,weights=var_explained, n_clusters=k)

index_partitions=[index_partitions[i] for i in range(k)]

In [15]:
# null partition model
import random
# Random partition of 14 parts model with no guarantee of size. 
part_random=[[] for i in range(14)]
n=len(new_data)
m=int(n/14)
for i in range(n):
    part_random[ random.choice(range(14)) ].append(i)
# Partition model with 14 equal sized parts
labels=[[i]*m for i in range(14)]
eq_labels=[]
for l in labels:
    eq_labels+=l
labels=random.shuffle(labels)
part_equal=[[] for i in range(14)]
for i in range(n):
    try:part_equal[ labels[i] ].append(i)
    except: part_equal[ random.choice(range(14)) ].append(i)


In [16]:
partGk=nx.algorithms.community.louvain_communities(Gk,weight='weight',resolution=0.7)
print("Number of Louvain communities Gk:",len(partGk))
partGtopo=nx.algorithms.community.louvain_communities(Gtopo,weight='weight',resolution=0.5)
print("Number of Louvain communities Gtopo:",len(partGtopo))

Number of Louvain communities Gk: 18
Number of Louvain communities Gtopo: 11


In [17]:
# Parse recent districting
part116=[[] for i in range(13)]
part118=[[] for i in range(14)]
for i in range(data.shape[0]):
    cd116=int(data[i,-1])-1
    cd118=int(data[i,-2])-1
    part116[cd116].append(i)
    part118[cd118].append(i)

In [18]:
print("\n 10Graph Comparisons")
#print("Louvain modularity Gk:",nx.algorithms.community.modularity(Gk, partGk, weight='weight'))
print("Louvain modularity Gtopo:",nx.algorithms.community.modularity(Gtopo, partGtopo, weight='weight'))
#print("Louvain modularity GeoPure partition on Gk:",nx.algorithms.community.modularity(Gk, pure_part, weight='weight'))
print("Louvain modularity GeoPure partition on Gtopo:",nx.algorithms.community.modularity(Gtopo, pure_part, weight='weight'))
#print("Louvain modularity Gtopo on Gk:",nx.algorithms.community.modularity(Gk, partGtopo, weight='weight'))
#print("Louvain modularity Gk on Gtopo:",nx.algorithms.community.modularity(Gtopo, partGk, weight='weight'))

print("\n K Means Comparisons")
#print("K means modularity on Gk:",nx.algorithms.community.modularity(Gk, index_partitions, weight='weight'))
print("K means modularity on Gtopo:",nx.algorithms.community.modularity(Gtopo, index_partitions, weight='weight'))

print("\nReal district modularity on Gtopo")
#print("116 partition on Gk:",nx.algorithms.community.modularity(Gk, part116, weight='weight'))
#print("118 partition on Gk:",nx.algorithms.community.modularity(Gk, part118, weight='weight'))
print("116 partition on Gtopo:",nx.algorithms.community.modularity(Gtopo, part116, weight='weight'))
print("118 partition on Gtopo:",nx.algorithms.community.modularity(Gtopo, part118, weight='weight'))

print("\n null model comparisons")
print("Random partition modularity on Gtopo:",nx.algorithms.community.modularity(Gtopo, part_random, weight='weight'))
print("Equal partition modularity on Gtopo:",nx.algorithms.community.modularity(Gtopo, part_equal, weight='weight'))


 10Graph Comparisons
Louvain modularity Gtopo: 0.4475305196073563
Louvain modularity GeoPure partition on Gtopo: 0.003154207294161239

 K Means Comparisons
K means modularity on Gtopo: 0.09685293179840138

Real district modularity on Gtopo
116 partition on Gtopo: 0.0033596436850784075
118 partition on Gtopo: 0.002097857237471587

 null model comparisons
Random partition modularity on Gtopo: -0.00043794427740562625
Equal partition modularity on Gtopo: -0.0004899222461252368


In [19]:
print("\n 10Graph Comparisons")
#print("K means loss of Gk:",calculate_weighted_kmeans_loss_for_partition(new_data, partGk, weights=var_explained))
print("K means loss of Gtopo:",calculate_weighted_kmeans_loss_for_partition(new_data, partGtopo, weights=var_explained))
print("K means loss of GeoPure:",calculate_weighted_kmeans_loss_for_partition(new_data, pure_part, weights=var_explained))

print("\n K Means")
print("loss of k means:",calculate_weighted_kmeans_loss_for_partition(new_data, index_partitions, weights=var_explained))

print("\nReal district k means loss")
print("116 partition:",calculate_weighted_kmeans_loss_for_partition(new_data, part116, weights=var_explained))
print("118 partition:",calculate_weighted_kmeans_loss_for_partition(new_data, part118, weights=var_explained))


print("\n null model k means loss")
print("Random partition k means loss:",calculate_weighted_kmeans_loss_for_partition(new_data, part_random, weights=var_explained))
print("Equal partition k means loss:",calculate_weighted_kmeans_loss_for_partition(new_data, part_equal, weights=var_explained))


 10Graph Comparisons
K means loss of Gtopo: 119.70203635650712
K means loss of GeoPure: 114.37685184972688

 K Means
loss of k means: 29.0772838818855

Real district k means loss
116 partition: 115.18201653038571
118 partition: 115.6993863980432

 null model k means loss
Random partition k means loss: 119.90356916854324
Equal partition k means loss: 119.94362779605896


In [40]:
def one_group_weighted_modularity(G, group, total_graph_weight):

    subgraph = G.subgraph(group)
    L_c = subgraph.size(weight='weight')
    S_c = sum(dict(G.degree(group, weight='weight')).values())

    term1 = L_c / total_graph_weight

    term2 = (S_c / (2 * total_graph_weight)) ** 2
    
    return term1 - term2

# normalization_factor=sum([G[u][v]["weight"] for u,v in G.edges()])

In [41]:
# mod=0

# for part in partG: # where
#     mod+=one_group_weighted_modularity(G,part,normalization_factor)
# print("Modularity calculated by one_group_modularity function:", mod)

In [42]:
# nx.algorithms.community.modularity(G, partG, weight='weight')

In [43]:
# import networkx as nx
# import numpy as np
# import random

# def get_modularity(G):
#     """Safely runs Louvain and returns modularity."""
#     if G.number_of_edges() == 0: return 0
#     try:
#         # Use resolution=1 as per your request
#         part = nx.algorithms.community.louvain_communities(G, weight='weight', resolution=1)
#         return nx.algorithms.community.modularity(G, part, weight='weight')
#     except:
#         return 0

# def randomize_graph(G):
#     """
#     Creates a random graph with the same Number of Nodes and Edges as G,
#     and the exact same list of edge weights, but randomized topology.
#     """
#     N = G.number_of_nodes()
#     M = G.number_of_edges()
    
#     # 1. Extract exact weights from original graph
#     weights = [d['weight'] for u, v, d in G.edges(data=True)]
    
#     # 2. Create random topology (Erdos-Renyi) with same density
#     G_rand = nx.gnm_random_graph(N, M, seed=42)
    
#     # 3. Assign the original shuffled weights to the new random edges
#     random.shuffle(weights)
#     for i, (u, v) in enumerate(G_rand.edges()):
#         G_rand[u][v]['weight'] = weights[i]
        
#     return G_rand

# # --- LOOP 1: Varying K (Standard KNN) ---
# print("\n--- EXPERIMENT 1: Standard KNN (Gk) ---")
# print(f"{'K':<4} | {'Edges':<10} | {'Q (Real)':<8} | {'Q (Rand)':<8} | {'Diff':<8}")
# print("-" * 50)

# for i in range(2, 20):
#     # 1. Build Real Graph
#     Gk = create_dimension_layered_knn(new_data, dimension_weights=var_explained, k=i)
    
#     # 2. Build Random Null Model
#     Gk_random = randomize_graph(Gk)
    
#     # 3. Calculate Modularities
#     q_real = get_modularity(Gk)
#     q_rand = get_modularity(Gk_random)
    
#     # Calculate sum of weights for reference (Network Mass)
#     total_weight = sum([d['weight'] for u, v, d in Gk.edges(data=True)])
    
#     print(f"{i:<4} | {int(total_weight):<10} | {q_real:.4f}   | {q_rand:.4f}   | {q_real - q_rand:.4f}")


# # --- LOOP 2: Varying Radius (Topology Graph) ---
# print("\n--- EXPERIMENT 2: Smoothed Topology (Gtopo) ---")
# print(f"{'r_max':<10} | {'Edges':<10} | {'Q (Real)':<8} | {'Q (Rand)':<8} | {'Diff':<8}")
# print("-" * 55)

# for i in range(1, 12):
#     # 1. Parameters
#     rmax = 0.00015 * 2**(i-3)
#     rdisc = rmax / 4
    
#     # 2. Build Real Graph
#     Gmulti = build_topology_graph(new_data, var_explained, rmax, rdisc, k=10, scaling_method="std")
#     Gtopo = flatten_graph_for_community_detection(Gmulti)
    
#     # 3. Build Random Null Model
#     Gtopo_random = randomize_graph(Gtopo)
    
#     # 4. Calculate Modularities
#     q_real = get_modularity(Gtopo)
#     q_rand = get_modularity(Gtopo_random)
    
#     # Calculate sum of weights for reference
#     total_weight = sum([d['weight'] for u, v, d in Gtopo.edges(data=True)])
    
#     print(f"{rmax:<10.5f} | {int(total_weight):<10} | {q_real:.4f}   | {q_rand:.4f}   | {q_real - q_rand:.4f}")

In [44]:
# for i in range(1, 12):
#     # 1. Parameters
#     rmax = 0.00015 * 2**(i-3)
#     rdisc = rmax / 4
    
#     # 2. Build Real Graph
#     Gmulti = build_topology_graph(new_data, var_explained, rmax, rdisc, k=10, scaling_method="std")
#     Gtopo = flatten_graph_for_community_detection(Gmulti)
    
#     # 3. Build Random Null Model
#     Gtopo_random = randomize_graph(Gtopo)
    
#     # 4. Calculate Modularities
#     q_real = get_modularity(Gtopo)
#     q_rand = get_modularity(Gtopo_random)
    
#     # Calculate sum of weights for reference
#     total_weight = sum([d['weight'] for u, v, d in Gtopo.edges(data=True)])
    
#     print(f"{rmax:<10.5f} | {int(total_weight):<10} | {q_real:.4f}   | {q_rand:.4f}   | {q_real - q_rand:.4f}")

# Optimization

In [82]:
from typing import List, Set, Dict, Tuple
import time

# --- Helper Functions ---

def mean_variance(group,node,new_data,weights):
    n=len(group)
    var=0
    for i in range(new_data.shape[1]):
        weight=weights[i]
        dif=[new_data[node,i]-new_data[member,i] for member in group]
        var+=weight*sum(x**2 for x in dif)/n
    return var

def partition_to_map(partition: List[Set[int]]) -> Dict[int, int]:
    """Converts a list of sets to a dictionary mapping node -> community_index."""
    node_to_community = {}
    for i, community_set in enumerate(partition):
        for node in community_set:
            node_to_community[node] = i
    return node_to_community

def is_contiguous(G: nx.Graph, partition: List) -> bool:
    for part in partition:
        if not nx.is_connected(G.subgraph(part)):
            return False
    return True

# def is_contiguous(G: nx.Graph, groupA: Set, groupB: Set) -> bool:
#     """
#     Checks if the two prospective communities (groupA and groupB) are contiguous
#     in the geographical graph G.
#     """
#     # Check contiguity for groupA
#     # Only check if the group has more than one node
#     if len(groupA) > 1 and not nx.is_connected(G.subgraph(groupA)):
#         return False
#     # Check contiguity for groupB
#     if len(groupB) > 1 and not nx.is_connected(G.subgraph(groupB)):
#         return False
#     return True

def one_group_weighted_modularity(G, group, total_graph_weight):

    subgraph = G.subgraph(group)
    L_c = subgraph.size(weight='weight')
    S_c = sum(dict(G.degree(group, weight='weight')).values())

    term1 = L_c / total_graph_weight

    term2 = (S_c / (2 * total_graph_weight)) ** 2
    
    return term1 - term2
# normalization_factor=sum([G[u][v]["weight"] for u,v in G.edges()])

def balance_population(Geo: nx.Graph, unbalanced_part: List[Set[int]], tol: int):
    # Convert part to map for iterations
    best_part = [set(comm) for comm in unbalanced_part]
    node_to_community_map = partition_to_map(best_part)
    # Get indicies where each district starts
    district_indices = list(range(len(best_part)))
    swap_flag = False
    
    # Iterate through all districts
    for comm_A_idx in district_indices:
        district_A_nodes = best_part[comm_A_idx]

        # Iterate through each node in district_A
        for node_A in district_A_nodes:
            if node_A not in best_part[comm_A_idx]:
                continue

            # Iterate over all geographical neighbors of A
            for node_B in Geo.neighbors(node_A):
                comm_B_idx = node_to_community_map.get(node_B)

                district_B_nodes = best_part[comm_B_idx]

                if comm_B_idx is None or comm_A_idx == comm_B_idx:
                    continue
                
                prospective_part = [set(comm) for comm in best_part]
                # Check population equality
                if np.abs(len(district_B_nodes) - len(district_A_nodes)) < tol:
                    # Find which one is larger, simulate swap
                    if len(district_A_nodes) < len(district_B_nodes):
                        # B leaves comm_B, B added to comm_A
                        prospective_part[comm_B_idx].discard(node_B)
                        prospective_part[comm_A_idx].add(node_B)
                    else:
                        # A leaves comm_A
                        prospective_part[comm_A_idx].discard(node_A)
                        prospective_part[comm_B_idx].add(node_A)

                    # Check contiguity of two groups
                    if is_contiguous(Geo,prospective_part):
                        # Update best part
                        best_part = prospective_part
                        # node_to_community_map = partition_to_map(best_part)
                        # swap_count += 1
                        # swap_flag = True
                        # print(f"Swap minimizes var, maintains contiguity. Node {node_A} and Node {node_B}.")
                        # print(f"Swap minimizes var, maintains contiguity. Swap Count: {swap_count}.")
    return best_part

def sequential_swap_var(G: nx.Graph, Geo: nx.Graph, current_partition: List[Set[int]]) -> Tuple[List[Set[int]], float]:
    
    iteration_time = time.time()
    swap_count = 0
    swap_flag = False

    best_part = [set(comm) for comm in current_partition]
    node_to_community_map = partition_to_map(best_part)


    # 2. Iterate through each district (community)
    # We use a list of indices to ensure we iterate over all districts
    district_indices = list(range(len(best_part)))

    # 3. Iterate through each district
    for comm_A_idx in district_indices:
        # best_swap = None  # Stores: (node_A, node_B, comm_A_idx, comm_B_idx)
        
        district_A_nodes = best_part[comm_A_idx]
        
        # Identify all potential swap candidates involving a node from comm_A_idx
        # A swap candidate is a pair (A, B) where A is in comm_A_idx and B is adjacent to A
        # and B is in a different community comm_B_idx.
        
        # Iterate over all nodes A in the current district
        for node_A in district_A_nodes:

            if node_A not in best_part[comm_A_idx]:
                continue

            # Iterate over all geographical neighbors of A
            for node_B in Geo.neighbors(node_A):
                comm_B_idx = node_to_community_map.get(node_B)

                district_B_nodes = best_part[comm_B_idx]

                if comm_B_idx is None or comm_A_idx == comm_B_idx:
                    continue

                var_A_now = mean_variance(district_A_nodes,node_A,new_data,weights)
                var_B_now = mean_variance(district_B_nodes,node_B,new_data,weights)

                var_A_swap = mean_variance(district_A_nodes,node_B,new_data,weights)
                var_B_swap = mean_variance(district_B_nodes,node_A,new_data,weights)
                
                if var_A_swap + var_B_swap < var_A_now + var_B_now:
                    # print(f"Updating best swap. Swap variance {var_A_swap + var_B_swap} < Current variance {var_A_now + var_B_now}")
                    # Post-swap partition
                    # Simulate the swap for the contiguity check
                    prospective_part = [set(comm) for comm in best_part]

                    # Swap

                    # A leaves comm_A, B leaves comm_B
                    prospective_part[comm_A_idx].discard(node_A)
                    prospective_part[comm_B_idx].discard(node_B)

                    # A enters comm_B, B enters comm_A
                    prospective_part[comm_B_idx].add(node_A)
                    prospective_part[comm_A_idx].add(node_B)

                    # Check contiguity of two groups
                    
                    if is_contiguous(Geo,prospective_part[comm_A_idx], prospective_part[comm_B_idx]):
                        # Update best part
                        best_part = prospective_part
                        node_to_community_map = partition_to_map(best_part)
                        swap_count += 1
                        swap_flag = True
                        # print(f"Swap minimizes var, maintains contiguity. Node {node_A} and Node {node_B}.")
                        # print(f"Swap minimizes var, maintains contiguity. Swap Count: {swap_count}.")

    
    iteration_time = time.time() - iteration_time

    return best_part, iteration_time, swap_flag

def sequential_swap_mod(G: nx.Graph, Geo: nx.Graph, current_partition: List[Set[int]], current_mod: float) -> Tuple[List[Set[int]], float]:
    # swapping with modularity
    iteration_time = time.time()
    swap_count = 0
    swap_flag = False

    best_part = [set(comm) for comm in current_partition]
    best_mod = current_mod
    node_to_community_map = partition_to_map(best_part)


    # 2. Iterate through each district (community)
    # We use a list of indices to ensure we iterate over all districts
    district_indices = list(range(len(best_part)))

    # 3. Iterate through each district
    for comm_A_idx in district_indices:
        # best_swap = None  # Stores: (node_A, node_B, comm_A_idx, comm_B_idx)
        
        district_A_nodes = best_part[comm_A_idx]
        
        # Identify all potential swap candidates involving a node from comm_A_idx
        # A swap candidate is a pair (A, B) where A is in comm_A_idx and B is adjacent to A
        # and B is in a different community comm_B_idx.
        
        # Iterate over all nodes A in the current district
        for node_A in district_A_nodes:

            if node_A not in best_part[comm_A_idx]:
                continue

            # Iterate over all geographical neighbors of A
            for node_B in Geo.neighbors(node_A):
                comm_B_idx = node_to_community_map.get(node_B)

                district_B_nodes = best_part[comm_B_idx]

                # Skip checking same nodes
                if comm_B_idx is None or comm_A_idx == comm_B_idx:
                    continue

                # Simulate the swap for the contiguity and modularity check
                prospective_part = [set(comm) for comm in best_part]

                # A leaves comm_A, B leaves comm_B
                prospective_part[comm_A_idx].discard(node_A)
                prospective_part[comm_B_idx].discard(node_B)

                # A enters comm_B, B enters comm_A
                prospective_part[comm_B_idx].add(node_A)
                prospective_part[comm_A_idx].add(node_B)

                # Check contiguity of two groups
                if is_contiguous(Geo,prospective_part):
                    # Passed contiguity check, check if new_mod is better than current_mod
                    total_graph_weight = G.size(weight='weight')
                    new_mod = one_group_weighted_modularity(G, current_partition, total_graph_weight)
                    if new_mod > best_mod:
                        # Update best part
                        best_part = prospective_part
                        node_to_community_map = partition_to_map(best_part)
                        best_mod = new_mod
                        swap_count += 1
                        swap_flag = True
                        # print(f"Swap minimizes var, maintains contiguity. Node {node_A} and Node {node_B}.")
                        # print(f"Swap minimizes var, maintains contiguity. Swap Count: {swap_count}.")

    
    iteration_time = time.time() - iteration_time

    return best_part, best_mod, iteration_time, swap_flag


In [83]:
# --- Core Optimization Function with Sequential Swap and Delta Q ---
def optimized_sequential_swap(G: nx.Graph, Geo: nx.Graph, N_iterations: int, n_swaps: int) -> Tuple[List[Set[int]], float]:
    """
    Performs local search optimization using a sequential, targeted two-node swap.
    In each iteration, it finds the single best swap across all districts and executes it.
    """
    
    opt_best_score = 0.0
    opt_best_part = 0.0

    total_duration = time.time()
    
    # --- Main Optimization Loop ---
    for i in range(N_iterations):
        print(f"--- Iteration {i+1}/{N_iterations} ---")
        # Balance partition for population
        iter_graph, iter_unbalance_partition = generate_geopure()
        iter_initial_partition = balance_population(Geo,iter_unbalance_partition, tol=20)
        
        # Convert to map
        iter_best_part = [set(comm) for comm in iter_initial_partition] # current best partition

        # Calculate initial modularity score
        iter_init_score = nx.algorithms.community.modularity(G, iter_best_part, weight='weight')
        iter_best_score = iter_init_score
        
        iter_time = 0.0
        for i in range(n_swaps):
            # Swap
            iter_new_part, swap_time, iter_new_mod, swap_flag = sequential_swap_mod(G, Geo, iter_best_part, iter_best_score)
            iter_time = total_duration + swap_time

            # If no swap occurred, optimization finished
            if not swap_flag:
                print("\nNo local swap increased modularity while maintaining contiguity. Optimization finished.")
                break

            # If new partition has better modularity score than current best, update current best
            if iter_new_mod > iter_best_score:
                iter_best_part = iter_new_part

        # If current initial partition yielded best results, update opt best
        opt_best_part = iter_best_part
        opt_best_score = iter_best_score
        print(f"Iteration {i+1} complete. Time: {iter_time:.2f} seconds.")
        total_duration = total_duration + iter_time

    # Calculate final modularity score
    # print(f"Length: {len(best_part)}")
    actual_opt_best_score = nx.algorithms.community.modularity(G, opt_best_part, weight='weight')
    if not actual_opt_best_score == opt_best_score:
        print(f"Tracked modularity score ({actual_opt_best_score:.4f}) and final calculated modularity score ({opt_best_score}) not equal.")
        return

    print("\n--- Final Results ---")
    print(f"Optimal Modularity Score (after local swaps): {opt_best_score:.4f}")
    print(f"Optimization took: {total_duration:.2f} seconds")
    return opt_best_part, opt_best_score, total_duration

In [None]:
best_partition, final_score, duration = optimized_sequential_swap(Gtopo, GeoPure, N_iterations=1, n_swaps=10)
print(f"Final Partition Size: {len(best_partition)}")
print(f"Final Modularity Score: {final_score}")
print(f"Optimization Time: {duration}")

# Problem before going to the next problem:
# 1. Fix errors
# 2. Check if balance population works


# Problem: Stuck at local minima
# Try:
# 1. Multiple initial positions
# 2. Introduce noise via SGD
# 3. Taking uphill steps every so often, probability of uphill step decreases over iterations

--- Iteration 1/1 ---


NetworkXError: Node {3072, 3073, 3074, 3075, 3076, 3077, 3078, 3079, 3080, 3081, 3082, 3083, 3084, 3085, 3086, 3087, 3088, 3089, 3090, 3091, 3092, 3093, 3094, 3095, 3096, 3097, 3098, 3099, 3100, 3101, 3102, 3103, 3104, 3105, 3106, 3107, 3108, 3109, 3110, 3111, 3112, 3113, 3114, 3115, 3116, 3117, 3118, 3119, 3120, 3121, 3122, 3123, 3124, 3125, 3126, 3127, 3128, 3129, 3130, 3131, 3132, 3133, 3134, 3135, 3136, 3137, 3138, 3139, 3140, 3141, 3142, 3143, 3144, 3145, 3146, 3147, 3148, 3149, 3150, 3151, 3152, 3153, 3154, 3155, 3156, 3157, 3158, 3159, 3160, 3161, 3162, 3163, 3164, 3165, 3166, 3167, 3168, 3169, 3170, 3171, 3172, 3173, 3174, 3175, 3176, 3177, 3178, 3179, 3180, 3181, 3182, 3183, 3184, 3185, 3186, 5240, 5241, 5242, 5243, 5244, 5245, 5246, 5247, 5248, 5249, 5250, 5251, 5252, 5253, 5254, 5255, 5256, 1078, 1076, 1107, 1108, 3388, 1109, 2550, 1068, 1069, 1070, 1071, 1072, 1073, 1110, 1074, 1075, 3366, 3367, 3368, 3369, 3370, 3371, 3372, 3373, 3374, 3375, 3376, 3377, 3378, 3379, 1111, 1079, 3380, 3381, 3384, 3385, 1080, 3387, 3382, 3389, 3390, 1081, 3383, 3386, 3392, 1077, 1082, 3391, 1083, 1112, 1084, 1085, 1086, 1087, 1113, 1088, 1089, 1090, 1091, 1092, 2551, 1093, 1114, 1094, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 2547, 2548, 2549, 3716, 3719, 3720, 3721, 3731, 3732, 3733, 3734, 3735, 3736, 3737, 3738, 3739, 3740, 3741, 3742, 3743, 3744, 3745, 3746, 3747, 3748, 3749, 3750, 3751, 3752, 3753, 3754, 3755, 3756, 3757, 3758, 3759, 3760, 3761, 3762, 3763, 3764, 3765, 3766, 3767, 3768, 3769, 3770, 3771, 3772, 3773, 3774, 3775, 2552, 2553, 2554, 2546, 5938, 5939, 5940, 5941, 5942, 5943, 5944, 5945, 5946, 5947, 5948, 5949, 5950, 5951, 5952, 5953, 5954, 5955, 5956, 5957, 5958, 5959, 5960, 5961, 5962, 5963, 5964, 5965, 5966, 5967, 5968, 5969, 5970, 5971, 5972, 5973, 5974, 5975, 5976, 5977, 5978, 5979, 5668, 5670, 5671, 5672, 7038, 7039, 7040, 7041, 7042, 7043, 7044, 5673, 5674, 5675, 5676, 5677, 3059, 3060, 3061, 3062, 3063, 3064, 3065, 3066, 3067, 3068, 3069, 3070, 3071} in sequence nbunch is not a valid node.

In [None]:
def is_contiguous(G: nx.Graph, partition: List) -> bool:
    for part in partition:
        if not nx.is_connected(G.subgraph(part)):
            return False
    return True