In [7]:
#PCA METHOD WITH REPORTING
import pandas as pd
import numpy as np
import math
import networkx as nx
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def reduce_pca_by_variance(data: np.ndarray, feature_names: list, variance_threshold: float):
    """
    Performs PCA on n-dimensional data, automatically selecting the minimum
    number of components to explain at least the `variance_threshold`.
    
    This modified version also prints the results of the reduction and
    the top 5 feature contributors for each component.

    Args:
        data: A (n_samples, n_features) NumPy array.
        feature_names: A list of strings corresponding to the feature columns
                       in `data`. (e.g., list(df.columns))
        variance_threshold: The target amount of variance to explain
                            (e.g., 0.95 for 95%).

    Returns:
        A tuple containing:
        - data_transformed (np.ndarray): The data projected onto the
                                         new component space.
        - fitted_pca (PCA): The fitted PCA object, which you can use
                            to inspect the number of components, etc.
        - explained_variance_list (list): A list of the variance explained
                                          by each component (e.g., [0.5, 0.3]).
    """
    
    if len(feature_names) != data.shape[1]:
        raise ValueError(f"Number of feature_names ({len(feature_names)}) does not "
                         f"match number of data columns ({data.shape[1]}).")

    # 1. Create a PCA object with the variance threshold.
    # By setting n_components to a float, PCA automatically finds
    # the components needed to explain that much variance.
    pca = PCA(n_components=variance_threshold)
    
    # 2. Create a pipeline to first scale the data, then run PCA.
    # Scaling is crucial for PCA to work correctly.
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', pca)
    ])
    
    # 3. Fit the pipeline to the data and transform it
    data_transformed = pipeline.fit_transform(data)
    
    # --- Report print statements ---
    
    # Get the original and new dimensions
    original_dimensions = data.shape[1]
    # We access the fitted pca object from step 2
    new_dimensions = pca.n_components_ 
    
    print("-" * 30)
    print("PCA Dimensionality Reduction Report")
    print("-" * 30)
    print(f"Original dimensions:   {original_dimensions}")
    print(f"New dimensions:        {new_dimensions}")
    print(f"Dimensions reduced by: {original_dimensions - new_dimensions}")
    print("\nVariance explained by each remaining component:")
    
    # pca.explained_variance_ratio_ is an array like [0.5, 0.3, 0.1]
    for i, variance in enumerate(pca.explained_variance_ratio_):
        print(f"  Principal Component {i+1}: {variance * 100:.2f}%")
        
    # Print total variance explained
    total_variance = np.sum(pca.explained_variance_ratio_)
    print(f"\nTotal variance explained: {total_variance * 100:.2f}%")
    print(f"(Target threshold was {variance_threshold * 100:.0f}%)")
    
    # --- New: Top 5 Contributors per Component Report ---
    print("\nTop 5 Contributors per Component:")
    
    # pca.components_ has shape (n_components, n_features)
    for i, component in enumerate(pca.components_):
        print(f"  --- Principal Component {i+1} ---")
        
        # Get indices of the top 5 absolute loadings
        # np.argsort returns indices of smallest to largest
        # We take the last 5, and then reverse them [::-1]
        top_5_indices = np.argsort(np.abs(component))[-5:][::-1]
        
        # Print the feature name and its loading (weight)
        for j, feature_index in enumerate(top_5_indices):
            feature_name = feature_names[feature_index]
            loading = component[feature_index]
            print(f"    {j+1}. {feature_name}: {loading:.4f}")
            
    print("-" * 30)
    
    # --- End of report ---
    
    # Get the list of explained variances
    explained_variance_list = pca.explained_variance_ratio_.tolist()
    
    # Return the new data, the fitted PCA object, and the list of variances
    return data_transformed, pca, explained_variance_list


In [None]:
pip install networkx

Defaulting to user installation because normal site-packages is not writeable
Collecting networkx
  Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 4.8 MB/s eta 0:00:01
[?25hInstalling collected packages: networkx
Successfully installed networkx-3.2.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install sklearn

Defaulting to user installation because normal site-packages is not writeable
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
[31m    ERROR: Command errored out with exit status 1:
     command: /Library/Developer/CommandLineTools/usr/bin/python3 -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/private/var/folders/kp/24jrww7s5rq2cp13x34d48yw0000gn/T/pip-install-fzv2nfqg/sklearn_98dc148dc1284f588ceea6e9d96923bd/setup.py'"'"'; __file__='"'"'/private/var/folders/kp/24jrww7s5rq2cp13x34d48yw0000gn/T/pip-install-fzv2nfqg/sklearn_98dc148dc1284f588ceea6e9d96923bd/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base /private/var/folders/kp/24jrww7s5rq2cp13x34d48yw0000gn/T/pip-pip-egg-info-88gbnlbc
         cwd: /private/v

# Processing

In [9]:
# Get data to be a numpy array probably
df=pd.read_csv('consolidated_acs_data_clean.csv')
data=np.array(df)
income_index=1  # Example index for income dimension
# Transform the income dimension by logarithmic scale
data[:, income_index] = np.log(data[:, income_index].astype(np.float64))
# Normalize data by dividing by standard deviation by dimension
for i in range(1,data.shape[1]):
    data[:, i] = data[:, i] / np.std(data[:, i])


## Weighting

In [10]:
# Initilize weights, and scale data by weights
# Get column indices
income_index = 1  # per_capita_income
poverty_index = df.columns.get_loc('avg_poverty_ratio')
employment_index = df.columns.get_loc('in_labor_force')
healthcare_index = df.columns.get_loc('Educational services, and health care and social assistance')
education_index = df.columns.get_loc('College or More')
commute_index = df.columns.get_loc('avg_commute_time')
housing_index = df.columns.get_loc('avg_housing_cost_burden')
race_indices = [df.columns.get_loc(col) for col in ['white', 'black', 'asian', 'native', 'pacific islander', 'other']]

# Apply dimension weights
data[:, income_index] *= 0.1225          # Economic Security - Income
data[:, poverty_index] *= 0.0525         # Economic Security - Poverty
data[:, employment_index] *= 0.07        # Economic Security - Employment
data[:, healthcare_index] *= 0.105       # Economic Security - Healthcare
data[:, education_index] *= 0.15         # Education
data[:, commute_index] *= 0.15           # Location Affordability - Transportation
data[:, housing_index] *= 0.15           # Location Affordability - Housing

# Race: distribute 0.2 across 6 race columns
for idx in race_indices:
    data[:, idx] *= 0.2 / 6  # 0.0333 per column

weights = np.array([
    0.1225,     # 1:  per_capita_income (Economic Security - Income)
    0.2/6,      # 2:  white (Cultural - Race)
    0.2/6,      # 3:  black (Cultural - Race)
    0.2/6,      # 4:  asian (Cultural - Race)
    0.2/6,      # 5:  native (Cultural - Race)
    0.2/6,      # 6:  pacific islander (Cultural - Race)
    0.2/6,      # 7:  other (Cultural - Race)
    0.0,        # 8:  Under High School (not weighted)
    0.0,        # 9:  High School (No College Degree) (not weighted)
    0.15,       # 10: College or More (Education)
    0.0,        # 11: Agriculture (not weighted)
    0.0,        # 12: Construction_and_manufacturing (not weighted)
    0.0,        # 13: trade (not weighted)
    0.0,        # 14: Transportation and warehousing (not weighted)
    0.0,        # 15: nerds (not weighted)
    0.105,      # 16: Educational services, and health care (Economic Security - Healthcare)
    0.0,        # 17: finance_inurance_and_realty (not weighted)
    0.0,        # 18: other_services (not weighted)
    0.07,       # 19: in_labor_force (Economic Security - Employment)
    0.0,        # 20: out_labor_force (not weighted)
    0.15,       # 21: avg_commute_time (Location Affordability - Transportation)
    0.15,       # 22: avg_housing_cost_burden (Location Affordability - Housing)
    0.0525,     # 23: avg_poverty_ratio (Economic Security - Poverty)
])


In [11]:
# Perform PCA, and project onto the top N dimensions so that they explain 50% of the variance
new_data,pca,var_explained = reduce_pca_by_variance(data[:,1:], list(df.columns)[1:], variance_threshold=0.5)

------------------------------
PCA Dimensionality Reduction Report
------------------------------
Original dimensions:   23
New dimensions:        6
Dimensions reduced by: 17

Variance explained by each remaining component:
  Principal Component 1: 16.07%
  Principal Component 2: 10.11%
  Principal Component 3: 8.43%
  Principal Component 4: 6.79%
  Principal Component 5: 6.32%
  Principal Component 6: 5.67%

Total variance explained: 53.38%
(Target threshold was 50%)

Top 5 Contributors per Component:
  --- Principal Component 1 ---
    1. College or More: 0.4448
    2. avg_poverty_ratio: 0.3872
    3. Under High School: -0.3443
    4. per_capita_income: 0.2592
    5. in_labor_force: 0.2370
  --- Principal Component 2 ---
    1. out_labor_force: 0.4101
    2. in_labor_force: -0.4101
    3. white: 0.3803
    4. pacific islander: 0.3752
    5. Agriculture, forestry, fishing and hunting, and mining: 0.2772
  --- Principal Component 3 ---
    1. High School (No College Degree): 0.4196
   

As shown, the first component has a distinct identity, namely education, income and poverty, part of the most important factors typically considered as determinant of a person's sociol-economic status. The rest of the components are a mixed of race, education, occupation, and comute time.

# Network initialization

In [12]:
def create_dimension_layered_knn(data, dimension_weights, k=10):
    """
    Create multi-layer network where each dimension has its own KNN graph.
    """
    G = nx.Graph()
    
    # Add nodes
    for i in range(data.shape[0]):
        G.add_node(i)
    
    # For each dimension, create KNN graph
    for dim in range(data.shape[1]):
        dim_weight = dimension_weights[dim]
        
        if dim_weight == 0:
            continue
        
        # Get this dimension's values (1D)
        dim_data = data[:, dim].reshape(-1, 1)
        
        # Build KNN graph for THIS dimension only
        from sklearn.neighbors import NearestNeighbors
        nbrs = NearestNeighbors(n_neighbors=k)
        nbrs.fit(dim_data)
        distances, indices = nbrs.kneighbors(dim_data)
        
        # Add edges weighted by dimension importance
        for i in range(len(data)):
            for j, neighbor in enumerate(indices[i]):
                if i != neighbor:
                    if G.has_edge(i, neighbor):
                        G[i][neighbor]['weight'] += dim_weight  # Accumulate
                    else:
                        G.add_edge(i, neighbor, weight=dim_weight)
    
    return G


In [13]:
from ring_network import construct_net
G = create_dimension_layered_knn(new_data, dimension_weights=var_explained, k=10)
G_uni= nx.Graph()
G_uni.add_nodes_from(G.nodes(data=True))
G_uni.add_edges_from(G.edges())

# Gring= construct_net(new_data,var_explained,0.2)
# Gring_uni= nx.Graph()
# Gring_uni.add_nodes_from(Gring.nodes(data=True))
# Gring_uni.add_edges_from(Gring.edges())

## Geographic weighting

In [14]:
import pickle
from scipy.sparse import load_npz

# Create graph
Geo = G.copy()
#Gring_geo=Gring.copy()
# Load adjacency matrix
adj_matrix = load_npz('adjacency_queen_matrix.npz')

# Load mappings
with open('adjacency_queen_mappings.pkl', 'rb') as f:
    mappings = pickle.load(f)
index_to_geoid = mappings['index_to_geoid']

# Get edges from sparse matrix
rows, cols = adj_matrix.nonzero()

# Add edges with your weight
weight = 2  # Change this to your desired weight

for i, j in zip(rows, cols):
    if i < j:  # Only add each edge once (undirected)
        geoid1 = index_to_geoid[i]
        geoid2 = index_to_geoid[j]
        Geo.add_edge(i, j, weight=weight)
        #Gring_geo.add_edge(i, j, weight=weight)


# Grouping

In [15]:
# K means clustering function
from sklearn.cluster import KMeans

def get_kmeans_partition(data: np.ndarray, weights, n_clusters=14):
    """
    Runs K-means clustering on the input data and returns the loss
    (inertia) and a partition of the data indices by cluster.

    Args:
        data: A (n_samples, n_features) NumPy array.
        n_clusters: The number of clusters (k).

    Returns:
        A tuple containing:
        - loss (float): The inertia (Within-Cluster Sum of Squares).
        - partitions (dict): A dictionary where keys are cluster IDs (0 to k-1)
                             and values are lists of original data indices
                             belonging to that cluster.
    """
    
    # 1. Initialize and fit the K-means model
    # n_init=10 runs the algorithm 10 times and picks the best result
    # random_state=42 ensures the result is reproducible
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    for i in range(len(weights)):
        data[:, i] = data[:, i] * weights[i]
    kmeans.fit(data)

    # 2. Get the loss (inertia)
    # .inertia_ is the WCSS (Within-Cluster Sum of Squares)
    loss = kmeans.inertia_

    # 3. Get the cluster assignment for each data point
    # .labels_ is an array like [0, 1, 1, 0, 2, ...]
    labels = kmeans.labels_

    # 4. Create the partition of indices
    partitions = {i: [] for i in range(n_clusters)}
    for index, cluster_id in enumerate(labels):
        partitions[cluster_id].append(index)

    return loss, partitions

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

def calculate_kmeans_loss_for_partition(data: np.ndarray, partition: list) -> float:
    """
    Calculates the K-means "loss" (Inertia, or Within-Cluster Sum of Squares)
    for a given dataset and a user-provided partition.

    Args:
        data: A (n_samples, n_features) NumPy array.
        partition: A list of lists, where each inner list contains the
                   *indices* (row numbers) of the data points belonging
                   to that cluster.
                   Example: [[0, 1, 4], [2, 3, 5]]

    Returns:
        total_loss (float): The total K-means loss (Inertia) for this partition.
    """
    
    total_loss = 0.0

    # Iterate over each cluster (which is a list of indices)
    for indices in partition:
        
        # 1. Get all data points belonging to this cluster
        # Using [indices, :] selects all rows whose index is in the list
        cluster_points = data[indices, :]
        
        # Handle empty clusters (their loss is 0)
        if cluster_points.shape[0] == 0:
            continue
            
        # 2. Calculate the "true" centroid (mean) for this cluster
        # axis=0 calculates the mean of each *column* (feature)
        centroid = np.mean(cluster_points, axis=0)
        
        # 3. Calculate the sum of squared distances from each point to the centroid
        #    - (cluster_points - centroid) uses broadcasting to get distance vectors
        #    - (** 2) squares all distances
        #    - np.sum(...) sums all squared distances into a single number
        cluster_loss = np.sum((cluster_points - centroid) ** 2)
        
        # 4. Add this cluster's loss to the total
        total_loss += cluster_loss
        
    return total_loss




k = 14

# 3. Run the function
total_loss, index_partitions = get_kmeans_partition(new_data,weights=var_explained, n_clusters=k)

index_partitions=[index_partitions[i] for i in range(k)]

In [None]:
from joblib import Parallel, delayed

NUM_RUNS = 100

def run_one_iteration(run, G, Geo, G_uni):
    partG = nx.algorithms.community.louvain_communities(G, weight='weight', resolution=0.75)
    partGeo = nx.algorithms.community.louvain_communities(Geo, weight='weight', resolution=0.8)
    partG_uni = nx.algorithms.community.louvain_communities(G_uni, weight='weight', resolution=0.95)

    return {
        "G_num_comms": len(partG),
        "Geo_num_comms": len(partGeo),
        "G_uni_num_comms": len(partG_uni),

        "cluster_size_std_Geo": np.std([len(x) for x in partGeo]),

        "G_modularity_self": nx.algorithms.community.modularity(G, partG, weight='weight'),
        "Geo_modularity_self": nx.algorithms.community.modularity(Geo, partGeo, weight='weight'),
        "G_uni_modularity_self": nx.algorithms.community.modularity(G_uni, partG_uni, weight='weight'),

        "G_modularity_GeoPartition": nx.algorithms.community.modularity(G, partGeo, weight='weight'),
        "G_modularity_GuniPartition": nx.algorithms.community.modularity(G, partG_uni, weight='weight'),
    }

results_list = Parallel(n_jobs=-1, verbose=10)(
    delayed(run_one_iteration)(run, G, Geo, G_uni)
    for run in range(NUM_RUNS)
)

results = {
    "G_num_comms": [],
    "Geo_num_comms": [],
    "G_uni_num_comms": [],
    "cluster_size_std_Geo": [],
    "G_modularity_self": [],
    "Geo_modularity_self": [],
    "G_uni_modularity_self": [],
    "G_modularity_GeoPartition": [],
    "G_modularity_GuniPartition": [],
}

for r in results_list:
    for key in results:
        results[key].append(r[key])

print("\n=== DONE ===\n")

for key in results:
    arr = np.array(results[key])
    print("{}: mean={:.4f}, std={:.4f}".format(key, arr.mean(), arr.std()))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed: 17.0min remaining:   42.6s



=== DONE ===

G_num_comms: mean=16.3500, std=1.1347
Geo_num_comms: mean=22.1800, std=1.2440
G_uni_num_comms: mean=16.7400, std=2.6669
cluster_size_std_Geo: mean=113.9136, std=14.1228
G_modularity_self: mean=0.2859, std=0.0012
Geo_modularity_self: mean=0.5183, std=0.0008
G_uni_modularity_self: mean=0.1467, std=0.0056
G_modularity_GeoPartition: mean=0.0100, std=0.0008
G_modularity_GuniPartition: mean=0.1548, std=0.0051


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 17.2min finished


In [19]:
results_list

[{'G_num_comms': 17,
  'Geo_num_comms': 23,
  'G_uni_num_comms': 16,
  'cluster_size_std_Geo': np.float64(88.8532912862588),
  'G_modularity_self': 0.2865562682786773,
  'Geo_modularity_self': 0.5183172147786919,
  'G_uni_modularity_self': 0.14615496944253792,
  'G_modularity_GeoPartition': 0.010055089876113265,
  'G_modularity_GuniPartition': 0.15456999197569918},
 {'G_num_comms': 17,
  'Geo_num_comms': 22,
  'G_uni_num_comms': 14,
  'cluster_size_std_Geo': np.float64(113.77487013563307),
  'G_modularity_self': 0.2868538434107725,
  'Geo_modularity_self': 0.5182789886666312,
  'G_uni_modularity_self': 0.1560686190732063,
  'G_modularity_GeoPartition': 0.01138045729750721,
  'G_modularity_GuniPartition': 0.16418119230486838},
 {'G_num_comms': 15,
  'Geo_num_comms': 24,
  'G_uni_num_comms': 21,
  'cluster_size_std_Geo': np.float64(102.07594361008321),
  'G_modularity_self': 0.284644987832289,
  'Geo_modularity_self': 0.5188751278070335,
  'G_uni_modularity_self': 0.1421948045983672,
  '

In [20]:
df_results = pd.DataFrame(results_list)
df_results.to_csv("louvain_results.csv", index=False)
print(df_results.head())


   G_num_comms  Geo_num_comms  G_uni_num_comms  cluster_size_std_Geo  \
0           17             23               16             88.853291   
1           17             22               14            113.774870   
2           15             24               21            102.075944   
3           18             19               12            107.310459   
4           15             25               16            100.535765   

   G_modularity_self  Geo_modularity_self  G_uni_modularity_self  \
0           0.286556             0.518317               0.146155   
1           0.286854             0.518279               0.156069   
2           0.284645             0.518875               0.142195   
3           0.287641             0.516415               0.156112   
4           0.284027             0.519459               0.147119   

   G_modularity_GeoPartition  G_modularity_GuniPartition  
0                   0.010055                    0.154570  
1                   0.011380            

In [32]:

geo_num_comms = np.array(results['Geo_num_comms'])
target = 14

diffs = np.abs(geo_num_comms - target)

min_diff = np.min(diffs)
best_indices = np.where(diffs == min_diff)[0]

print("All best runs (closest to 14 communities):", best_indices)
print("Number of communities for these runs:", geo_num_comms[best_indices])


All best runs (closest to 14 communities): [3 7]
Number of communities for these runs: [19 19]


In [34]:
geo_num_comms = np.array(results['Geo_num_comms'])
target = 14

diffs = np.abs(geo_num_comms - target)
min_diff = np.min(diffs)
best_runs = np.where(diffs == min_diff)[0]

geo_modularity_self = np.array(results['Geo_modularity_self'])
best_mod_index = best_runs[np.argmax(geo_modularity_self[best_runs])]

print("Best run index (closest to 14 AND highest modularity):", best_mod_index)
print("Number of communities:", geo_num_comms[best_mod_index])
print("Modularity:", geo_modularity_self[best_mod_index])


Best run index (closest to 14 AND highest modularity): 7
Number of communities: 19
Modularity: 0.5171688730189038


In [None]:

G_num_comms = np.array(results['G_num_comms'])
target = 14

diffs = np.abs(G_num_comms - target)

min_diff = np.min(diffs)
best_indices = np.where(diffs == min_diff)[0]

print("All best runs (closest to 14 communities):", best_indices)
print("Number of communities for these runs:", G_num_comms[best_indices])

geo_modularity_self = np.array(results['Geo_modularity_self'])
best_mod_index = best_runs[np.argmax(geo_modularity_self[best_runs])]

print("Best run index (closest to 14 AND highest modularity):", best_mod_index)
print("Number of communities:", geo_num_comms[best_mod_index])
print("Modularity:", geo_modularity_self[best_mod_index])



All best runs (closest to 14 communities): [35 41 46 76]
Number of communities for these runs: [14 14 14 14]


In [39]:
G_num_comms = np.array(results['G_num_comms'])
target = 14

diffs = np.abs(G_num_comms - target)
min_diff = np.min(diffs)
best_runs = np.where(diffs == min_diff)[0]

G_modularity_self = np.array(results['G_modularity_self'])
best_mod_index = best_runs[np.argmax(G_modularity_self[best_runs])]

print("Best run index (closest to 14 AND highest modularity):", best_mod_index)
print("Number of communities:", G_num_comms[best_mod_index])
print("Modularity:", G_modularity_self[best_mod_index])


Best run index (closest to 14 AND highest modularity): 76
Number of communities: 14
Modularity: 0.2838134678766587
