In [None]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats
import networkx as nx
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def reduce_pca_by_variance(data: np.ndarray, variance_threshold: float):
    """
    Performs PCA on n-dimensional data, automatically selecting the minimum
    number of components to explain at least the `variance_threshold`.

    Args:
        data: A (n_samples, n_features) NumPy array.
        variance_threshold: The target amount of variance to explain
                            (e.g., 0.95 for 95%).

    Returns:
        A tuple containing:
        - data_transformed (np.ndarray): The data projected onto the
                                         new component space.
        - fitted_pca (PCA): The fitted PCA object, which you can use
                            to inspect the number of components, etc.
    """
    
    # 1. Create a PCA object with the variance threshold.
    # By setting n_components to a float, PCA automatically finds
    # the components needed to explain that much variance.
    pca = PCA(n_components=variance_threshold)
    
    # 2. Create a pipeline to first scale the data, then run PCA.
    # Scaling is crucial for PCA to work correctly.
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', pca)
    ])
    
    # 3. Fit the pipeline to the data and transform it
    data_transformed = pipeline.fit_transform(data)
    
    # Return the new data and the fitted PCA object (which is stored in
    # the 'pca' variable) so you can inspect it later.
    return data_transformed, pca


In [None]:
# Get data to be a numpy array probably
data=np.array(pd.read_csv('data.csv'))
# Transform the income dimension by logarithmic scale
data[:, income_index] = np.log(data[:, income_index])
# Normalize data by dividing by standard deviation by dimension
for i in range(data.shape[1]):
    data[:, i] = data[:, i] / np.std(data[:, i])

# Perform PCA, and project onto the top N dimensions so that they explain 50% of the variance
new_data,pca = reduce_pca_by_variance(data, variance_threshold=0.5)

In [None]:
def create_network(data, weights,window_size):
    # First add each data point as a node
    G = nx.Graph()
    for i, point in enumerate(data):
        G.add_node(i, features=point)

    # For each dimension in the data, add edges between points within a certain window size
    
    for dim in range(data.shape[1]):
        weight=weights[dim]
        # Find the max of the current dimension
        dim_values = data[:, dim]
        max_value = np.max(dim_values)
        
        n=int(max_value/window_size) + 1
        
        # this add each marker in the dimesion as a node, so that data points close to it can connect to it
        markers = [window_size * i for i in range(1,n+1)]
        G.add_node(f'{dim,markers[0]:.4}', marker=True, dim=dim, value=markers[0])
        for i in range(1,n+1):
            G.add_node(f'{dim,markers[i]:.4}', marker=True, dim=dim, value=markers[i])
            G.add_edge(f'{dim,markers[i-1]:.4}',f'{dim,markers[i]:.4}', weight=weight)
        
        # Now connect data points to the nearest marker nodes within the window size
        for i, point in enumerate(data):
            point_value = point[dim]
            lower_marker = markers[int(point_value // window_size)]
            G.add_edge(i, f'{dim,lower_marker:.4}', weight=weight)
        
    return G

In [None]:
import numpy as np
from sklearn.cluster import KMeans

def get_kmeans_partition(data: np.ndarray, weights, n_clusters=14):
    """
    Runs K-means clustering on the input data and returns the loss
    (inertia) and a partition of the data indices by cluster.

    Args:
        data: A (n_samples, n_features) NumPy array.
        n_clusters: The number of clusters (k).

    Returns:
        A tuple containing:
        - loss (float): The inertia (Within-Cluster Sum of Squares).
        - partitions (dict): A dictionary where keys are cluster IDs (0 to k-1)
                             and values are lists of original data indices
                             belonging to that cluster.
    """
    
    # 1. Initialize and fit the K-means model
    # n_init=10 runs the algorithm 10 times and picks the best result
    # random_state=42 ensures the result is reproducible
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    for i in range(weights):
        data[:, i] = data[:, i] * weights[i]
    kmeans.fit(data)

    # 2. Get the loss (inertia)
    # .inertia_ is the WCSS (Within-Cluster Sum of Squares)
    loss = kmeans.inertia_

    # 3. Get the cluster assignment for each data point
    # .labels_ is an array like [0, 1, 1, 0, 2, ...]
    labels = kmeans.labels_

    # 4. Create the partition of indices
    partitions = {i: [] for i in range(n_clusters)}
    for index, cluster_id in enumerate(labels):
        partitions[cluster_id].append(index)

    return loss, partitions

# --- Example Usage ---

# 1. Create a sample NumPy matrix (6 samples, 2 features)
# Let's create two clear clusters

k = 14

# 3. Run the function
total_loss, index_partitions = get_kmeans_partition(new_data,weights, n_clusters=k)



In [None]:
G=create_network(new_data, weights=[], window_size=0.1)
# Give it a partition from the NC districts
# We can give it several partitions from the recent redistrictings and test the modularity for each.s
partition=[]

In [None]:
# nx.algorithms.community.louvain_communities(G,weight='weight')
nx.algorithms.community.modularity(G, partition, weight='weight')
nx.algorithms.community.modularity(G, index_partitions, weight='weight')