In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from fairlearn.datasets import fetch_adult
from scipy.io import loadmat
from scipy.spatial.distance import cdist
from lp_solver import run_connector_and_solve_lp
from min_cost_rounding import rounding_wrapper
from clustering_utils import give_rand_centers, lloyd, comp_cost
from main_wc import load_data, normalize_data,socially_fair_kmeans# assumed helper functions



In [None]:
import networkx as nx

G = nx.DiGraph()
G.add_node("a", demand=-1)
G.add_node("b", demand=1)
G.add_edge("a", "b", capacity=1, weight=0.4873)

cost, flow = nx.network_simplex(G)
print("Flow cost:", cost)


In [2]:
def test_lp_and_rounding(data_normalized, svar_all, centers, alpha_val=0.1, beta_val=0.1, lambda_param=0.5):
    """
    Run LP and min-cost rounding on given clustering centers.
    
    Parameters:
    - data_normalized: ndarray of shape (n, d), normalized input data
    - svar_all: 1D array of sensitive group labels (1 or 2)
    - centers: ndarray of shape (k, d), cluster centers from fair clustering
    - alpha_val, beta_val: slack parameters for each group
    - lambda_param: tradeoff between clustering cost and fairness
    
    Returns:
    - result: dictionary from rounding_wrapper with final assignment and cost details
    """

    # Step 1: Define alpha and beta dictionaries
    unique_groups = np.unique(svar_all)
    alpha = {h: alpha_val for h in unique_groups}
    beta = {h: beta_val for h in unique_groups}

    # Step 2: Solve LP
    lp_result = run_connector_and_solve_lp(
        df=pd.DataFrame(data_normalized),
        svar_all=svar_all,
        centers=centers,
        alpha=alpha,
        beta=beta,
        lambda_param=lambda_param
    )
    
    print("LP keys:", lp_result.keys())
    print("LP objective z:", lp_result.get("z"))
    print("LP x_frac shape:", lp_result["x_frac"].shape)
    print("LP x_frac sum:", np.sum(lp_result["x_frac"]))
    
    # Step 3: Build distance matrix
    distance_matrix = cdist(data_normalized, centers, metric="sqeuclidean")

    
    print("Calling rounding_wrapper...")
    # Step 4: Run rounding
    result = rounding_wrapper(
    lp_assignment=lp_result["x_frac"],
    distance_matrix=distance_matrix,
    color_labels=svar_all.to_numpy(), 
    num_clusters=centers.shape[0],
    num_colors=len(unique_groups),
    lp_objective=lp_result["z"],
    df=data_normalized,
    centers=centers
    )

    print("Rounding done!")
    
    # Step 5: Print result summary
    print("✅ LP objective:", lp_result["z"])
    print("🎯 Rounded objective:", result["objective"])
    print("📊 Cost ratio (rounded / LP):", result["cost_ratio"])

    return result


In [3]:
data_all, svar_all, _ = load_data("adult")
data_normalized = normalize_data(data_all)
centers = socially_fair_kmeans(data_normalized, svar_all, k=10)
print("Centers shape:", centers.shape)

Centers shape: (10, 14)


In [4]:
centers = socially_fair_kmeans(data_normalized, svar_all, k=10)
result = test_lp_and_rounding(data_normalized, svar_all, centers)

Version identifier: 22.1.1.0 | 2022-11-28 | 9160aff4d
CPXPARAM_Read_DataCheck                          1
Parallel mode: deterministic, using up to 4 threads for concurrent optimization:
 * Starting dual Simplex on 1 thread...
 * Starting Barrier on 3 threads...
Tried aggregator 1 time.
LP Presolve eliminated 80 rows and 60 columns.
Reduced LP has 45224 rows, 452221 columns, and 904442 nonzeros.
Presolve time = 0.83 sec. (322.72 ticks)
Initializing dual steep norms . . .

Iteration log . . .
Iteration:     1   Dual objective     =             0.000000
Perturbation started.
Iteration:   101   Dual objective     =             0.000000
Iteration:  1586   Dual objective     =             0.019020
Iteration:  3071   Dual objective     =             0.020108
Iteration:  4556   Dual objective     =             0.020917
Iteration:  6041   Dual objective     =             0.021591
Iteration:  7526   Dual objective     =             0.022249
Iteration:  9011   Dual objective     =             0.0

NetworkXUnfeasible: no flow satisfies all node demands

In [None]:
import matplotlib.pyplot as plt

def plot_group_distance_costs(data, centers, assignment_matrix, group_labels, group_names=None):
    """
    Plots average squared distance to assigned center for each group.

    Parameters:
    - data: ndarray (n, d) of input points
    - centers: ndarray (k, d) of cluster centers
    - assignment_matrix: binary (n, k) array from rounding
    - group_labels: array of group ids (e.g., 1 or 2)
    - group_names: optional list of names for each group id
    """

    n, k = assignment_matrix.shape
    assigned_centers = assignment_matrix @ centers  # (n, d)
    sq_dists = np.sum((data - assigned_centers)**2, axis=1)  # (n,)

    unique_groups = np.unique(group_labels)
    costs = []

    for h in unique_groups:
        mask = (group_labels == h)
        avg_cost = np.mean(sq_dists[mask])
        costs.append(avg_cost)

    # Prepare x-axis labels
    if group_names:
        labels = [group_names[h - 1] for h in unique_groups]  # assuming group ids are 1-indexed
    else:
        labels = [f"Group {h}" for h in unique_groups]

    # Plot
    plt.figure(figsize=(6, 4))
    plt.bar(labels, costs, color='skyblue', edgecolor='black')
    plt.ylabel("Avg. Distance Cost")
    plt.title("Average Distance-to-Center Cost by Group")
    plt.tight_layout()
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.show()


In [None]:
plot_group_distance_costs(
    data=data_normalized,
    centers=centers,
    assignment_matrix=result["assignment"],
    group_labels=svar_all,
    group_names=['Female', 'Male']
)
