In [1]:
import time
from itertools import product, combinations

import numpy as np
import pandas as pd
import gurobipy as gb
from sklearn.linear_model import LinearRegression


# WLS credentials
WLSACCESSID = 'ccc2c36a-db14-4956-b2e3-60adc45e9957'
WLSSECRET = '1e0e3dbf-7933-44dc-8f81-e0482ded7ac8'
LICENSEID = 2586688

# Create the Gurobi environment with parameters
env = gb.Env(empty=True)  # Start with an empty environment
env.setParam('WLSACCESSID', WLSACCESSID)
env.setParam('WLSSECRET', WLSSECRET)
env.setParam('LICENSEID', LICENSEID)
env.start() 

Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2586688
Academic license 2586688 - for non-commercial use only - registered to ru___@ucsd.edu


<gurobipy.Env, Parameter changes: WLSAccessID=(user-defined), WLSSecret=(user-defined), LicenseID=2586688>

In [2]:
df = pd.read_csv('GA_features.csv')
df.columns

Index(['Unnamed: 0.2', 'Unnamed: 0', 'Unnamed: 0.1', 'county', 'tweets',
       'contribution', 'n_poll', 'frac_unem', 'frac_votes', 'total_votes',
       ...
       'frac_voted_A', 'frac_voted_B', 'frac_voted_C', 'frac_voted_D',
       'frac_voted_E', 'frac_voted_F', 'frac_voted_G', 'total_registers',
       'latitude', 'longitude'],
      dtype='object', length=125)

In [3]:
# Define Constants
SOCIAL_CATEGORIES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
BUDGET = 100
TAU_VALUES = [0.566, None]  # Define fairness constraints for optimization

# Data Preparation
df = pd.read_csv('GA_features.csv')
# X_columns = ['frpl_rate', 'calculus', 'ap_ib', 'counselors']
X_columns = ['frac_unem', 'n_poll', 'contribution', 'tweets']
count_columns = [f'registered_{category}' for category in SOCIAL_CATEGORIES]
frac_columns = [f'frac_registered_{category}' for category in SOCIAL_CATEGORIES]

X = df[X_columns]
A_frac = df[frac_columns]
y_train = df['frac_votes'].values

neighbor_distance_matrix = np.load('distance_matrix.npy')
neighbor_index_matrix = np.load('index_matrix.npy')

contribution = X['contribution'].values
n_poll = X['n_poll'].values
tweets = X['tweets'].values
n = len(X)

In [4]:
X

Unnamed: 0,frac_unem,n_poll,contribution,tweets
0,0.160193,0,235818,0
1,0.117003,0,93026,0
2,0.115665,0,173074,0
3,0.179682,0,72226,0
4,0.173954,0,927608,2
...,...,...,...,...
154,0.070532,1,4216384,0
155,0.182849,0,101176,0
156,0.206583,0,142526,0
157,0.163246,0,92444,0


In [11]:
# AP_IB = X['ap_ib'].values
# COUNSELORS = X['counselors'].values
# FRPL = np.ones_like(X['frpl_rate'].values)
# A_FRAC = df[frac_columns]
# A_MATRIX = A_FRAC.values


# NEIGHBOR_INDEX_MATRIX = np.load('neighbor_index_matrix.npy')
# NEIGHBOR_DISTANCE_MATRIX = np.load('neighbor_distance_matrix.npy')
# NUM_SCHOOLS = X.shape[0]
# # weight_df = pd.read_csv('params_7_disagg.csv', index_col=0)
# # WEIGHT_MATRIX = weight_df.values

# #possible intervention - column represents neighbours
# NUM_NEIGHBORS = NEIGHBOR_INDEX_MATRIX.shape[1]
# intervention_sample_spaces = [(0, 1)] * NUM_NEIGHBORS
# POSSIBLE_INTERVENTIONS_MATRIX = np.array(list(
#     product(*intervention_sample_spaces)
# ))
# NUM_POSSIBLE_INTERVENTIONS = POSSIBLE_INTERVENTIONS_MATRIX.shape[0]

# BUDGET = 100

# NUM_CATEGORIES = 28
# CATEGORIES = list(range(NUM_CATEGORIES))
# CATEGORY_PAIRS = list(combinations(CATEGORIES, 2))

# DEMOGRAPHIC_COUNTERFACTUALS = [0, 1]
# NUM_COUNTERFACTUALS = len(DEMOGRAPHIC_COUNTERFACTUALS)

# TOTAL_STUDENTS = df['total_students'].values
# R_COUNTS = df[count_columns].values
# R_COUNTS_TOTAL = R_COUNTS.sum(axis=0)

# CALCULUS = X['calculus']
# A_DIMENSION = A_MATRIX.shape[1]

# WHETHER_OR_NOT_CALCULUS_GIVEN_INTERFERENCE = np.max(
#     NEIGHBOR_DISTANCE_MATRIX * CALCULUS.values, axis=1)

In [47]:
# Define Constants
SOCIAL_CATEGORIES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
# BUDGET = 100
# TAU_VALUES = [0.566, None]  # Define fairness constraints for optimization
BUDGET = 10
TAU_VALUES = [0.9, None]  # Define fairness constraints for optimization

# Data Preparation
df = pd.read_csv('GA_features.csv')

# Define relevant columns
X_columns = ['frac_unem', 'n_poll', 'contribution', 'tweets']
count_columns = [f'registered_{category}' for category in SOCIAL_CATEGORIES]
frac_columns = [f'frac_registered_{category}' for category in SOCIAL_CATEGORIES]

# Extract features and targets
X = df[X_columns]
A_frac = df[frac_columns]
y_train = df['frac_votes'].values

# Prepare other required matrices and values
CALCULUS = X['frac_unem'].values  # Assuming 'frac_unem' represents calculus data
COUNSELORS = X['n_poll'].values
FRPL = np.ones_like(X['contribution'].values)
A_MATRIX = A_frac.values
TOTAL_R = df['total_registers'].values
R_COUNTS = df[count_columns].values
R_COUNTS_TOTAL = R_COUNTS.sum(axis=0)

# Load neighborhood matrices
NEIGHBOR_INDEX_MATRIX = np.load('index_matrix.npy')
NEIGHBOR_DISTANCE_MATRIX = np.load('distance_matrix.npy')

# Calculate dimensions and possible interventions
NUM_SCHOOLS = X.shape[0]
NUM_NEIGHBORS = NEIGHBOR_INDEX_MATRIX.shape[1]
intervention_sample_spaces = [(0, 1)] * NUM_NEIGHBORS
POSSIBLE_INTERVENTIONS_MATRIX = np.array(list(
    product(*intervention_sample_spaces)
))
NUM_POSSIBLE_INTERVENTIONS = POSSIBLE_INTERVENTIONS_MATRIX.shape[0]

# Define demographic counterfactuals
DEMOGRAPHIC_COUNTERFACTUALS = [0, 1]
NUM_COUNTERFACTUALS = len(DEMOGRAPHIC_COUNTERFACTUALS)

# Interference effect calculation (example)
WHETHER_OR_NOT_CALCULUS_GIVEN_INTERFERENCE = np.max(
    NEIGHBOR_DISTANCE_MATRIX * CALCULUS[:, None], axis=1
)

# Additional features
contribution = X['contribution'].values
n_poll = X['n_poll'].values
tweets = X['tweets'].values
n = len(X)


In [48]:
# neighbor_index_matrix = np.load('neighbor_index_matrix.npy')
# neighbor_distance_matrix = np.load('neighbor_distance_matrix.npy')

# # Print basic information
# print("Neighbor Index Matrix:")
# print("Shape:", neighbor_index_matrix.shape)
# print("Content (First 5 Rows):\n", neighbor_index_matrix[:5])

# print("\nNeighbor Distance Matrix:")
# print("Shape:", neighbor_distance_matrix.shape)
# print("Content (First 5 Rows):\n", neighbor_distance_matrix[:5])

In [49]:
neighbor_distance_matrix.sum()

180.03710752209687

In [50]:
# Find and analyze differences
diff_indices = np.where(neighbor_distance_matrix != neighbor_distance_matrix.T)
for i, j in zip(diff_indices[0], diff_indices[1]):
    print(f"M[{i}, {j}] = {neighbor_distance_matrix[i, j]}, M[{j}, {i}] = {neighbor_distance_matrix[j, i]}")

M[3, 43] = 0.0, M[43, 3] = 0.01944789998479488
M[3, 64] = 0.0, M[64, 3] = 0.018502561336574925
M[3, 124] = 0.0, M[124, 3] = 0.016939410396047995
M[5, 67] = 0.0, M[67, 5] = 0.032337242186982404
M[7, 109] = 0.02828219683566139, M[109, 7] = 0.0
M[7, 114] = 0.0, M[114, 7] = 0.02416468780808177
M[8, 141] = 0.02599353807160201, M[141, 8] = 0.0
M[9, 76] = 0.0, M[76, 9] = 0.02736538389201286
M[10, 101] = 0.0, M[101, 10] = 0.03229806923148114
M[11, 86] = 0.0, M[86, 11] = 0.02617989520909695
M[11, 157] = 0.0, M[157, 11] = 0.02301055534989381
M[12, 23] = 0.0, M[23, 12] = 0.020616915300489147
M[12, 147] = 0.0, M[147, 12] = 0.022240357163258322
M[13, 34] = 0.02352673796726905, M[34, 13] = 0.0
M[13, 36] = 0.026657819795958757, M[36, 13] = 0.0
M[17, 84] = 0.03449541102608217, M[84, 17] = 0.0
M[17, 101] = 0.0, M[101, 17] = 0.032597171867988245
M[17, 125] = 0.03274595872522721, M[125, 17] = 0.0
M[18, 29] = 0.0, M[29, 18] = 0.028271956228138797
M[19, 97] = 0.01479858237149285, M[97, 19] = 0.0
M[20, 81] 

In [51]:
# # Calculate adjusted features for regression model
# def compute_adjusted_features(feature_values, A_frac, neighbor_distance_matrix):
#     max_neighbor_influence = np.max(neighbor_distance_matrix * feature_values.T, axis=1).reshape(n, 1)
#     return A_frac * max_neighbor_influence

# a_max_Sij_Pj = compute_adjusted_features(ap_ib, A_frac, neighbor_distance_matrix)
# a_max_Sij_Cj = compute_adjusted_features(calculus, A_frac, neighbor_distance_matrix)
# a_Fj = A_frac * counselors.reshape(n, 1)

# # Combine features for regression model
# X_train = np.concatenate((a_max_Sij_Pj, a_max_Sij_Cj, a_Fj, A_frac), axis=1)

# # Train linear regression model
# linmod = LinearRegression(fit_intercept=False).fit(X_train, y_train)
# model_weights = linmod.coef_
# param_dims = len(SOCIAL_CATEGORIES)

# # Extract regression weights
# weight_dict = {
#     'alpha': model_weights[param_dims:param_dims*2],
#     'beta': model_weights[:param_dims],
#     'gamma': model_weights[param_dims*2:param_dims*3],
#     'theta': model_weights[-param_dims:]
# }
# params = pd.DataFrame(weight_dict)

# ALPHA, BETA, GAMMA, THETA = (params['alpha'].values, params['beta'].values, 
#                              params['gamma'].values, params['theta'].values)

# ALPHA, BETA, GAMMA, THETA

In [52]:
# Define updated features
X_columns = ['frac_unem', 'n_poll', 'contribution', 'tweets']

# Extract updated features and targets
X = df[X_columns]
frac_unem = X['frac_unem'].values
n_poll = X['n_poll'].values
contribution = X['contribution'].values
tweets = X['tweets'].values

# Calculate adjusted features for regression model
def compute_adjusted_features(feature_values, A_frac, neighbor_distance_matrix):
    # Calculate maximum neighbor influence scaled by distance
    max_neighbor_influence = np.max(neighbor_distance_matrix * feature_values[:, None], axis=1).reshape(n, 1)
    return A_frac * max_neighbor_influence

# Compute adjusted features using the updated columns
a_max_Sij_frac_unem = compute_adjusted_features(frac_unem, A_frac, neighbor_distance_matrix)
a_max_Sij_n_poll = compute_adjusted_features(n_poll, A_frac, neighbor_distance_matrix)
a_max_Sij_contribution = compute_adjusted_features(contribution, A_frac, neighbor_distance_matrix)
a_max_Sij_tweets = compute_adjusted_features(tweets, A_frac, neighbor_distance_matrix)

# Combine features for regression model
X_train = np.concatenate((a_max_Sij_frac_unem, a_max_Sij_n_poll, 
                          a_max_Sij_contribution, a_max_Sij_tweets, A_frac), axis=1)

# Train linear regression model
linmod = LinearRegression(fit_intercept=False).fit(X_train, y_train)
model_weights = linmod.coef_

# Define parameter dimensions based on social categories
param_dims = len(SOCIAL_CATEGORIES)

# Extract regression weights for the updated features
# ALPHA: frac_unem
# BETA: n_poll
# GAMMA: contribution
# DELTA: tweets
# THETA: A_frac
weight_dict = {
    'alpha': model_weights[:param_dims],
    'beta': model_weights[param_dims:param_dims*2],
    'gamma': model_weights[param_dims*2:param_dims*3],
    'delta': model_weights[param_dims*3:param_dims*4],
    'theta': model_weights[param_dims*4:]
}

# Store weights in a DataFrame
params = pd.DataFrame(weight_dict, index=SOCIAL_CATEGORIES)

# Extract weight vectors
ALPHA = params['alpha'].values
BETA = params['beta'].values
GAMMA = params['gamma'].values
DELTA = params['delta'].values
THETA = params['theta'].values

# View results
print("ALPHA (frac_unem):", ALPHA)
print("BETA (n_poll):", BETA)
print("GAMMA (contribution):", GAMMA)
print("DELTA (tweets):", DELTA)
print("THETA (A_frac):", THETA)


ALPHA (frac_unem): [  0.58228814   0.80293196 -23.93764663   0.13683652  35.05485199
  -6.12617056  -9.43794082]
BETA (n_poll): [-0.15566316 -0.11416819 -1.14465912  1.05640314 39.34117556  1.90360013
 -0.14928238]
GAMMA (contribution): [-2.64688066e-09  2.45915518e-08  4.80094340e-08 -1.42543058e-07
 -6.36376245e-06  6.33192531e-08  4.45895434e-08]
DELTA (tweets): [ 0.00393742 -0.00113047  0.01636672  0.00101458  0.80113931 -0.02424243
 -0.02411203]
THETA (A_frac): [  0.59310839   0.68230219   2.56738399  -0.2678885  -18.10622622
   1.09305964   1.35503998]


In [53]:
# Optimization Helper
def calculate_expected_impact(index, intervention_array, demographic_vector):
    """
    Calculate the expected impact for a given index, intervention array, and demographic vector.
    """
    # Get nearest neighbors and distances for the given index
    nearest_neighbors = neighbor_index_matrix[index, :]
    neighbor_distances = neighbor_distance_matrix[index, nearest_neighbors]
    
    # Compute terms for each feature using the revised features and weights
    frac_unem_term = np.dot(demographic_vector, ALPHA) * np.max(neighbor_distances * intervention_array)
    n_poll_term = np.dot(demographic_vector, BETA) * np.max(neighbor_distances * n_poll[nearest_neighbors])
    contribution_term = np.dot(demographic_vector, GAMMA) * np.max(neighbor_distances * contribution[nearest_neighbors])
    tweets_term = np.dot(demographic_vector, DELTA) * np.max(neighbor_distances * tweets[nearest_neighbors])
    demographic_term = np.dot(demographic_vector, THETA)

    # Calculate total impact
    impact = frac_unem_term + n_poll_term + contribution_term + tweets_term + demographic_term
    
    # Clamp impact between 0 and 1
    return max(min(impact, 1), 0)


In [54]:
def calculate_all_possible_impacts(index, demographic_vector, POSSIBLE_INTERVENTIONS_MATRIX):
    possible_impacts = np.empty(len(POSSIBLE_INTERVENTIONS_MATRIX))
    for k, intervention_array in enumerate(POSSIBLE_INTERVENTIONS_MATRIX):
        possible_impacts[k] = calculate_expected_impact(index, intervention_array, demographic_vector)
    return possible_impacts

In [55]:
# Optimization Routine
def optimize_interventions(tau_value, A_frac, POSSIBLE_INTERVENTIONS_MATRIX):
    print(f'Running optimization for tau={tau_value}')
    model = gb.Model(env=env)

    interventions = model.addVars(n, vtype=gb.GRB.BINARY, name="interventions")
    model.addConstr(sum(interventions.values()) <= BUDGET, "budget_constraint")

    def add_auxiliary_constraints(index):
        demographic_vector = A_frac.values[index, :]
        factual_impacts = calculate_all_possible_impacts(index, demographic_vector, POSSIBLE_INTERVENTIONS_MATRIX)

        auxiliary_vars = model.addVars(
            len(factual_impacts), obj=factual_impacts, vtype=gb.GRB.CONTINUOUS
        )
        model.update()

        for j, intervention in enumerate(POSSIBLE_INTERVENTIONS_MATRIX):
            for k, neighbor in enumerate(neighbor_index_matrix[index]):
                if intervention[k] == 1:
                    model.addConstr(auxiliary_vars[j] <= interventions[neighbor])
                else:
                    model.addConstr(auxiliary_vars[j] <= 1 - interventions[neighbor])
        model.addConstr(sum(auxiliary_vars.values()) == 1)

        if tau_value is not None:
            for group_idx in range(A_frac.shape[1]):
                group_impact_diff = calculate_all_possible_impacts(index, np.eye(A_frac.shape[1])[group_idx], POSSIBLE_INTERVENTIONS_MATRIX) - factual_impacts
                model.addConstr(
                    sum(auxiliary_vars[j] * group_impact_diff[j] for j in range(len(factual_impacts))) <= tau_value
                )

    for index in range(n):
        add_auxiliary_constraints(index)

    model.setObjective(model.getObjective(), gb.GRB.MAXIMIZE)
    model.optimize()

    if model.status == gb.GRB.OPTIMAL:
        return np.array([interventions[i].X for i in range(n)]).astype(bool)
    else:
        raise RuntimeError("Optimization failed.")

# Run optimization for each tau value
for tau_value in TAU_VALUES:
    try:
        optimal_interventions = optimize_interventions(tau_value, A_frac, POSSIBLE_INTERVENTIONS_MATRIX)
        print(f"Optimal interventions: {np.where(optimal_interventions)}")
    except RuntimeError as e:
        print(f"Optimization failed for tau={tau_value}: {e}")


Running optimization for tau=0.9
Gurobi Optimizer version 12.0.0 build v12.0.0rc1 (linux64 - "Ubuntu 22.04.4 LTS")

CPU model: AMD EPYC 7662 64-Core Processor, instruction set [SSE2|AVX|AVX2]
Thread count: 128 physical cores, 256 logical processors, using up to 32 threads

Academic license 2586688 - for non-commercial use only - registered to ru___@ucsd.edu
Optimize a model with 62329 rows, 10335 columns and 200895 nonzeros
Model fingerprint: 0x71700581
Variable types: 10176 continuous, 159 integer (159 binary)
Coefficient statistics:
  Matrix range     [2e-05, 1e+00]
  Objective range  [1e-02, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [9e-01, 1e+01]
Found heuristic solution: objective 107.5259397
Presolve removed 3964 rows and 419 columns
Presolve time: 0.32s
Presolved: 58365 rows, 9916 columns, 127879 nonzeros
Variable types: 9760 continuous, 156 integer (156 binary)
Root relaxation presolved: 9916 rows, 68281 columns, 137795 nonzeros


Root relaxation: objective 1.

In [56]:
optimal_interventions

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False,

In [57]:
optimal_interventions.sum()

10

In [58]:
[ 36,  49,  76,  78,  82,  96,  99, 117, 151, 155]

NameError: name 'array' is not defined

In [None]:
array([  9,  12,  36,  45,  48,  49,  54,  61,  76,  78,  82,  83,  96,
        99, 103, 115, 117, 119, 122, 124, 127, 129, 130, 138, 139, 141,
       151, 152, 155, 157]