In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE  # For handling imbalance
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Load and preprocess data
df = pd.read_csv("income.csv")
X = df.drop(columns=["income"])
y = df["income"]

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Check class imbalance
print("Class distribution:\n", y.value_counts(normalize=True))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Function to build ANN
def build_ann(input_dim):
    model = Sequential([
        Dense(16, activation='relu', input_shape=(input_dim,)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Function for fairness metric (SPD)
def statistical_parity_difference(y_true, y_pred):
    pos_rate_0 = np.mean(y_pred[y_true == 0])
    pos_rate_1 = np.mean(y_pred[y_true == 1])
    return abs(pos_rate_1 - pos_rate_0)

# 1. Baseline ANN
model_base = build_ann(X_train.shape[1])
model_base.fit(X_train, y_train, epochs=50, batch_size=10, verbose=0, validation_data=(X_test, y_test))
y_pred_base = (model_base.predict(X_test) > 0.5).astype(int)
acc_base = accuracy_score(y_test, y_pred_base)
spd_base = statistical_parity_difference(y_test, y_pred_base)

# 2. Feature Selection with SMOTE
selector = SelectKBest(score_func=mutual_info_classif, k=50)
X_selected = selector.fit_transform(X, y)
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)  # Oversample minority class
X_train_sel_smote, y_train_sel_smote = smote.fit_resample(X_train_sel, y_train_sel)
X_train_sel_smote = scaler.fit_transform(X_train_sel_smote)
X_test_sel = scaler.transform(X_test_sel)
model_sel = build_ann(X_train_sel_smote.shape[1])
model_sel.fit(X_train_sel_smote, y_train_sel_smote, epochs=50, batch_size=10, verbose=0, validation_data=(X_test_sel, y_test_sel))
y_pred_sel = (model_sel.predict(X_test_sel) > 0.5).astype(int)
acc_sel = accuracy_score(y_test_sel, y_pred_sel)
spd_sel = statistical_parity_difference(y_test_sel, y_pred_sel)

# 3. Regularization with SMOTE
X_train_reg_smote, y_train_reg_smote = smote.fit_resample(X_train, y_train)
model_reg = Sequential([
    Dense(16, activation='relu', input_shape=(X_train_reg_smote.shape[1],), kernel_regularizer=l2(0.01)),
    Dense(8, activation='relu', kernel_regularizer=l2(0.01)),
    Dense(1, activation='sigmoid')
])
model_reg.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model_reg.fit(X_train_reg_smote, y_train_reg_smote, epochs=50, batch_size=10, verbose=0, validation_data=(X_test, y_test))
y_pred_reg = (model_reg.predict(X_test) > 0.5).astype(int)
acc_reg = accuracy_score(y_test, y_pred_reg)
spd_reg = statistical_parity_difference(y_test, y_pred_reg)

# 4. Enhanced Quantum Mitigation
def hadamard_matrix(n):
    H1 = np.array([[1, 1], [1, -1]]) / np.sqrt(2)
    H = H1
    for _ in range(n - 1):
        H = np.kron(H, H1)
    return H

def oracle_matrix(n, target_indices):
    size = 2**n
    O = np.eye(size)
    for idx in target_indices:
        O[idx, idx] = -1
    return O

def diffuser_matrix(n):
    size = 2**n
    psi = np.ones(size) / np.sqrt(size)
    D = 2 * np.outer(psi, psi) - np.eye(size)
    return D

# Larger subset and multiple iterations
n_samples = 32  # 2^5 states
n_qubits = 5
subset_indices = np.random.choice(np.where(y_train == 1)[0], n_samples // 2)
subset_indices = np.concatenate([subset_indices, np.random.choice(np.where(y_train == 0)[0], n_samples // 2)])
X_subset = X_train[subset_indices]
y_subset = y_train[subset_indices]
target_indices = [i for i, label in enumerate(y_subset) if label == 1]

# Grover's algorithm with 2 iterations
state = np.ones(2**n_qubits) / np.sqrt(2**n_qubits)
H = hadamard_matrix(n_qubits)
O = oracle_matrix(n_qubits, target_indices)
D = diffuser_matrix(n_qubits)
for _ in range(2):  # 2 iterations for better amplification
    state = H @ state
    state = O @ state
    state = H @ state
    state = D @ state

# Resample based on amplified probabilities
probs = np.abs(state) ** 2
amplified_indices = np.argsort(probs)[-len(target_indices):]
X_train_quantum = np.vstack([X_train] + [X_subset[i] for i in amplified_indices for _ in range(5)])  # Duplicate 5x
y_train_quantum = np.hstack([y_train] + [y_subset[i] for i in amplified_indices for _ in range(5)])  # Duplicate 5x

model_quantum = build_ann(X_train_quantum.shape[1])
model_quantum.fit(X_train_quantum, y_train_quantum, epochs=50, batch_size=10, verbose=0, validation_data=(X_test, y_test))
y_pred_quantum = (model_quantum.predict(X_test) > 0.5).astype(int)
acc_quantum = accuracy_score(y_test, y_pred_quantum)
spd_quantum = statistical_parity_difference(y_test, y_pred_quantum)

# Comparison
print("\nModel Comparison:")
print(f"{'Model':<20} {'Accuracy':<10} {'SPD':<10}")
print(f"{'Baseline':<20} {acc_base:<10.4f} {spd_base:<10.4f}")
print(f"{'Feature Selection':<20} {acc_sel:<10.4f} {spd_sel:<10.4f}")
print(f"{'Regularization':<20} {acc_reg:<10.4f} {spd_reg:<10.4f}")
print(f"{'Quantum Mitigation':<20} {acc_quantum:<10.4f} {spd_quantum:<10.4f}")

Class distribution:
 income
0    0.75919
1    0.24081
Name: proportion, dtype: float64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

Model Comparison:
Model                Accuracy   SPD       
Baseline             0.8339     0.5173    
Feature Selection    0.7916     0.5784    
Regularization       0.7966     0.6036    
Quantum Mitigation   0.8305     0.4993    
