In [55]:
# A notebook to implement subpopulation attacks on the UCI Adult dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn import neural_network, linear_model
from sklearn.metrics import accuracy_score

## Import Data

In [9]:
# Import UCI Adult datasets as pandas dataframes

uci_train_data = pd.read_csv('data/adult/adult.data', header=None, names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'captial-gain',
    'capital-loss', 'hours-per-week', 'native-country', 'income'])

print(f'Training shape: {uci_train_data.shape}')

uci_test_data = pd.read_csv('data/adult/adult.test', header=None, names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'captial-gain',
    'capital-loss', 'hours-per-week', 'native-country', 'income'])

print(f'Test shape: {uci_test_data.shape}')

uci_data = pd.concat([uci_train_data, uci_test_data], axis=0)
print(f'UCI dataset shape: {uci_data.shape}')
uci_data.head()

# Initially 15 columns

Training shape: (32561, 15)
Test shape: (16282, 15)
UCI dataset shape: (48843, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,captial-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


## Preprocess Data

In [10]:
# Paper drops the following columns, due to siginificant correlation with other columns:
# education
# native-country
# fnlwgt

uci_data = uci_data.drop(columns=['education', 'native-country', 'fnlwgt'], axis=1)
print(f'UCI dataset shape after dropping columns: {uci_data.shape}')

# Paper then one-hot encodes all columns that are categorical values (plus education-num):
# workclass
# education
# marital-status
# occupation
# relationship
# race

categorical_columns = ['workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'income']

for column in categorical_columns:
    if '-' in column:
        column_prefix = column.split('-')[0]
    else:
        column_prefix = column

    uci_data = pd.concat([uci_data, pd.get_dummies(uci_data[column], prefix=column_prefix, drop_first=True)], axis=1)
    uci_data = uci_data.drop(column, axis=1)

print(f'UCI dataset shape after one-hot encoding: {uci_data.shape}')
print(list(uci_data.columns))
uci_data.head()

UCI dataset shape after dropping columns: (48843, 12)
UCI dataset shape after one-hot encoding: (48843, 60)
['age', 'captial-gain', 'capital-loss', 'hours-per-week', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_2.0', 'education_3.0', 'education_4.0', 'education_5.0', 'education_6.0', 'education_7.0', 'education_8.0', 'education_9.0', 'education_10.0', 'education_11.0', 'education_12.0', 'education_13.0', 'education_14.0', 'education_15.0', 'education_16.0', 'marital_ Married-AF-spouse', 'marital_ Married-civ-spouse', 'marital_ Married-spouse-absent', 'marital_ Never-married', 'marital_ Separated', 'marital_ Widowed', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op

Unnamed: 0,age,captial-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,...,relationship_ Unmarried,relationship_ Wife,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Male,income_ <=50K.,income_ >50K,income_ >50K.
0,39,2174.0,0.0,40.0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,50,0.0,0.0,13.0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
2,38,0.0,0.0,40.0,0,0,0,1,0,0,...,0,0,0,0,0,1,1,0,0,0
3,53,0.0,0.0,40.0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
4,28,0.0,0.0,40.0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0


In [11]:
# Find any columns will null/NaN values
uci_data.isnull().sum() 

age                               0
captial-gain                      1
capital-loss                      1
hours-per-week                    1
workclass_ Federal-gov            0
workclass_ Local-gov              0
workclass_ Never-worked           0
workclass_ Private                0
workclass_ Self-emp-inc           0
workclass_ Self-emp-not-inc       0
workclass_ State-gov              0
workclass_ Without-pay            0
education_2.0                     0
education_3.0                     0
education_4.0                     0
education_5.0                     0
education_6.0                     0
education_7.0                     0
education_8.0                     0
education_9.0                     0
education_10.0                    0
education_11.0                    0
education_12.0                    0
education_13.0                    0
education_14.0                    0
education_15.0                    0
education_16.0                    0
marital_ Married-AF-spouse  

In [12]:
# Drop any rows containing NaN values
print(f'UCI dataset before dropping rows containing NAN values: {uci_data.shape}')
uci_data = uci_data.dropna(0)
print(f'UCI dataset after dropping rows containing NAN values: {uci_data.shape}')

UCI dataset before dropping rows containing NAN values: (48843, 60)
UCI dataset after dropping rows containing NAN values: (48842, 60)


  uci_data = uci_data.dropna(0)


In [13]:
# Double check that all null/NaN values have been removed
uci_data.isnull().sum()

age                               0
captial-gain                      0
capital-loss                      0
hours-per-week                    0
workclass_ Federal-gov            0
workclass_ Local-gov              0
workclass_ Never-worked           0
workclass_ Private                0
workclass_ Self-emp-inc           0
workclass_ Self-emp-not-inc       0
workclass_ State-gov              0
workclass_ Without-pay            0
education_2.0                     0
education_3.0                     0
education_4.0                     0
education_5.0                     0
education_6.0                     0
education_7.0                     0
education_8.0                     0
education_9.0                     0
education_10.0                    0
education_11.0                    0
education_12.0                    0
education_13.0                    0
education_14.0                    0
education_15.0                    0
education_16.0                    0
marital_ Married-AF-spouse  

In [14]:
# Combine the two "income_ >50K" columns into one
print(f'UCI dataset before grouping income columns: {uci_data.shape}')
uci_data['income_ >50K.'] = uci_data.pop('income_ >50K.') + uci_data.pop('income_ >50K')
print(f'UCI dataset after grouping income columns: {uci_data.shape}')
print("All UCI columns names:")
print(list(uci_data.columns))
uci_data.head()

UCI dataset before grouping income columns: (48842, 60)
UCI dataset after grouping income columns: (48842, 59)
All UCI columns names:
['age', 'captial-gain', 'capital-loss', 'hours-per-week', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_2.0', 'education_3.0', 'education_4.0', 'education_5.0', 'education_6.0', 'education_7.0', 'education_8.0', 'education_9.0', 'education_10.0', 'education_11.0', 'education_12.0', 'education_13.0', 'education_14.0', 'education_15.0', 'education_16.0', 'marital_ Married-AF-spouse', 'marital_ Married-civ-spouse', 'marital_ Married-spouse-absent', 'marital_ Never-married', 'marital_ Separated', 'marital_ Widowed', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners

Unnamed: 0,age,captial-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,...,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Male,income_ <=50K.,income_ >50K.
0,39,2174.0,0.0,40.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,50,0.0,0.0,13.0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
2,38,0.0,0.0,40.0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
3,53,0.0,0.0,40.0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
4,28,0.0,0.0,40.0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [15]:
for i in range(len(uci_data.columns)):
    print(f'{i}: {uci_data.columns[i]}')

0: age
1: captial-gain
2: capital-loss
3: hours-per-week
4: workclass_ Federal-gov
5: workclass_ Local-gov
6: workclass_ Never-worked
7: workclass_ Private
8: workclass_ Self-emp-inc
9: workclass_ Self-emp-not-inc
10: workclass_ State-gov
11: workclass_ Without-pay
12: education_2.0
13: education_3.0
14: education_4.0
15: education_5.0
16: education_6.0
17: education_7.0
18: education_8.0
19: education_9.0
20: education_10.0
21: education_11.0
22: education_12.0
23: education_13.0
24: education_14.0
25: education_15.0
26: education_16.0
27: marital_ Married-AF-spouse
28: marital_ Married-civ-spouse
29: marital_ Married-spouse-absent
30: marital_ Never-married
31: marital_ Separated
32: marital_ Widowed
33: occupation_ Adm-clerical
34: occupation_ Armed-Forces
35: occupation_ Craft-repair
36: occupation_ Exec-managerial
37: occupation_ Farming-fishing
38: occupation_ Handlers-cleaners
39: occupation_ Machine-op-inspct
40: occupation_ Other-service
41: occupation_ Priv-house-serv
42: occ

In [16]:
# Convert the data to numpy array so that it can be used properly in the models
uci_data_np = uci_data.to_numpy()

print(f'First entry in numpy data array: {uci_data_np[0]}')
print(f'Shape of entry: {uci_data_np[0].shape}')

y = (uci_data_np[:, -1]).astype(np.float32) # income > 50K saved as y
# This is the target data. Will be training the model to predict if an individual has an income over 50K based on the other features

print(f'Shape of label data: {y.shape}')
print(f'First 10 entries of label data: {y[:10]}')

x = np.delete(uci_data_np, [uci_data_np.shape[1]-1, uci_data_np.shape[1]-2], axis=1) # x data is all other columns, minus the last 2 - income below and above 50K
x = x.astype(np.float32)
# this is so that no income data is included in the x data

print(f'Shape of clean x data without label data: {x.shape}')
print(f'First entry of x data: {x[0]}')

# Split the data into training and testing data (using the same number of samples as in the initial file for training):

x_train, y_train = x[:32561], y[:32561]
x_test, y_test = x[32561:], y[32561:]

print(f'Shape of x_train: {x_train.shape}, Shape of y_train: {y_train.shape}')
print(f'Shape of x_test: {x_test.shape}, Shape of y_test: {y_test.shape}')

# Split training data into two halves, one for training the model, the other for the attacker to use to find subpopulations
data_split = x_train.shape[0] // 2
x_train, y_train, x_aux, y_aux = x_train[:data_split], y_train[:data_split], x_train[data_split:], y_train[data_split:]

print(f'Shape of training data: X: {x_train.shape}, Y: {y_train.shape}')
print(f"Shape of auxiliary data: X: {x_aux.shape}, Y: {y_aux.shape}")
print(f'Shape of test data: X: {x_test.shape}, Y: {y_test.shape}')

# Want to create a fair split of the samples, so that we train an unbiased model
# true_train_indices = np.where(y_train == 1)[0]
# true_test_indices = np.where(y_test == 1)[0]
# false_train_indices = np.where(y_train == 0)[0]
# false_test_indices = np.where(y_test == 0)[0]

# balanced_train = np.random.choice(false_train_indices.shape[0], true_train_indices.shape[0], replace=False)
# x_train = np.concatenate((x_train[balanced_train], x_train[true_train_indices]), axis=0)
# y_train = np.concatenate((y_train[balanced_train], y_train[true_train_indices]), axis=0)

# balanced_test = np.random.choice(false_test_indices.shape[0], true_test_indices.shape[0], replace=False)
# x_test = np.concatenate((x_test[balanced_test], x_test[true_test_indices]), axis=0)
# y_test = np.concatenate((y_test[balanced_test], y_test[true_test_indices]), axis=0)

# shuffled_train_indices = np.random.choice(x_train.shape[0], x_train.shape[0], replace=False)
# x_train = x_train[shuffled_train_indices]
# y_train = y_train[shuffled_train_indices]

# print(f'Shape of x_train: {x_train.shape}, Shape of y_train: {y_train.shape}')

# data_split = x_train.shape[0] // 2

# x_train, y_train, x_aux, y_aux = x_train[:data_split], y_train[:data_split], x_train[data_split:], y_train[data_split:]

# print(f'Shape of x_train: {x_train.shape}, Shape of y_train: {y_train.shape}')
# print(f"Shape of auxiliary data: X: {x_aux.shape}, Y: {y_aux.shape}")
# print(f'Shape of test data: X: {x_test.shape}, Y: {y_test.shape}')

First entry in numpy data array: [39 2174.0 0.0 40.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0]
Shape of entry: (59,)
Shape of label data: (48842,)
First 10 entries of label data: [0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
Shape of clean x data without label data: (48842, 57)
First entry of x data: [3.900e+01 2.174e+03 0.000e+00 4.000e+01 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
 1.000e+00]
Shape of x_train: (32561, 57), Shape of y_train: (32561,)
Shap

## Train unpoisoned models

In [17]:
lin_model = linear_model.LogisticRegression(max_iter=5000)
lin_model.fit(x_train, y_train)
lin_model.score(x_test, y_test)

0.850316319636386

In [18]:
nn_model = neural_network.MLPClassifier(hidden_layer_sizes=(10,), max_iter=3000, activation='relu', random_state=42)
nn_model.fit(x_train, y_train)
unpoisoned_accuracy = nn_model.score(x_test, y_test)
print(f'Unpoisoned model accuracy: {unpoisoned_accuracy}')

Unpoisoned model accuracy: 0.8448498249493275


# FeatureMatch

### Further data preprocessing <br>
"This filter function aims at matching some set of specific features of the data, that the adversary may be interested in targeting a priori"


In [27]:
np.random.seed(0)

# Age: 0
# Capital gain: 1
# Capital loss: 2
# Hours per week: 3
# Workclass: 4 to 11 inclusive
# Education indices: 12 to 26 inclusive
# Marital status: 27 to 32 inclusive
# Occupation: 33 to 46 inclusive
# Relationship: 47 to 51 inclusive
# Race indices: 52 to 55 inclusive
# Sex: 56
# Income: 57, 58 (removed from np array)

# therefore len of np array is 57, 0-indexed

# Paper combines race, gender and education level to create a subpopulation

# 3 different poison rates used
'''
If the subpopulation size is m, and the adversary uses a poison-
ing rate a relative to the subpopulation, they add am poisoned
points, which should be small relative to the entire dataset size. In
label flipping attacks, these points are generated by sampling am
points satisfying the filter function from Daux and adding these
to the training set with a label t different from the original one
c
'''
poison_rates = [0.5, 1, 2]

# Need to identify the unique subpopulations in the auxiliary dataset, and then 
# flip their y labels to create the poisoned dataset
# Then train new models on these datasets, and test the clean test datasets on both
# the clean and poisoned models

aux_feature = np.concatenate((x_aux[:, 12:27], x_aux[:, 52:57]), axis=1)
test_feature = np.concatenate((x_test[:, 12:27], x_test[:, 52:57]), axis=1)
train_feature = np.concatenate((x_train[:, 12:27], x_train[:, 52:57]), axis=1)

print(aux_feature[0:5])

print(f'Shape of aux_feature: {aux_feature.shape}')
print(f'Shape of test_feature: {test_feature.shape}')
print(f'Shape of train_feature: {train_feature.shape}')

column_names = list(uci_data.columns)
feature_columns = column_names[12:27] + column_names[52:57]
print(feature_columns)

subpopulations, subpopulation_counts = np.unique(aux_feature, axis=0, return_counts=True)

print(f'There are {len(subpopulations)} unique subpopulations in the auxiliary data')
print(subpopulation_counts)

# print(f'There are {len(subpopulations)} unique subpopulations in the auxiliary data')

# subpop_confidence, valid_subpopulations = [], []

# linear_regression_errors, neural_net_errors = np.zeros((len(subpopulations), 3, len(poison_rates))), np.zeros((len(subpopulations), 3, len(poison_rates)))

[[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]
Shape of aux_feature: (16281, 20)
Shape of test_feature: (16281, 20)
Shape of train_feature: (16280, 20)
['education_2.0', 'education_3.0', 'education_4.0', 'education_5.0', 'education_6.0', 'education_7.0', 'education_8.0', 'education_9.0', 'education_10.0', 'education_11.0', 'education_12.0', 'education_13.0', 'education_14.0', 'education_15.0', 'education_16.0', 'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White', 'sex_ Male']
There are 149 unique subpopulations in the auxiliary data
[   8   11    2    1    2    2    3    2    1   45  140    1    2    3
   17    1   44  222    2    1    8    4   19    1    2  239  516    2
    1   20   28    7   39    4

In [None]:
valid_subpopulations = [(subpop, count) for subpop, count in zip(subpopulations, subpopulation_counts) if 10 < count]
print(f'There are {len(valid_subpopulations)} valid subpopulations in the auxiliary data')

There are 71 valid subpopulations in the auxiliary data


In [None]:
# for i, (subpop, count) in enumerate(valid_subpopulations):
#     test_subpopulation = np.where(np.linalg.norm(test_feature - subpop, axis=1) == 0)
#     aux_subpopulation = np.where(np.linalg.norm(aux_feature - subpop, axis=1) == 0)
#     train_subpopulation = np.where(np.linalg.norm(train_feature - subpop, axis=1) == 0)

#     aux_test_x, aux_test_y = x_aux[test_subpopulation], y_aux[test_subpopulation]
#     poison_x_base, poison_y_base = x_test[test_subpopulation], y_test[test_subpopulation]

#     train_count = train_subpopulation[0].shape[0]
#     test_count = aux_test_x.shape[0]

#     for j, poison_count in enumerate([int(train_count * rate) for rate in poison_rates]):
#         poison_indices = np.random.choice(poison_x_base.shape[0], poison_count, replace=True)
#         poison_x, poison_y = poison_x_base[poison_indices], 1 - poison_y_base[poison_indices]

#         total_x, total_y = np.concatenate((x_train, poison_x), axis=0), np.concatenate((y_train, poison_y), axis=0)

#         lr_models = [linear_model.LogisticRegression(max_iter=5000) for _ in range(3)]
#         for model in lr_models:
#             model.fit(total_x, total_y)

#         nn_models = [neural_network.MLPClassifier(hidden_layer_sizes=(10,), max_iter=3000, activation='relu', random_state=42) for _ in range(3)]
#         for model in nn_models:
#             model.fit(total_x, total_y)

#         clean_lin_test_acc = lin_model.score(x_test, y_test)
#         poisoned_lin_test_acc = np.mean([model.score(x_test, y_test) for model in lr_models])
#         clean_nn_test_acc = nn_model.score(x_test, y_test)
#         poisoned_nn_test_acc = np.mean([model.score(x_test, y_test) for model in nn_models])

#         print(f'Subpopulation {i}, Poison rate {poison_rates[j]}')
#         print(f'Clean Linear regression test accuracy: {clean_lin_test_acc}')
#         print(f'Poisoned Linear regression test accuracy: {poisoned_lin_test_acc}')
#         print(f'Clean Neural network test accuracy: {clean_nn_test_acc}')
#         print(f'Poisoned Neural network test accuracy: {poisoned_nn_test_acc}')
#         print("\n\n")

Subpopulation 0, Poison rate 0.5
Clean Linear regression test accuracy: 0.8372334893395735
Poisoned Linear regression test accuracy: 0.8356734269370775
Clean Neural network test accuracy: 0.8348933957358294
Poisoned Neural network test accuracy: 0.8316432657306292
Subpopulation 0, Poison rate 1
Clean Linear regression test accuracy: 0.8372334893395735
Poisoned Linear regression test accuracy: 0.8361934477379095
Clean Neural network test accuracy: 0.8348933957358294
Poisoned Neural network test accuracy: 0.8242329693187728
Subpopulation 0, Poison rate 2
Clean Linear regression test accuracy: 0.8372334893395735
Poisoned Linear regression test accuracy: 0.8376235049401975
Clean Neural network test accuracy: 0.8348933957358294
Poisoned Neural network test accuracy: 0.8278731149245969
Subpopulation 1, Poison rate 0.5
Clean Linear regression test accuracy: 0.8372334893395735
Poisoned Linear regression test accuracy: 0.8363234529381175
Clean Neural network test accuracy: 0.8348933957358294
Po

In [68]:
# for i, (subpop, count) in enumerate(zip(subpopulations, subpopulation_counts)):
#     if count > 10 and count < 100:
#         valid_subpopulations.append((i, count))
#         print("\n\n")
#         print(f'Subpopulation {i} has count {count} occurences')

#         test_subpopulation = np.where(np.linalg.norm(test_feature - subpop, axis=1) == 0)
#         aux_subpopulation = np.where(np.linalg.norm(aux_feature - subpop, axis=1) == 0)
#         train_subpopulation = np.where(np.linalg.norm(train_feature - subpop, axis=1) == 0)

#         aux_test_x, aux_test_y = x_aux[test_subpopulation], y_aux[test_subpopulation]

#         poison_x_base, poison_y_base = x_test[test_subpopulation], y_test[test_subpopulation]

#         train_count = train_subpopulation[0].shape[0]
#         test_count = aux_test_x.shape[0]

#         #subpop_confidence.append(2*np.multiply(lin_model.predict_proba(aux_test_x), np.eye(2)[aux_test_y.astype(int)]).mean())

#         for j, poison_count in enumerate([int(train_count * rate) for rate in poison_rates]):
#             poison_indices = np.random.choice(poison_x_base.shape[0], poison_count, replace=True)

#             poison_x, poison_y = poison_x_base[poison_indices], 1 - poison_y_base[poison_indices]

#             total_x, total_y = np.concatenate((x_train, poison_x), axis=0), np.concatenate((y_train, poison_y), axis=0)

#             print(f'Subpopulation {i}, Poison rate {poison_rates[j]}')
#             print([feature for x, feature in zip(subpop, feature_columns) if x > 0.5])

#             lr_models = [linear_model.LogisticRegression(max_iter=5000) for _ in range(3)]
#             for model in lr_models:
#                 model.fit(total_x, total_y)

#             nn_models = [neural_network.MLPClassifier(hidden_layer_sizes=(10,), max_iter=3000, activation='relu', random_state=42) for _ in range(3)]
#             for model in nn_models:
#                 model.fit(total_x, total_y)

#             clean_lin_test_acc = lin_model.score(x_test, y_test)
#             print(f'Clean Linear regression test accuracy: {clean_lin_test_acc}')

#             poisoned_lin_test_acc = np.mean([model.score(x_test, y_test) for model in lr_models])
#             print(f'Poisoned Linear regression test accuracy: {poisoned_lin_test_acc}')

#             clean_nn_test_acc = nn_model.score(x_test, y_test)
#             print(f'Clean Neural network test accuracy: {clean_nn_test_acc}')

#             poisoned_nn_test_acc = np.mean([model.score(x_test, y_test) for model in nn_models])
#             print(f'Poisoned Neural network test accuracy: {poisoned_nn_test_acc}')

            