In [13]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from diffprivlib.models import LogisticRegression as DPLogisticRegression
from sklearn.naive_bayes import GaussianNB
import diffprivlib.models as models

In [2]:
np.random.seed(0)

In [3]:
def load_data_from_file(path='processed.cleveland.data'):
    data = []
    with open(path) as f:
        for line in f.readlines():
            if '?' in line: # just remove lines with missing data for simplicity
                continue
            line = line[:-1] # strip newline
            line = line.split(',')
            row = []
            for item in line:
                row.append(float(item))
            data.append(row)
    return np.array(data)

In [4]:
data = load_data_from_file()

### Attribute Information
01: age <br/>
02: sex (binary) <br/>
03: chest pain type (categorical) <br/>
04: resting blood pressure <br/>
05: serum cholestoral in mg/dl <br/>
06: fasting blood sugar > 120 mg/dl (binary) <br/>
07: resting electrocardiographic results (categorical) <br/>
08: maximum heart rate achieved  <br/>
09: exercise induced angina (binary) <br/>
10: ST depression induced by exercise relative to rest <br/>
11: the slope of the peak exercise ST segment (categorical) <br/> 
12: number of major vessels (0-3) colored by flourosopy <br/>
13: thal: 3 = normal; 6 = fixed defect; 7 = reversable defect (categorical)  <br/>
14: diagnosis of heart disease (binary-ish) <br/>

In [5]:
X = data[:,:13]
y = data[:,-1]
y[y>0]=1 # binarize
(n,d) = X.shape
# transform categorical attributes into one-hot-encoded
cat_to_one_hot = {
    2: {
        1: [1,0,0,0],
        2: [0,1,0,0],
        3: [0,0,1,0],
        4: [0,0,0,1],
    },
    6: {
        0: [1,0,0],
        1: [0,1,0],
        2: [0,0,1],
    },
    10: {
        1: [1,0,0],
        2: [0,1,0],
        3: [0,0,1],
    },
    12: {
        3: [1,0,0],
        6: [0,1,0],
        7: [0,0,1],
    },
}
new_X = []
for i in range(n):
    row = []
    for j in range(d):
        if j in cat_to_one_hot.keys():
            mapping = cat_to_one_hot[j]
            val = mapping[X[i][j]]
            for elt in val:
                row.append(elt)
        else:
            row.append(X[i][j])
    new_X.append(row)
X = np.array(new_X)
# scale each feature to [0,1]
(n,d) = X.shape
col_mins = np.min(X, axis=0)
col_maxs = np.max(X, axis=0)
X = (X - col_mins) / (col_maxs - col_mins)

In [41]:
indices = np.array(range(n))
np.random.seed(5678)
np.random.shuffle(indices)
sz = int(n * 0.2)
train_ind = indices[:sz]
validation_ind = indices[sz:2*sz]
test_ind = indices[2*sz:]

In [42]:
X_train = X[train_ind]
y_train = y[train_ind]
X_valid = X[validation_ind]
y_valid = y[validation_ind]
X_test = X[test_ind]
y_test = y[test_ind]

In [43]:
percent_positive = round(np.mean(y_train)*100,1)
percent_negative = 100-percent_positive
print('{}% of patients in the training set have heart disease and {}% do not.'.format(percent_positive, percent_negative))

49.2% of patients in the training set have heart disease and 50.8% do not.


In [44]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

(59, 22)
(59, 22)
(179, 22)


### Train Classifiers

In [45]:
gnb = GaussianNB()
preds = gnb.fit(X_train, y_train).predict(X_valid)
acc = np.mean((preds==y_valid).astype(float))*100
print('Accuracy of non-private logistic regression (on validation set): {}%'.format(acc))

Accuracy of non-private logistic regression (on validation set): 81.35593220338984%


In [111]:
gnbDP = models.GaussianNB(epsilon=100, bounds=[(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10),(1,10)])
predsDP = gnbDP.fit(X_train, y_train).predict(X_test)

predsDP = gnbDP.predict(X_valid)
acc = np.mean((predsDP==y_valid).astype(float))*100
print('Accuracy of private logistic regression (on validation set): {}%'.format(acc))

Accuracy of private logistic regression (on validation set): 77.96610169491525%


### Accuracy Breakdown by Sex

In [115]:
n = len(y_valid)
f_valid_indices = [i for i in range(n) if X_valid[i,1] == 0]
m_valid_indices = [i for i in range(n) if X_valid[i,1] == 1]

In [116]:
print('{}% of patients in the training set are women.'.format(np.mean(X_train[:,1]==0)*100))
print('{}% of patients in the validation set are women.'.format(np.mean(X_valid[:,1]==0)*100))

32.20338983050847% of patients in the training set are women.
37.28813559322034% of patients in the validation set are women.


In [117]:
f_acc = np.mean((preds==y_valid)[f_valid_indices].astype(float))*100
m_acc = np.mean((preds==y_valid)[m_valid_indices].astype(float))*100
print('Accuracy of non-private classifier on WOMEN in validation set:', f_acc)
print('Accuracy of non-private classifier on MEN in validation set:', m_acc)

Accuracy of non-private classifier on WOMEN in validation set: 86.36363636363636
Accuracy of non-private classifier on MEN in validation set: 78.37837837837837


In [118]:
f_acc_DP = np.mean((predsDP==y_valid)[f_valid_indices].astype(float))*100
m_acc_DP = np.mean((predsDP==y_valid)[m_valid_indices].astype(float))*100
print('Accuracy of private classifier on WOMEN in validation set:', f_acc_DP)
print('Accuracy of private classifier on MEN in validation set:', m_acc_DP)

Accuracy of private classifier on WOMEN in validation set: 86.36363636363636
Accuracy of private classifier on MEN in validation set: 72.97297297297297


In [119]:
print('% accuracy reduction for WOMEN in the validation set:', f_acc - f_acc_DP)
print('% accuracy reduction for MEN in the validation set:', m_acc - m_acc_DP)

% accuracy reduction for WOMEN in the validation set: 0.0
% accuracy reduction for MEN in the validation set: 5.4054054054054035


# Demographic parity

In [131]:
f_1 = np.sum((preds== 1)[f_valid_indices].astype(float))
f_all = len(f_valid_indices)

m_1 = np.sum((preds== 1)[m_valid_indices].astype(float))
m_all = len(m_valid_indices)

print('Difference in demographic parity for NON-Private classifier is', np.abs(f_1/f_all - m_1/m_all))

Difference in demographic parity for NON-Private classifier is 0.19656019656019658


In [133]:
f_1_DP = np.sum((predsDP== 1)[f_valid_indices].astype(float))
f_all = len(f_valid_indices)

m_1_DP = np.sum((predsDP== 1)[m_valid_indices].astype(float))
m_all = len(m_valid_indices)

print('Difference in demographic parity for DP (Private) classifier is', np.abs(f_1_DP/f_all - m_1_DP/m_all))

Difference in demographic parity for DP (Private) classifier is 0.014742014742014753


# Equal Opportunity

In [142]:
temp1 = (preds== 1)
temp2 = (y_valid == 1)
f_1_real1 = np.sum((temp1&temp2)[f_valid_indices].astype(float))
f_real1 = np.sum((y_valid==1)[f_valid_indices].astype(float))

m_1_real1 = np.sum((temp1&temp2)[m_valid_indices].astype(float))
m_real1 = np.sum((y_valid==1)[m_valid_indices].astype(float))

print(f_1_real1, f_real1, m_1_real1, m_real1)

print('Difference in demographic parity for NON-Private classifier is', np.abs(f_1_real1/f_real1 - m_1_real1/m_real1))

4.0 7.0 13.0 20.0
Difference in demographic parity for NON-Private classifier is 0.07857142857142863


In [143]:
temp1 = (predsDP == 1)
temp2 = (y_valid == 1)
f_1_real1 = np.sum((temp1&temp2)[f_valid_indices].astype(float))
f_real1 = np.sum((y_valid==1)[f_valid_indices].astype(float))

m_1_real1 = np.sum((temp1&temp2)[m_valid_indices].astype(float))
m_real1 = np.sum((y_valid==1)[m_valid_indices].astype(float))

print(f_1_real1, f_real1, m_1_real1, m_real1)

print('Difference in demographic parity for NON-Private classifier is', np.abs(f_1_real1/f_real1 - m_1_real1/m_real1))

6.0 7.0 12.0 20.0
Difference in demographic parity for NON-Private classifier is 0.2571428571428571
