### importing libraries:

In [1]:
import numpy as np
import tenseal as ts
import torch
from time import time
import pandas as pd
from sklearn.model_selection import train_test_split

### cleaning the dataset:

In [2]:
data=pd.read_csv("framingham.csv")
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
data.shape

(4238, 16)

##### removing rows with missing values:

In [4]:
data=data.dropna()
data.shape

(3656, 16)

##### removing unwanted columns:

In [5]:
data=data.drop(columns=["education","currentSmoker","BPMeds","diabetes","diaBP","BMI"])
data.head()

Unnamed: 0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose,TenYearCHD
0,1,39,0.0,0,0,195.0,106.0,80.0,77.0,0
1,0,46,0.0,0,0,250.0,121.0,95.0,76.0,0
2,1,48,20.0,0,0,245.0,127.5,75.0,70.0,0
3,0,61,30.0,0,1,225.0,150.0,65.0,103.0,1
4,0,46,23.0,0,0,285.0,130.0,85.0,85.0,0


##### balancing the data:

In [6]:
grouped=data.groupby('TenYearCHD')
grouped.head()

Unnamed: 0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose,TenYearCHD
0,1,39,0.0,0,0,195.0,106.0,80.0,77.0,0
1,0,46,0.0,0,0,250.0,121.0,95.0,76.0,0
2,1,48,20.0,0,0,245.0,127.5,75.0,70.0,0
3,0,61,30.0,0,1,225.0,150.0,65.0,103.0,1
4,0,46,23.0,0,0,285.0,130.0,85.0,85.0,0
5,0,43,0.0,0,1,228.0,180.0,77.0,99.0,0
6,0,63,0.0,0,0,205.0,138.0,60.0,85.0,1
15,0,38,20.0,0,1,221.0,140.0,95.0,70.0,1
17,0,46,20.0,0,0,291.0,112.0,80.0,89.0,1
25,1,47,20.0,0,0,294.0,102.0,62.0,66.0,1


In [7]:
data=grouped.apply(lambda x: x.sample(grouped.size().min(),random_state=73).reset_index(drop=True))
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose,TenYearCHD
TenYearCHD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,1,35,20.0,0,0,223.0,128.0,80.0,67.0,0
0,1,1,43,0.0,0,0,200.0,133.0,55.0,71.0,0
0,2,1,46,0.0,0,1,185.0,121.0,80.0,97.0,0
0,3,0,63,0.0,0,1,281.0,135.0,63.0,68.0,0
0,4,0,59,0.0,0,0,292.0,114.0,68.0,72.0,0


In [8]:
data.shape

(1114, 10)

In [9]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose,TenYearCHD
TenYearCHD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,1,35,20.0,0,0,223.0,128.0,80.0,67.0,0
0,1,1,43,0.0,0,0,200.0,133.0,55.0,71.0,0
0,2,1,46,0.0,0,1,185.0,121.0,80.0,97.0,0
0,3,0,63,0.0,0,1,281.0,135.0,63.0,68.0,0
0,4,0,59,0.0,0,0,292.0,114.0,68.0,72.0,0
...,...,...,...,...,...,...,...,...,...,...,...
1,552,1,65,0.0,1,1,266.0,140.0,80.0,77.0,1
1,553,1,52,0.0,1,0,202.0,136.0,83.0,67.0,1
1,554,0,57,0.0,0,1,432.0,153.0,98.0,75.0,1
1,555,1,45,20.0,0,0,264.0,118.5,75.0,90.0,1


In [10]:
y=torch.tensor(data["TenYearCHD"].values).float()
y

tensor([0., 0., 0.,  ..., 1., 1., 1.])

In [11]:
y=y.unsqueeze(1)

In [12]:
data=data.drop("TenYearCHD",axis=1)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose
TenYearCHD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,1,35,20.0,0,0,223.0,128.0,80.0,67.0
0,1,1,43,0.0,0,0,200.0,133.0,55.0,71.0
0,2,1,46,0.0,0,1,185.0,121.0,80.0,97.0
0,3,0,63,0.0,0,1,281.0,135.0,63.0,68.0
0,4,0,59,0.0,0,0,292.0,114.0,68.0,72.0


##### normalizing data:

In [13]:
data=(data-data.mean())/data.std()
x=torch.tensor(data.values).float()
x

tensor([[ 1.0436, -1.9451,  0.8790,  ..., -0.3666,  0.3485, -0.5414],
        [ 1.0436, -1.0146, -0.7572,  ..., -0.1627, -1.7064, -0.4173],
        [ 1.0436, -0.6656, -0.7572,  ..., -0.6521,  0.3485,  0.3899],
        ...,
        [-0.9574,  0.6138, -0.7572,  ...,  0.6531,  1.8280, -0.2931],
        [ 1.0436, -0.7819,  0.8790,  ..., -0.7541, -0.0625,  0.1726],
        [ 1.0436,  0.9628,  0.8790,  ...,  1.3465, -1.2955, -0.0447]])

In [14]:
x.shape

torch.Size([1114, 9])

In [15]:
y.shape

torch.Size([1114, 1])

##### dividing the dataset into training and testing:

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,test_size=0.3)

In [17]:
len(x_train),len(x_test),len(y_train),len(y_test)

(779, 335, 779, 335)

### Creating a neural network that facilitates Logisitic Regression Classification:

##### A class that creates a neural network for Logistic Regression:

In [18]:
class LR(torch.nn.Module):
    def __init__(self,n_features):
        super(LR,self).__init__()
        self.lr=torch.nn.Linear(n_features,1)
        
    def forward(self,x):
        out=torch.sigmoid(self.lr(x))
        return out

##### extracting number of features:

In [19]:
n_features=x_train.shape[1]

### _______________________________________________________________RISKY PART______________________________________________________________________

### Creating TenSEAL context for encrypting data:

In [20]:
poly_mod_degree=8192
coeff_mod_bit_sizes=[40,21,21,21,21,21,21,40]
ctx_training=ts.context(ts.SCHEME_TYPE.CKKS,poly_mod_degree,-1,coeff_mod_bit_sizes)
ctx_training.global_scale=2**21
ctx_training.generate_galois_keys()

In [21]:
print("Is the context private?",("Yes" if ctx_training.is_private() else "No"))

Is the context private? Yes


##### encrypting x_test,x_train and y_train:

In [22]:
t_start=time()
enc_x_test=[ts.ckks_vector(ctx_training,x.tolist()) for x in x_test]
enc_y_test=[ts.ckks_vector(ctx_training,y.tolist()) for y in y_test]
enc_x_train=[ts.ckks_vector(ctx_training,x.tolist()) for x in x_train]
enc_y_train=[ts.ckks_vector(ctx_training,y.tolist()) for y in y_train]
t_end=time()

print(f'data trained in {t_end-t_start}s')

data trained in 19.829476356506348s


### Training Logistic Regression model on encrypted data:

##### Creating a class that creates a neural network for Logistic Regression on encrypted data:

In [23]:
class EncryptedLR:
    
    def __init__(self,torch_lr):
        self.weight=torch_lr.lr.weight.data.tolist()[0]
        self.bias=torch_lr.lr.bias.data.tolist()
        self._delta_w = 0
        self._delta_b = 0
        self._count = 0
        
    def forward(self,enc_x):
        enc_out=enc_x.dot(self.weight)+self.bias
        enc_out=EncryptedLR.sigmoid(enc_out)
        return enc_out
    
    def backward(self,enc_x,enc_out,enc_y):
        out_minus_y=(enc_out-enc_y)
        self._delta_w+=enc_x*out_minus_y
        self._delta_b+=out_minus_y
        self._count+=1
        
    def update_parameters(self):
        if self._count==0:
            raise RuntimeError("You should at least run one forward iteration")

        self.weight-=self._delta_w*(1/self._count)+self.weight*0.05
        self.bias-=self._delta_b*(1/self._count)
    
        self._delta_w=0
        self._delta_b=0
        self._count=0
    
    @staticmethod
    def sigmoid(enc_x):
        # using polynomial approximation of degree 3 which fits the function pretty well in the range [-5,5]. Source: https://eprint.iacr.org/2018/462.pdf
        return enc_x.polyval([0.5,0.197,0,-0.004])
    
    def plain_accuracy(self,x_test,y_test):
        # evaluate accuracy of the model on
        # the plain (x_test, y_test) dataset
        w=torch.tensor(self.weight)
        b=torch.tensor(self.bias)
        out=torch.sigmoid(x_test.matmul(w)+b).reshape(-1, 1)
        correct=torch.abs(y_test-out)<0.5
        return correct.float().mean()    
    
    def encrypt(self,context):
        self.weight=ts.ckks_vector(context,self.weight)
        self.bias=ts.ckks_vector(context,self.bias)
        
    def decrypt(self):
        self.weight=self.weight.decrypt()
        self.bias=self.bias.decrypt()
        
    def __call__(self,*args,**kwargs):
        return self.forward(*args,**kwargs)

##### training the model on encrypted data and testing it on plain data:

In [24]:
EPOCHS=5
eelr = EncryptedLR(LR(n_features))
accuracy = eelr.plain_accuracy(x_test, y_test)
print(f"Accuracy at epoch #0 is {accuracy}")

times = []
for epoch in range(EPOCHS):
    eelr.encrypt(ctx_training)
    
    # if you want to keep an eye on the distribution to make sure
    # the function approximation is still working fine
    # WARNING: this operation is time consuming
    # encrypted_out_distribution(eelr, enc_x_train)
    
    t_start = time()
    for enc_x, enc_y in zip(enc_x_train, enc_y_train):
        enc_out = eelr.forward(enc_x)
        eelr.backward(enc_x, enc_out, enc_y)
    eelr.update_parameters()
    t_end = time()
    times.append(t_end - t_start)
    
    eelr.decrypt()
    accuracy = eelr.plain_accuracy(x_test, y_test)
    print(f"Accuracy at epoch #{epoch + 1} is {accuracy}")


print(f"\nAverage time per epoch: {int(sum(times) / len(times))} seconds")
print(f"Final accuracy is {accuracy}")

Accuracy at epoch #0 is 0.5283582210540771
Accuracy at epoch #1 is 0.6537313461303711
Accuracy at epoch #2 is 0.641791045665741
Accuracy at epoch #3 is 0.6328358054161072
Accuracy at epoch #4 is 0.6268656849861145
Accuracy at epoch #5 is 0.6268656849861145

Average time per epoch: 68 seconds
Final accuracy is 0.6268656849861145


##### testing the model on encrypted data:

In [25]:
def encrypted_evaluation(model, enc_x_test, y_test):
    t_start = time()
    
    correct = 0
    for enc_x, y in zip(enc_x_test, y_test):
        # encrypted evaluation
        enc_out = model(enc_x)
        # plain comparison
        out = enc_out.decrypt()
        out = torch.tensor(out)
        out = torch.sigmoid(out)
        if torch.abs(out - y) < 0.5:
            correct += 1
    
    t_end = time()
    print(f"Evaluated test_set of {len(x_test)} entries in {int(t_end - t_start)} seconds")
    print(f"Accuracy: {correct}/{len(x_test)} = {correct / len(x_test)}")
    return correct / len(x_test)
    

encrypted_accuracy = encrypted_evaluation(eelr, enc_x_test, y_test)

Evaluated test_set of 335 entries in 23 seconds
Accuracy: 169/335 = 0.5044776119402985
