In [2]:
# Import Dependencies
import phe as paillier
from contextlib import contextmanager

In [3]:
# Create Public and Private Keys
key_length = 1024
pub_key, privkey = paillier.generate_paillier_keypair(n_length=key_length) 

In [4]:
pub_key

<PaillierPublicKey 17da1056c7>

In [6]:
privkey

<PaillierPrivateKey for <PaillierPublicKey 17da1056c7>>

## Homomorphic Encryption for Machine Learning

Logistic Regression for Spam/Not Spam e-mail Classification.

For this problem we have two users:

**USER-1**

**USER-2**

AI Inc. makes a Machine Learning model that is trained on some email data for classification between Spam/Not Spam. Now, they want to take that model, encrypt it and send to USER-1 and USER-2 who will train the model on their data, fully Homomorphically Encrypted, and send the trained, a bit better model back to AI Inc.

In this process, AI Inc. get a better trained model every time without even looking at USER-1 or USER-2 data. This way AI Inc. can serve the customers better with a smart Machine Learning model and the USER has complete control of his/her data.

In [7]:
# Import Dependencies

import time
import os.path
from zipfile import ZipFile
from urllib.request import urlopen
from contextlib import contextmanager

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
@contextmanager
def timer():
    """Helper for measuring runtime"""

    time0 =  time.perf_counter()
    yield
    print('[elapsed time: %.2f s]' % (time.perf_counter() - time0))

In [9]:
class AI_Inc:
    """
    AI Inc. Trains a Logistic Regression model on plaintext data, encrypts the model for remote use by USER-1 and USER-2,
    decrypts encrypted scores using the paillier private key.
    """

    def __init__(self):
        #self.model = LogisticRegression()  ## X, Y not encripted
        self.model = RandomForestClassifier()

    # Generate Public and Private Key Pairs
    # Public Key is used to Encrypt the Data, Private Key to Decrypt
    def generate_paillier_keypair(self, n_length):
        self.pubkey, self.privkey = paillier.generate_paillier_keypair(n_length=n_length)  # generation of pub and private

    # Train the Model
    def fit(self, X, y):
        self.model = self.model.fit(X, y)   # model train without encription

    # Make Predictions for Email "Spam/Not Spam"
    def predict(self, X):
        return self.model.predict(X)   # model predict without encripted data

    # Encypt the Coefficients for the Logistic Regression Equation
    # Weights can tell about the data, so Encrypt them
    # Equation: y = mX + b
    def encrypt_weights(self):
        coef = self.model.coef_[0, :]
        encrypted_weights = [self.pubkey.encrypt(coef[i])
                             for i in range(coef.shape[0])]
        encrypted_intercept = self.pubkey.encrypt(self.model.intercept_[0])
        return encrypted_weights, encrypted_intercept

    # Decrypt the Scores for the Model
    def decrypt_scores(self, encrypted_scores):
        return   self.privkey.decrypt(encrypted_scores)                           #[self.privkey.decrypt(s) for s in encrypted_scores]

In [12]:
# Now the USER-1 gets a trained model from AI Inc. and trains on its own data all using Homomorphic Encryption.
class User_1:
    """
    USER-1/USER-2 are given the encrypted model trained by AI Inc. and the public key.

    Scores local plaintext data with the encrypted model, but cannot decrypt
    the scores without the private key held by AI Inc..
    """

    def __init__(self, pubkey):
        self.pubkey = pubkey

    # Set Initial Values of Coefficients
    def set_weights(self, weights, intercept):
        self.weights = weights
        self.intercept = intercept

    # Compute the Prediction Scores for the Model all while being totally Encrypted.
    def encrypted_score(self, x):
        """Compute the score of `x` by multiplying with the encrypted model,
        which is a vector of `paillier.EncryptedNumber`"""
        score = self.intercept
        idx = len(x.shape) #x.nonzero()
        for i in range(len(x.shape)):
            for j in range(30):
                score += x[i][j] * self.weights[j]
                
    
        
#         for i in range(idx):
#             score += x[0, i] * self.weights[i]
        return score

    # Get the Evaluation Scores for the Model
    def encrypted_evaluate(self, X):
        for i in range(len(X.shape)):
            for j in range(30):
                result = self.encrypted_score(X[i][j])
        return result         #[self.encrypted_score(X[i, :]) for i in range(X.shape[0])]

In [15]:
# USER-1 taking the encrypted model, weights and testing performance on it's own dataset
print("USER-1: Scoring on own data with AI Inc.'s Encrypted Classifier...")

# AI Inc sends the Public Keys to perform operations
user_1 = User_1(ai_inc.pubkey)

# USER-1 sets the model Hyperparameters to AI Inc.'s Hyperparameter values
user_1.set_weights(encrypted_weights, encrypted_intercept)

with timer() as t:
    encrypted_scores = user_1.encrypted_evaluate(X_test)

USER-1: Scoring on own data with AI Inc.'s Encrypted Classifier...


NameError: name 'ai_inc' is not defined

In [13]:
import pandas as pd

In [14]:
data =  pd.read_csv("New_data.csv")

In [15]:
data = data.drop(['Unnamed: 0'], axis = 1) 

In [16]:
X = data.iloc[:,1:].values
Y = data.iloc[:,0].values

In [17]:
XL = X.tolist()
encrypted_number_X = []
for i in range(len(XL)):
    for j in range(30):
        encrypted_number_X.append(pub_key.encrypt(XL[i][j]))

In [18]:
import numpy as np
arr_X = np.reshape(encrypted_number_X,(569,30))

In [19]:
YL = Y.tolist()
enc_Y = [pub_key.encrypt(y) for y in YL]
arr_Y = np.reshape(enc_Y,(569,))

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25 , random_state = 0)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# log = LogisticRegression(random_state=0)
# log.fit(X_train,Y_train)

In [22]:
# # Get the Preprocessed Split Data
# X_train, y_train, X_test, y_test = preprocess_data()

In [23]:
# Now firstly the AI Inc. Generates the Public and Private Keys
print("AI Inc.: Generating Paillier Public Private Keypair")
ai_inc = AI_Inc()
# NOTE: using smaller keys sizes wouldn't be cryptographically safe
ai_inc.generate_paillier_keypair(n_length=1024)

AI Inc.: Generating Paillier Public Private Keypair


In [24]:
import time
print("AI Inc.: Training Initial Spam Classifier")
with timer() as t:
    ai_inc.fit(X_train, Y_train)

AI Inc.: Training Initial Spam Classifier
[elapsed time: 0.24 s]


In [25]:
print("AI Inc.'s Classification on Test Data, what it would expect the performance to be on USER-1/2's data...")
with timer() as t:
    error = np.mean(ai_inc.predict(X_test) != Y_test)
print("Error {:.3f}".format(error))

AI Inc.'s Classification on Test Data, what it would expect the performance to be on USER-1/2's data...
[elapsed time: 0.05 s]
Error 0.028


In [26]:
# LG
# AI Inc.'s Classification on Test Data, what it would expect the performance to be on USER-1/2's data...
# [elapsed time: 0.00 s]
# Error 0.056
# Random Forest



In [27]:
print("AI Inc.: Encrypting Trained Classifier before sending to USER-1/2")
with timer() as t:
    encrypted_weights, encrypted_intercept = ai_inc.encrypt_weights()  # function call with in the class

AI Inc.: Encrypting Trained Classifier before sending to USER-1/2


AttributeError: 'RandomForestClassifier' object has no attribute 'coef_'

In [None]:
# Confirming the Weights are Encrypted
print("Encrypted Weights: ", encrypted_weights)
print("Encrypted Intercept: ", encrypted_intercept)

Now, we have an encrypted trained model.

AI Inc. sends the trained model with it's weights encrypted [as weights can tell something about the data] and sends both the things to the USER-1 and USER2.

Now, USER-1 and USER-2 get the encrypted weights, the trained model and the public key to do some operations on their own dataset. This is called **Homomorphic Encryption**.

In [48]:
143*30

4290

In [57]:
X_test.shape

(143, 30)

In [53]:
x_test = np.reshape(X_test,(4290,))

In [122]:
# USER-1 taking the encrypted model, weights and testing performance on it's own dataset
print("USER-1: Scoring on own data with AI Inc.'s Encrypted Classifier...")

# AI Inc sends the Public Keys to perform operations
user_1 = User_1(ai_inc.pubkey)

# USER-1 sets the model Hyperparameters to AI Inc.'s Hyperparameter values
user_1.set_weights(encrypted_weights, encrypted_intercept)

with timer() as t:
    encrypted_scores = user_1.encrypted_evaluate(X_test)

USER-1: Scoring on own data with AI Inc.'s Encrypted Classifier...
[elapsed time: 0.00 s]


In [None]:
#IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed  

In [124]:
# Making Sure the Score is Encrypted
print(encrypted_scores)

<phe.paillier.EncryptedNumber object at 0x0000016A54251340>


In [123]:
0x0000016A53D11610  # LG

1556184372752

In [125]:
0x0000016A54251340  # RF

1556189877056

Now USER has the option to train the model on it's own data and send the trained model to AI Inc.

In [126]:
type(encrypted_scores)
encrypted_scores1 = [encrypted_scores ] #.tolist()
type(encrypted_scores1)

list

In [133]:
print("AI Inc.: Decrypting USER-1/2's scores")

with timer() as t:
    score = ai_inc.decrypt_scores(encrypted_scores1)
    print("score : {}".format(score))
    error = np.mean(np.sign(score))  != Y_test)
print("Error {:.3f} -- this is not known to AI Inc., who does not possess the ground truth labels".format(error))

AI Inc.: Decrypting USER-1/2's scores
score : [-0.23990767545189137]
[elapsed time: 0.01 s]
Error -1.000 -- this is not known to AI Inc., who does not possess the ground truth labels


In [None]:
# LG
# AI Inc.: Decrypting USER-1/2's scores
# [-0.23990767545189137]
# [elapsed time: 0.01 s]
# RF
# AI Inc.: Decrypting USER-1/2's scores
# [-0.23990767545189137]
# [elapsed time: 0.01 s]



In [132]:
Y_test

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1], dtype=int64)