In [4]:
import pickle
import pathlib
import os
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from keras.src import Sequential
from keras.src.layers import Dense, BatchNormalization, Dropout
from keras.src.callbacks import EarlyStopping

In [5]:
PROJECT_PATH = os.getcwd()
PROJECT_PATH = os.path.join(PROJECT_PATH, "..")
MODELS_DIR = pathlib.Path(PROJECT_PATH) / "store" / "models"

DATASET_DIR = pathlib.Path(PROJECT_PATH) / "data"

In [6]:
import pandas as pd

dataset = pd.read_csv(DATASET_DIR / "bank_marketing"/ "dataset.csv")
dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,27,management,single,secondary,no,35,no,no,cellular,4,jul,255,1,-1,0,,no
1,54,blue-collar,married,primary,no,466,no,no,cellular,4,jul,297,1,-1,0,,no
2,43,blue-collar,married,secondary,no,105,no,yes,cellular,4,jul,668,2,-1,0,,no
3,31,technician,single,secondary,no,19,no,no,telephone,4,jul,65,2,-1,0,,no
4,27,technician,single,secondary,no,126,yes,yes,cellular,4,jul,436,4,-1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,,yes
30903,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,,yes
30904,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
30905,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,,no


In [7]:
X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1]

# Remove the bug in the dataset where the entire row has -9 values
mask = ~(X == -9).all(axis=1)
X = X[mask]
y = y[mask]

y = y.replace({"no": 0, "yes": 1}).astype(int)

  y = y.replace({"no": 0, "yes": 1}).astype(int)


In [8]:
def preprocess(X: pd.DataFrame):
    """
    The function will preprocess the data:
    1. Categorical features will be label encoded (Boy->1, Girl ->2)
    2. Numerical features will be scaled if the data is intended to be used for baseline. For cloud data set, no scaling will be preformed.

    Return pd.Dataframe
    """
    # Identify categorical and numeric columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

    # Initialize lists to store processed columns
    processed_columns = []

    # If there are categorical columns, apply one-hot encoding
    if categorical_cols:
        print("\nEncoding categorical columns...")
        # onehot_encoder = OneHotEncoder(categories='auto', sparse=False)
        # X_categorical = pd.DataFrame(onehot_encoder.fit_transform(X[categorical_cols]),
        #                              columns=onehot_encoder.get_feature_names_out(categorical_cols))
        label_encoder = LabelEncoder()
        X_categorical = pd.DataFrame()
        for col in categorical_cols:
            X_categorical[col] = label_encoder.fit_transform(X[col])

        processed_columns.append(X_categorical)

    # Apply standard scaling to the numeric columns
    if numeric_cols:
        print("\nScaling numerical columns...")
        scaler = MinMaxScaler()
        X_numeric = X[numeric_cols]
        # X_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

        processed_columns.append(X_numeric)

    # Combine the processed columns
    if processed_columns:
        X_processed = pd.concat(processed_columns, axis=1)
    else:
        X_processed = X.copy()  # If there are no categorical or numeric columns, keep the original dataframe


    return X_processed


X = preprocess(X)


Encoding categorical columns...

Scaling numerical columns...


In [9]:
X_sample, y_sample = X.iloc[2000:3000], y.iloc[2000:3000]
y_sample.value_counts()

y
0    919
1     81
Name: count, dtype: int64

In [10]:
X_test, y_test = X.iloc[:1000], y.iloc[:1000]
y_test.value_counts()

y
0    940
1     60
Name: count, dtype: int64

In [11]:
import pandas as pd
import torch.nn as nn
from keras.src.utils import to_categorical


class DNNEmbedding(nn.Module):

    name = "dnn_embedding"

    def __init__(self, **kwargs):
        super(DNNEmbedding, self).__init__()

        X, y = kwargs.get("X"), kwargs.get("y")
        num_classes = len(set(y))
        y = to_categorical(y, num_classes=num_classes)

        model = Sequential()
        model.add(Dense(units=X.shape[1]//2, activation='tanh', name="embedding"))
        model.add(BatchNormalization())
        model.add(Dropout(0.4))
        model.add(Dense(units=num_classes, activation='softmax', name="output"))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        early_stop = EarlyStopping(patience=2, monitor="loss")

        model.fit(X, y,validation_data=[X_test, to_categorical(y_test,2)], epochs=50, batch_size=8, callbacks=[early_stop])
        self.model = model.layers[0]
        self.output_shape = (1, X.shape[1]//2)


    def forward(self, x):

        if type(x) is pd.DataFrame:
            x = x.to_numpy()

        embedding = self.model(x)
        return embedding


embedding = DNNEmbedding(X=X_sample, y=y_sample)



  from .autonotebook import tqdm as notebook_tqdm


Epoch 1/50


2024-11-25 21:41:55.878657: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.5990 - loss: 0.8058 - val_accuracy: 0.8780 - val_loss: 0.4320
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7562 - loss: 0.5783 - val_accuracy: 0.9120 - val_loss: 0.3815
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8541 - loss: 0.4363 - val_accuracy: 0.9240 - val_loss: 0.3399
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8823 - loss: 0.3715 - val_accuracy: 0.9240 - val_loss: 0.3020
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9001 - loss: 0.3373 - val_accuracy: 0.9250 - val_loss: 0.2708
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.9055 - loss: 0.3081 - val_accuracy: 0.9250 - val_loss: 0.2570
Epoch 7/50
[1m125/125[0m [32m━

In [38]:
import numpy as np
from keras.src.layers import Input, Dense,  Flatten
from keras.src.layers import BatchNormalization, Activation, Conv2DTranspose
from keras.src.models import Model, Sequential
from keras.src.layers import LeakyReLU, Reshape, Conv2D, UpSampling2D, ReLU

class BaseEncryptor:

    name: str

    def __init__(self, input_shape=None, output_shape=None):
        self.model = None
        self.output_shape = output_shape
        self.input_shape = input_shape

    def build_generator(self, input_shape, output_shape):
        raise NotImplementedError("Subclasses should implement this method")

    def encode(self, inputs) -> np.array:
        inputs = np.expand_dims(inputs, axis=0)
        if self.model is None:
            input_shape = inputs.shape[1:]
            output_shape = self.output_shape or (1, inputs.shape[2])
            self.model = self.build_generator(input_shape, output_shape)
        return self.model(inputs).numpy()

class DCEncryptor(BaseEncryptor):

    name = "dc"
    
    def build_generator1(self, input_shape, output_shape):
        G = Sequential()
        G.add(Reshape(target_shape=[1, 1, self.input_shape], input_shape=[self.input_shape]))
        # No weights or activations here

        # 1x1x4096
        G.add(Conv2DTranspose(filters=64, kernel_size=4))
        G.add(Activation('relu'))
        # Weights index: 0, Activations index: 1

        # 4x4x64
        G.add(Conv2D(filters=64, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 2, Activations index: 5
        G.add(UpSampling2D())
        # No weights or activations here

        # 8x8x64
        G.add(Conv2D(filters=32, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 8, Activations index: 9
        G.add(UpSampling2D())
        # No weights or activations here

        # 16x16x32
        G.add(Conv2D(filters=16, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 14, Activations index: 13
        G.add(UpSampling2D())
        # No weights or activations here

        # 32x32x16
        G.add(Conv2D(filters=8, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 20, Activations index: 17
        G.add(UpSampling2D())
        # No weights or activations here

        # 64x64x8
        G.add(Conv2D(filters=4, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 26, Activations index: 21
        G.add(UpSampling2D())
        # No weights or activations here

        # 128x128x4
        G.add(Conv2D(filters=3, kernel_size=4, padding='same'))
        G.add(Activation('sigmoid'))
        
        return G
        
    def build_generator(self, input_shape, output_shape):

        input_layer = Input(shape=input_shape)
        x = Flatten()(input_layer)
    
        x = Dense(4*4*256, use_bias=False)(x)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)
    
        x = Reshape((4, 4, 256))(x)
    
        x = Conv2DTranspose(128, (4, 4), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
    
        x = Conv2DTranspose(64, (4, 4), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
    
        x = Conv2DTranspose(32, (4, 4), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
    
        output_image = Conv2DTranspose(3, (4, 4), strides=(1, 1), padding='same', use_bias=False, activation='sigmoid')(x)
    
        return Model(inputs=input_layer, outputs=output_image)
        
    def build_generator_vgg224(self, input_shape, output_shape):

        input_layer = Input(shape=input_shape)
        x = Flatten()(input_layer)

        x = Dense(7*7*256, use_bias=False)(x)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)

        x = Reshape((7, 7, 256))(x)
        x = Conv2DTranspose(128, (5, 5), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)

        x = Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)

        x = Conv2DTranspose(32, (5, 5), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)

        x = Conv2DTranspose(16, (5, 5), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)

        output_image = Conv2DTranspose(3, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh')(x)

        return Model(inputs=input_layer, outputs=output_image)
    
    
encoder = DCEncryptor(output_shape=(1, 32, 32, 4))

In [13]:
from keras.api.applications import ResNet152V2, VGG16, EfficientNetB7
# from keras.api.applications.vgg16 import preprocess_input
from keras.api.applications.resnet_v2 import preprocess_input
import tensorflow as tf
from keras.api.models import load_model


def pad(tensor, original, target=600):
    pad_height = (target - original) // 2
    pad_width = (target - original) // 2
    padded_tensor = tf.pad(tensor, [[pad_height, pad_height], [pad_width, pad_width], [0, 0]], mode='CONSTANT', constant_values=0)

    # If the dimensions are odd, add an extra row/column to one side
    if (600 - 224) % 2 != 0:
        padded_tensor = tf.pad(padded_tensor, [[0, 1], [0, 1], [0, 0]], mode='CONSTANT', constant_values=0)

    return padded_tensor[np.newaxis, ...]

def preprocess_image(image):
    # Assuming 'image' is your input tensor
    resized_image = tf.image.resize(image, (32, 32))
    return resized_image


class VGG16CloudModel:
    name = "vgg16"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.model = self.get_model()
        self.input_shape = (32,32,3)#(224, 224, 3)
        self.output_shape = (1,1000)

    def fit(self, X_train, y_train, **kwargs):
        pass

    def get_model(self):
        # Load the pretrained VGG16 model with ImageNet weights
        model = load_model("/Users/eden.yavin/Projects/Tabular-Cloud-ML/store/models/cifar100_vgg.keras") #VGG16(weights='imagenet')
        return model

    def predict(self, X):
        X = self.preprocess(X)
        predictions = self.model.predict(X, verbose=None)
        return predictions

    def preprocess(self, X):
        
        X = X.copy()
        # X = (X * 10000).astype(np.uint8)

        if any(s < 32 for s in X.shape[1:3]):
            # Pad the input to make its size equal to 224
            padded_X = tf.image.resize_with_crop_or_pad(X, 32, 32)

            # Ensure the input is properly preprocessed for VGG16
            X = preprocess_input(padded_X.numpy())
        else:
            # If no padding is needed, directly preprocess the input
            X = preprocess_input(X)

        return X

    
    
cloud = VGG16CloudModel()


In [39]:
X_encrypted, X_test_encrypted = [], []
X_embed, X_test_embed = [], []
for i, x in X_sample.iterrows():
    
    x_embed = embedding(x.values.reshape(1,-1))
    X_embed.append(x_embed)
    encrypted = encoder.encode(np.vstack(x_embed))

    X_encrypted.append(encrypted)
    
for i,x in X_test.iterrows():
    x_embed = embedding(x.values.reshape(1,-1))
    X_test_embed.append(x_embed)
    encrypted = encoder.encode(np.vstack(x_embed))
    X_test_encrypted.append(encrypted)

In [40]:
predictions = [
    cloud.predict(x)
    for x in X_encrypted
]
test_preds = [
    cloud.predict(x)
    for x in X_test_encrypted
]

In [23]:
# VGG Cifar100 | Encoder - Sigmoid 
g = predictions[0]
g.max()

0.96965593

In [41]:
# VGG Cifar100 | Encoder - tanh 
v = predictions[0]
v.max()

0.9696511

In [139]:
# Resnet no scalar
# p = predictions[0]
p.max()

1.0

In [109]:
# Resnet with scalar
# h = predictions[0]
h.max()

0.63243544

In [42]:

from keras.src.callbacks import LearningRateScheduler, EarlyStopping
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, f1_score
from keras.src.models import Model
from keras.src.layers import Dense, Dropout, Input,  BatchNormalization
from keras.src.metrics import F1Score
import numpy as np

class NeuralNetworkInternalModel(BaseEstimator, ClassifierMixin):

    def __init__(self, **kwargs):
        self.batch_size = 8
        self.dropout_rate = 0.3
        self.epochs = 100
        self.model: Model = None

    def fit(self, X, y):
        y_onehot = to_categorical(y , 2)
        lr_scheduler = LearningRateScheduler(lambda epoch: 0.0001 * (0.9 ** epoch))
        early_stopping = EarlyStopping(patience=3, monitor='loss')
        self.model.fit(X, y_onehot, epochs=self.epochs, batch_size=self.batch_size, callbacks=[lr_scheduler, early_stopping])

    def predict(self, X):
        prediction = self.model.predict(X)
        return np.argmax(prediction, axis=1)


    def evaluate(self, X, y):
        if len(y.shape) == 2:
            y = np.argmax(y, axis=1)

        pred = self.predict(X)
        return accuracy_score(y, pred), f1_score(y, pred, average='weighted')


class DenseInternalModel(NeuralNetworkInternalModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "neural_network"
        num_classes = kwargs.get("num_classes")
        input_shape = kwargs.get("input_shape")
        self.model = self.get_model(num_classes=num_classes, input_shape=input_shape)

    def get_model(self, num_classes, input_shape):
        # Build the model
        inputs = Input(shape=(input_shape,))  # Dynamic input shape

        # Define the hidden layers
        x = BatchNormalization()(inputs)
        x = Dense(units=128, activation='leaky_relu')(x)
        x = Dropout(self.dropout_rate)(x)

        # Define the output layer
        outputs = Dense(units=num_classes, activation='softmax')(x)

        # Create the model
        model = Model(inputs=inputs, outputs=outputs)

        # Compile the model with F1 Score
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy', F1Score()]
                      )

        return model
    
iim = DenseInternalModel(num_classes=2, input_shape=predictions[0].shape[1])

In [43]:
iim.fit(np.vstack(predictions), y_sample)

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9018 - f1_score: 0.5085 - loss: 0.6594 - learning_rate: 1.0000e-04
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9118 - f1_score: 0.4769 - loss: 0.4453 - learning_rate: 9.0000e-05
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9049 - f1_score: 0.4750 - loss: 0.3254 - learning_rate: 8.1000e-05
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9234 - f1_score: 0.4801 - loss: 0.2734 - learning_rate: 7.2900e-05
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9140 - f1_score: 0.4775 - loss: 0.2922 - learning_rate: 6.5610e-05
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9236 - f1_score: 0.4801 - loss: 0.2696 - learning_rate: 5.9

In [44]:
iim.evaluate(np.vstack(test_preds), y_test)
# resnet

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


(0.94, 0.9109278350515464)

In [45]:
X_pred_embed, X_test_pred_embed = [], []
for embed, pred in zip(X_embed, predictions):
    X_pred_embed.append(np.hstack([embed, pred]))
    
for embed, pred in zip(X_test_embed, test_preds):
    X_test_pred_embed.append(np.hstack([embed, pred]))

In [46]:
iim = DenseInternalModel(num_classes=2, input_shape=X_pred_embed[0].shape[1])

iim.fit(np.vstack(X_pred_embed), y_sample)


Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6921 - f1_score: 0.4720 - loss: 0.6348 - learning_rate: 1.0000e-04
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9302 - f1_score: 0.4819 - loss: 0.3988 - learning_rate: 9.0000e-05
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9244 - f1_score: 0.4803 - loss: 0.2926 - learning_rate: 8.1000e-05
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9262 - f1_score: 0.4808 - loss: 0.2575 - learning_rate: 7.2900e-05
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9133 - f1_score: 0.4772 - loss: 0.2732 - learning_rate: 6.5610e-05
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9235 - f1_score: 0.4801 - loss: 0.2485 - learning_rate: 5.9

In [30]:
iim.evaluate(np.vstack(X_test_pred_embed), y_test)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


(0.94, 0.9109278350515464)

In [48]:
baseline = DenseInternalModel(num_classes=2, input_shape=X_sample.shape[1])
baseline.fit(X_sample, y_sample)
baseline.evaluate(X_test, y_test)

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.5807 - f1_score: 0.4430 - loss: 0.6987 - learning_rate: 1.0000e-04
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7671 - f1_score: 0.4760 - loss: 0.5389 - learning_rate: 9.0000e-05
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8928 - f1_score: 0.5261 - loss: 0.4287 - learning_rate: 8.1000e-05
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9114 - f1_score: 0.5282 - loss: 0.3774 - learning_rate: 7.2900e-05
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9261 - f1_score: 0.5431 - loss: 0.3235 - learning_rate: 6.5610e-05
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9184 - f1_score: 0.4939 - loss: 0.3105 - learning_rate: 5.9

(0.939, 0.9104280556988138)