In [1]:
import pickle
import pathlib
import os
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder

In [2]:
PROJECT_PATH = os.getcwd()
PROJECT_PATH = os.path.join(PROJECT_PATH, "..")
MODELS_DIR = pathlib.Path(PROJECT_PATH) / "store" / "models"

DATASET_DIR = pathlib.Path(PROJECT_PATH) / "data"

In [3]:
import pandas as pd

dataset = pd.read_csv(DATASET_DIR / "bank_marketing"/ "dataset.csv")
dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,27,management,single,secondary,no,35,no,no,cellular,4,jul,255,1,-1,0,,no
1,54,blue-collar,married,primary,no,466,no,no,cellular,4,jul,297,1,-1,0,,no
2,43,blue-collar,married,secondary,no,105,no,yes,cellular,4,jul,668,2,-1,0,,no
3,31,technician,single,secondary,no,19,no,no,telephone,4,jul,65,2,-1,0,,no
4,27,technician,single,secondary,no,126,yes,yes,cellular,4,jul,436,4,-1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30902,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,,yes
30903,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,,yes
30904,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
30905,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,,no


In [4]:
X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1]

# Remove the bug in the dataset where the entire row has -9 values
mask = ~(X == -9).all(axis=1)
X = X[mask]
y = y[mask]

y = y.replace({"no": 0, "yes": 1}).astype(int)

  y = y.replace({"no": 0, "yes": 1}).astype(int)


In [5]:
def preprocess(X: pd.DataFrame):
    """
    The function will preprocess the data:
    1. Categorical features will be label encoded (Boy->1, Girl ->2)
    2. Numerical features will be scaled if the data is intended to be used for baseline. For cloud data set, no scaling will be preformed.

    Return pd.Dataframe
    """
    # Identify categorical and numeric columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

    # Initialize lists to store processed columns
    processed_columns = []

    # If there are categorical columns, apply one-hot encoding
    if categorical_cols:
        print("\nEncoding categorical columns...")
        # onehot_encoder = OneHotEncoder(categories='auto', sparse=False)
        # X_categorical = pd.DataFrame(onehot_encoder.fit_transform(X[categorical_cols]),
        #                              columns=onehot_encoder.get_feature_names_out(categorical_cols))
        label_encoder = LabelEncoder()
        X_categorical = pd.DataFrame()
        for col in categorical_cols:
            X_categorical[col] = label_encoder.fit_transform(X[col])

        processed_columns.append(X_categorical)

    # Apply standard scaling to the numeric columns
    if numeric_cols:
        print("\nScaling numerical columns...")
        scaler = MinMaxScaler()
        # X_numeric = X[numeric_cols]
        X_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

        processed_columns.append(X_numeric)

    # Combine the processed columns
    if processed_columns:
        X_processed = pd.concat(processed_columns, axis=1)
    else:
        X_processed = X.copy()  # If there are no categorical or numeric columns, keep the original dataframe


    return X_processed


X = preprocess(X)


Encoding categorical columns...

Scaling numerical columns...


In [6]:
X = preprocess(X)
X_sample, y_sample = X.iloc[2000:22000], y.iloc[2000:22000]
y_sample.value_counts()


Scaling numerical columns...


y
0    18287
1     1713
Name: count, dtype: int64

In [7]:
X_test, y_test = X.iloc[:2000], y.iloc[:2000]
y_test.value_counts()

y
0    1878
1     122
Name: count, dtype: int64

In [8]:
import pandas as pd
import torch.nn as nn, tensorflow as tf
from keras.src.utils import to_categorical
from keras.src.layers import Dense, Dropout, Flatten, BatchNormalization
from keras.src.callbacks import EarlyStopping
from keras.src import Sequential


class DNNEmbedding(nn.Module):

    name = "dnn_embedding"

    def __init__(self, **kwargs):
        super(DNNEmbedding, self).__init__()

        X, y = kwargs.get("X"), kwargs.get("y")
        num_classes = len(set(y))
        y = to_categorical(y, num_classes=num_classes)

        model = Sequential()
        model.add(Dense(units=X.shape[1]//2, activation='tanh', name="embedding"))
        model.add(BatchNormalization())
        model.add(Dropout(0.4))
        model.add(Dense(units=num_classes, activation='softmax', name="output"))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        early_stop = EarlyStopping(patience=2, monitor="loss")

        model.fit(X, y,validation_data=[X_test, to_categorical(y_test,2)], epochs=50, batch_size=8, callbacks=[early_stop])
        self.model = model.layers[0]
        self.output_shape = (1, X.shape[1]//2)


    def forward(self, x):

        if type(x) is pd.DataFrame:
            x = x.to_numpy()

        embedding = self.model(x)
        return embedding


# Code to run on CPU
with tf.device('/CPU:0'):
    
    embedding = DNNEmbedding(X=X_sample, y=y_sample)



  from .autonotebook import tqdm as notebook_tqdm


Epoch 1/50


2024-12-22 20:55:26.347344: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-12-22 20:55:26.347361: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2024-12-22 20:55:26.347365: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2024-12-22 20:55:26.347586: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-12-22 20:55:26.347599: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 308us/step - accuracy: 0.7992 - loss: 0.4826 - val_accuracy: 0.9410 - val_loss: 0.1740
Epoch 2/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 278us/step - accuracy: 0.9155 - loss: 0.2582 - val_accuracy: 0.9470 - val_loss: 0.1401
Epoch 3/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 271us/step - accuracy: 0.9222 - loss: 0.2243 - val_accuracy: 0.9485 - val_loss: 0.1459
Epoch 4/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 272us/step - accuracy: 0.9128 - loss: 0.2371 - val_accuracy: 0.9485 - val_loss: 0.1496
Epoch 5/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 271us/step - accuracy: 0.9167 - loss: 0.2336 - val_accuracy: 0.9480 - val_loss: 0.1528
Epoch 6/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 275us/step - accuracy: 0.9171 - loss: 0.2308 - val_accuracy: 0.9465 - val_loss: 0.1386
Epoch 7/50
[1m

In [9]:
import numpy as np
from keras.src.layers import Input, Dense,  Flatten
from keras.src.layers import BatchNormalization, Activation, Conv2DTranspose
from keras.src.models import Model, Sequential
from keras.src.layers import LeakyReLU, Reshape, Conv2D, UpSampling2D, ReLU

class BaseEncryptor:

    name: str

    def __init__(self, input_shape=None, output_shape=None):
        self.model = None
        self.output_shape = output_shape
        self.input_shape = input_shape

    def build_generator(self, input_shape, output_shape):
        raise NotImplementedError("Subclasses should implement this method")

    def encode(self, inputs) -> np.array:
        inputs = np.expand_dims(inputs, axis=0)
        if self.model is None:
            input_shape = inputs.shape[1:]
            output_shape = self.output_shape or (1, inputs.shape[2])
            self.model = self.build_generator(input_shape, output_shape)
        return self.model(inputs).numpy()

class DCEncryptor(BaseEncryptor):

    name = "dc"
    
    def build_generator1(self, input_shape, output_shape):
        G = Sequential()
        G.add(Reshape(target_shape=[1, 1, self.input_shape], input_shape=[self.input_shape]))
        # No weights or activations here

        # 1x1x4096
        G.add(Conv2DTranspose(filters=64, kernel_size=4))
        G.add(Activation('relu'))
        # Weights index: 0, Activations index: 1

        # 4x4x64
        G.add(Conv2D(filters=64, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 2, Activations index: 5
        G.add(UpSampling2D())
        # No weights or activations here

        # 8x8x64
        G.add(Conv2D(filters=32, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 8, Activations index: 9
        G.add(UpSampling2D())
        # No weights or activations here

        # 16x16x32
        G.add(Conv2D(filters=16, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 14, Activations index: 13
        G.add(UpSampling2D())
        # No weights or activations here

        # 32x32x16
        G.add(Conv2D(filters=8, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 20, Activations index: 17
        G.add(UpSampling2D())
        # No weights or activations here

        # 64x64x8
        G.add(Conv2D(filters=4, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 26, Activations index: 21
        G.add(UpSampling2D())
        # No weights or activations here

        # 128x128x4
        G.add(Conv2D(filters=3, kernel_size=4, padding='same'))
        G.add(Activation('sigmoid'))
        
        return G
        
    def build_generator(self, input_shape, output_shape):

        input_layer = Input(shape=input_shape)
        x = Flatten()(input_layer)
    
        x = Dense(4*4*256, use_bias=False)(x)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)
    
        x = Reshape((4, 4, 256))(x)
    
        x = Conv2DTranspose(128, (4, 4), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
    
        x = Conv2DTranspose(64, (4, 4), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
    
        x = Conv2DTranspose(32, (4, 4), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
    
        output_image = Conv2DTranspose(3, (4, 4), strides=(1, 1), padding='same', use_bias=False, activation='sigmoid')(x)
    
        return Model(inputs=input_layer, outputs=output_image)
        
    def build_generator_vgg224(self, input_shape, output_shape):

        input_layer = Input(shape=input_shape)
        x = Flatten()(input_layer)

        x = Dense(7*7*256, use_bias=False)(x)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)

        x = Reshape((7, 7, 256))(x)
        x = Conv2DTranspose(128, (5, 5), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)

        x = Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)

        x = Conv2DTranspose(32, (5, 5), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)

        x = Conv2DTranspose(16, (5, 5), strides=(2, 2), padding='same', use_bias=False)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)

        output_image = Conv2DTranspose(3, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh')(x)

        return Model(inputs=input_layer, outputs=output_image)
    
    
encoder = DCEncryptor(output_shape=(1, 32, 32, 4))

In [13]:
from keras.api.applications import ResNet152V2, VGG16, EfficientNetB7
# from keras.api.applications.vgg16 import preprocess_input
from keras.api.applications.resnet_v2 import preprocess_input
import tensorflow as tf
from keras.api.models import load_model


def pad(tensor, original, target=600):
    pad_height = (target - original) // 2
    pad_width = (target - original) // 2
    padded_tensor = tf.pad(tensor, [[pad_height, pad_height], [pad_width, pad_width], [0, 0]], mode='CONSTANT', constant_values=0)

    # If the dimensions are odd, add an extra row/column to one side
    if (600 - 224) % 2 != 0:
        padded_tensor = tf.pad(padded_tensor, [[0, 1], [0, 1], [0, 0]], mode='CONSTANT', constant_values=0)

    return padded_tensor[np.newaxis, ...]

def preprocess_image(image):
    # Assuming 'image' is your input tensor
    resized_image = tf.image.resize(image, (32, 32))
    return resized_image


class VGG16CloudModel:
    name = "vgg16"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.model = self.get_model()
        self.input_shape = (224, 224, 3) 
        self.output_shape = (1,1000)

    def fit(self, X_train, y_train, **kwargs):
        pass

    def get_model(self):
        # Load the pretrained VGG16 model with ImageNet weights
        # model = load_model("/Users/eden.yavin/Projects/Tabular-Cloud-ML/store/models/cifar100_vgg.keras") #
        model = VGG16(weights='imagenet')
        return model

    def predict(self, X):
        X = self.preprocess(X)
        predictions = self.model.predict(X, verbose=None)
        return predictions

    def preprocess(self, X):
        
        X = X.copy()
        # X = (X * 10000).astype(np.uint8)

        if any(s < 224 for s in X.shape[1:3]):
            # Pad the input to make its size equal to 224
            padded_X = tf.image.resize_with_crop_or_pad(X, 224, 224)

            # Ensure the input is properly preprocessed for VGG16
            X = preprocess_input(padded_X.numpy())
        else:
            # If no padding is needed, directly preprocess the input
            X = preprocess_input(X)

        return X

    
    
cloud = VGG16CloudModel()


In [16]:
from tqdm import tqdm
X_encrypted, X_test_encrypted = [], []
X_embed, X_test_embed = [], []

with tf.device('/GPU:0'):
        
    for i, x in tqdm(X_sample.iterrows(), total=len(X_sample), leave=True, position=0):
        
        x_embed = embedding(x.values.reshape(1,-1))
        X_embed.append(x_embed)
        encrypted = encoder.encode(np.vstack(x_embed))
    
        X_encrypted.append(encrypted)
        
    for i,x in tqdm(X_test.iterrows(), total=len(X_test), leave=True, position=0):
        x_embed = embedding(x.values.reshape(1,-1))
        X_test_embed.append(x_embed)
        encrypted = encoder.encode(np.vstack(x_embed))
        X_test_encrypted.append(encrypted)

100%|██████████| 20000/20000 [02:06<00:00, 158.39it/s]
100%|██████████| 2000/2000 [00:12<00:00, 163.28it/s]


In [17]:
with tf.device('/GPU:0'):

    predictions = [
        cloud.predict(x)
        for x in tqdm(X_encrypted, total=len(X_encrypted), leave=True, position=0)
    ]
    test_preds = [
        cloud.predict(x)
        for x in tqdm(X_test_encrypted, total=len(X_test_encrypted), leave=True, position=0)
]

  0%|          | 0/20000 [00:00<?, ?it/s]2024-12-22 21:01:14.019489: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
100%|██████████| 20000/20000 [19:14<00:00, 17.32it/s]
100%|██████████| 2000/2000 [02:09<00:00, 15.47it/s]


In [18]:
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

# Define the base models
base_model1 = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
base_model2 = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42)

# Define the meta-model
meta_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Create the stacking classifier
stacking_model = StackingClassifier(
    estimators=[('xgb1', base_model1), ('xgb2', base_model2)],
    final_estimator=meta_model,
    # cv=5
)

In [19]:
X_pred_embed, X_test_pred_embed = [], []
for embed, pred in zip(X_embed, predictions):
    X_pred_embed.append(np.hstack([embed, pred]))
    
for embed, pred in zip(X_test_embed, test_preds):
    X_test_pred_embed.append(np.hstack([embed, pred]))

In [21]:
X_pred_embed = np.vstack(X_pred_embed)
X_test_pred_embed = np.vstack(X_test_pred_embed)

In [23]:
from sklearn.metrics import accuracy_score, f1_score


# Fit the stacking model
stacking_model.fit(X_pred_embed, y_sample)

# Make predictions on the test set
y_pred = stacking_model.predict(X_test_pred_embed)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Stacking Model Accuracy: {accuracy:.4f}")
print(f"Stacking Model F1 Score: {f1_score(y_test, y_pred):.4f}")

Stacking Model Accuracy: 0.9455
Stacking Model F1 Score: 0.3978


In [25]:

# Create and train the first XGBoost model on X1
model1 = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model1.fit(np.vstack(X_embed), y_sample)

# Create and train the second XGBoost model on X2
model2 = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model2.fit(np.vstack(predictions), y_sample)

# Make predictions using both models (probability of class 1)
pred1_train = model1.predict_proba(np.vstack(X_embed))[:, 1].reshape(-1, 1)
pred2_train = model2.predict_proba(np.vstack(predictions))[:, 1].reshape(-1, 1)
pred1_test = model1.predict_proba(np.vstack(X_test_embed))[:, 1].reshape(-1, 1)
pred2_test = model2.predict_proba(np.vstack(test_preds))[:, 1].reshape(-1, 1)

# Combine predictions for meta-model training
X_meta_train = np.hstack((pred1_train, pred2_train))
X_meta_test = np.hstack((pred1_test, pred2_test))

# Create and train the meta-model (stacking)
meta_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
meta_model.fit(X_meta_train, y_sample)

# Make final predictions using the meta-model
final_predictions = meta_model.predict(X_meta_test)
final_proba = meta_model.predict_proba(X_meta_test)[:, 1]

# Evaluate the stacked model
accuracy = accuracy_score(y_test, final_predictions)

In [27]:
accuracy, f1_score(y_test, final_predictions, average='weighted')

(0.9385, 0.938381641511976)