In [1]:
import pickle
import pathlib
import os
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from keras.src import Sequential
from keras.src.layers import Dense, BatchNormalization, Dropout
from keras.src.callbacks import EarlyStopping

In [2]:
PROJECT_PATH = os.getcwd()
PROJECT_PATH = os.path.join(PROJECT_PATH, "..")
MODELS_DIR = pathlib.Path(PROJECT_PATH) / "store" / "models"

DATASET_DIR = pathlib.Path(PROJECT_PATH) / "data"

In [3]:
import pandas as pd

dataset = pd.read_csv(DATASET_DIR / "heloc"/ "dataset.csv")
dataset

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,Good,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,Bad,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,Bad,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,Bad,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [4]:
# X, y = dataset.iloc[:, 1:], dataset.iloc[:, 0]
# 
# # Remove the bug in the dataset where the entire row has -9 values
# mask = ~(X == -9).all(axis=1)
# X = X[mask]
# y = y[mask]
# 
# y = y.replace({"no": 0, "yes": 1}).astype(int)

In [8]:
def preprocess(X: pd.DataFrame):
    """
    The function will preprocess the data:
    1. Categorical features will be label encoded (Boy->1, Girl ->2)
    2. Numerical features will be scaled if the data is intended to be used for baseline. For cloud data set, no scaling will be preformed.

    Return pd.Dataframe
    """
    # Identify categorical and numeric columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

    # Initialize lists to store processed columns
    processed_columns = []

    # If there are categorical columns, apply one-hot encoding
    if categorical_cols:
        print("\nEncoding categorical columns...")
        # onehot_encoder = OneHotEncoder(categories='auto', sparse=False)
        # X_categorical = pd.DataFrame(onehot_encoder.fit_transform(X[categorical_cols]),
        #                              columns=onehot_encoder.get_feature_names_out(categorical_cols))
        label_encoder = LabelEncoder()
        X_categorical = pd.DataFrame()
        for col in categorical_cols:
            X_categorical[col] = label_encoder.fit_transform(X[col])

        processed_columns.append(X_categorical)

    # Apply standard scaling to the numeric columns
    if numeric_cols:
        print("\nScaling numerical columns...")
        scaler = MinMaxScaler()
        X_numeric = X[numeric_cols]
        # X_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

        processed_columns.append(X_numeric)

    # Combine the processed columns
    if processed_columns:
        X_processed = pd.concat(processed_columns, axis=1)
    else:
        X_processed = X.copy()  # If there are no categorical or numeric columns, keep the original dataframe


    return X_processed


dataset = preprocess(dataset)


Encoding categorical columns...

Scaling numerical columns...


In [9]:
from sklearn.model_selection import train_test_split
X,y = dataset.iloc[:, 1:], dataset.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7844, 23), (2615, 23), (7844,), (2615,))

In [10]:
import pandas as pd
import torch.nn as nn
from keras.src.utils import to_categorical


class DNNEmbedding(nn.Module):

    name = "dnn_embedding"

    def __init__(self, **kwargs):
        super(DNNEmbedding, self).__init__()

        X, y = kwargs.get("X"), kwargs.get("y")
        num_classes = len(set(y))
        y = to_categorical(y, num_classes=num_classes)

        model = Sequential()
        model.add(Dense(units=X.shape[1]//2, activation='tanh', name="embedding"))
        model.add(BatchNormalization())
        model.add(Dropout(0.4))
        model.add(Dense(units=num_classes, activation='softmax', name="output"))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        early_stop = EarlyStopping(patience=2, monitor="loss")

        model.fit(X, y,validation_data=[X_test, to_categorical(y_test,2)], epochs=50, batch_size=8, callbacks=[early_stop])
        self.model = model.layers[0]
        self.output_shape = (1, X.shape[1]//2)


    def forward(self, x):

        if type(x) is pd.DataFrame:
            x = x.to_numpy()

        embedding = self.model(x)
        return embedding


embedding = DNNEmbedding(X=X_train, y=y_train)



Epoch 1/50


2024-12-29 19:09:45.826179: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-12-29 19:09:45.826213: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2024-12-29 19:09:45.826219: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2024-12-29 19:09:45.826241: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-12-29 19:09:45.826253: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-12-29 19:09:46.196824: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - accuracy: 0.5734 - loss: 0.7618 - val_accuracy: 0.6543 - val_loss: 0.6518
Epoch 2/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.6247 - loss: 0.6621 - val_accuracy: 0.6524 - val_loss: 0.6420
Epoch 3/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.6392 - loss: 0.6432 - val_accuracy: 0.6662 - val_loss: 0.6183
Epoch 4/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.6387 - loss: 0.6398 - val_accuracy: 0.6704 - val_loss: 0.6258
Epoch 5/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.6452 - loss: 0.6430 - val_accuracy: 0.6635 - val_loss: 0.6148
Epoch 6/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.6485 - loss: 0.6393 - val_accuracy: 0.6707 - val_loss: 0.6147
Epoch 7/50
[1m981/981[0m 

In [12]:
import numpy as np
from keras.src.layers import Input, Dense,  Flatten
from keras.src.layers import BatchNormalization, Activation, Conv2DTranspose
from keras.src.models import Model, Sequential
from keras.src.layers import LeakyReLU, Reshape, Conv2D, UpSampling2D, ReLU

class BaseEncryptor:

    name: str

    def __init__(self, input_shape=None, output_shape=None):
        self.model = None
        self.output_shape = output_shape
        self.input_shape = input_shape

    def build_generator(self, input_shape, output_shape):
        raise NotImplementedError("Subclasses should implement this method")

    def encode(self, inputs) -> np.array:
        inputs = np.expand_dims(inputs, axis=0)
        if self.model is None:
            input_shape = inputs.shape[1:]
            output_shape = self.output_shape or (1, inputs.shape[2])
            self.model = self.build_generator(input_shape, output_shape)
        return self.model(inputs).numpy()

class DCEncryptor(BaseEncryptor):

    name = "dc"
        
    def build_generator(self, input_shape, output_shape):

          # Ziv's Model
        G = Sequential()

        G.add(Reshape(target_shape=[1, *input_shape[1:]], input_shape=input_shape))
        # No weights or activations here

        # 1x1x4096
        G.add(Conv2DTranspose(filters=64, kernel_size=4))
        G.add(Activation('relu'))
        # Weights index: 0, Activations index: 1

        # 4x4x64
        G.add(Conv2D(filters=64, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 2, Activations index: 5
        G.add(UpSampling2D())
        # No weights or activations here

        # 8x8x64
        G.add(Conv2D(filters=32, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 8, Activations index: 9
        G.add(UpSampling2D())
        # No weights or activations here

        # 16x16x32
        G.add(Conv2D(filters=16, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 14, Activations index: 13
        G.add(UpSampling2D())
        # No weights or activations here

        # 32x32x16
        G.add(Conv2D(filters=8, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 20, Activations index: 17
        G.add(UpSampling2D())
        # No weights or activations here

        # 64x64x8
        G.add(Conv2D(filters=4, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 26, Activations index: 21
        G.add(UpSampling2D())
        # No weights or activations here

        # 128x128x4
        G.add(Conv2D(filters=3, kernel_size=4, padding='same'))
        G.add(Activation('sigmoid'))
        # Weights index: 32, Activations index: 25

        return G
    
    
encoder = DCEncryptor(output_shape=(1, *(224, 224, 3)))

In [13]:
from tqdm import tqdm
X_encrypted, X_test_encrypted = [], []
X_embed, X_test_embed = [], []
for i, x in tqdm(X_train.iterrows(), total=len(X)):
    
    x_embed = embedding(x.values.reshape(1,-1))
    X_embed.append(x_embed)
    x_embed = np.vstack(x_embed)[np.newaxis, ...]
    encrypted = encoder.encode(x_embed)

    X_encrypted.append(encrypted)
    


  super().__init__(**kwargs)
 75%|███████▍  | 7844/10459 [02:03<00:41, 63.67it/s]


In [14]:
for i,x in tqdm(X_test.iterrows(), total=len(X_test)):
    x_embed = embedding(x.values.reshape(1,-1))
    X_embed.append(x_embed)
    x_embed = np.vstack(x_embed)[np.newaxis, ...]
    encrypted = encoder.encode(x_embed)
    
    X_test_encrypted.append(encrypted)

100%|██████████| 2615/2615 [00:40<00:00, 64.16it/s]


In [15]:
from keras.src.models import Sequential


from keras.src.layers import Conv2D, MaxPooling2D, Flatten, Dense


student_model = Sequential([
    Flatten(input_shape=(224, 224, 3)),
    Dense(512, activation='relu'),
    Dense(1000, activation='softmax'),    
])

# student_model = Sequential([
#     Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(224, 224, 3)),
#     MaxPooling2D((2, 2)),  # Output: (64,64,64)
#     Conv2D(128, (3, 3), activation='relu', padding='same'),
#     MaxPooling2D((2, 2)),  # Output: (32,32,128)
#     Conv2D(256, (3, 3), activation='relu', padding='same'),
#     MaxPooling2D((2, 2)),  # Output: (16,16,256)
#     Conv2D(256, (3, 3), activation='relu', padding='same'),
#     MaxPooling2D((2, 2)),  # Output: (8,8,256)
#     Conv2D(256, (3, 3), activation='relu', padding='same'),
#     MaxPooling2D((2, 2)),  # Output: (4,4,256)
#     Conv2D(512, (3, 3), activation='relu', padding='same'),
# ])

# student_model = Sequential([
#     Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(128, 128, 3)),
#     MaxPooling2D((2, 2)),
#     Conv2D(128, (3, 3), activation='relu', padding='same'),
#     MaxPooling2D((2, 2)),
#     Conv2D(256, (3, 3), activation='relu', padding='same'),
#     MaxPooling2D((2, 2)),
#     Flatten(),
#     Dense(512, activation='relu'),
#     Dense(1000, activation='softmax')
# ])

  super().__init__(**kwargs)


In [16]:
student_model.summary()

In [17]:
import tensorflow as tf
from keras.src.losses import categorical_crossentropy as logloss, kl_divergence as KLD_Loss
from keras.src.metrics.accuracy_metrics import categorical_accuracy
NUM_CLASSES = 1000

def distillation_loss(y_true, y_pred, temperature=3.5, lambd=0.5, is_embeddings=False):
    if is_embeddings:
        y_true, y_pred = tf.nn.softmax(y_true), tf.nn.softmax(y_pred)
    # The teacher's model prediction vector is the y_true.
    # To use KL-div loss we first need to soften the outputs
    y_true_KD = tf.nn.softmax(y_true / temperature, axis=1)
    y_pred_KD = tf.nn.softmax(y_pred / temperature, axis=1)
                        
    # # Classic cross-entropy (without temperature)
    # CE_loss = logloss(y_true,y_pred)
    
    # KL-Divergence loss for softened output (with temperature)
    KL_loss = temperature**2*KLD_Loss(y_true_KD,y_pred_KD)
    return KL_loss
    # return lambd*CE_loss + (1-lambd)*KL_loss


In [18]:
from keras.src.optimizers import Adam
from keras.src.layers import Lambda, Activation
from keras.src.applications.vgg16 import preprocess_input as vgg_preprocess_input, VGG16

# teacher_model = cloud.model
input_tensor = Input(shape=(224,224,3)) #Input(shape=(128, 128, 3))
teacher_model = VGG16(weights="imagenet")#, include_top=False, input_tensor=input_tensor)
teacher_model.trainable = False
optimizer = Adam()

# Preprocess function (adjust as needed for your specific case)
def preprocess(images):
    padd_images = tf.image.resize_with_crop_or_pad(images, 224, 224)
    return vgg_preprocess_input(padd_images)

@tf.function
def train_step(images):
    with tf.GradientTape() as tape:
        teacher_preds = teacher_model(images)        
        student_preds = student_model(images, training=True)
        loss = distillation_loss(teacher_preds, student_preds)
    
    gradients = tape.gradient(loss, student_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, student_model.trainable_variables))
    return loss


In [19]:

num_epochs = 2
train_dataset = np.vstack(X_encrypted)
batch_size = 32

# Assuming X_encrypted is a numpy array, convert it to a tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(train_dataset)
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)

for epoch in range(num_epochs):
    epoch_loss = []
    progress_bar = tqdm(train_dataset, desc=f"Epoch {epoch + 1}/{num_epochs}")
    
    for batch in progress_bar:
        batch = preprocess(batch)
        loss = train_step(batch)
        epoch_loss.append(loss.numpy())
        
        # Update progress bar
        
        progress_bar.set_postfix({'loss': f'{np.mean(epoch_loss):.4f}'})
    
    print(f"Epoch {epoch + 1}, Average Loss: {np.mean(epoch_loss):.4f}")

Epoch 1/2: 100%|██████████| 245/245 [01:18<00:00,  3.13it/s, loss=0.0006]2024-12-29 19:18:12.903329: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
Epoch 1/2: 100%|██████████| 245/245 [01:18<00:00,  3.11it/s, loss=0.0006]


Epoch 1, Average Loss: 0.0006


Epoch 2/2: 100%|██████████| 245/245 [01:18<00:00,  3.13it/s, loss=0.0006]2024-12-29 19:19:31.086551: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
Epoch 2/2: 100%|██████████| 245/245 [01:18<00:00,  3.13it/s, loss=0.0006]

Epoch 2, Average Loss: 0.0006





In [99]:
# for epoch in range(num_epochs):
#     for images in tqdm.tqdm(train_dataset, total=len(train_dataset)):
#         images = preprocess(images)
#         loss = train_step(images)
#     print(f"Epoch {epoch + 1}, Loss: {loss.numpy()}")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (32,) + inhomogeneous part.