In [1]:
import pickle
import pathlib
import os
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder


In [2]:
PROJECT_PATH = os.getcwd()
PROJECT_PATH = os.path.join(PROJECT_PATH, "..")
MODELS_DIR = pathlib.Path(PROJECT_PATH) / "store" / "models"

DATASET_DIR = pathlib.Path(PROJECT_PATH) / "data"

In [3]:
import pandas as pd

dataset = pd.read_csv(DATASET_DIR / "heloc"/ "dataset.csv")
dataset

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,Good,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,Bad,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,Bad,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,Bad,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [7]:
def preprocess(X: pd.DataFrame):
    """
    The function will preprocess the data:
    1. Categorical features will be label encoded (Boy->1, Girl ->2)
    2. Numerical features will be scaled if the data is intended to be used for baseline. For cloud data set, no scaling will be preformed.

    Return pd.Dataframe
    """
    # Identify categorical and numeric columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

    # Initialize lists to store processed columns
    processed_columns = []

    # If there are categorical columns, apply one-hot encoding
    if categorical_cols:
        print("\nEncoding categorical columns...")
        # onehot_encoder = OneHotEncoder(categories='auto', sparse=False)
        # X_categorical = pd.DataFrame(onehot_encoder.fit_transform(X[categorical_cols]),
        #                              columns=onehot_encoder.get_feature_names_out(categorical_cols))
        label_encoder = LabelEncoder()
        X_categorical = pd.DataFrame()
        for col in categorical_cols:
            X_categorical[col] = label_encoder.fit_transform(X[col])

        processed_columns.append(X_categorical)

    # Apply standard scaling to the numeric columns
    if numeric_cols:
        print("\nScaling numerical columns...")
        scaler = MinMaxScaler()
        X_numeric = X[numeric_cols]
        # X_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

        processed_columns.append(X_numeric)

    # Combine the processed columns
    if processed_columns:
        X_processed = pd.concat(processed_columns, axis=1)
    else:
        X_processed = X.copy()  # If there are no categorical or numeric columns, keep the original dataframe


    return X_processed


dataset = preprocess(dataset)


Encoding categorical columns...

Scaling numerical columns...


In [8]:
from sklearn.model_selection import train_test_split
X,y = dataset.iloc[:, 1:], dataset.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7844, 23), (2615, 23), (7844,), (2615,))

In [9]:
import pandas as pd
import torch.nn as nn, tensorflow as tf
from keras.src.utils import to_categorical
from keras.src.layers import Dense, Dropout, Flatten, BatchNormalization
from keras.src.callbacks import EarlyStopping
from keras.src import Sequential


class DNNEmbedding(nn.Module):

    name = "dnn_embedding"

    def __init__(self, **kwargs):
        super(DNNEmbedding, self).__init__()

        X, y = kwargs.get("X"), kwargs.get("y")
        num_classes = len(set(y))
        y = to_categorical(y, num_classes=num_classes)

        model = Sequential()
        model.add(Dense(units=X.shape[1]//2, activation='tanh', name="embedding"))
        model.add(BatchNormalization())
        model.add(Dropout(0.4))
        model.add(Dense(units=num_classes, activation='softmax', name="output"))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        early_stop = EarlyStopping(patience=2, monitor="loss")

        model.fit(X, y,validation_data=[X_test, to_categorical(y_test,2)], epochs=50, batch_size=8, callbacks=[early_stop])
        self.model = model.layers[0]
        self.classifier = model
        self.output_shape = (1, X.shape[1]//2)


    def forward(self, x):

        if type(x) is pd.DataFrame:
            x = x.to_numpy()

        embedding = self.model(x)
        return embedding


# Code to run on CPU
with tf.device('/CPU:0'):
    
    embedding = DNNEmbedding(X=X_train, y=y_train)



Epoch 1/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 366us/step - accuracy: 0.5725 - loss: 0.8027 - val_accuracy: 0.6482 - val_loss: 0.6423
Epoch 2/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300us/step - accuracy: 0.6063 - loss: 0.6738 - val_accuracy: 0.6459 - val_loss: 0.6395
Epoch 3/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302us/step - accuracy: 0.6203 - loss: 0.6535 - val_accuracy: 0.6505 - val_loss: 0.6324
Epoch 4/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301us/step - accuracy: 0.6324 - loss: 0.6461 - val_accuracy: 0.6505 - val_loss: 0.6250
Epoch 5/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299us/step - accuracy: 0.6320 - loss: 0.6512 - val_accuracy: 0.6493 - val_loss: 0.6183
Epoch 6/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299us/step - accuracy: 0.6483 - loss: 0.6351 - val_accuracy: 0.6474 - val_loss: 0.6246
Epoch 7/50
[1m9

In [14]:
import numpy as np
from keras.src.layers import Input, Dense,  Flatten
from keras.src.layers import BatchNormalization, Activation, Conv2DTranspose
from keras.src.models import Model, Sequential
from keras.src.layers import LeakyReLU, Reshape, Conv2D, UpSampling2D, ReLU

class BaseEncryptor:

    name: str

    def __init__(self, input_shape=None, output_shape=None):
        self.model = None
        self.output_shape = output_shape
        self.input_shape = input_shape

    def build_generator(self, input_shape, output_shape):
        raise NotImplementedError("Subclasses should implement this method")

    def encode(self, inputs) -> np.array:
        inputs = np.expand_dims(inputs, axis=0)
        if self.model is None:
            input_shape = inputs.shape[1:]
            output_shape = self.output_shape or (1, inputs.shape[2])
            self.model = self.build_generator(input_shape, output_shape)
        return self.model(inputs).numpy()

class DCEncryptor(BaseEncryptor):

    name = "dc"
    
    def build_generator(self, input_shape, output_shape):
        # Ziv's Model
        G = Sequential()

        G.add(Reshape(target_shape=[1, *input_shape[1:]], input_shape=input_shape))
        # No weights or activations here

        # 1x1x4096
        G.add(Conv2DTranspose(filters=64, kernel_size=4))
        G.add(Activation('relu'))
        # Weights index: 0, Activations index: 1

        # 4x4x64
        G.add(Conv2D(filters=64, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 2, Activations index: 5
        G.add(UpSampling2D())
        # No weights or activations here

        # 8x8x64
        G.add(Conv2D(filters=32, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 8, Activations index: 9
        G.add(UpSampling2D())
        # No weights or activations here

        # 16x16x32
        G.add(Conv2D(filters=16, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 14, Activations index: 13
        G.add(UpSampling2D())
        # No weights or activations here

        # 32x32x16
        G.add(Conv2D(filters=8, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 20, Activations index: 17
        G.add(UpSampling2D())
        # No weights or activations here

        # 64x64x8
        G.add(Conv2D(filters=4, kernel_size=4, padding='same'))
        G.add(BatchNormalization(momentum=0.7))
        G.add(Activation('relu'))
        # Weights index: 26, Activations index: 21
        G.add(UpSampling2D())
        # No weights or activations here

        # 128x128x4
        G.add(Conv2D(filters=3, kernel_size=4, padding='same'))
        G.add(Activation('sigmoid'))
        # Weights index: 32, Activations index: 25

        return G
    
    
encoder = DCEncryptor(output_shape=(1, 224, 224, 4))

In [15]:
from keras.api.applications import ResNet152V2, VGG16, EfficientNetB7
# from keras.api.applications.vgg16 import preprocess_input
from keras.api.applications.resnet_v2 import preprocess_input
import tensorflow as tf
from keras.api.models import load_model


def pad(tensor, original, target=600):
    pad_height = (target - original) // 2
    pad_width = (target - original) // 2
    padded_tensor = tf.pad(tensor, [[pad_height, pad_height], [pad_width, pad_width], [0, 0]], mode='CONSTANT', constant_values=0)

    # If the dimensions are odd, add an extra row/column to one side
    if (600 - 224) % 2 != 0:
        padded_tensor = tf.pad(padded_tensor, [[0, 1], [0, 1], [0, 0]], mode='CONSTANT', constant_values=0)

    return padded_tensor[np.newaxis, ...]

def preprocess_image(image):
    # Assuming 'image' is your input tensor
    resized_image = tf.image.resize(image, (224, 224))
    return resized_image


class VGG16CloudModel:
    name = "vgg16"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.model = self.get_model()
        self.input_shape = (224, 224, 3) 
        self.output_shape = (1,1000)

    def fit(self, X_train, y_train, **kwargs):
        pass

    def get_model(self):
        # Load the pretrained VGG16 model with ImageNet weights
        # model = load_model("/Users/eden.yavin/Projects/Tabular-Cloud-ML/store/models/cifar100_vgg.keras") #
        model = VGG16(weights='imagenet')
        return model

    def predict(self, X):
        X = self.preprocess(X)
        predictions = self.model.predict(X, verbose=None)
        return predictions

    def preprocess(self, X):
        
        X = X.copy()
        # X = (X * 10000).astype(np.uint8)

        if any(s < 224 for s in X.shape[1:3]):
            # Pad the input to make its size equal to 224
            padded_X = tf.image.resize_with_crop_or_pad(X, 224, 224)

            # Ensure the input is properly preprocessed for VGG16
            X = preprocess_input(padded_X.numpy())
        else:
            # If no padding is needed, directly preprocess the input
            X = preprocess_input(X)

        return X

    
    
cloud = VGG16CloudModel()


In [12]:
from tqdm import tqdm
X_encrypted, X_test_encrypted = [], []
X_embed, X_test_embed = [], []

with tf.device('/GPU:0'):
        
    for i, x in tqdm(X_train.iterrows(), total=len(X_train)):
        
        x_embed = embedding(x.values.reshape(1,-1))
        X_embed.append(x_embed)
        encrypted = encoder.encode(np.vstack(x_embed))
    
        X_encrypted.append(encrypted)
        
    for i,x in tqdm(X_test.iterrows(), total=len(X_test)):
        x_embed = embedding(x.values.reshape(1,-1))
        X_test_embed.append(x_embed)
        encrypted = encoder.encode(np.vstack(x_embed))
        X_test_encrypted.append(encrypted)

100%|██████████| 7844/7844 [00:45<00:00, 172.44it/s]
100%|██████████| 2615/2615 [00:15<00:00, 170.08it/s]


In [17]:
with tf.device('/GPU:0'):

    predictions = [
        cloud.predict(x)
        for x in tqdm(X_encrypted, total=len(X_encrypted), leave=True, position=0)
    ]
    test_preds = [
        cloud.predict(x)
        for x in tqdm(X_test_encrypted, total=len(X_test_encrypted), leave=True, position=0)
]

100%|██████████| 7844/7844 [03:59<00:00, 32.79it/s]
100%|██████████| 2615/2615 [01:20<00:00, 32.67it/s]


In [18]:

from keras.src.callbacks import LearningRateScheduler, EarlyStopping
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, f1_score
from keras.src.models import Model
from keras.src.layers import Dense, Dropout, Input,  BatchNormalization
from keras.src.metrics import F1Score
import numpy as np

class NeuralNetworkInternalModel(BaseEstimator, ClassifierMixin):

    def __init__(self, **kwargs):
        self.batch_size = 8
        self.dropout_rate = 0.3
        self.epochs = 100
        self.model: Model = None

    def fit(self, X, y):
        y_onehot = to_categorical(y , 2)
        lr_scheduler = LearningRateScheduler(lambda epoch: 0.0001 * (0.9 ** epoch))
        early_stopping = EarlyStopping(patience=3, monitor='loss')
        self.model.fit(X, y_onehot, epochs=self.epochs, batch_size=self.batch_size, callbacks=[lr_scheduler, early_stopping])

    def predict(self, X):
        prediction = self.model.predict(X)
        return np.argmax(prediction, axis=1)


    def evaluate(self, X, y):
        if len(y.shape) == 2:
            y = np.argmax(y, axis=1)

        pred = self.predict(X)
        return accuracy_score(y, pred), f1_score(y, pred, average='weighted')



## Using one vector

In [19]:

class DenseInternalModel(NeuralNetworkInternalModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "neural_network"
        num_classes = kwargs.get("num_classes")
        input_shape = kwargs.get("input_shape")
        self.model = self.get_model(num_classes=num_classes, input_shape=input_shape)

    def get_model(self, num_classes, input_shape):
        # Build the model
        inputs = Input(shape=(input_shape,))  # Dynamic input shape

        # Define the hidden layers
        x = BatchNormalization()(inputs)
        x = Dense(units=128, activation='leaky_relu')(x)
        x = Dropout(self.dropout_rate)(x)

        # Define the output layer
        outputs = Dense(units=num_classes, activation='softmax')(x)

        # Create the model
        model = Model(inputs=inputs, outputs=outputs)

        # Compile the model with F1 Score
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy', F1Score()]
                      )

        return model
    

### Using Embedding Vector

In [20]:
with tf.device('/CPU:0'):
    iim = DenseInternalModel(num_classes=2, input_shape=X_embed[0].shape[1])
    iim.fit(np.vstack(X_embed), y_train)
    print("--------PERFORMANCE-------")
    print(iim.evaluate(np.vstack(X_test_embed), y_test))

Epoch 1/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 310us/step - accuracy: 0.5528 - f1_score: 0.5463 - loss: 0.6973 - learning_rate: 1.0000e-04
Epoch 2/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296us/step - accuracy: 0.6458 - f1_score: 0.6452 - loss: 0.6351 - learning_rate: 9.0000e-05
Epoch 3/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298us/step - accuracy: 0.6468 - f1_score: 0.6456 - loss: 0.6374 - learning_rate: 8.1000e-05
Epoch 4/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300us/step - accuracy: 0.6491 - f1_score: 0.6485 - loss: 0.6325 - learning_rate: 7.2900e-05
Epoch 5/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297us/step - accuracy: 0.6500 - f1_score: 0.6495 - loss: 0.6303 - learning_rate: 6.5610e-05
Epoch 6/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297us/step - accuracy: 0.6611 - f1_score: 0.6609 - loss: 0.6288 - learni

In [29]:
from xgboost import XGBClassifier

iim = XGBClassifier()
iim.fit(np.vstack(X_embed), y_train)
print("--------PERFORMANCE-------")
pred = iim.predict(np.vstack(X_test_embed))
accuracy_score(y_test, pred), f1_score(y_test, pred, average='weighted')

--------PERFORMANCE-------


(0.665774378585086, 0.6659135091315428)

### Using Pred Vector

In [21]:
# Code to run on CPU
with tf.device('/CPU:0'):
    iim = DenseInternalModel(num_classes=2, input_shape=predictions[0].shape[1])
    iim.fit(np.vstack(predictions), y_train)
    print("--------PERFORMANCE-------")
    print(iim.evaluate(np.vstack(test_preds), y_test))

Epoch 1/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 674us/step - accuracy: 0.5238 - f1_score: 0.3600 - loss: 0.6927 - learning_rate: 1.0000e-04
Epoch 2/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 677us/step - accuracy: 0.5198 - f1_score: 0.3420 - loss: 0.6927 - learning_rate: 9.0000e-05
Epoch 3/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 676us/step - accuracy: 0.5254 - f1_score: 0.3444 - loss: 0.6923 - learning_rate: 8.1000e-05
Epoch 4/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 683us/step - accuracy: 0.5270 - f1_score: 0.3462 - loss: 0.6919 - learning_rate: 7.2900e-05
Epoch 5/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 747us/step - accuracy: 0.5237 - f1_score: 0.3437 - loss: 0.6920 - learning_rate: 6.5610e-05
Epoch 6/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 667us/step - accuracy: 0.5252 - f1_score: 0.3443 - loss: 0.6925 - learni

In [30]:
iim = XGBClassifier()
iim.fit(np.vstack(predictions), y_train)
print("--------PERFORMANCE-------")
pred = iim.predict(np.vstack(test_preds))
accuracy_score(y_test, pred), f1_score(y_test, pred, average='weighted')

--------PERFORMANCE-------


(0.6481835564053537, 0.6482276701825916)

### Using Both Prediction and Embedding as One Vector

In [22]:
X_pred_embed, X_test_pred_embed = [], []
for embed, pred in zip(X_embed, predictions):
    X_pred_embed.append(np.hstack([embed, pred]))
    
for embed, pred in zip(X_test_embed, test_preds):
    X_test_pred_embed.append(np.hstack([embed, pred]))

In [23]:
with tf.device('/CPU:0'):
    
    iim = DenseInternalModel(num_classes=2, input_shape=X_pred_embed[0].shape[1])
    iim.fit(np.vstack(X_pred_embed), y_train)
    print("--------PERFORMANCE-------")
    print(iim.evaluate(np.vstack(X_test_pred_embed), y_test))


Epoch 1/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 924us/step - accuracy: 0.6020 - f1_score: 0.6019 - loss: 0.6549 - learning_rate: 1.0000e-04
Epoch 2/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 739us/step - accuracy: 0.6658 - f1_score: 0.6657 - loss: 0.6221 - learning_rate: 9.0000e-05
Epoch 3/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 731us/step - accuracy: 0.6486 - f1_score: 0.6484 - loss: 0.6294 - learning_rate: 8.1000e-05
Epoch 4/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 722us/step - accuracy: 0.6541 - f1_score: 0.6540 - loss: 0.6251 - learning_rate: 7.2900e-05
Epoch 5/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 715us/step - accuracy: 0.6551 - f1_score: 0.6548 - loss: 0.6256 - learning_rate: 6.5610e-05
--------PERFORMANCE-------
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 467us/step
(0.6783938814531548, 0.6781022232248638)


In [32]:
iim = XGBClassifier(random_state=42)
iim.fit(np.vstack(X_pred_embed), y_train)
print("--------PERFORMANCE-------")
pred = iim.predict(np.vstack(X_test_pred_embed))
accuracy_score(y_test, pred), f1_score(y_test, pred, average='weighted')

--------PERFORMANCE-------


(0.6596558317399618, 0.6597263128416567)

In [34]:

# Create and train the first XGBoost model on X1
model1 = XGBClassifier(random_state=42)
model1.fit(np.vstack(X_embed), y_train)

# Create and train the second XGBoost model on X2
model2 = XGBClassifier(random_state=42)
model2.fit(np.vstack(predictions), y_train)

# Make predictions using both models (probability of class 1)
pred1_train = model1.predict_proba(np.vstack(X_embed))[:, 1].reshape(-1, 1)
pred2_train = model2.predict_proba(np.vstack(predictions))[:, 1].reshape(-1, 1)
pred1_test = model1.predict_proba(np.vstack(X_test_embed))[:, 1].reshape(-1, 1)
pred2_test = model2.predict_proba(np.vstack(test_preds))[:, 1].reshape(-1, 1)

# Combine predictions for meta-model training
X_meta_train = np.hstack((pred1_train, pred2_train))
X_meta_test = np.hstack((pred1_test, pred2_test))

# Create and train the meta-model (stacking)
meta_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
meta_model.fit(X_meta_train, y_train)

# Make final predictions using the meta-model
final_predictions = meta_model.predict(X_meta_test)

# Evaluate the stacked model
accuracy_score(y_test, final_predictions), f1_score(y_test, final_predictions, average='weighted')


(0.6588910133843212, 0.6586733545079823)

## Baseline

In [24]:
with tf.device('/CPU:0'):
    baseline = DenseInternalModel(num_classes=2, input_shape=X_train.shape[1])
    baseline.fit(X_train, y_train)
    print("--------PERFORMANCE-------")
    print(baseline.evaluate(X_test, y_test))

Epoch 1/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 334us/step - accuracy: 0.5915 - f1_score: 0.5883 - loss: 0.6839 - learning_rate: 1.0000e-04
Epoch 2/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299us/step - accuracy: 0.6683 - f1_score: 0.6671 - loss: 0.6165 - learning_rate: 9.0000e-05
Epoch 3/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301us/step - accuracy: 0.6831 - f1_score: 0.6818 - loss: 0.6129 - learning_rate: 8.1000e-05
Epoch 4/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294us/step - accuracy: 0.6647 - f1_score: 0.6638 - loss: 0.6109 - learning_rate: 7.2900e-05
Epoch 5/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300us/step - accuracy: 0.6837 - f1_score: 0.6818 - loss: 0.5982 - learning_rate: 6.5610e-05
Epoch 6/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298us/step - accuracy: 0.6827 - f1_score: 0.6811 - loss: 0.6012 - learni

## Using Two vectors

In [25]:
from keras.src.callbacks import LearningRateScheduler, EarlyStopping
from sklearn.metrics import accuracy_score, f1_score
from keras.src.models import Model
from keras.src.layers import Dense, Dropout, Input,  BatchNormalization, concatenate
from keras.src.metrics import F1Score
from keras.src import regularizers

class DoubleDenseInternalModel(NeuralNetworkInternalModel):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "neural_network"
        num_classes = kwargs.get("num_classes")
        input_shape = kwargs.get("input_shape")
        self.model = self.get_model(num_classes=num_classes, input_shape=input_shape)

    def get_model(self, num_classes, input_shape):
        inputs_sub_networks = []

        input_shape_a, input_shape_b = input_shape
        input_a = Input(shape=(input_shape_a,))

        x = Dense(input_shape_a // 2, activation="relu", kernel_regularizer=regularizers.L2(0.1), bias_regularizer=regularizers.L2(0.01))(
            input_a)
        x = BatchNormalization(momentum=0.7)(x)
        x = Dropout(0.3)(x)
        # x = Dense(input_shape_a / 2, activation="relu")(x)
        x = Model(inputs=input_a, outputs=x)

        inputs_sub_networks.append(x)

        input_b = Input(shape=(input_shape_b,))
        # the second branch operates on the second input
        y = Dense(input_shape_b // 4, activation="relu", kernel_regularizer=regularizers.L2(0.1),  bias_regularizer=regularizers.L2(0.01))(
            input_b)
        y = BatchNormalization(momentum=0.7)(y)
        y = Dropout(0.3)(y)
        y = Model(inputs=input_b, outputs=y)

        inputs_sub_networks.append(y)

        combined = concatenate([k.output for k in inputs_sub_networks])

        m = Dense(num_classes, activation="softmax", kernel_regularizer=regularizers.L2(0.1),
                  bias_regularizer=regularizers.L2(0.1))(combined)

        model = Model(inputs=[k.input for k in inputs_sub_networks], outputs=m)
        # Compile the model with F1 Score
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy', F1Score()]
                      )

        return model

In [26]:
X_double_pred_embed, X_double_test_pred_embed = [np.vstack(X_embed), np.vstack(predictions)], [np.vstack(X_test_embed), np.vstack(test_preds)]


In [27]:
with tf.device('/CPU:0'):
    double_iim = DoubleDenseInternalModel(num_classes=2, input_shape=(X_double_pred_embed[0].shape[1],X_double_pred_embed[1].shape[1]))
    double_iim.fit(X_double_pred_embed, y_train)
    print("--------PERFORMANCE-------")
    print(double_iim.evaluate(X_double_test_pred_embed, y_test))



Epoch 1/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5084 - f1_score: 0.5058 - loss: 20.2064 - learning_rate: 1.0000e-04
Epoch 2/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5368 - f1_score: 0.5353 - loss: 1.2766 - learning_rate: 9.0000e-05
Epoch 3/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5454 - f1_score: 0.5451 - loss: 1.0276 - learning_rate: 8.1000e-05
Epoch 4/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5770 - f1_score: 0.5764 - loss: 0.9072 - learning_rate: 7.2900e-05
Epoch 5/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.5984 - f1_score: 0.5979 - loss: 0.8228 - learning_rate: 6.5610e-05
Epoch 6/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6338 - f1_score: 0.6337 - loss: 0.7675 - learning_rate: 5.

# Correlation

In [44]:
rows = []
for embed, pred, label in zip(X_embed, predictions, y_sample):
    rows.append(np.hstack([embed, pred, np.vstack([label])]))
df = pd.DataFrame(np.vstack(rows))
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,2.766979e-07,5.367829e-09,...,1e-06,5e-06,5.753773e-07,1.384776e-09,2.619639e-07,1e-06,1.063482e-09,1.579175e-07,0.000266,0.0
1,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,2.766979e-07,5.367829e-09,...,1e-06,5e-06,5.753773e-07,1.384776e-09,2.619639e-07,1e-06,1.063482e-09,1.579175e-07,0.000266,0.0
2,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,2.766727e-07,5.367589e-09,...,1e-06,5e-06,5.753566e-07,1.384754e-09,2.619567e-07,1e-06,1.063456e-09,1.579101e-07,0.000266,0.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,2.766951e-07,5.367594e-09,...,1e-06,5e-06,5.753533e-07,1.384647e-09,2.619532e-07,1e-06,1.063356e-09,1.579191e-07,0.000266,0.0
4,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,2.766727e-07,5.367589e-09,...,1e-06,5e-06,5.753566e-07,1.384754e-09,2.619567e-07,1e-06,1.063456e-09,1.579101e-07,0.000266,0.0


In [61]:
rows = []
for embed, pred, label in zip(X_test_embed, test_preds, y_test):
    rows.append(np.hstack([embed, pred, np.vstack([label])]))

test_df = pd.DataFrame(np.vstack(rows))
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
0,-0.996705,-1.0,-1.0,-0.999537,-1.0,-1.0,-1.0,1.0,2.766912e-07,5.367964e-09,...,1e-06,5e-06,5.753951e-07,1.384787e-09,2.61957e-07,1e-06,1.06349e-09,1.579187e-07,0.000266,0.0
1,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,2.766979e-07,5.367829e-09,...,1e-06,5e-06,5.753773e-07,1.384776e-09,2.619639e-07,1e-06,1.063482e-09,1.579175e-07,0.000266,0.0
2,-0.99957,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,2.766907e-07,5.367943e-09,...,1e-06,5e-06,5.753918e-07,1.384787e-09,2.619565e-07,1e-06,1.063482e-09,1.579196e-07,0.000266,0.0
3,-1.0,-1.0,-1.0,1.0,-0.998985,-1.0,-1.0,-1.0,2.76672e-07,5.367792e-09,...,1e-06,5e-06,5.753728e-07,1.384619e-09,2.619551e-07,1e-06,1.063365e-09,1.579138e-07,0.000266,0.0
4,-1.0,-1.0,-1.0,-1.0,-0.999996,-1.0,-1.0,1.0,2.766909e-07,5.367933e-09,...,1e-06,5e-06,5.753923e-07,1.384779e-09,2.619559e-07,1e-06,1.063486e-09,1.579189e-07,0.000266,0.0


In [58]:
correlation_matrix = df.corr()

TOP = 10
target_correlation = correlation_matrix.iloc[-1].abs().sort_values(ascending=False)
top_target_corr = target_correlation[1:TOP]  # Exclude the target's correlation with itself

print(f"Top {TOP} feature correlations with the target:")
print(top_target_corr)

Top 10 feature correlations with the target:
6      0.003050
2      0.007783
4      0.013605
67     0.015070
11     0.016073
60     0.024851
102    0.026671
1      0.029054
0      0.035994
Name: 108, dtype: float64


In [57]:
TOP = 20

# Get the top 10 feature-feature correlations
feature_correlation = correlation_matrix.abs().unstack()
feature_correlation = feature_correlation[feature_correlation < 1.0]  # Remove self-correlations
top_feature_corr = feature_correlation.sort_values(ascending=False)[:TOP]

print(f"\nTop {TOP} feature-feature correlations:")
print(top_feature_corr)


Top 20 feature-feature correlations:
91   8      0.991189
8    91     0.991189
101  9      0.986578
9    101    0.986578
44   43     0.986427
43   44     0.986427
59   100    0.985324
100  59     0.985324
20   23     0.984703
23   20     0.984703
107  52     0.982269
52   107    0.982269
19   106    0.980771
106  19     0.980771
25   66     0.979824
66   25     0.979824
44   58     0.979658
58   44     0.979658
107  99     0.978215
99   107    0.978215
dtype: float64


# Dim Reduction

In [60]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Assuming your data is in a numpy array called 'X' with shape (n_samples, 1009)
# If not, you need to load and prepare your data first
X,y = df.iloc[:, :-1], df.iloc[:, -1]
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=10)
X_reduced = pca.fit_transform(X_scaled)

# X_reduced now has shape (n_samples, 10)

# Print the explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total explained variance:", sum(pca.explained_variance_ratio_))

Explained variance ratio: [0.53415906 0.25490196 0.06251872 0.03584684 0.02870274 0.01985813
 0.01562191 0.01151777 0.01084    0.00854644]
Total explained variance: 0.9825135639825718


In [68]:
X_test,_ = test_df.iloc[:, :-1], test_df.iloc[:, -1]
# Standardize the data
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

# Apply PCA
pca = PCA(n_components=10)
X_test_reduced = pca.fit_transform(X_test_scaled)

# X_reduced now has shape (n_samples, 10)

# Print the explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total explained variance:", sum(pca.explained_variance_ratio_))

Explained variance ratio: [0.53593826 0.2474006  0.05817848 0.04273527 0.02496998 0.01911342
 0.01823509 0.0139761  0.01179614 0.01007248]
Total explained variance: 0.9824158305038713


In [69]:
with tf.device('/CPU:0'):
    
    iim = DenseInternalModel(num_classes=2, input_shape=X_reduced.shape[1])
    iim.fit(X_reduced, y_sample)
    print("--------PERFORMANCE-------")
    print(iim.evaluate(X_test_reduced, y_test))

Epoch 1/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 285us/step - accuracy: 0.8543 - f1_score: 0.5073 - loss: 0.4010 - learning_rate: 1.0000e-04
Epoch 2/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 277us/step - accuracy: 0.9151 - f1_score: 0.5256 - loss: 0.2679 - learning_rate: 9.0000e-05
Epoch 3/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 276us/step - accuracy: 0.9144 - f1_score: 0.5435 - loss: 0.2659 - learning_rate: 8.1000e-05
Epoch 4/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 276us/step - accuracy: 0.9178 - f1_score: 0.5436 - loss: 0.2547 - learning_rate: 7.2900e-05
Epoch 5/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 275us/step - accuracy: 0.9171 - f1_score: 0.5415 - loss: 0.2559 - learning_rate: 6.5610e-05
Epoch 6/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 274us/step - accuracy: 0.9220 - f1_score: 0.5675 - loss: 0.2

In [67]:
X_test_reduced.shape, y_test.shape

((20000, 10), (2000,))