In [18]:
import pickle
import pathlib
import os
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from keras.src import Sequential
from keras.src.layers import Dense, BatchNormalization, Dropout
from keras.src.callbacks import EarlyStopping
import pandas as pd
from tqdm import tqdm


In [4]:
PROJECT_PATH = os.getcwd()
PROJECT_PATH = os.path.join(PROJECT_PATH, "..")
MODELS_DIR = pathlib.Path(PROJECT_PATH) / "store" / "models"

DATASET_DIR = pathlib.Path(PROJECT_PATH) / "data"

In [7]:
def preprocess(X: pd.DataFrame):
    """
    The function will preprocess the data:
    1. Categorical features will be label encoded (Boy->1, Girl ->2)
    2. Numerical features will be scaled if the data is intended to be used for baseline. For cloud data set, no scaling will be preformed.

    Return pd.Dataframe
    """
    # Identify categorical and numeric columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

    # Initialize lists to store processed columns
    processed_columns = []

    # If there are categorical columns, apply one-hot encoding
    if categorical_cols:
        print("\nEncoding categorical columns...")
        # onehot_encoder = OneHotEncoder(categories='auto', sparse=False)
        # X_categorical = pd.DataFrame(onehot_encoder.fit_transform(X[categorical_cols]),
        #                              columns=onehot_encoder.get_feature_names_out(categorical_cols))
        label_encoder = LabelEncoder()
        X_categorical = pd.DataFrame()
        for col in categorical_cols:
            X_categorical[col] = label_encoder.fit_transform(X[col])

        processed_columns.append(X_categorical)

    # Apply standard scaling to the numeric columns
    if numeric_cols:
        print("\nScaling numerical columns...")
        scaler = MinMaxScaler()
        X_numeric = X[numeric_cols]
        # X_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

        processed_columns.append(X_numeric)

    # Combine the processed columns
    if processed_columns:
        X_processed = pd.concat(processed_columns, axis=1)
    else:
        X_processed = X.copy()  # If there are no categorical or numeric columns, keep the original dataframe


    return X_processed




In [12]:
import pandas as pd
import torch.nn as nn, tensorflow as tf
from keras.src.utils import to_categorical
from keras.src.layers import Dense, Dropout, Flatten, BatchNormalization
from keras.src.callbacks import EarlyStopping
from keras.src import Sequential


class DNNEmbedding(nn.Module):

    name = "dnn_embedding"

    def __init__(self, **kwargs):
        super(DNNEmbedding, self).__init__()

        X, y = kwargs.get("X"), kwargs.get("y")
        num_classes = len(set(y))
        y = to_categorical(y, num_classes=num_classes)

        model = Sequential()
        model.add(Dense(units=X.shape[1]//2, activation='tanh', name="embedding"))
        model.add(BatchNormalization())
        model.add(Dropout(0.4))
        model.add(Dense(units=num_classes, activation='softmax', name="output"))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        early_stop = EarlyStopping(patience=2, monitor="loss")

        model.fit(X, y,validation_data=[X_test, to_categorical(y_test,2)], epochs=50, batch_size=8, callbacks=[early_stop])
        self.model = model.layers[0]
        self.classifier = model
        self.output_shape = (1, X.shape[1]//2)


    def forward(self, x):

        if type(x) is pd.DataFrame:
            x = x.to_numpy()

        embedding = self.model(x)
        return embedding





  from .autonotebook import tqdm as notebook_tqdm


In [16]:

from keras.src.callbacks import LearningRateScheduler, EarlyStopping
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, f1_score
from keras.src.models import Model
from keras.src.layers import Dense, Dropout, Input,  BatchNormalization
from keras.src.metrics import F1Score
import numpy as np

class NeuralNetworkInternalModel(BaseEstimator, ClassifierMixin):

    def __init__(self, **kwargs):
        self.batch_size = 8
        self.dropout_rate = 0.3
        self.epochs = 100
        self.model: Model = None

    def fit(self, X, y):
        y_onehot = to_categorical(y , 2)
        lr_scheduler = LearningRateScheduler(lambda epoch: 0.0001 * (0.9 ** epoch))
        early_stopping = EarlyStopping(patience=3, monitor='loss')
        self.model.fit(X, y_onehot, epochs=self.epochs, batch_size=self.batch_size, callbacks=[lr_scheduler, early_stopping])

    def predict(self, X):
        prediction = self.model.predict(X)
        return np.argmax(prediction, axis=1)


    def evaluate(self, X, y):
        if len(y.shape) == 2:
            y = np.argmax(y, axis=1)

        pred = self.predict(X)
        return accuracy_score(y, pred), f1_score(y, pred, average='weighted')



In [17]:
class DenseInternalModel(NeuralNetworkInternalModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "neural_network"
        num_classes = kwargs.get("num_classes")
        input_shape = kwargs.get("input_shape")
        self.model = self.get_model(num_classes=num_classes, input_shape=input_shape)

    def get_model(self, num_classes, input_shape):
        # Build the model
        inputs = Input(shape=(input_shape,))  # Dynamic input shape

        # Define the hidden layers
        x = BatchNormalization()(inputs)
        x = Dense(units=128, activation='leaky_relu')(x)
        x = Dropout(self.dropout_rate)(x)

        # Define the output layer
        outputs = Dense(units=num_classes, activation='softmax')(x)

        # Create the model
        model = Model(inputs=inputs, outputs=outputs)

        # Compile the model with F1 Score
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy', F1Score()]
                      )

        return model

# HELOC

In [5]:
DATASET_NAME = "heloc"

DATASET_PATH = DATASET_DIR / DATASET_NAME / "dataset.csv"
dataset = pd.read_csv(DATASET_PATH)
dataset.head()

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80


In [10]:
dataset = preprocess(dataset)
X,y = dataset.iloc[:, 1:], dataset.iloc[:, 0]
y.head()


Scaling numerical columns...


0    0
1    0
2    0
3    0
4    0
Name: RiskPerformance, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7844, 23), (2615, 23), (7844,), (2615,))

In [13]:

# Code to run on CPU
with tf.device('/CPU:0'):
    
    embedding = DNNEmbedding(X=X_train, y=y_train)
    

Epoch 1/50


2024-12-29 18:40:00.299671: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-12-29 18:40:00.299714: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2024-12-29 18:40:00.299725: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2024-12-29 18:40:00.299972: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-12-29 18:40:00.299995: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 387us/step - accuracy: 0.5157 - loss: 0.7671 - val_accuracy: 0.5866 - val_loss: 0.7074
Epoch 2/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302us/step - accuracy: 0.5666 - loss: 0.6884 - val_accuracy: 0.6249 - val_loss: 0.6795
Epoch 3/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 305us/step - accuracy: 0.6096 - loss: 0.6684 - val_accuracy: 0.6524 - val_loss: 0.6463
Epoch 4/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309us/step - accuracy: 0.6154 - loss: 0.6566 - val_accuracy: 0.6597 - val_loss: 0.6326
Epoch 5/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302us/step - accuracy: 0.6331 - loss: 0.6477 - val_accuracy: 0.6815 - val_loss: 0.6149
Epoch 6/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309us/step - accuracy: 0.6378 - loss: 0.6412 - val_accuracy: 0.6692 - val_loss: 0.6135
Epoch 7/50
[1m981/981[0m 

In [19]:

X_train_emb, X_test_emb = [], []

for i, x in tqdm(X_train.iterrows(), total=len(X_train)):
    X_train_emb.append(embedding(x.values.reshape(1,-1)))
    
for i, x in tqdm(X_test.iterrows(), total=len(X_test)):
    X_test_emb.append(embedding(x.values.reshape(1,-1)))


100%|██████████| 7844/7844 [00:08<00:00, 915.11it/s]
100%|██████████| 2615/2615 [00:02<00:00, 917.36it/s]


In [23]:
# Code to run on CPU
with tf.device('/CPU:0'):
    iim = DenseInternalModel(num_classes=2, input_shape=X_train_emb[0].shape[1])
    iim.fit(np.vstack(X_train_emb), y_train)
    print("--------PERFORMANCE-------")
    print(iim.evaluate(np.vstack(X_test_emb), y_test))

Epoch 1/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 295us/step - accuracy: 0.6010 - f1_score: 0.5985 - loss: 0.6677 - learning_rate: 1.0000e-04
Epoch 2/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 288us/step - accuracy: 0.6725 - f1_score: 0.6707 - loss: 0.6306 - learning_rate: 9.0000e-05
Epoch 3/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 288us/step - accuracy: 0.6896 - f1_score: 0.6881 - loss: 0.6128 - learning_rate: 8.1000e-05
Epoch 4/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283us/step - accuracy: 0.6734 - f1_score: 0.6708 - loss: 0.6136 - learning_rate: 7.2900e-05
Epoch 5/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283us/step - accuracy: 0.6774 - f1_score: 0.6745 - loss: 0.6158 - learning_rate: 6.5610e-05
Epoch 6/100
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285us/step - accuracy: 0.6774 - f1_score: 0.6764 - loss: 0.6160 - learni

# Gesture Phase

In [8]:
X_train, y_train, X_test, y_test = load_data("gesture_phase", 0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1777, 32), (1777,), (988, 32), (988,))

In [9]:
clf = XGBClassifier()
clf.fit(X_train, y_train)


In [12]:
preds = clf.predict(X_test)
accuracy_score(y_test, preds), f1_score(y_test, preds, average='weighted')

(0.6153846153846154, 0.5984407447266358)