_Alberto Medrano Fernández_

# AdaBoost (SMOTE)

---

## Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from time import time

from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, LabelEncoder 
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from tensorflow.keras.models import load_model, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping

from imblearn.over_sampling import SMOTE

---

## Dataset

In [2]:
hikari_2022 = pd.read_csv('ALLFLOWMETER_HIKARI2022.csv', sep=',')
hikari_2022

Unnamed: 0.1,Unnamed: 0,uid,originh,originp,responh,responp,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,...,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,bwd_last_window_size,attack_category,Label
0,0,Cmu9v81jToQyRF1gbk,184.0.48.168,38164,184.0.48.150,50443,0 days 00:00:00.000060,1,1,0,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,64240,0,64240,0,Benign,0
1,1,CO21hl3TWkuXTOgajk,184.0.48.169,43068,184.0.48.150,50443,0 days 00:00:00.000083,1,1,0,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,64240,0,64240,0,Benign,0
2,2,CBLJ6L19FP0MfYX7Oh,184.0.48.124,5678,255.255.255.255,5678,0 days 00:01:59.996602,3,0,3,...,5.999912e+07,1.199966e+08,5.999830e+07,1156.846698,0,0,0,0,Benign,0
3,3,ChTG451zJ7hUYOcqje,184.0.48.124,5678,255.255.255.255,5678,0 days 00:00:59.996909,2,0,2,...,5.999691e+07,5.999691e+07,5.999691e+07,0.000000,0,0,0,0,Benign,0
4,4,Cn9y6E2KVxzQbs5wjc,184.0.48.124,5678,255.255.255.255,5678,0 days 00:00:59.992130,2,0,2,...,5.999213e+07,5.999213e+07,5.999213e+07,0.000000,0,0,0,0,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228248,16634,Clt16PPxzrXEtpa5d,184.0.48.20,53866,184.0.48.255,1947,0 days 00:00:00.000027,2,0,2,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0,0,0,0,XMRIGCC CryptoMiner,1
228249,53170,Cs8RA72uHDiQa5ch2k,184.0.48.20,54318,184.0.48.255,1947,0 days 00:00:00.000027,2,0,2,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0,0,0,0,XMRIGCC CryptoMiner,1
228250,53529,Cy4dqo4YEq5YGxjUXa,184.0.48.20,65355,184.0.48.255,1947,0 days 00:00:00,2,0,2,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0,0,0,0,XMRIGCC CryptoMiner,1
228251,86308,CFXfNV3OTG04e0UnP4,184.0.48.20,53642,184.0.48.255,1947,0 days 00:00:00.000054,2,0,2,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0,0,0,0,XMRIGCC CryptoMiner,1


In [3]:
hikari_2022 = hikari_2022.drop(columns=['Unnamed: 0', 'uid', 'originh', 'responh', 'flow_duration', 'fwd_URG_flag_count', 
                                        'bwd_URG_flag_count', 'attack_category'])
hikari_2022

Unnamed: 0,originp,responp,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,down_up_ratio,...,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,bwd_last_window_size,Label
0,38164,50443,1,1,0,0,16644.063492,16644.063492,33288.126984,1.0,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,64240,0,64240,0,0
1,43068,50443,1,1,0,0,12052.597701,12052.597701,24105.195402,1.0,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,64240,0,64240,0,0
2,5678,5678,3,0,3,0,0.025001,0.000000,0.025001,0.0,...,5.999748e+07,5.999912e+07,1.199966e+08,5.999830e+07,1156.846698,0,0,0,0,0
3,5678,5678,2,0,2,0,0.033335,0.000000,0.033335,0.0,...,5.999691e+07,5.999691e+07,5.999691e+07,5.999691e+07,0.000000,0,0,0,0,0
4,5678,5678,2,0,2,0,0.033338,0.000000,0.033338,0.0,...,5.999213e+07,5.999213e+07,5.999213e+07,5.999213e+07,0.000000,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228248,53866,1947,2,0,2,0,73584.280702,0.000000,73584.280702,0.0,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0,0,0,0,1
228249,54318,1947,2,0,2,0,74235.469027,0.000000,74235.469027,0.0,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0,0,0,0,1
228250,65355,1947,2,0,2,0,0.000000,0.000000,0.000000,0.0,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0,0,0,0,1
228251,53642,1947,2,0,2,0,37117.734513,0.000000,37117.734513,0.0,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0,0,0,0,1


In [4]:
# Calculate Pearson correlation coefficient
corr = hikari_2022.corr(method ='pearson')

# Extract the correlation with the target variable 'Label'
corr_with_target = corr['Label']

# Select only columns with a correlation less than 0.05
relevant_features = corr_with_target[abs(corr_with_target) >= 0.05].index

# Filter the DataFrame to keep only the relevant features
hikari_2022 = hikari_2022[relevant_features]

print("Dataset after EDA:")
hikari_2022

Dataset after EDA:


Unnamed: 0,originp,responp,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,bwd_pkts_per_sec,down_up_ratio,fwd_header_size_tot,fwd_header_size_min,...,bwd_subflow_bytes,fwd_bulk_packets,bwd_bulk_packets,active.min,active.max,active.avg,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Label
0,38164,50443,1,1,0,0,16644.063492,1.0,40,40,...,0.0,0.0,0.0,60.081482,60.081482,60.081482,64240,0,64240,0
1,43068,50443,1,1,0,0,12052.597701,1.0,40,40,...,0.0,0.0,0.0,82.969666,82.969666,82.969666,64240,0,64240,0
2,5678,5678,3,0,3,0,0.000000,0.0,24,8,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0,0
3,5678,5678,2,0,2,0,0.000000,0.0,16,8,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0,0
4,5678,5678,2,0,2,0,0.000000,0.0,16,8,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228248,53866,1947,2,0,2,0,0.000000,0.0,16,8,...,0.0,0.0,0.0,27.179718,27.179718,27.179718,0,0,0,1
228249,54318,1947,2,0,2,0,0.000000,0.0,16,8,...,0.0,0.0,0.0,26.941299,26.941299,26.941299,0,0,0,1
228250,65355,1947,2,0,2,0,0.000000,0.0,16,8,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0,1
228251,53642,1947,2,0,2,0,0.000000,0.0,16,8,...,0.0,0.0,0.0,53.882599,53.882599,53.882599,0,0,0,1


---

## Models

### Without feature extraction

In [5]:
X = hikari_2022.iloc[:, :-1]
y = hikari_2022['Label']

# Cross-validation
def cross_validation(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Resample with SMOTE 
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        print("X_train_resampled, ", X_train_resampled.shape)
        print("y_train_resampled, ", y_train_resampled.value_counts())
        
        # Normalize data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train_resampled)
        X_test = scaler.transform(X_test)

        # Train AdaBoost classifier
        ab_classifier = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42)
        ab_classifier.fit(X_train, y_train_resampled)

        # Make predictions
        y_pred = ab_classifier.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        precision = precision_score(y_test, y_pred)
        precisions.append(precision)
        
        recall = recall_score(y_test, y_pred)
        recalls.append(recall)
        
        f1 = f1_score(y_test, y_pred)
        f1_scores.append(f1)
        
        print(f"Fold accuracy: {accuracy}")
        print(f"Fold precision: {precision}")
        print(f"Fold recall: {recall}")
        print(f"Fold F1-score: {f1}")
        
        # Generate classification report
        report = classification_report(y_test, y_pred)
        print(report)
    
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    mean_f1 = np.mean(f1_scores)
    
    print("Mean classification report:")
    print("==========================")
    print(f"Mean accuracy: {mean_accuracy}")
    print(f"Mean precision: {mean_precision}")
    print(f"Mean recall: {mean_recall}")
    print(f"Mean F1-score: {mean_f1}")   



tiempo_inicial = time()
#
cross_validation(X, y)
#
tiempo_final = time() 
tiempo_ejecucion = tiempo_final - tiempo_inicial
print ('\nEl tiempo de ejecucion fue:', tiempo_ejecucion) # Seconds

X_train_resampled,  (343860, 43)
y_train_resampled,  Label
0    171930
1    171930
Name: count, dtype: int64
Fold accuracy: 0.9781384854658167
Fold precision: 0.7286842822119314
Fold recall: 0.999252895031752
Fold F1-score: 0.8427851291745432
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     42974
           1       0.73      1.00      0.84      2677

    accuracy                           0.98     45651
   macro avg       0.86      0.99      0.92     45651
weighted avg       0.98      0.98      0.98     45651

X_train_resampled,  (343952, 43)
y_train_resampled,  Label
0    171976
1    171976
Name: count, dtype: int64
Fold accuracy: 0.9782918227421086
Fold precision: 0.7339276066990815
Fold recall: 0.9977965479250827
Fold F1-score: 0.8457587548638132
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     42928
           1       0.73      1.00      0.85      2723

    accuracy          

---

### With DNN

In [6]:
X = hikari_2022.iloc[:, :-1]
y = hikari_2022['Label']

# Define the DNN model
def create_dnn(input_dim):
    inputs = Input(shape=(input_dim,))
    
    # Hidden layer 1
    x = Dense(input_dim * 2)(inputs)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    
    # Hidden layer 2
    x = Dense(input_dim)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    
    # Feature layer
    n_bottleneck = round(float(input_dim) / 2.0)
    bottleneck = Dense(n_bottleneck)(x)
    bottleneck = BatchNormalization()(bottleneck)
    bottleneck = ReLU()(bottleneck)
    
    # Output layer
    outputs = Dense(1, activation='sigmoid')(bottleneck)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Model for feature extraction
    feature_extractor = Model(inputs=inputs, outputs=bottleneck)
    
    return model, feature_extractor

# Cross-validation
def cross_validation_dnn(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Resample with SMOTE 
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        print("X_train_resampled, ", X_train_resampled.shape)
        print("y_train_resampled, ", y_train_resampled.value_counts())
        
        # Normalize data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train_resampled)
        X_test = scaler.transform(X_test)
        
        n_inputs = X_train.shape[1]
        dnn, feature_extractor = create_dnn(n_inputs)
        
        # Early Stopping
        early_stopping1 = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
        early_stopping2 = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        
        # Fit the DNN model
        dnn.fit(X_train, y_train_resampled, epochs=100, batch_size=32, verbose=1,
                        validation_data=(X_test, y_test),
                        callbacks=[early_stopping1, early_stopping2])
        
        # Encode the data
        X_train_encoded = feature_extractor.predict(X_train)
        X_test_encoded = feature_extractor.predict(X_test)
        
        # Train AdaBoost classifier
        ab_classifier_encoded = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42)
        ab_classifier_encoded.fit(X_train_encoded, y_train_resampled)
        
        # Make predictions
        y_pred_encoded = ab_classifier_encoded.predict(X_test_encoded)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred_encoded)
        accuracies.append(accuracy)
        
        precision = precision_score(y_test, y_pred_encoded)
        precisions.append(precision)
        
        recall = recall_score(y_test, y_pred_encoded)
        recalls.append(recall)
        
        f1 = f1_score(y_test, y_pred_encoded)
        f1_scores.append(f1)
        
        print(f"Fold accuracy: {accuracy}")
        print(f"Fold precision: {precision}")
        print(f"Fold recall: {recall}")
        print(f"Fold F1-score: {f1}")
        
        # Generate classification report
        report = classification_report(y_test, y_pred_encoded)
        print(report)
    
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    mean_f1 = np.mean(f1_scores)
    
    print("Mean classification report:")
    print("==========================")
    print(f"Mean accuracy: {mean_accuracy}")
    print(f"Mean precision: {mean_precision}")
    print(f"Mean recall: {mean_recall}")
    print(f"Mean F1-score: {mean_f1}")


tiempo_inicial = time()
#
cross_validation_dnn(X, y)
#
tiempo_final = time() 
tiempo_ejecucion = tiempo_final - tiempo_inicial
print ('\nEl tiempo de ejecucion fue:', tiempo_ejecucion) # Seconds

X_train_resampled,  (343860, 43)
y_train_resampled,  Label
0    171930
1    171930
Name: count, dtype: int64
Epoch 1/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 811us/step - accuracy: 0.9840 - loss: 0.0593 - val_accuracy: 0.9809 - val_loss: 0.0545
Epoch 2/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 822us/step - accuracy: 0.9900 - loss: 0.0364 - val_accuracy: 0.9811 - val_loss: 0.0577
Epoch 3/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 792us/step - accuracy: 0.9900 - loss: 0.0364 - val_accuracy: 0.9817 - val_loss: 0.0485
Epoch 4/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 793us/step - accuracy: 0.9899 - loss: 0.0362 - val_accuracy: 0.9814 - val_loss: 0.0479
Epoch 5/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 819us/step - accuracy: 0.9901 - loss: 0.0359 - val_accuracy: 0.9812 - val_loss: 0.0500
Epoch 6/100
[1m10746/10746[0m [32m━━━━━━━━

---

### With DBN

In [7]:
def visualize_dbn(layers, file_name='dbn_structure.png'):
    # Crear un grafo dirigido
    G = nx.DiGraph()

    # Añadir nodos para cada capa RBM
    for i, n_units in enumerate(layers):
        G.add_node(f"RBM {i+1}\n{n_units} unidades", layer=i+1)

    # Añadir aristas entre nodos (de una capa a otra)
    for i in range(len(layers) - 1):
        G.add_edge(f"RBM {i+1}\n{layers[i]} unidades", f"RBM {i+2}\n{layers[i+1]} unidades")

    # Dibujar el grafo
    pos = nx.spring_layout(G)  # Posición de los nodos
    nx.draw(G, pos, with_labels=True, node_size=3000, node_color='lightblue', font_size=10, font_weight='bold')
    
    # Guardar la imagen a un archivo
    plt.title("Arquitectura de la DBN")
    plt.savefig(file_name)
    plt.show()
    
#visualize_dbn([X.shape[1]*2, X.shape[1]])

In [8]:
X = hikari_2022.iloc[:, :-1]
y = hikari_2022['Label']

# Define the DBN class
class DBN(BaseEstimator, TransformerMixin):
    def __init__(self, rbm_layers, rbm_learning_rate, rbm_n_iter):
        self.rbm_layers = rbm_layers
        self.rbm_learning_rate = rbm_learning_rate
        self.rbm_n_iter = rbm_n_iter
        self.rbms = []
        for i, n_components in enumerate(rbm_layers):
            self.rbms.append(BernoulliRBM(n_components=n_components, learning_rate=rbm_learning_rate, n_iter=rbm_n_iter, verbose=1))

    def fit(self, X, y=None):
        input_data = X
        for rbm in self.rbms:
            rbm.fit(input_data)
            input_data = rbm.transform(input_data)
        return self

    def transform(self, X):
        input_data = X
        for rbm in self.rbms:
            input_data = rbm.transform(input_data)
        return input_data

# Cross-validation
def cross_validation_dbn(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Resample with SMOTE 
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        print("X_train_resampled, ", X_train_resampled.shape)
        print("y_train_resampled, ", y_train_resampled.value_counts())
        
        # Normalize data
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train_resampled)
        X_test = scaler.transform(X_test)
        
        n_inputs = X_train.shape[1]
        dbn = DBN(rbm_layers=[n_inputs*2, n_inputs], rbm_learning_rate=0.1, rbm_n_iter=10)
        
        # Fit the DBN model
        dbn.fit(X_train)
        
        # Encode the data
        X_train_encoded = dbn.transform(X_train)
        X_test_encoded = dbn.transform(X_test)
        
        # Train AdaBoost classifier
        ab_classifier_encoded = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42)
        ab_classifier_encoded.fit(X_train_encoded, y_train_resampled)
        
        # Make predictions
        y_pred_encoded = ab_classifier_encoded.predict(X_test_encoded)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred_encoded)
        accuracies.append(accuracy)
        
        precision = precision_score(y_test, y_pred_encoded)
        precisions.append(precision)
        
        recall = recall_score(y_test, y_pred_encoded)
        recalls.append(recall)
        
        f1 = f1_score(y_test, y_pred_encoded)
        f1_scores.append(f1)
        
        print(f"Fold accuracy: {accuracy}")
        print(f"Fold precision: {precision}")
        print(f"Fold recall: {recall}")
        print(f"Fold F1-score: {f1}")
        
        # Generate classification report
        report = classification_report(y_test, y_pred_encoded)
        print(report)
    
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    mean_f1 = np.mean(f1_scores)
    
    print("Mean classification report:")
    print("==========================")
    print(f"Mean accuracy: {mean_accuracy}")
    print(f"Mean precision: {mean_precision}")
    print(f"Mean recall: {mean_recall}")
    print(f"Mean F1-score: {mean_f1}")
    
    
tiempo_inicial = time()
#
cross_validation_dbn(X, y)
#
tiempo_final = time() 
tiempo_ejecucion = tiempo_final - tiempo_inicial
print ('\nEl tiempo de ejecucion fue:', tiempo_ejecucion) # Seconds

X_train_resampled,  (343860, 43)
y_train_resampled,  Label
0    171930
1    171930
Name: count, dtype: int64
[BernoulliRBM] Iteration 1, pseudo-likelihood = -5.41, time = 4.79s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -5.35, time = 6.18s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -5.05, time = 6.23s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -4.87, time = 6.13s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -5.02, time = 6.36s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -4.85, time = 6.21s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -4.98, time = 6.08s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -4.88, time = 6.16s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -5.09, time = 6.04s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -4.90, time = 6.18s
[BernoulliRBM] Iteration 1, pseudo-likelihood = -4.27, time = 4.69s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -4.23, time = 5.56s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -4.61, time = 5.60s
[Berno

---

### With Autoencoder

In [9]:
X = hikari_2022.iloc[:, :-1]
y = hikari_2022['Label']

# Define the autoencoder model
def create_autoencoder(n_inputs):
    inputs = Input(shape=(n_inputs,))
    
    # Define Encoder
    e = Dense(n_inputs*2)(inputs)
    e = BatchNormalization()(e)
    e = LeakyReLU()(e)
    
    # Bottleneck
    n_bottleneck = round(float(n_inputs)/2.0)
    bottleneck = Dense(n_bottleneck)(e)
    
    # Define Decoder
    d = Dense(n_inputs*2)(bottleneck)
    d = BatchNormalization()(d)
    d = LeakyReLU()(d)
    
    # Output layer
    outputs = Dense(n_inputs, activation='linear')(d)
    
    # Define autoencoder model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='mse')
    
    # Define encoder model
    encoder = Model(inputs=inputs, outputs=bottleneck)
    
    return model, encoder

# Cross-validation
def cross_validation_autoencoder(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Resample with SMOTE 
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        print("X_train_resampled, ", X_train_resampled.shape)
        print("y_train_resampled, ", y_train_resampled.value_counts())
        
        # Normalize data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train_resampled)
        X_test = scaler.transform(X_test)
        
        n_inputs = X_train.shape[1]
        autoencoder, encoder = create_autoencoder(n_inputs)
        
        # Early Stopping
        early_stopping1 = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
        early_stopping2 = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        
        # Fit the DNN model
        autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, verbose=1,
                        validation_data=(X_test, X_test),
                        callbacks=[early_stopping1, early_stopping2])
        
        # Encode the data
        X_train_encoded = encoder.predict(X_train)
        X_test_encoded = encoder.predict(X_test)
        
        # Train AdaBoost classifier
        ab_classifier_encoded = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42)
        ab_classifier_encoded.fit(X_train_encoded, y_train_resampled)
        
        # Make predictions
        y_pred_encoded = ab_classifier_encoded.predict(X_test_encoded)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred_encoded)
        accuracies.append(accuracy)
        
        precision = precision_score(y_test, y_pred_encoded)
        precisions.append(precision)
        
        recall = recall_score(y_test, y_pred_encoded)
        recalls.append(recall)
        
        f1 = f1_score(y_test, y_pred_encoded)
        f1_scores.append(f1)
        
        print(f"Fold accuracy: {accuracy}")
        print(f"Fold precision: {precision}")
        print(f"Fold recall: {recall}")
        print(f"Fold F1-score: {f1}")
        
        # Generate classification report
        report = classification_report(y_test, y_pred_encoded)
        print(report)
    
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    mean_f1 = np.mean(f1_scores)
    
    print("Mean classification report:")
    print("==========================")
    print(f"Mean accuracy: {mean_accuracy}")
    print(f"Mean precision: {mean_precision}")
    print(f"Mean recall: {mean_recall}")
    print(f"Mean F1-score: {mean_f1}")


tiempo_inicial = time()
#
cross_validation_autoencoder(X, y)
#
tiempo_final = time() 
tiempo_ejecucion = tiempo_final - tiempo_inicial
print ('\nEl tiempo de ejecucion fue:', tiempo_ejecucion) # Seconds

X_train_resampled,  (343860, 43)
y_train_resampled,  Label
0    171930
1    171930
Name: count, dtype: int64
Epoch 1/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 701us/step - loss: 0.1529 - val_loss: 0.0615
Epoch 2/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 690us/step - loss: 0.0798 - val_loss: 0.1096
Epoch 3/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 693us/step - loss: 0.0729 - val_loss: 0.0500
Epoch 4/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 691us/step - loss: 0.0643 - val_loss: 0.0487
Epoch 5/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 695us/step - loss: 0.0829 - val_loss: 0.0402
Epoch 6/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 693us/step - loss: 0.0494 - val_loss: 0.0533
Epoch 7/100
[1m10746/10746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 697us/step - loss: 0.0373 - val_loss: 0.0404
Ep