In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, classification_report, roc_curve, auc)
import tensorflow as tf
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Verify TensorFlow
print("TensorFlow Version:", tf.__version__)
try:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
    print("Keras imports successful!")
except ImportError as e:
    print("Keras import error:", e)
    print("Please reinstall TensorFlow: pip uninstall tensorflow -y; pip install tensorflow==2.18.0")
    raise

TensorFlow Version: 2.18.0
Keras imports successful!


In [3]:
# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


In [4]:
try:
    train_df = pd.read_csv('../../../data/processed/train.csv')
    val_df = pd.read_csv('../../../data/processed/val.csv')
    test_df = pd.read_csv('../../../data/processed/test.csv')
except FileNotFoundError as e:
    print("Error: Preprocessed data files not found. Ensure 'train.csv', 'val.csv', 'test.csv' exist.")
    raise


In [5]:
# Separate features and labels
X_train = train_df.drop('binary_label', axis=1)
y_train = train_df['binary_label']
X_val = val_df.drop('binary_label', axis=1)
y_val = val_df['binary_label']
X_test = test_df.drop('binary_label', axis=1)
y_test = test_df['binary_label']

print("Loaded preprocessed data:")
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")


Loaded preprocessed data:
Train shape: (2932930, 38), Validation shape: (628485, 38), Test shape: (628485, 38)


In [6]:
# Evaluation function
def evaluate_model(y_true, y_pred, y_prob, model_name):
    print(f"\n{model_name} Evaluation:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1-Score:", f1_score(y_true, y_pred))
    print("ROC-AUC:", roc_auc_score(y_true, y_prob))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('shallow_dnn_confusion_matrix.png')
    plt.close()
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig('shallow_dnn_roc_curve.png')
    plt.close()
    
    # Return metrics
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_prob)
    }


In [7]:
# Define Shallow DNN
def create_shallow_dnn(input_dim):
    model = Sequential([
        Dense(64, input_dim=input_dim, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [8]:

# Train Shallow DNN
input_dim = X_train.shape[1]
shallow_dnn = create_shallow_dnn(input_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
checkpoint = ModelCheckpoint('shallow_dnn_best.h5', monitor='val_loss', save_best_only=True)

history = shallow_dnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=128,
    callbacks=[early_stopping, checkpoint],
    verbose=1
)


Epoch 1/50
[1m22906/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.9678 - loss: 0.0807



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 4ms/step - accuracy: 0.9678 - loss: 0.0806 - val_accuracy: 0.9903 - val_loss: 0.0290
Epoch 2/50
[1m22913/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.9878 - loss: 0.0346



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 7ms/step - accuracy: 0.9878 - loss: 0.0346 - val_accuracy: 0.9916 - val_loss: 0.0243
Epoch 3/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9892 - loss: 0.0303



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 8ms/step - accuracy: 0.9892 - loss: 0.0303 - val_accuracy: 0.9919 - val_loss: 0.0228
Epoch 4/50
[1m22909/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - accuracy: 0.9899 - loss: 0.0282



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 8ms/step - accuracy: 0.9899 - loss: 0.0282 - val_accuracy: 0.9920 - val_loss: 0.0224
Epoch 5/50
[1m22909/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - accuracy: 0.9907 - loss: 0.0262



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 9ms/step - accuracy: 0.9907 - loss: 0.0262 - val_accuracy: 0.9935 - val_loss: 0.0181
Epoch 6/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 7ms/step - accuracy: 0.9915 - loss: 0.0239 - val_accuracy: 0.9934 - val_loss: 0.0192
Epoch 7/50
[1m22908/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.9914 - loss: 0.0239



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 7ms/step - accuracy: 0.9914 - loss: 0.0239 - val_accuracy: 0.9941 - val_loss: 0.0171
Epoch 8/50
[1m22908/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.9918 - loss: 0.0228



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 6ms/step - accuracy: 0.9918 - loss: 0.0228 - val_accuracy: 0.9941 - val_loss: 0.0160
Epoch 9/50
[1m22905/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.9922 - loss: 0.0217



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 7ms/step - accuracy: 0.9922 - loss: 0.0217 - val_accuracy: 0.9943 - val_loss: 0.0159
Epoch 10/50
[1m22908/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.9924 - loss: 0.0211



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 6ms/step - accuracy: 0.9924 - loss: 0.0211 - val_accuracy: 0.9945 - val_loss: 0.0155
Epoch 11/50
[1m22906/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.9926 - loss: 0.0205



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 6ms/step - accuracy: 0.9926 - loss: 0.0205 - val_accuracy: 0.9946 - val_loss: 0.0145
Epoch 12/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 6ms/step - accuracy: 0.9928 - loss: 0.0201 - val_accuracy: 0.9942 - val_loss: 0.0156
Epoch 13/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 6ms/step - accuracy: 0.9928 - loss: 0.0200 - val_accuracy: 0.9946 - val_loss: 0.0151
Epoch 14/50
[1m22907/22914[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.9929 - loss: 0.0196



[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 6ms/step - accuracy: 0.9929 - loss: 0.0196 - val_accuracy: 0.9947 - val_loss: 0.0139
Epoch 15/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 5ms/step - accuracy: 0.9930 - loss: 0.0192 - val_accuracy: 0.9945 - val_loss: 0.0151
Epoch 16/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 5ms/step - accuracy: 0.9931 - loss: 0.0189 - val_accuracy: 0.9945 - val_loss: 0.0158
Epoch 17/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 5ms/step - accuracy: 0.9929 - loss: 0.0197 - val_accuracy: 0.9946 - val_loss: 0.0150
Epoch 18/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 5ms/step - accuracy: 0.9932 - loss: 0.0189 - val_accuracy: 0.9942 - val_loss: 0.0168
Epoch 19/50
[1m22914/22914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 5ms/step - accuracy: 0.9932 - loss: 0.0187 - val_accuracy: 0.9948 - val_loss: 0.014

In [9]:
# Save model
shallow_dnn.save('shallow_dnn_final.h5')
print("Shallow DNN model trained and saved.")



Shallow DNN model trained and saved.


In [10]:
# Evaluate on test set
shallow_pred = (shallow_dnn.predict(X_test, verbose=0) > 0.5).astype(int).flatten()
shallow_prob = shallow_dnn.predict(X_test, verbose=0).flatten()
shallow_metrics = evaluate_model(y_test, shallow_pred, shallow_prob, "Shallow DNN")



Shallow DNN Evaluation:
Accuracy: 0.9946410813305011
Precision: 0.9925563738227726
Recall: 0.9967572762393315
F1-Score: 0.9946523894356112
ROC-AUC: 0.9998410595955298

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    314243
           1       0.99      1.00      0.99    314242

    accuracy                           0.99    628485
   macro avg       0.99      0.99      0.99    628485
weighted avg       0.99      0.99      0.99    628485



In [11]:
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Shallow DNN Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Shallow DNN Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.savefig('shallow_dnn_training_history.png')
plt.close()


In [12]:
# Save metrics
metrics_df = pd.DataFrame([shallow_metrics]).set_index('Model')
metrics_df.to_csv('shallow_dnn_metrics.csv')
print("\nShallow DNN Metrics:")
print(metrics_df)


Shallow DNN Metrics:
             Accuracy  Precision    Recall  F1-Score   ROC-AUC
Model                                                         
Shallow DNN  0.994641   0.992556  0.996757  0.994652  0.999841
