In [None]:
import numpy as np
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import h5py
#For handling large datasets

#Memory-safe data loading
def load_data_batches(X_path, y_path, batch_size=50000):
    X=np.load(X_path, mmap_mode='r')
    y=np.load(y_path)
    for i in range(0, len(X), batch_size):
        yield X[i:i+batch_size].copy(), y[i:i+batch_size]

#Create memory-efficient data generator
def data_generator(X_path, y_path, normal_only=True, batch_size=50000):
    for X_batch, y_batch in load_data_batches(X_path, y_path, batch_size):
        if normal_only:
            yield X_batch[y_batch == 0]
        else:
            yield X_batch

#Model architecture
def build_autoencoder(input_dim):
    input_layer=Input(shape=(input_dim,))
    encoded=Dense(128, activation='relu')(input_layer)
    encoded=Dense(64, activation='relu')(encoded)
    decoded=Dense(128, activation='relu')(encoded)
    decoded=Dense(input_dim, activation='linear')(decoded)
    return Model(input_layer, decoded)

#Initialize
input_dim=np.load('X_train.npy', mmap_mode='r').shape[1]
autoencoder=build_autoencoder(input_dim)
autoencoder.compile(optimizer='adam', loss='mse')

#Train in batches
batch_size=50000
for epoch in range(10): #Reduced epochs for demo
    print(f"Epoch {epoch+1}")
    for X_batch in data_generator('X_train.npy', 'y_train.npy'):
        autoencoder.fit(X_batch, X_batch, 
                       epochs=1,
                       batch_size=1024,
                       verbose=0)

#Calculate threshold using normal samples
normal_mse=[]
for X_batch in data_generator('X_train.npy', 'y_train.npy'):
    reconstructions=autoencoder.predict(X_batch, batch_size=1024)
    normal_mse.extend(np.mean(np.square(X_batch - reconstructions), axis=1))
threshold=np.quantile(normal_mse, 0.95)

#Evaluate in batches
y_true, y_pred=[], []
for X_batch, y_batch in load_data_batches('X_test.npy', 'y_test.npy'):
    reconstructions=autoencoder.predict(X_batch, batch_size=1024)
    mse=np.mean(np.square(X_batch - reconstructions), axis=1)
    y_true.extend(y_batch)
    y_pred.extend((mse > threshold).astype(int))

#Metrics
print("Test Performance:")
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

#Save model and threshold
autoencoder.save('autoencoder.h5')
joblib.dump(threshold, 'ae_threshold.joblib')

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━



              precision    recall  f1-score   support

           0       1.00      0.95      0.97    291834
           1       0.99      1.00      0.99   1177696

    accuracy                           0.99   1469530
   macro avg       0.99      0.98      0.98   1469530
weighted avg       0.99      0.99      0.99   1469530



['ae_threshold.joblib']