In [None]:
# Uploading dataset
from google.colab import files
import zipfile
import os


uploaded = files.upload()

zip_path = list(uploaded.keys())[0]

extract_dir = "/content/dataset_extracted"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

!ls {extract_dir}

Saving Dataset (1).zip to Dataset (1).zip
test  train


In [2]:
# Extract it
zip_path = list(uploaded.keys())[0]
extract_dir = "/content/dataset_extracted"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [3]:
# Check structure
!ls {extract_dir}


test  train


In [None]:
#Installing Dependecies
!pip install tensorflow pandas scikit-learn matplotlib




In [None]:
import numpy as np
import random
import tensorflow as tf  

# Fix seed for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
#Load, Label, Preprocess, and Combine Train and Test Datasets

import os
import pandas as pd
from utils import create_windowed_dataframe

train_path = "/content/dataset_extracted/train"
test_path = "/content/dataset_extracted/test"

# Load train files
train_files = [f for f in os.listdir(train_path) if f.endswith('.csv')]
train_dfs = [pd.read_csv(os.path.join(train_path, f)) for f in train_files]
train_data = pd.concat(train_dfs, ignore_index=True)
train_data['class'] = 0

# Load test files and assign labels
test_files = [f for f in os.listdir(test_path) if f.endswith('.csv')]
test_dfs = []
for f in test_files:
    df = pd.read_csv(os.path.join(test_path, f))
    df['class'] = 0 if 'normal' in f.lower() else 1
    test_dfs.append(df)
test_data = pd.concat(test_dfs, ignore_index=True)

# Remove the time column as it is not needed for scaling
train_data = train_data.drop(columns=['_time'])
test_data = test_data.drop(columns=['_time'])

print("Train:", train_data.shape, "Test:", test_data.shape)

Train: (2906231, 57) Test: (58404, 57)


In [None]:
#Create Windowed Dataframes and Prepare 3D Arrays for Model Input
window_size = 50
train_windowed = create_windowed_dataframe(train_data, window_size)
test_windowed = create_windowed_dataframe(test_data, window_size)

X_train = train_windowed.drop(columns=['class']).values
y_train = train_windowed['class'].values
X_test = test_windowed.drop(columns=['class']).values
y_test = test_windowed['class'].values

# Convert to 3D (samples, timesteps, features)
import numpy as np
X_train = np.stack([np.column_stack(x) for x in X_train])
X_test = np.stack([np.column_stack(x) for x in X_test])

print(X_train.shape, X_test.shape)

(58124, 50, 56) (1168, 50, 56)


In [8]:
#Validate windowed arrays before scaling
print("Before cleaning:")
print("NaNs in X_train:", np.isnan(X_train).sum())
print("NaNs in X_test:", np.isnan(X_test).sum())

# If NaNs exist, remove any windows containing them
mask_train = ~np.isnan(X_train).any(axis=(1,2))
mask_test = ~np.isnan(X_test).any(axis=(1,2))

X_train = X_train[mask_train]
y_train = y_train[mask_train]
X_test = X_test[mask_test]
y_test = y_test[mask_test]

print("After cleaning:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

# Normalize safely
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Flatten for scaling
X_train_flat = X_train.reshape(-1, X_train.shape[-1])
X_test_flat = X_test.reshape(-1, X_test.shape[-1])

scaler.fit(X_train_flat)
X_train_scaled = scaler.transform(X_train_flat).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape)

print("Min:", np.min(X_train_scaled), "Max:", np.max(X_train_scaled))


Before cleaning:
NaNs in X_train: 1370
NaNs in X_test: 1666
After cleaning:
X_train shape: (58114, 50, 56)
X_test shape: (1156, 50, 56)
Min: 0.0 Max: 1.0000000000000002


In [None]:
#Scale Features Using Min-Max Normalization on Windowed Data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_flat = X_train.reshape(-1, X_train.shape[-1])
X_test_flat = X_test.reshape(-1, X_test.shape[-1])

scaler.fit(X_train_flat)
X_train_scaled = scaler.transform(X_train_flat).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape)

In [None]:
#Define and Compile LSTM Autoencoder Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector
from tensorflow.keras.optimizers import Adam

timesteps = X_train.shape[1]
n_features = X_train.shape[2]

inputs = Input(shape=(timesteps, n_features))
encoded = LSTM(64, activation='relu')(inputs)
decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(n_features, activation='relu', return_sequences=True)(decoded) # Modified this line

autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

autoencoder.summary()

In [None]:
print(np.min(X_train_scaled), np.max(X_train_scaled))


0.0 1.0000000000000002


In [None]:
#Train LSTM Autoencoder Model on Scaled Data

history = autoencoder.fit(X_train_scaled, X_train_scaled,
                          epochs=20,
                          batch_size=16,
                          validation_split=0.1,
                          shuffle=True)


Epoch 1/20
[1m3269/3269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 10ms/step - loss: 0.0398 - val_loss: 0.0083
Epoch 2/20
[1m3269/3269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 9ms/step - loss: 0.0129 - val_loss: 0.0079
Epoch 3/20
[1m3269/3269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 9ms/step - loss: 0.0081 - val_loss: 0.0077
Epoch 4/20
[1m3269/3269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 9ms/step - loss: 0.0081 - val_loss: 0.0076
Epoch 5/20
[1m3269/3269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 9ms/step - loss: 0.0074 - val_loss: 0.0032
Epoch 6/20
[1m3269/3269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 9ms/step - loss: 0.0035 - val_loss: 0.0030
Epoch 7/20
[1m3269/3269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 9ms/step - loss: 0.0035 - val_loss: 0.0030
Epoch 8/20
[1m3269/3269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 9ms/step - loss: 0.0035 - val_loss: 0.0030
Epoch 9/20
[1m

In [12]:
# Predictions
X_test_pred = autoencoder.predict(X_test_scaled)
reconstruction_errors = np.mean(np.square(X_test_scaled - X_test_pred), axis=(1,2))

X_train_pred = autoencoder.predict(X_train_scaled)
train_errors = np.mean(np.square(X_train_scaled - X_train_pred), axis=(1,2))
threshold = np.mean(train_errors) + 3*np.std(train_errors)

predictions = (reconstruction_errors > threshold).astype(int)

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
[1m1817/1817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step
Confusion Matrix:
 [[428   3]
 [ 94 631]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.99      0.90       431
           1       1.00      0.87      0.93       725

    accuracy                           0.92      1156
   macro avg       0.91      0.93      0.91      1156
weighted avg       0.93      0.92      0.92      1156



In [13]:
!sed -i 's/fig.suptitle(f"Start: {timeStamp.iloc\[r\]\[0\]} - End: {timeStamp.iloc\[r\]\[-1\]}", fontsize=16)/fig.suptitle(f"Window Index: {timeStamp.iloc[r]}", fontsize=16)/' utils.py


In [14]:
import importlib
import utils
importlib.reload(utils)
from utils import save_qualitative_results


In [None]:
# from utils import save_qualitative_results
# test_X_df = pd.DataFrame({col: list(X_test[:, :, idx]) for idx, col in enumerate(range(X_test.shape[2]))})
# test_X_df['_time'] = list(range(X_test.shape[0]))

# save_qualitative_results(test_X_df, pd.Series(y_test), predictions, "DatasetName")
# print("Check qualitative_results/DatasetName/ for plots")

from utils import save_qualitative_results
import numpy as np
import pandas as pd

# Limit the number of samples
num_samples = 20
indices = np.random.choice(X_test.shape[0], size=num_samples, replace=False)

# Subset your data
X_test = X_test[indices]
y_test = np.array(y_test)[indices]
predictions = np.array(predictions)[indices]

# Build DataFrame for subset
test_X_df = pd.DataFrame({col: list(X_test[:, :, idx]) for idx, col in enumerate(range(X_test.shape[2]))})
test_X_df['_time'] = list(range(X_test.shape[0]))

# Call the function
save_qualitative_results(test_X_df, pd.Series(y_test), predictions, "DatasetName")
print(f"Check qualitative_results/DatasetName/ for {num_samples} plots")




Creating DatasetName qualitative_results directory [...]
Check qualitative_results/DatasetName/ for 20 plots


In [25]:
!zip -r /content/DatasetName_results.zip /content/qualitative_results/DatasetName


  adding: content/qualitative_results/DatasetName/ (stored 0%)
  adding: content/qualitative_results/DatasetName/1753634610762800540_1_1.png (deflated 25%)
  adding: content/qualitative_results/DatasetName/1753634626954619804_0_0.png (deflated 30%)
  adding: content/qualitative_results/DatasetName/1753634598283426692_1_0.png (deflated 31%)
  adding: content/qualitative_results/DatasetName/1753634631928863890_0_0.png (deflated 27%)
  adding: content/qualitative_results/DatasetName/1753634649845258930_0_0.png (deflated 30%)
  adding: content/qualitative_results/DatasetName/1753634541200091972_1_1.png (deflated 26%)
  adding: content/qualitative_results/DatasetName/1753634592831127055_1_1.png (deflated 25%)
  adding: content/qualitative_results/DatasetName/1753634546569736113_0_0.png (deflated 29%)
  adding: content/qualitative_results/DatasetName/1753634586169086483_1_1.png (deflated 26%)
  adding: content/qualitative_results/DatasetName/1753634581301154826_0_0.png (deflated 29%)
  addin

In [26]:
from google.colab import files
files.download("/content/DatasetName_results.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>