<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/Time_Based_Patterns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import glob
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, classification_report, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

In [2]:
# === 1. Load and combine datasets ===
data_folder = '/content/datasets/'
files = glob.glob(data_folder + '*.csv')
time_cols = ['session_duration', 'bytes_per_second', 'hour', 'day_of_week', 'is_weekend', 'is_anomaly']

df_list = []
for file in files:
    df = pd.read_csv(file, low_memory=False)
    for col in time_cols:
        if col not in df.columns:
            df[col] = pd.NA
    df_list.append(df[time_cols])

combined_df = pd.concat(df_list, ignore_index=True)
combined_df = combined_df.dropna(subset=['session_duration', 'bytes_per_second'])

In [3]:
# === 2. Feature engineering ===
feature_cols = ['session_duration', 'bytes_per_second', 'hour', 'day_of_week', 'is_weekend']
combined_df[feature_cols] = combined_df[feature_cols].fillna(0).astype(float)
combined_df['is_anomaly'] = combined_df['is_anomaly'].fillna(0).astype(int)

In [4]:
# === 3. Proper train/val/test split BEFORE sequence creation ===
train_df, temp_df = train_test_split(
    combined_df, test_size=0.4, stratify=combined_df['is_anomaly'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['is_anomaly'], random_state=42
)

In [5]:
# === 4. Scale features based on training set ===
scaler = StandardScaler()
scaler.fit(train_df[feature_cols])
for df in (train_df, val_df, test_df):
    df[feature_cols] = scaler.transform(df[feature_cols])

In [6]:
# === 5. Sequence creation ===
sequence_length = 10

def create_sequences(df, features, seq_len):
    data = df[features].values
    y = df['is_anomaly'].values
    seqs, labels = [], []
    for i in range(len(data) - seq_len + 1):
        seqs.append(data[i:i+seq_len])
        labels.append(int(y[i:i+seq_len].max()))
    return np.array(seqs), np.array(labels)

X_train, y_train = create_sequences(train_df, feature_cols, sequence_length)
X_val,   y_val   = create_sequences(val_df,   feature_cols, sequence_length)
X_test,  y_test  = create_sequences(test_df,  feature_cols, sequence_length)

In [7]:
# === 6. Build LSTM autoencoder with regularization ===
n_features = len(feature_cols)
model = Sequential([
    LSTM(64, activation='relu', input_shape=(sequence_length, n_features), return_sequences=False),
    Dropout(0.2),
    RepeatVector(sequence_length),
    LSTM(64, activation='relu', return_sequences=True),
    Dropout(0.2),
    TimeDistributed(Dense(n_features))
])
model.compile(optimizer='adam', loss='mse')

In [8]:
# === 7. Train with validation ===
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
]
model.fit(
    X_train, X_train,
    epochs=100,
    batch_size=64,
    validation_data=(X_val, X_val),
    callbacks=callbacks,
    verbose=1
)

Epoch 1/100
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - loss: 0.9136 - val_loss: 0.7851 - learning_rate: 0.0010
Epoch 2/100
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - loss: 0.8812 - val_loss: 0.7353 - learning_rate: 0.0010
Epoch 3/100
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 21ms/step - loss: 0.7672 - val_loss: 0.6750 - learning_rate: 0.0010
Epoch 4/100
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - loss: 0.7497 - val_loss: 0.6310 - learning_rate: 0.0010
Epoch 5/100
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 22ms/step - loss: 0.6761 - val_loss: 0.7212 - learning_rate: 0.0010
Epoch 6/100
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - loss: 0.7431 - val_loss: 0.7702 - learning_rate: 0.0010
Epoch 7/100
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 21ms/step - loss: 1.0426 - val_loss: 0.

<keras.src.callbacks.history.History at 0x7c32c7e213d0>

In [9]:
# === 8. Compute reconstruction errors ===
def get_mse(seqs, recon):
    return np.mean(np.power(seqs - recon, 2), axis=(1,2))

mse_val = get_mse(X_val, model.predict(X_val))
mse_test = get_mse(X_test, model.predict(X_test))

[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


In [10]:
# === 9. Grid-search threshold on validation for best F1 ===
thresholds = np.linspace(mse_val.min(), mse_val.max(), 1000)
best_thr, best_f1 = 0, 0
for thr in thresholds:
    preds = (mse_val > thr).astype(int)
    score = f1_score(y_val, preds)
    if score > best_f1:
        best_f1, best_thr = score, thr

optimal_threshold = best_thr
print(f"Optimal threshold: {optimal_threshold:.6f}, Validation F1: {best_f1:.3f}")

Optimal threshold: 0.149783, Validation F1: 0.575


In [11]:
# === 10. Final evaluation on TEST set ===
y_pred_test = (mse_test > optimal_threshold).astype(int)
print("\n=== Classification Report on Test Set ===")
print(classification_report(y_test, y_pred_test, target_names=['Normal','Anomaly']))
auc = roc_auc_score(y_test, mse_test)
print(f"AUC-ROC on Test Set: {auc:.3f}")


=== Classification Report on Test Set ===
              precision    recall  f1-score   support

      Normal       0.70      0.02      0.04      8281
     Anomaly       0.41      0.99      0.58      5710

    accuracy                           0.42     13991
   macro avg       0.55      0.50      0.31     13991
weighted avg       0.58      0.42      0.26     13991

AUC-ROC on Test Set: 0.572


In [12]:
# === 11. Save final results ===
final_df = test_df.iloc[sequence_length-1:].copy()
final_df['lstm_recon_error'] = mse_test
final_df['lstm_anomaly'] = y_pred_test
final_df['is_anomaly_seq'] = y_test
final_df.to_csv('/content/final_lstm_results.csv', index=False)
print("Final results saved to /content/final_lstm_results.csv")

Final results saved to /content/final_lstm_results.csv
