
Project Quorum - Lightweight Log Anomaly Detection Model Training
Optimized for <1.5GB RAM usage on Google Colab
Supports both PyOD and TensorFlow Lite models



In [15]:
!pip install pandas numpy scikit-learn pyod joblib tensorflow



In [16]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

In [17]:
# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG = {
    # Dataset URLs (LogPai LogHub) - Updated to raw URLs
    'datasets': [
        'https://raw.githubusercontent.com/logpai/loghub/master/HDFS/HDFS_2k.log_structured.csv',
        'https://raw.githubusercontent.com/logpai/loghub/master/BGL/BGL_2k.log_structured.csv',
        'https://raw.githubusercontent.com/logpai/loghub/master/Thunderbird/Thunderbird_2k.log_structured.csv',
        'https://raw.githubusercontent.com/logpai/loghub/master/Mac/Mac_2k.log_structured.csv',
        'https://raw.githubusercontent.com/logpai/loghub/master/Windows/Windows_2k.log_structured.csv',
        'https://raw.githubusercontent.com/logpai/loghub/master/Linux/Linux_2k.log_structured.csv',
    ],

    # Model settings
    'max_features': 1000,  # Reduced for memory efficiency
    'contamination': 0.02,  # Expected anomaly rate (2%)
    'chunk_size': 500,      # Process data in chunks to save memory
    'model_type': 'pyod',   # 'pyod' or 'tflite'
}

In [18]:
# ============================================================================
# STEP 1: DATA LOADING (Memory-Efficient)
# ============================================================================

def load_dataset_from_url(url, sample_size=None):
    """Load dataset directly from GitHub URL with memory optimization"""
    print(f"📥 Loading dataset from: {url}")

    try:
        # Read CSV with minimal columns to save memory
        df = pd.read_csv(url, usecols=['Content', 'Label'] if 'Label' in pd.read_csv(url, nrows=1).columns else ['Content'])

        # Sample if dataset is too large
        if sample_size and len(df) > sample_size:
            df = df.sample(n=sample_size, random_state=42)
            print(f"⚠️ Sampled {sample_size} rows for memory efficiency")

        print(f"✅ Loaded {len(df)} rows")
        return df

    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return None

def prepare_data(datasets_urls, sample_per_dataset=5000):
    """Load and combine multiple datasets efficiently"""
    all_messages = []
    all_labels = []

    for url in datasets_urls:
        df = load_dataset_from_url(url, sample_size=sample_per_dataset)
        if df is not None:
            all_messages.extend(df['Content'].fillna('').tolist())

            # Handle labeled vs unlabeled data
            if 'Label' in df.columns:
                all_labels.extend(df['Label'].map({'Normal': 0, 'Anomaly': 1}).fillna(0).tolist())
            else:
                all_labels.extend([0] * len(df))  # Assume normal if no labels

    return pd.DataFrame({'message': all_messages, 'label': all_labels})


In [19]:
# ============================================================================
# STEP 2: FEATURE EXTRACTION (TF-IDF with Memory Optimization)
# ============================================================================

def extract_features(messages, max_features=1000):
    """Convert log messages to numerical features using TF-IDF"""
    from sklearn.feature_extraction.text import TfidfVectorizer

    print(f"🔧 Extracting features (max_features={max_features})...")

    vectorizer = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        ngram_range=(1, 2),
        max_df=0.95,
        min_df=2
    )

    X = vectorizer.fit_transform(messages)
    print(f"✅ Feature matrix shape: {X.shape}")
    print(f"💾 Memory usage: ~{X.data.nbytes / 1024 / 1024:.2f} MB")

    return X, vectorizer

In [20]:
# ============================================================================
# STEP 3A: TRAIN PYOD MODEL (Lightweight Anomaly Detection)
# ============================================================================

def train_pyod_model(X, contamination=0.02):
    """Train Isolation Forest using PyOD (very memory efficient)"""
    from pyod.models.iforest import IForest

    print(f"🤖 Training PyOD Isolation Forest...")
    print(f"   - Contamination: {contamination}")
    print(f"   - Estimators: 100")

    model = IForest(
        contamination=contamination,
        n_estimators=100,
        max_samples='auto',
        random_state=42,
        n_jobs=1  # Single thread to save memory
    )

    # Convert sparse to dense only if needed (IForest works with dense)
    if hasattr(X, 'toarray'):
        X_dense = X.toarray()
    else:
        X_dense = X

    model.fit(X_dense)
    print("✅ Model trained successfully!")

    return model


In [21]:
# ============================================================================
# STEP 3B: TRAIN TFLITE MODEL (Neural Network Autoencoder)
# ============================================================================

def train_tflite_model(X, epochs=5):
    """Train autoencoder and convert to TFLite"""
    import tensorflow as tf
    from tensorflow import keras

    print(f"🤖 Training TensorFlow Lite Autoencoder...")

    # Convert sparse to dense
    if hasattr(X, 'toarray'):
        X_dense = X.toarray().astype(np.float32)
    else:
        X_dense = X.astype(np.float32)

    input_dim = X_dense.shape[1]

    # Simple autoencoder architecture (memory efficient)
    autoencoder = keras.Sequential([
        keras.layers.InputLayer(input_shape=(input_dim,)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(input_dim, activation='sigmoid')
    ])

    autoencoder.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mse'
    )

    print(f"   - Training for {epochs} epochs...")
    autoencoder.fit(
        X_dense, X_dense,
        epochs=epochs,
        batch_size=128,
        validation_split=0.1,
        verbose=1
    )

    print("✅ Model trained!")
    return autoencoder

def convert_to_tflite(model):
    """Convert Keras model to TFLite format"""
    import tensorflow as tf

    print("📦 Converting to TensorFlow Lite...")
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]  # Quantization
    tflite_model = converter.convert()

    print(f"✅ TFLite model size: {len(tflite_model) / 1024:.2f} KB")
    return tflite_model

In [22]:
# ============================================================================
# STEP 4: MODEL EVALUATION
# ============================================================================

def evaluate_model(model, X, y_true, model_type='pyod'):
    """Quick evaluation of trained model"""
    from sklearn.metrics import classification_report, roc_auc_score

    print("\n📊 Model Evaluation:")

    if model_type == 'pyod':
        # Get anomaly scores
        scores = model.decision_function(X.toarray() if hasattr(X, 'toarray') else X)
        predictions = model.predict(X.toarray() if hasattr(X, 'toarray') else X)
    else:
        # For autoencoder: reconstruction error
        X_dense = X.toarray() if hasattr(X, 'toarray') else X
        reconstructed = model.predict(X_dense)
        scores = np.mean(np.square(X_dense - reconstructed), axis=1)
        threshold = np.percentile(scores, 98)
        predictions = (scores > threshold).astype(int)

    # Print metrics
    print(classification_report(y_true, predictions, target_names=['Normal', 'Anomaly']))

    if len(np.unique(y_true)) > 1:
        auc = roc_auc_score(y_true, scores)
        print(f"ROC-AUC Score: {auc:.4f}")


In [23]:
# ============================================================================
# STEP 5: SAVE MODELS
# ============================================================================

def save_pyod_model(model, vectorizer, output_dir='./'):
    """Save PyOD model and vectorizer"""
    print("\n💾 Saving PyOD model...")

    joblib.dump(model, f'{output_dir}/iforest_model.pkl')
    joblib.dump(vectorizer, f'{output_dir}/tfidf_vectorizer.pkl')

    print(f"✅ Saved:")
    print(f"   - {output_dir}/iforest_model.pkl")
    print(f"   - {output_dir}/tfidf_vectorizer.pkl")

def save_tflite_model(tflite_model, vectorizer, output_dir='./'):
    """Save TFLite model and vectorizer"""
    print("\n💾 Saving TFLite model...")

    with open(f'{output_dir}/anomaly_detector.tflite', 'wb') as f:
        f.write(tflite_model)

    joblib.dump(vectorizer, f'{output_dir}/tfidf_vectorizer.pkl')

    print(f"✅ Saved:")
    print(f"   - {output_dir}/anomaly_detector.tflite")
    print(f"   - {output_dir}/tfidf_vectorizer.pkl")

In [24]:
# ============================================================================
# STEP 6: INFERENCE TEST
# ============================================================================

def test_inference(model, vectorizer, model_type='pyod'):
    """Test model with sample log messages"""
    print("\n🧪 Testing Inference:")

    test_logs = [
        "User root logged in successfully",
        "Connection established from 192.168.1.1",
        "CRITICAL: Segmentation fault in sshd",
        "Failed password attempt for admin",
        "System backup completed successfully"
    ]

    X_test = vectorizer.transform(test_logs)

    if model_type == 'pyod':
        X_test_dense = X_test.toarray()
        scores = model.decision_function(X_test_dense)
        predictions = model.predict(X_test_dense)
    else:
        X_test_dense = X_test.toarray()
        reconstructed = model.predict(X_test_dense)
        scores = np.mean(np.square(X_test_dense - reconstructed), axis=1)
        predictions = (scores > np.percentile(scores, 80)).astype(int)

    print("\n" + "="*70)
    for log, score, pred in zip(test_logs, scores, predictions):
        status = "🚨 ANOMALY" if pred == 1 else "✅ NORMAL"
        print(f"{status} | Score: {score:.4f} | {log[:50]}...")
    print("="*70)

In [25]:
# ============================================================================
# MAIN TRAINING PIPELINE
# ============================================================================

def main():
    print("="*70)
    print("🎯 PROJECT QUORUM - LOG ANOMALY DETECTION TRAINING")
    print("="*70)

    # Step 1: Load data
    print("\n📂 STEP 1: Loading datasets...")
    df = prepare_data(CONFIG['datasets'], sample_per_dataset=5000)
    print(f"✅ Total samples: {len(df)}")

    # Step 2: Extract features
    print("\n🔧 STEP 2: Feature extraction...")
    X, vectorizer = extract_features(df['message'], CONFIG['max_features'])

    # Step 3: Train model
    print(f"\n🤖 STEP 3: Training {CONFIG['model_type'].upper()} model...")

    if CONFIG['model_type'] == 'pyod':
        model = train_pyod_model(X, CONFIG['contamination'])

        # Evaluate
        evaluate_model(model, X, df['label'], 'pyod')

        # Save
        save_pyod_model(model, vectorizer)

        # Test
        test_inference(model, vectorizer, 'pyod')

    else:  # tflite
        model = train_tflite_model(X, epochs=5)
        tflite_model = convert_to_tflite(model)

        # Evaluate
        evaluate_model(model, X, df['label'], 'tflite')

        # Save
        save_tflite_model(tflite_model, vectorizer)

        # Note: TFLite inference test requires TFLite interpreter
        print("\n⚠️ TFLite inference test skipped (use TFLite interpreter in production)")

    print("\n" + "="*70)
    print("✅ TRAINING COMPLETE!")
    print("📦 Download the .pkl/.tflite files and integrate into your backend")
    print("="*70)

In [26]:
# ============================================================================
# RUN TRAINING
# ============================================================================

if __name__ == "__main__":
    # Install required packages first (run in Colab cell):
    # !pip install pandas numpy scikit-learn pyod joblib tensorflow

    main()

🎯 PROJECT QUORUM - LOG ANOMALY DETECTION TRAINING

📂 STEP 1: Loading datasets...
📥 Loading dataset from: https://raw.githubusercontent.com/logpai/loghub/master/HDFS/HDFS_2k.log_structured.csv
✅ Loaded 2000 rows
📥 Loading dataset from: https://raw.githubusercontent.com/logpai/loghub/master/BGL/BGL_2k.log_structured.csv
✅ Loaded 2000 rows
📥 Loading dataset from: https://raw.githubusercontent.com/logpai/loghub/master/Thunderbird/Thunderbird_2k.log_structured.csv
✅ Loaded 2000 rows
📥 Loading dataset from: https://raw.githubusercontent.com/logpai/loghub/master/Mac/Mac_2k.log_structured.csv
✅ Loaded 2000 rows
📥 Loading dataset from: https://raw.githubusercontent.com/logpai/loghub/master/Windows/Windows_2k.log_structured.csv
✅ Loaded 2000 rows
📥 Loading dataset from: https://raw.githubusercontent.com/logpai/loghub/master/Linux/Linux_2k.log_structured.csv
✅ Loaded 2000 rows
✅ Total samples: 12000

🔧 STEP 2: Feature extraction...
🔧 Extracting features (max_features=1000)...
✅ Feature matrix sha