# 🖥️ Server Logs Anomaly Detection Training

This notebook trains an anomaly detection model for server logs using Apache access logs data.

## Features Used:
- **response_time**: Time taken to process the request (ms)
- **request_size**: Size of the request in bytes
- **status_code_numeric**: HTTP status code as numeric value
- **hour_of_day**: Hour when request was made (0-23)
- **requests_per_minute**: Rate of requests per minute

## Anomaly Types to Detect:
- 🚨 **High response times** (potential DDoS or performance issues)
- 🚨 **Unusual status code patterns** (4xx, 5xx errors)
- 🚨 **Abnormal request sizes** (potential attacks)
- 🚨 **Traffic spikes** (unusual request rates)
- 🚨 **Off-hours activity** (requests at unusual times)


In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from datetime import datetime, timedelta
import requests
import io
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")
print("📊 Starting Server Logs Anomaly Detection Training...")


✅ Libraries imported successfully!
📊 Starting Server Logs Anomaly Detection Training...


In [6]:
# Generate synthetic server logs data (since we'll simulate realistic server logs)
def generate_server_logs_data(n_samples=50000, anomaly_rate=0.05):
    """Generate synthetic server logs data with realistic patterns"""
    
    np.random.seed(42)  # For reproducibility
    
    # Normal server behavior patterns
    normal_samples = int(n_samples * (1 - anomaly_rate))
    
    # Generate normal data
    normal_data = {
        # Response time: Normal distribution around 200ms, peak hours slower
        'response_time': np.random.gamma(2, 100),  # Gamma distribution for response times
        
        # Request size: Most requests small, some larger
        'request_size': np.random.lognormal(8, 1.5),  # Log-normal for request sizes
        
        # Status codes: Mostly 200, some 304, few 404/500
        'status_code_numeric': np.random.choice([200, 200, 200, 200, 200, 200, 304, 404, 500], 
                                               size=normal_samples, 
                                               p=[0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.15, 0.08, 0.02]),
        
        # Hour of day: Peak during business hours (9-17), low at night
        'hour_of_day': np.concatenate([
            np.random.choice(range(9, 18), size=int(normal_samples * 0.6)),  # Business hours
            np.random.choice(range(0, 24), size=int(normal_samples * 0.4))   # Other hours
        ]),
        
        # Requests per minute: Normal load 20-80 requests/min
        'requests_per_minute': np.random.normal(50, 15)
    }
    
    # Generate anomalous data
    anomaly_samples = n_samples - normal_samples
    
    anomaly_data = {
        # High response times (performance issues, DDoS)
        'response_time': np.concatenate([
            np.random.gamma(2, 100),  # Some normal
            np.random.gamma(2, 2000)  # Some very high (anomalies)
        ])[:anomaly_samples],
        
        # Unusual request sizes (potential attacks)
        'request_size': np.concatenate([
            np.random.lognormal(8, 1.5),      # Normal sizes
            np.random.lognormal(12, 2),       # Very large requests (attacks)
            np.random.lognormal(4, 1)         # Very small requests
        ])[:anomaly_samples],
        
        # More error status codes in anomalies
        'status_code_numeric': np.random.choice([200, 400, 403, 404, 500, 502, 503], 
                                               size=anomaly_samples,
                                               p=[0.3, 0.15, 0.15, 0.15, 0.1, 0.1, 0.05]),
        
        # Unusual hours (night time attacks)
        'hour_of_day': np.random.choice([0, 1, 2, 3, 4, 5, 22, 23], size=anomaly_samples),
        
        # Traffic spikes or unusual quiet periods
        'requests_per_minute': np.concatenate([
            np.random.normal(200, 50),  # Traffic spikes
            np.random.normal(5, 2)      # Unusual quiet periods
        ])[:anomaly_samples]
    }
    
    # Combine normal and anomaly data
    data = {
        'response_time': np.concatenate([normal_data['response_time'], anomaly_data['response_time']]),
        'request_size': np.concatenate([normal_data['request_size'], anomaly_data['request_size']]),
        'status_code_numeric': np.concatenate([normal_data['status_code_numeric'], anomaly_data['status_code_numeric']]),
        'hour_of_day': np.concatenate([normal_data['hour_of_day'], anomaly_data['hour_of_day']]),
        'requests_per_minute': np.concatenate([normal_data['requests_per_minute'], anomaly_data['requests_per_minute']])
    }
    
    # Create labels (0 = normal, 1 = anomaly)
    labels = np.concatenate([
        np.zeros(normal_samples),  # Normal samples
        np.ones(anomaly_samples)   # Anomaly samples
    ])
    
    # Create DataFrame
    df = pd.DataFrame(data)
    df['is_anomaly'] = labels
    
    # Add some realistic constraints
    df['response_time'] = np.clip(df['response_time'], 50, 10000)  # 50ms to 10s
    df['request_size'] = np.clip(df['request_size'], 100, 100000)  # 100B to 100KB
    df['requests_per_minute'] = np.clip(df['requests_per_minute'], 1, 500)  # 1 to 500 req/min
    
    # Shuffle the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    return df

# Generate the dataset
print("🏗️ Generating synthetic server logs dataset...")
df = generate_server_logs_data(n_samples=50000, anomaly_rate=0.05)

print(f"📊 Dataset created:")
print(f"   Total samples: {len(df)}")
print(f"   Normal samples: {len(df[df['is_anomaly'] == 0])}")
print(f"   Anomaly samples: {len(df[df['is_anomaly'] == 1])}")
print(f"   Anomaly rate: {df['is_anomaly'].mean():.2%}")

df.head()


🏗️ Generating synthetic server logs dataset...


ValueError: probabilities do not sum to 1

In [None]:
# Exploratory Data Analysis
print("📈 Performing Exploratory Data Analysis...")

# Basic statistics
print("\n📊 Dataset Statistics:")
print(df.describe())

# Check for missing values
print(f"\n🔍 Missing Values: {df.isnull().sum().sum()}")

# Feature distributions
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('🖥️ Server Logs Feature Distributions', fontsize=16, fontweight='bold')

features = ['response_time', 'request_size', 'status_code_numeric', 'hour_of_day', 'requests_per_minute']

for idx, feature in enumerate(features):
    row = idx // 3
    col = idx % 3
    
    # Plot distribution for normal vs anomaly
    df[df['is_anomaly'] == 0][feature].hist(alpha=0.7, bins=50, ax=axes[row, col], 
                                           label='Normal', color='green', density=True)
    df[df['is_anomaly'] == 1][feature].hist(alpha=0.7, bins=50, ax=axes[row, col], 
                                           label='Anomaly', color='red', density=True)
    
    axes[row, col].set_title(f'{feature.replace("_", " ").title()}')
    axes[row, col].set_xlabel(feature.replace("_", " ").title())
    axes[row, col].set_ylabel('Density')
    axes[row, col].legend()

# Remove the empty subplot
fig.delaxes(axes[1, 2])

plt.tight_layout()
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df[features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('🔗 Feature Correlation Matrix')
plt.show()


In [None]:
# Prepare features for training
print("🔧 Preparing features for training...")

# Select feature columns
feature_columns = ['response_time', 'request_size', 'status_code_numeric', 'hour_of_day', 'requests_per_minute']
X = df[feature_columns].copy()
y = df['is_anomaly'].copy()

print(f"Features shape: {X.shape}")
print(f"Features: {feature_columns}")

# Check for any infinite or NaN values
print(f"Infinite values: {np.isinf(X).sum().sum()}")
print(f"NaN values: {X.isnull().sum().sum()}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n📊 Data Split:")
print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Test set: {X_test.shape[0]} samples")
print(f"   Training anomaly rate: {y_train.mean():.2%}")
print(f"   Test anomaly rate: {y_test.mean():.2%}")

# Display sample data
print(f"\n🔍 Sample Training Data:")
print(X_train.head())


In [None]:
# Feature scaling
print("⚖️ Scaling features...")

# Initialize scaler
scaler = StandardScaler()

# Fit scaler on training data only
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✅ Features scaled successfully!")
print(f"   Scaler mean: {scaler.mean_}")
print(f"   Scaler scale: {scaler.scale_}")

# Verify scaling
print(f"\n📊 Scaled Training Data Statistics:")
print(f"   Mean: {X_train_scaled.mean(axis=0)}")
print(f"   Std: {X_train_scaled.std(axis=0)}")

# Convert back to DataFrame for easier handling
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_columns)

print(f"\n🔍 Sample Scaled Training Data:")
print(X_train_scaled_df.head())


In [None]:
# Train Isolation Forest model
print("🌲 Training Isolation Forest model...")

# For training, we only use normal data (unsupervised anomaly detection)
X_train_normal = X_train_scaled[y_train == 0]
print(f"Training on {len(X_train_normal)} normal samples")

# Initialize and train Isolation Forest
model = IsolationForest(
    contamination=0.05,  # Expected proportion of anomalies
    random_state=42,
    n_estimators=100,    # Number of trees
    max_samples=0.8,     # Fraction of samples to train each tree
    max_features=1.0,    # Use all features
    n_jobs=-1           # Use all CPU cores
)

# Train the model on normal data only
model.fit(X_train_normal)

print("✅ Model training completed!")
print(f"   Model type: {type(model).__name__}")
print(f"   Number of estimators: {model.n_estimators}")
print(f"   Contamination rate: {model.contamination}")

# Make predictions on training set
y_train_pred = model.predict(X_train_scaled)
y_train_pred_binary = (y_train_pred == -1).astype(int)  # Convert -1/1 to 0/1

# Make predictions on test set
y_test_pred = model.predict(X_test_scaled)
y_test_pred_binary = (y_test_pred == -1).astype(int)  # Convert -1/1 to 0/1

print(f"\n📊 Training Set Predictions:")
print(f"   Predicted anomalies: {y_train_pred_binary.sum()}")
print(f"   Predicted normal: {len(y_train_pred_binary) - y_train_pred_binary.sum()}")
print(f"   Predicted anomaly rate: {y_train_pred_binary.mean():.2%}")

print(f"\n📊 Test Set Predictions:")
print(f"   Predicted anomalies: {y_test_pred_binary.sum()}")
print(f"   Predicted normal: {len(y_test_pred_binary) - y_test_pred_binary.sum()}")
print(f"   Predicted anomaly rate: {y_test_pred_binary.mean():.2%}")


In [None]:
# Model evaluation
print("📊 Evaluating model performance...")

# Classification report for test set
print("🎯 Test Set Classification Report:")
print(classification_report(y_test, y_test_pred_binary, 
                          target_names=['Normal', 'Anomaly'], 
                          digits=3))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred_binary)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Anomaly'], 
            yticklabels=['Normal', 'Anomaly'])
plt.title('🎯 Confusion Matrix - Server Logs Anomaly Detection')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Calculate metrics
tn, fp, fn, tp = cm.ravel()
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)

print(f"\n📈 Detailed Metrics:")
print(f"   True Positives (TP): {tp}")
print(f"   True Negatives (TN): {tn}")
print(f"   False Positives (FP): {fp}")
print(f"   False Negatives (FN): {fn}")
print(f"   Accuracy: {accuracy:.3f}")
print(f"   Precision: {precision:.3f}")
print(f"   Recall: {recall:.3f}")
print(f"   F1-Score: {f1_score:.3f}")

# Feature importance (based on path lengths in isolation trees)
# Note: Isolation Forest doesn't have traditional feature importance, 
# but we can analyze which features contribute to anomaly scores
print(f"\n🔍 Model Analysis:")
print(f"   Decision function range: [{model.decision_function(X_test_scaled).min():.3f}, {model.decision_function(X_test_scaled).max():.3f}]")
print(f"   Threshold for anomaly: < 0")


In [None]:
# Visualize anomaly detection results
print("📊 Creating visualization of anomaly detection results...")

# Create a sample of data for visualization (to avoid overcrowding)
sample_size = 2000
sample_indices = np.random.choice(len(X_test), size=min(sample_size, len(X_test)), replace=False)
X_test_sample = X_test.iloc[sample_indices]
y_test_sample = y_test.iloc[sample_indices]
y_pred_sample = y_test_pred_binary[sample_indices]

# Plot pairwise feature comparisons
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('🖥️ Server Logs Anomaly Detection Results', fontsize=16, fontweight='bold')

# Define feature pairs for visualization
feature_pairs = [
    ('response_time', 'request_size'),
    ('response_time', 'requests_per_minute'),
    ('hour_of_day', 'status_code_numeric'),
    ('request_size', 'requests_per_minute')
]

for idx, (feat1, feat2) in enumerate(feature_pairs):
    row = idx // 2
    col = idx % 2
    
    # True labels
    normal_mask = y_test_sample == 0
    anomaly_mask = y_test_sample == 1
    
    # Plot true normal points
    axes[row, col].scatter(X_test_sample[normal_mask][feat1], 
                          X_test_sample[normal_mask][feat2],
                          c='lightgreen', alpha=0.6, s=30, label='True Normal', marker='o')
    
    # Plot true anomalies
    axes[row, col].scatter(X_test_sample[anomaly_mask][feat1], 
                          X_test_sample[anomaly_mask][feat2],
                          c='red', alpha=0.8, s=50, label='True Anomaly', marker='x')
    
    # Highlight false positives and false negatives
    fp_mask = (y_test_sample == 0) & (y_pred_sample == 1)
    fn_mask = (y_test_sample == 1) & (y_pred_sample == 0)
    
    if fp_mask.sum() > 0:
        axes[row, col].scatter(X_test_sample[fp_mask][feat1], 
                              X_test_sample[fp_mask][feat2],
                              c='orange', alpha=0.8, s=60, label='False Positive', marker='s', edgecolors='black')
    
    if fn_mask.sum() > 0:
        axes[row, col].scatter(X_test_sample[fn_mask][feat1], 
                              X_test_sample[fn_mask][feat2],
                              c='purple', alpha=0.8, s=60, label='False Negative', marker='^', edgecolors='black')
    
    axes[row, col].set_xlabel(feat1.replace('_', ' ').title())
    axes[row, col].set_ylabel(feat2.replace('_', ' ').title())
    axes[row, col].set_title(f'{feat1.replace("_", " ").title()} vs {feat2.replace("_", " ").title()}')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Anomaly score distribution
plt.figure(figsize=(12, 6))
scores = model.decision_function(X_test_scaled)
scores_normal = scores[y_test == 0]
scores_anomaly = scores[y_test == 1]

plt.subplot(1, 2, 1)
plt.hist(scores_normal, bins=50, alpha=0.7, label='Normal', color='green', density=True)
plt.hist(scores_anomaly, bins=50, alpha=0.7, label='Anomaly', color='red', density=True)
plt.axvline(x=0, color='black', linestyle='--', label='Decision Threshold')
plt.xlabel('Anomaly Score')
plt.ylabel('Density')
plt.title('🎯 Anomaly Score Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

# ROC-like curve using anomaly scores
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, -scores)  # Negative because lower scores = more anomalous
roc_auc = auc(fpr, tpr)

plt.subplot(1, 2, 2)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('📊 ROC Curve')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"📊 ROC AUC Score: {roc_auc:.3f}")


In [None]:
# Save the trained model and scaler
print("💾 Saving trained model and scaler...")

# Save the model
model_filename = 'model_server_logs.joblib'
joblib.dump(model, model_filename)
print(f"✅ Model saved as: {model_filename}")

# Save the scaler
scaler_filename = 'scaler_server_logs.joblib'
joblib.dump(scaler, scaler_filename)
print(f"✅ Scaler saved as: {scaler_filename}")

# Test loading the saved model
print("\n🔄 Testing saved model...")
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)

# Test with a few samples
test_samples = X_test.iloc[:5]
print(f"\n🧪 Testing with {len(test_samples)} samples:")
print(test_samples)

# Scale the test samples
test_samples_scaled = loaded_scaler.transform(test_samples)

# Make predictions
predictions = loaded_model.predict(test_samples_scaled)
anomaly_scores = loaded_model.decision_function(test_samples_scaled)

print(f"\n🎯 Predictions:")
for i, (pred, score) in enumerate(zip(predictions, anomaly_scores)):
    is_anomaly = pred == -1
    print(f"   Sample {i+1}: {'🚨 ANOMALY' if is_anomaly else '✅ Normal'} (score: {score:.3f})")

print(f"\n🎉 Model training and saving completed successfully!")
print(f"📁 Model files created:")
print(f"   - {model_filename}")
print(f"   - {scaler_filename}")

# Show final model summary
print(f"\n📋 Final Model Summary:")
print(f"   Model Type: Isolation Forest")
print(f"   Training Samples: {len(X_train_normal)}")
print(f"   Features: {len(feature_columns)}")
print(f"   Test Accuracy: {accuracy:.3f}")
print(f"   Test Precision: {precision:.3f}")
print(f"   Test Recall: {recall:.3f}")
print(f"   Test F1-Score: {f1_score:.3f}")
print(f"   ROC AUC: {roc_auc:.3f}")
print(f"   Model is ready for deployment! 🚀")
