# Phase 1: Network Traffic Anomaly Detection - Training
This notebook covers the data loading, cleaning, robust feature engineering, and training of an optimized Isolation Forest model for AIOps-ready network telemetry analysis.

## 1. Environment Setup

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler
from google.colab import files
import os

# Visual settings
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Data
Upload the `network_traffic_data.csv` file from your local machine.

In [None]:
uploaded = files.upload()
df = pd.read_csv('network_traffic_data.csv')
print(f"Loaded {len(df)} records.")
df.head()

## 3. Data Cleaning & Feature Engineering
We clean the data and transform features. We also visualize the data to understand distributions.

In [None]:
# Handle missing values
df['dns_query'] = df['dns_query'].fillna('none')
df = df.dropna(subset=['source_ip', 'dest_ip', 'dest_port', 'protocol'])

# Encoding Categorical Features
le_protocol = LabelEncoder()
df['protocol_enc'] = le_protocol.fit_transform(df['protocol'])

# Scaling Numeric Features
scaler = StandardScaler()
numeric_features = ['dest_port', 'bytes_sent', 'bytes_recv']
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Feature Selection
features = numeric_features + ['protocol_enc']
X = df[features]
print(f"Feature matrix shape: {X.shape}")

## 4. Train Optimized Model
We use a **Robust Isolation Forest** configuration:
*   `n_estimators=300`: More trees for better stability and convergence.
*   `bootstrap=True`: Randomly samples independent subsets, reducing overfitting.
*   `n_jobs=-1`: Utilizes all CPU cores for faster training.

In [None]:
print("Training Optimized Isolation Forest model...")
model = IsolationForest(
    n_estimators=300,
    contamination='auto',
    max_samples='auto',
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)
model.fit(X)

# Evaluate
predictions = model.predict(X)
df['anomaly_score'] = model.decision_function(X)
df['is_anomaly'] = predictions

anomaly_count = (predictions == -1).sum()
print(f"Detected {anomaly_count} anomalies out of {len(df)} records ({anomaly_count/len(df):.2%}).")

## 5. Model Evaluation & Visualization
Analyzing the distribution of anomaly scores helps confirm if the model is effectively separating outliers (left tail) from normal traffic (right).

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['anomaly_score'], bins=50, kde=True, color='purple')
plt.axvline(x=model.offset_, color='red', linestyle='--', label=f'Threshold ({model.offset_:.3f})')
plt.title('Distribution of Anomaly Scores (Lower = More Anomalous)')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

## 6. Export Artifacts
Save the robust model and preprocessing objects for the API.

In [None]:
model_artifacts = {
    'model': model,
    'scaler': scaler,
    'le_protocol': le_protocol,
    'features': features
}

artifact_path = 'anomaly_model.joblib'
joblib.dump(model_artifacts, artifact_path)
print(f"Model saved to {artifact_path}")

# Download back to local machine
files.download(artifact_path)