# Phase 1: Network Traffic Anomaly Detection - Training
This notebook covers the data loading, cleaning, robust feature engineering, and training of an optimized Isolation Forest model for AIOps-ready network telemetry analysis.

## 1. Environment Setup

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Check environment (Colab vs Local)
try:
    from google.colab import files
    IN_COLAB = True
    print("Running in Google Colab environment.")
except ImportError:
    IN_COLAB = False
    print("Running in Local environment.")

# Visual settings
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Data
Automatically detects `data/network_traffic_data.csv` locally, or prompts for upload in Colab.

In [None]:
input_file = 'network_traffic_data.csv'
local_path = '../../data/network_traffic_data.csv' # Relative path from ml/notebooks/

if os.path.exists(input_file):
    # File exists in current directory (e.g., uploaded to Colab root)
    data_path = input_file
elif os.path.exists(local_path):
    # File exists in project structure
    data_path = local_path
elif IN_COLAB:
    print("Data not found. Please upload network_traffic_data.csv")
    uploaded = files.upload()
    data_path = input_file
else:
    raise FileNotFoundError("Could not find network_traffic_data.csv. Please ensure it is in the data/ directory.")

df = pd.read_csv(data_path)
print(f"Loaded {len(df)} records from {data_path}.")
df.head()

## 3. Data Cleaning & Feature Engineering
We clean the data and transform features. We also visualize the data to understand distributions.

In [None]:
# Handle missing values
df['dns_query'] = df['dns_query'].fillna('none')
df = df.dropna(subset=['source_ip', 'dest_ip', 'dest_port', 'protocol'])

# Encoding Categorical Features
le_protocol = LabelEncoder()
df['protocol_enc'] = le_protocol.fit_transform(df['protocol'])

# Scaling Numeric Features
scaler = StandardScaler()
numeric_features = ['dest_port', 'bytes_sent', 'bytes_recv']
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Feature Selection
features = numeric_features + ['protocol_enc']
X = df[features]
print(f"Feature matrix shape: {X.shape}")

## 4. Train Optimized Model
We use a **Robust Isolation Forest** configuration:
*   `n_estimators=300`: More trees for better stability and convergence.
*   `bootstrap=True`: Randomly samples independent subsets, reducing overfitting.
*   `n_jobs=-1`: Utilizes all CPU cores for faster training.

In [None]:
print("Training Optimized Isolation Forest model...")
model = IsolationForest(
    n_estimators=300,
    contamination='auto',
    max_samples='auto',
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)
model.fit(X)

# Evaluate
predictions = model.predict(X)
df['anomaly_score'] = model.decision_function(X)
df['is_anomaly'] = predictions

anomaly_count = (predictions == -1).sum()
print(f"Detected {anomaly_count} anomalies out of {len(df)} records ({anomaly_count/len(df):.2%}).")

## 5. Model Evaluation & Visualization
Analyzing the distribution of anomaly scores helps confirm if the model is effectively separating outliers (left tail) from normal traffic (right).

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['anomaly_score'], bins=50, kde=True, color='purple')
plt.axvline(x=model.offset_, color='red', linestyle='--', label=f'Threshold ({model.offset_:.3f})')
plt.title('Distribution of Anomaly Scores (Lower = More Anomalous)')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

## 6. Export Artifacts
Save the robust model and preprocessing objects for the API. In a local environment, these are saved safely to `ml/models/`.

In [None]:
model_artifacts = {
    'model': model,
    'scaler': scaler,
    'le_protocol': le_protocol,
    'features': features
}

output_dir = '../../ml/models/'
if not os.path.exists(output_dir) and not IN_COLAB:
    # Fallback if directory structure is different
    output_dir = ''

artifact_fname = 'anomaly_model.joblib'
artifact_path = os.path.join(output_dir, artifact_fname)

# Create directory if needed
if output_dir:
os.makedirs(output_dir, exist_ok=True)

joblib.dump(model_artifacts, artifact_path)
print(f"Model saved to {artifact_path}")

# Download back to local machine if in Colab
if IN_COLAB:
    files.download(artifact_path)