# Phase 1: Network Traffic Anomaly Detection - Training
This notebook covers the data loading, cleaning, feature engineering, and training of an Isolation Forest model for AIOps-ready network telemetry analysis.

## 1. Environment Setup

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler
from google.colab import files
import os

## 2. Load Data
Upload the `network_traffic_data.csv` file from your local machine.

In [None]:
uploaded = files.upload()
df = pd.read_csv('network_traffic_data.csv')
df.head()

## 3. Data Cleaning & Feature Engineering
We will clean the data and transform categorical features into a format suitable for the Isolation Forest algorithm.

In [None]:
# Handle missing values
df['dns_query'] = df['dns_query'].fillna('none')
df = df.dropna(subset=['source_ip', 'dest_ip', 'dest_port', 'protocol'])

# Encoding Categorical Features
le_protocol = LabelEncoder()
df['protocol_enc'] = le_protocol.fit_transform(df['protocol'])

# Scaling Numeric Features
scaler = StandardScaler()
numeric_features = ['dest_port', 'bytes_sent', 'bytes_recv']
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Feature Selection Documentation:
# - dest_port: Identifies target services, useful for detecting port scanning or lateral movement.
# - bytes_sent/recv: Volume metrics are key to identifying data exfiltration (high sent) or command-and-control (low, periodic traffic).
# - protocol_enc: Baseline behaviors differ significantly between TCP, UDP, and ICMP.

features = numeric_features + ['protocol_enc']
X = df[features]
print(f"Feature matrix shape: {X.shape}")

## 4. Train Model
We use **Isolation Forest**, an unsupervised learning algorithm that is effective at identifying anomalies by isolating observations.

In [None]:
print("Training Isolation Forest model...")
model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
model.fit(X)

# Evaluate
predictions = model.predict(X)
df['anomaly_score'] = model.decision_function(X)
df['is_anomaly'] = predictions

anomaly_count = (predictions == -1).sum()
print(f"Detected {anomaly_count} anomalies out of {len(df)} records.")

## 5. Export Artifacts
Save the model and preprocessing objects for use in the REST API (Phase 2).

In [None]:
model_artifacts = {
    'model': model,
    'scaler': scaler,
    'le_protocol': le_protocol,
    'features': features
}

artifact_path = 'anomaly_model.joblib'
joblib.dump(model_artifacts, artifact_path)
print(f"Model saved to {artifact_path}")

# Download back to local machine
files.download(artifact_path)