<a href="https://colab.research.google.com/github/DiptangshuPattadar/ML-project/blob/main/ANOMALY_DETECTION_IN_NETWORK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

# Network Anomaly Detection System
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import (accuracy_score, classification_report,
                           confusion_matrix, roc_curve, auc)
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# Set modern visualization style
sns.set_theme(style="whitegrid", palette="husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

def plot_class_distribution(y, title, filename):
    plt.figure()
    sns.countplot(x=y)
    plt.title(f'Class Distribution: {title}')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

def plot_clusters(X, clusters, filename):
    plt.figure()
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=clusters, palette="viridis")
    plt.title('Cluster Visualization (PCA)')
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

def plot_anomaly_scores(scores, y_true, filename):
    plt.figure()
    sns.kdeplot(scores[y_true==0], label='Normal', fill=True)
    sns.kdeplot(scores[y_true==1], label='Anomaly', fill=True)
    plt.title('Anomaly Score Distribution')
    plt.xlabel('Anomaly Score')
    plt.legend()
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

def plot_feature_importance(model, feature_names, top_n=20, filename='feature_importance.png'):
    plt.figure()
    importances = model.feature_importances_
    indices = np.argsort(importances)[-top_n:]
    plt.title(f'Top {top_n} Feature Importance')
    plt.barh(range(top_n), importances[indices], align='center')
    plt.yticks(range(top_n), [feature_names[i] for i in indices])
    plt.tight_layout()
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

def plot_confusion_matrix(y_true, y_pred, filename):
    plt.figure()
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

def plot_roc_curve(y_true, y_proba, filename):
    plt.figure()
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

# Load and preprocess data
train_df = pd.read_csv("UNSW_NB15_training-set.csv")
test_df = pd.read_csv("UNSW_NB15_testing-set.csv")

# Clean and prepare data
train_df = train_df.dropna(subset=['label'])
test_df = test_df.dropna(subset=['label'])

# Label Encoding with alignment
label_encoders = {}
for col in train_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], test_df[col]]).astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    label_encoders[col] = le

# Feature engineering with safe operations
def create_features(df):
    eps = 1e-9
    df = df.copy()
    return df.assign(
        sbytes_dbytes_ratio = np.divide(df['sbytes'], df['dbytes'] + eps),
        sttl_dttl_ratio = np.divide(df['sttl'], df['dttl'] + eps),
        sload_dload_ratio = np.divide(df['sload'], df['dload'] + eps),
        spkts_dpkts_ratio = np.divide(df['spkts'], df['dpkts'] + eps)
    ).fillna(0).replace([np.inf, -np.inf], 0)

# Create and align features
X_train = create_features(train_df.drop(columns=['label']))
X_test = create_features(test_df.drop(columns=['label']))
y_train = train_df['label'].astype(int)
y_test = test_df['label'].astype(int)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_train_scaled, y_train)
plot_class_distribution(y_balanced, 'Balanced Training Data', 'balanced_dist.png')

# Cluster modeling
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_balanced)
plot_clusters(X_balanced, clusters, 'clusters.png')

# Train cluster models
cluster_models = []
train_scores = np.zeros(X_balanced.shape[0])

for cluster in range(5):
    cluster_data = X_balanced[clusters == cluster]
    model = IsolationForest(
        n_estimators=300,
        max_samples='auto',
        contamination=0.15,
        random_state=42,
        n_jobs=-1
    )
    model.fit(cluster_data)
    cluster_models.append(model)
    train_scores[clusters == cluster] = -model.decision_function(cluster_data)

# Generate test scores
test_scores = np.zeros(X_test_scaled.shape[0])
for model in cluster_models:
    scores = -model.decision_function(X_test_scaled)
    test_scores += scores
test_scores /= len(cluster_models)

# Visualization
plot_anomaly_scores(test_scores, y_test, 'anomaly_scores.png')

# Create enhanced features
X_train_enhanced = np.hstack([X_balanced, train_scores.reshape(-1, 1)])
X_test_enhanced = np.hstack([X_test_scaled, test_scores.reshape(-1, 1)])

# Final model training
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_enhanced, y_balanced)

# Evaluation
final_pred = rf_model.predict(X_test_enhanced)
y_proba = rf_model.predict_proba(X_test_enhanced)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, final_pred))
print(f"Accuracy: {accuracy_score(y_test, final_pred):.2f}")

# Visual evaluation
plot_feature_importance(rf_model, X_train.columns.tolist() + ['anomaly_score'])
plot_confusion_matrix(y_test, final_pred, 'confusion_matrix.png')
plot_roc_curve(y_test, y_proba, 'roc_curve.png')

# Precision-Recall Curve
plt.figure()
PrecisionRecallDisplay.from_predictions(y_test, y_proba)
plt.title('Precision-Recall Curve')
plt.savefig('precision_recall.png', bbox_inches='tight')
plt.close()



Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     56000
           1       1.00      0.92      0.96    119341

    accuracy                           0.95    175341
   macro avg       0.93      0.96      0.94    175341
weighted avg       0.96      0.95      0.95    175341

Accuracy: 0.95


<Figure size 1200x800 with 0 Axes>