In [None]:
import pandas as pd
import numpy as np
import time
import joblib
import os
import gc
import logging
from typing import List, Dict, Tuple, Optional, Any
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_auc_score, accuracy_score,
                             precision_score, recall_score, f1_score)
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("intrusion_detection.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class NetworkIntrusionDetector:
    """A professional network intrusion detection system using machine learning"""

    def __init__(self, config: Dict[str, Any]):
        """
        Initialize the intrusion detection system

        Args:
            config: Configuration dictionary with paths and parameters
        """
        self.config = config
        self.models = {}
        self.results = {}
        self.scaler = None

        # Create directories for models and plots
        os.makedirs(self.config['model_dir'], exist_ok=True)
        os.makedirs(self.config['plot_dir'], exist_ok=True)

        logger.info("Network Intrusion Detector initialized")

    def load_and_preprocess_data(self, file_paths: List[str], dataset_type: str,
                                sample_frac: float = 1.0) -> pd.DataFrame:
        """
        Load and preprocess data based on dataset type

        Args:
            file_paths: List of file paths to load
            dataset_type: Type of dataset ('CICIDS' or 'UNSW')
            sample_frac: Fraction of data to sample

        Returns:
            Preprocessed DataFrame
        """
        dfs = []

        for file_path in file_paths:
            try:
                logger.info(f"Loading data from {file_path}...")

                # Load the data with error handling for different encodings
                try:
                    df = pd.read_csv(file_path, low_memory=False)
                except UnicodeDecodeError:
                    df = pd.read_csv(file_path, low_memory=False, encoding='latin-1')

                # Clean column names
                df.columns = df.columns.str.strip().str.lower()

                # Sample data if needed
                if sample_frac < 1.0:
                    df = df.sample(frac=sample_frac, random_state=self.config['random_state'])
                    logger.info(f"Using {sample_frac*100:.0f}% random sample of data")

                dfs.append(df)
                logger.info(f"Data loaded successfully. Shape: {df.shape}")

            except FileNotFoundError:
                logger.error(f"File not found at {file_path}")
                continue
            except Exception as e:
                logger.error(f"Error loading {file_path}: {e}")
                continue

        if not dfs:
            raise ValueError("No data files were successfully loaded")

        # Combine all dataframes
        combined_df = pd.concat(dfs, ignore_index=True, sort=False)
        logger.info(f"Combined dataset shape: {combined_df.shape}")

        # Preprocess based on dataset type
        if dataset_type == "CICIDS":
            return self._preprocess_cicids_data(combined_df)
        elif dataset_type == "UNSW":
            return self._preprocess_unsw_data(combined_df)
        else:
            raise ValueError(f"Unknown dataset type: {dataset_type}")

    def _preprocess_cicids_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Preprocessing of CICIDS2017 dataset"""
        logger.info("Preprocessing CICIDS2017 data...")

        # Handle missing values and infinities
        initial_count = len(df)
        df.replace([np.inf, -np.inf], np.nan, inplace=True)

        # Remove columns with too many missing values
        missing_threshold = self.config.get('missing_threshold', 0.5)
        missing_ratios = df.isnull().sum() / len(df)
        cols_to_drop = missing_ratios[missing_ratios > missing_threshold].index

        if len(cols_to_drop) > 0:
            logger.info(f"Dropping {len(cols_to_drop)} columns with >{missing_threshold*100}% missing values")
            df.drop(columns=cols_to_drop, inplace=True)

        # Remove rows with any remaining missing values
        df.dropna(inplace=True)
        logger.info(f"Removed {initial_count - len(df)} rows with issues")

        # Convert labels to binary classification
        logger.info("Processing labels...")

        # Handle different label formats in CICIDS dataset
        if 'label' in df.columns:
            df['label'] = df['label'].apply(
                lambda x: 0 if 'benign' in str(x).lower() else 1
            )
        else:
            # Try to find label column with different names
            label_candidates = ['label', 'class', 'attack', 'result']
            for candidate in label_candidates:
                if candidate in df.columns:
                    df['label'] = df[candidate].apply(
                        lambda x: 0 if 'benign' in str(x).lower() or x == 0 else 1
                    )
                    break
            else:
                raise ValueError("Could not find label column in CICIDS data")

        # Remove any non-binary labels
        df = df[df['label'].isin([0, 1])]

        # Convert non-numeric columns
        logger.info("Converting non-numeric features...")
        object_cols = df.select_dtypes(include=['object', 'category']).columns
        object_cols = [col for col in object_cols if col != 'label']

        for col in object_cols:
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except Exception as e:
                logger.warning(f"Could not convert column {col} to numeric: {e}")
                # If conversion fails, use label encoding
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))

        # Final cleanup
        df.dropna(inplace=True)

        # Show class distribution
        self._log_class_distribution(df)

        return df

    def _preprocess_unsw_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Preprocessing of UNSW-NB15 dataset"""
        logger.info("Preprocessing UNSW-NB15 data...")

        # Handle missing values and infinities
        initial_count = len(df)
        df.replace([np.inf, -np.inf], np.nan, inplace=True)

        # Remove columns with too many missing values
        missing_threshold = self.config.get('missing_threshold', 0.5)
        missing_ratios = df.isnull().sum() / len(df)
        cols_to_drop = missing_ratios[missing_ratios > missing_threshold].index

        if len(cols_to_drop) > 0:
            logger.info(f"Dropping {len(cols_to_drop)} columns with >{missing_threshold*100}% missing values")
            df.drop(columns=cols_to_drop, inplace=True)

        # Remove rows with any remaining missing values
        df.dropna(inplace=True)
        logger.info(f"Removed {initial_count - len(df)} rows with issues")

        # Convert labels to binary classification
        logger.info("Processing labels...")

        # UNSW-NB15 uses different label column names
        label_col = None
        for possible_label in ['label', 'attack_cat', 'classification']:
            if possible_label in df.columns:
                label_col = possible_label
                break

        if label_col is None:
            raise ValueError("Could not find label column in UNSW-NB15 data")

        if label_col == 'label':
            df['label'] = df['label'].apply(lambda x: 0 if x == 0 else 1)
        else:
            # For attack_cat or other categorical labels
            df['label'] = df[label_col].apply(
                lambda x: 0 if str(x).lower() in ['normal', 'benign', '0'] else 1
            )

        # Remove any non-binary labels
        df = df[df['label'].isin([0, 1])]

        # Convert non-numeric columns
        logger.info("Converting non-numeric features...")
        object_cols = df.select_dtypes(include=['object', 'category']).columns
        object_cols = [col for col in object_cols if col != 'label']

        for col in object_cols:
            try:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))
            except Exception as e:
                logger.warning(f"Could not convert column {col} using label encoding: {e}")
                # If conversion fails, drop the column
                df.drop(columns=[col], inplace=True)

        # Final cleanup
        df.dropna(inplace=True)

        # Show class distribution
        self._log_class_distribution(df)

        return df

    def _log_class_distribution(self, df: pd.DataFrame):
        """Log class distribution information"""
        class_counts = df['label'].value_counts()
        benign_count = class_counts.get(0, 0)
        attack_count = class_counts.get(1, 0)

        logger.info("\nClass Distribution:")
        logger.info(f"Benign (0): {benign_count} samples")
        logger.info(f"Attack (1): {attack_count} samples")

        if attack_count > 0:
            imbalance_ratio = benign_count / attack_count
            logger.info(f"Imbalance Ratio: {imbalance_ratio:.2f}:1")
        else:
            logger.warning("Attack class count is zero, cannot compute imbalance ratio.")

    def _align_features(self, train_df: pd.DataFrame, test_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Align features between train and test datasets"""
        # Get common columns
        common_cols = list(set(train_df.columns) & set(test_df.columns))

        # Ensure label is included
        if 'label' not in common_cols:
            common_cols.append('label')

        # Filter both datasets to only include common columns
        train_df = train_df[common_cols]
        test_df = test_df[common_cols]

        logger.info(f"Aligned datasets to {len(common_cols)} common features")

        return train_df, test_df

    def _remove_low_variance_features(self, df: pd.DataFrame, threshold: float = 0.01) -> pd.DataFrame:
        """Remove features with low variance"""
        selector = VarianceThreshold(threshold=threshold)
        features = df.drop('label', axis=1, errors='ignore')

        # Handle case where there are no features left
        if features.empty:
            return df

        selector.fit(features)

        # Get features to keep
        features_to_keep = features.columns[selector.get_support()]
        removed_count = len(features.columns) - len(features_to_keep)

        if removed_count > 0:
            logger.info(f"Removing {removed_count} low-variance features")
            return pd.concat([df[features_to_keep], df['label']], axis=1)

        return df

    def _remove_highly_correlated_features(self, df: pd.DataFrame, threshold: float = 0.95) -> pd.DataFrame:
        """Remove highly correlated features"""
        features = df.drop('label', axis=1, errors='ignore')

        # Handle case where there are no features left
        if features.empty or len(features.columns) < 2:
            return df

        # Calculate correlation matrix
        try:
            corr_matrix = features.corr().abs()
        except:
            # Fallback to simple correlation calculation
            corr_matrix = features.astype(float).corr().abs()

        # Select upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        # Find features with correlation greater than threshold
        to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

        if len(to_drop) > 0:
            logger.info(f"Removing {len(to_drop)} highly correlated features")
            return df.drop(columns=to_drop)

        return df

    def _select_numeric_features(self, df: pd.DataFrame, max_features: int = 30) -> List[str]:
        """Select only numeric features and limit to top features"""
        # Select only numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

        # Remove label from features if it's there
        if 'label' in numeric_cols:
            numeric_cols.remove('label')

        logger.info(f"Found {len(numeric_cols)} numeric features")

        # If too many features, select the most important ones (based on variance)
        if len(numeric_cols) > max_features:
            logger.info(f"Selecting top {max_features} features by variance...")
            try:
                variances = df[numeric_cols].var().sort_values(ascending=False)
                selected_cols = variances.head(max_features).index.tolist()
                return selected_cols
            except:
                # Fallback: just take the first max_features columns
                return numeric_cols[:max_features]

        return numeric_cols

    def _evaluate_model(self, model, X_test: np.ndarray, y_test: np.ndarray,
                       model_name: str) -> Dict[str, Any]:
        """Evaluate model performance"""
        try:
            # Generate predictions
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=0)
            recall = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)

            # Calculate AUC if model supports probability predictions
            roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else 0.5

            # Confusion matrix
            cm = confusion_matrix(y_test, y_pred)

            return {
                'model_name': model_name,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'roc_auc': roc_auc,
                'confusion_matrix': cm,
                'y_pred': y_pred,
                'y_proba': y_proba
            }
        except Exception as e:
            logger.error(f"Error evaluating model {model_name}: {e}")
            # Return default values in case of error
            return {
                'model_name': model_name,
                'accuracy': 0,
                'precision': 0,
                'recall': 0,
                'f1_score': 0,
                'roc_auc': 0.5,
                'confusion_matrix': np.zeros((2, 2)),
                'y_pred': np.zeros_like(y_test),
                'y_proba': None
            }

    def _plot_comprehensive_comparison(self, results: List[Dict[str, Any]],
                                     y_test: np.ndarray, dataset_name: str):
        """Create comprehensive comparison plots for all models"""
        try:
            # 1. Model performance comparison bar chart
            metrics_df = pd.DataFrame(results)
            metrics_df = metrics_df[['model_name', 'accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']]
            metrics_df.set_index('model_name', inplace=True)

            plt.figure(figsize=(12, 8))
            metrics_df.plot(kind='bar', figsize=(12, 8))
            plt.title(f'Model Performance Comparison on {dataset_name}')
            plt.ylabel('Score')
            plt.xticks(rotation=45)
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.tight_layout()
            plt.savefig(f"{self.config['plot_dir']}/model_comparison_{dataset_name}.png",
                       dpi=300, bbox_inches='tight')
            plt.close()

            # 2. Confusion matrices for all models
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            axes = axes.ravel()

            for i, result in enumerate(results):
                cm = result['confusion_matrix']
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
                           xticklabels=['Pred Benign', 'Pred Attack'],
                           yticklabels=['True Benign', 'True Attack'])
                axes[i].set_title(f'{result["model_name"]}\nAccuracy: {result["accuracy"]:.3f}, F1: {result["f1_score"]:.3f}')

            plt.tight_layout()
            plt.savefig(f"{self.config['plot_dir']}/confusion_matrices_{dataset_name}.png",
                       dpi=300, bbox_inches='tight')
            plt.close()

            # 3. F1-Score comparison
            plt.figure(figsize=(10, 6))
            f1_scores = [result['f1_score'] for result in results]
            model_names = [result['model_name'] for result in results]
            colors = ['red', 'blue', 'green', 'orange', 'purple']
            plt.bar(model_names, f1_scores, color=colors[:len(model_names)])
            plt.title(f'F1-Score Comparison Across Models on {dataset_name}')
            plt.ylabel('F1-Score')
            plt.xticks(rotation=45)
            plt.ylim(0, 1)
            for i, v in enumerate(f1_scores):
                plt.text(i, v + 0.01, f'{v:.3f}', ha='center')
            plt.tight_layout()
            plt.savefig(f"{self.config['plot_dir']}/f1_score_comparison_{dataset_name}.png",
                       dpi=300, bbox_inches='tight')
            plt.close()

            return metrics_df
        except Exception as e:
            logger.error(f"Error creating plots: {e}")
            return pd.DataFrame()

    def train_and_evaluate_models(self, X_train: np.ndarray, y_train: np.ndarray,
                                 X_test: np.ndarray, y_test: np.ndarray,
                                 dataset_name: str) -> List[Dict[str, Any]]:
        """Train and evaluate multiple models on the given data"""
        # Feature scaling
        logger.info("Scaling features...")
        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Save the scaler
        joblib.dump(self.scaler, f"{self.config['model_dir']}/scaler_{dataset_name}.pkl")
        logger.info(f"Scaler saved as {self.config['model_dir']}/scaler_{dataset_name}.pkl")

        # Handle class imbalance with SMOTE
        logger.info("Handling class imbalance with SMOTE...")
        logger.info(f"Before SMOTE: Class 0: {sum(y_train == 0)}, Class 1: {sum(y_train == 1)}")

        try:
            smote = SMOTE(random_state=self.config['random_state'])
            X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
            logger.info(f"After SMOTE: Class 0: {sum(y_train_res == 0)}, Class 1: {sum(y_train_res == 1)}")
        except Exception as e:
            logger.error(f"SMOTE failed: {e}. Using original data.")
            X_train_res, y_train_res = X_train_scaled, y_train

        # Define models with error handling
        models = {
            "Logistic Regression": LogisticRegression(
                class_weight='balanced',
                random_state=self.config['random_state'],
                max_iter=1000,
                n_jobs=-1,
                solver='saga'
            ),
            "Random Forest": RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=2,
                class_weight='balanced',
                random_state=self.config['random_state'],
                n_jobs=-1
            ),
            "XGBoost": XGBClassifier(
                n_estimators=200,
                learning_rate=0.1,
                max_depth=6,
                min_child_weight=1,
                gamma=0,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0,
                reg_lambda=1,
                use_label_encoder=False,
                eval_metric='logloss',
                random_state=self.config['random_state'],
                n_jobs=-1
            ),
            "CatBoost": CatBoostClassifier(
                iterations=200,
                learning_rate=0.1,
                depth=6,
                l2_leaf_reg=3,
                random_state=self.config['random_state'],
                verbose=False,
                thread_count=-1
            )
        }

        # Train and evaluate each model
        results = []
        for name, model in models.items():
            try:
                logger.info(f"\nTraining {name}...")
                model_start = time.time()

                # Fit the model with error handling
                try:
                    if name == "XGBoost":
                        model.fit(
                            X_train_res, y_train_res,
                            eval_set=[(X_test_scaled, y_test)],
                            verbose=False
                        )
                    elif name == "CatBoost":
                        model.fit(
                            X_train_res, y_train_res,
                            eval_set=[(X_test_scaled, y_test)],
                            verbose=False
                        )
                    else:
                        model.fit(X_train_res, y_train_res)
                except Exception as e:
                    logger.error(f"Error training {name}: {e}")
                    continue

                # Save the model
                model_path = f"{self.config['model_dir']}/{name.lower().replace(' ', '_')}_{dataset_name}_model.pkl"
                joblib.dump(model, model_path)
                logger.info(f"{name} model saved as {model_path}")

                # Evaluate the model
                model_results = self._evaluate_model(model, X_test_scaled, y_test, name)
                model_results['training_time'] = time.time() - model_start
                results.append(model_results)

                logger.info(f"{name} training completed in {model_results['training_time']:.2f} seconds")
                logger.info(f"{name} Accuracy: {model_results['accuracy']:.4f}")
                logger.info(f"{name} F1-Score: {model_results['f1_score']:.4f}")

            except Exception as e:
                logger.error(f"Unexpected error with {name}: {e}")
                continue

        return results

    def process_dataset(self, train_paths: List[str], test_paths: List[str],
                       dataset_name: str) -> List[Dict[str, Any]]:
        """Process a complete dataset"""
        logger.info(f"\n{'='*60}")
        logger.info(f"PROCESSING {dataset_name} DATASET")
        logger.info(f"{'='*60}")

        start_time = time.time()

        try:
            # Load and preprocess training data
            train_df = self.load_and_preprocess_data(train_paths, dataset_name,
                                                   self.config['sample_frac'])

            # Load and preprocess testing data
            test_df = self.load_and_preprocess_data(test_paths, dataset_name,
                                                  self.config['sample_frac'])

            # Align features between train and test
            train_df, test_df = self._align_features(train_df, test_df)

            # Enhanced feature selection
            logger.info("Performing feature selection...")
            train_df = self._remove_low_variance_features(train_df, threshold=0.01)
            train_df = self._remove_highly_correlated_features(train_df, threshold=0.95)

            # Apply the same feature selection to test data
            selected_features = list(train_df.columns)
            test_df = test_df[selected_features]

            # Select only numeric features
            feature_cols = self._select_numeric_features(train_df,
                                                       max_features=self.config.get('max_features', 30))

            # Prepare features and labels
            X_train = train_df[feature_cols]
            y_train = train_df['label']

            X_test = test_df[feature_cols]
            y_test = test_df['label']

            logger.info(f"Selected {len(feature_cols)} features for modeling")
            logger.info(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
            logger.info(f"Testing set: {X_test.shape[0]} samples, {X_test.shape[1]} features")

            # Train and evaluate models
            results = self.train_and_evaluate_models(X_train, y_train, X_test, y_test, dataset_name)

            # Create comprehensive comparison plots
            metrics_df = self._plot_comprehensive_comparison(results, y_test, dataset_name)

            if not metrics_df.empty:
                # Find best model based on F1-score
                best_model_idx = metrics_df['f1_score'].idxmax()
                best_model_name = metrics_df.index[best_model_idx]
                best_model_score = metrics_df.loc[best_model_name, 'f1_score']

                logger.info(f"\nBest model for {dataset_name}: {best_model_name} (F1-score: {best_model_score:.4f})")

            logger.info(f"{dataset_name} processing completed in {time.time() - start_time:.2f} seconds")

            return results

        except Exception as e:
            logger.error(f"Error processing {dataset_name}: {e}")
            return []

    def run(self):
        """Run the complete intrusion detection pipeline"""
        start_time = time.time()

        logger.info("Starting Network Intrusion Detection Pipeline")

        # Process CICIDS2017 dataset
        cicids_results = self.process_dataset(
            self.config['cicids_training_paths'],
            self.config['cicids_testing_paths'],
            "CICIDS2017"
        )

        # Process UNSW-NB15 dataset
        unsw_results = self.process_dataset(
            self.config['unsw_paths'],
            self.config['unsw_paths'],  # Using same paths for train/test for UNSW
            "UNSW_NB15"
        )

        total_time = time.time() - start_time
        logger.info(f"\nTotal execution time: {total_time:.2f} seconds")

        # Final message
        logger.info("\n" + "="*60)
        logger.info("PROCESS COMPLETED!")
        logger.info("="*60)

        return {
            'cicids_results': cicids_results,
            'unsw_results': unsw_results
        }


# Configuration
config = {
    'cicids_training_paths': [
        "/content/drive/MyDrive/Machine learning project/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv",
        "/content/drive/MyDrive/Machine learning project/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv",
        "/content/drive/MyDrive/Machine learning project/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv",
        "/content/drive/MyDrive/Machine learning project/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
        "/content/drive/MyDrive/Machine learning project/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infiltration.pcap_ISCX.csv"
    ],
    'cicids_testing_paths': [
        "/content/drive/MyDrive/Machine learning project/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv",
        "/content/drive/MyDrive/Machine learning project/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
    ],
    'unsw_paths': [
        "/content/drive/MyDrive/Machine learning project/MachineLearningCVE/UNSW_NB15_testing-set.csv"
    ],
    'sample_frac': 0.2,
    'random_state': 42,
    'model_dir': "models",
    'plot_dir': "performance_plots",
    'max_features': 30
}


if __name__ == "__main__":
    # Create and run the intrusion detector
    detector = NetworkIntrusionDetector(config)
    results = detector.run()