## Intrusion Detection System

- Name: Kaushik Srivasan
- Branch: 4th year, School of Computing & Data Science, Sai University

For Applied Data Analytics Course, CMI

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import OneClassSVM

### Loading the Data and Preprocessing

In [4]:
columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", 
    "num_compromised", "root_shell", "su_attempted", "num_root", 
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", 
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]

train = pd.read_csv('all_data\Intrusion-detection_Train.csv', header=None, names=columns)
test = pd.read_csv('all_data\Intrusion-detection_Test.csv', header=None, names=columns)

  train = pd.read_csv('all_data\Intrusion-detection_Train.csv', header=None, names=columns)
  test = pd.read_csv('all_data\Intrusion-detection_Test.csv', header=None, names=columns)


In [5]:
print("Train Data Head:\n", train.head())


Train Data Head:
    duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1                    0.00                    0.60   
2                    0.10          

In [6]:
print("Null values in train:\n", train.isnull().sum())

Null values in train:
 duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate       

In [7]:
categorical_cols = ["protocol_type", "service", "flag"]
encoder = LabelEncoder()
for col in categorical_cols:
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])

In [8]:
train['label'] = train['label'].apply(lambda x: 0 if x == 'normal' else 1)
test['label'] = test['label'].apply(lambda x: 0 if x == 'normal' else 1)

## EDA

In [9]:
def perform_eda(data):
    # Basic information about the dataset
    print("Dataset Information:")
    print(data.info())
    
    # Check for missing values
    print("\nMissing Values:")
    print(data.isnull().sum())
    
    # Distribution of labels
    print("\nLabel Distribution:")
    print(data['label'].value_counts(normalize=True))
    
    # Visualize label distribution
    plt.figure(figsize=(10, 6))
    data['label'].value_counts().plot(kind='bar')
    plt.title('Distribution of Labels')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig('label_distribution.png')
    plt.close()
    
    # Correlation heatmap for numerical features
    numeric_features = data.select_dtypes(include=[np.number]).columns
    plt.figure(figsize=(20, 16))
    sns.heatmap(data[numeric_features].corr(), annot=False, cmap='coolwarm')
    plt.title('Correlation Heatmap of Numerical Features')
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png')
    plt.close()


In [10]:
perform_eda(train)

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  int32  
 2   service                      125973 non-null  int32  
 3   flag                         125973 non-null  int32  
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 n

In [11]:
def label_distribution_analysis(data):
    plt.figure(figsize=(12, 6))
    label_counts = data['label'].value_counts()
    
    # Pie Chart
    plt.subplot(121)
    label_counts.plot(kind='pie', autopct='%1.1f%%')
    plt.title('Label Distribution (Pie Chart)')
    
    # Bar Chart
    plt.subplot(122)
    label_counts.plot(kind='bar')
    plt.title('Label Distribution (Bar Chart)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('label_distribution_analysis.png')
    plt.close()
    
    print("\nLabel Distribution:")
    print(label_counts)
    print("\nLabel Percentages:")
    print(label_counts / len(data) * 100)
label_distribution_analysis(train)


Label Distribution:
label
0    67343
1    58630
Name: count, dtype: int64

Label Percentages:
label
0    53.458281
1    46.541719
Name: count, dtype: float64


In [None]:
def numerical_features_analysis(data):
       
        numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
        numeric_data = data[numeric_columns]
        
        # Descriptive Statistics
        print("\nDescriptive Statistics:")
        print(numeric_data.describe())
        
        plt.figure(figsize=(20, 10))
        numeric_data.boxplot()
        plt.title('Boxplot of Numerical Features')
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.savefig('numerical_features_boxplot.png')
        plt.close()
numerical_features_analysis(train)


Descriptive Statistics:
           duration  protocol_type        service           flag  \
count  125973.00000  125973.000000  125973.000000  125973.000000   
mean      287.14465       1.053202      31.226469       6.979996   
std      2604.51531       0.426620      16.346470       2.689365   
min         0.00000       0.000000       0.000000       0.000000   
25%         0.00000       1.000000      20.000000       5.000000   
50%         0.00000       1.000000      24.000000       9.000000   
75%         0.00000       1.000000      49.000000       9.000000   
max     42908.00000       2.000000      69.000000      10.000000   

          src_bytes     dst_bytes           land  wrong_fragment  \
count  1.259730e+05  1.259730e+05  125973.000000   125973.000000   
mean   4.556674e+04  1.977911e+04       0.000198        0.022687   
std    5.870331e+06  4.021269e+06       0.014086        0.253530   
min    0.000000e+00  0.000000e+00       0.000000        0.000000   
25%    0.000000e+00  0

In [None]:
def categorical_features_analysis(data):
        categorical_columns = ['protocol_type', 'service', 'flag']
        
        for col in categorical_columns:
            print(f"\n{col.capitalize()} Distribution:")
            cat_counts = data[col].value_counts()
            print(cat_counts)
            
            # Stacked Bar Chart of Categorical Features vs Label
            plt.figure(figsize=(12, 6))
            pd.crosstab(data[col], data['label'], normalize='index').plot(kind='bar', stacked=True)
            plt.title(f'{col.capitalize()} vs Label Distribution')
            plt.xlabel(col.capitalize())
            plt.ylabel('Proportion')
            plt.legend(title='Label', bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.tight_layout()
            plt.close()
categorical_features_analysis(train)


Protocol_type Distribution:
protocol_type
1    102689
2     14993
0      8291
Name: count, dtype: int64

Service Distribution:
service
24    40338
49    21853
12     9043
54     7313
20     6860
      ...  
61        3
27        2
3         2
22        2
25        1
Name: count, Length: 70, dtype: int64

Flag Distribution:
flag
9     74945
5     34851
1     11233
4      2421
2      1562
6       365
10      271
7       127
3       103
8        49
0        46
Name: count, dtype: int64


<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [None]:
def correlation_analysis(data):
    numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    numeric_data = data[numeric_columns]
    
    # Correlation Matrix
    correlation_matrix = numeric_data.corr()
    
    plt.figure(figsize=(20, 16))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f", square=True)
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.close()
    
    # High Correlation Features
    high_corr_features = np.where(np.abs(correlation_matrix) > 0.8)
    high_corr_features = [(correlation_matrix.columns[x], correlation_matrix.columns[y]) 
                            for x, y in zip(*high_corr_features) if x != y and x < y]
    
    print("\nHighly Correlated Features:")
    for feat1, feat2 in high_corr_features:
        print(f"{feat1} and {feat2}")

correlation_analysis(train)


Highly Correlated Features:
hot and is_guest_login
num_compromised and num_root
serror_rate and srv_serror_rate
serror_rate and dst_host_serror_rate
serror_rate and dst_host_srv_serror_rate
srv_serror_rate and dst_host_serror_rate
srv_serror_rate and dst_host_srv_serror_rate
rerror_rate and srv_rerror_rate
rerror_rate and dst_host_rerror_rate
rerror_rate and dst_host_srv_rerror_rate
srv_rerror_rate and dst_host_rerror_rate
srv_rerror_rate and dst_host_srv_rerror_rate
dst_host_srv_count and dst_host_same_srv_rate
dst_host_serror_rate and dst_host_srv_serror_rate
dst_host_rerror_rate and dst_host_srv_rerror_rate


In [15]:
def statistical_significance_tests(data):
    # ANOVA to check if numerical features differ significantly across labels
    from scipy import stats
    
    numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    numerical_columns = [col for col in numerical_columns if col != 'label']
    
    print("\nStatistical Significance of Features:")
    for col in numerical_columns:
        # Group data by label
        groups = [group[col].values for name, group in data.groupby('label')]
        
        # Perform one-way ANOVA
        f_value, p_value = stats.f_oneway(*groups)
        print(f"{col}: F-statistic = {f_value:.4f}, p-value = {p_value:.4f}")
statistical_significance_tests(train)


Statistical Significance of Features:
duration: F-statistic = 300.5273, p-value = 0.0000
protocol_type: F-statistic = 10829.1723, p-value = 0.0000
service: F-statistic = 10431.9632, p-value = 0.0000
flag: F-statistic = 90735.9557, p-value = 0.0000
src_bytes: F-statistic = 4.4170, p-value = 0.0356
dst_bytes: F-statistic = 2.1358, p-value = 0.1439
land: F-statistic = 6.5139, p-value = 0.0107
wrong_fragment: F-statistic = 1169.4199, p-value = 0.0000
urgent: F-statistic = 0.9785, p-value = 0.3226
hot: F-statistic = 21.5669, p-value = 0.0000
num_failed_logins: F-statistic = 1.7763, p-value = 0.1826
logged_in: F-statistic = 114585.7611, p-value = 0.0000
num_compromised: F-statistic = 13.1031, p-value = 0.0003
root_shell: F-statistic = 51.8581, p-value = 0.0000
su_attempted: F-statistic = 63.5132, p-value = 0.0000
num_root: F-statistic = 16.5245, p-value = 0.0000
num_file_creations: F-statistic = 57.0206, p-value = 0.0000
num_shells: F-statistic = 11.3030, p-value = 0.0008
num_access_files: 

  res = hypotest_fun_out(*samples, **kwds)


is_guest_login: F-statistic = 194.6537, p-value = 0.0000
count: F-statistic = 62689.6775, p-value = 0.0000
srv_count: F-statistic = 0.0748, p-value = 0.7844
serror_rate: F-statistic = 92481.5048, p-value = 0.0000
srv_serror_rate: F-statistic = 91324.6847, p-value = 0.0000
rerror_rate: F-statistic = 8643.6175, p-value = 0.0000
srv_rerror_rate: F-statistic = 8651.4094, p-value = 0.0000
same_srv_rate: F-statistic = 163866.5772, p-value = 0.0000
diff_srv_rate: F-statistic = 5451.0421, p-value = 0.0000
srv_diff_host_rate: F-statistic = 1821.1547, p-value = 0.0000
dst_host_count: F-statistic = 20620.1012, p-value = 0.0000
dst_host_srv_count: F-statistic = 137598.2861, p-value = 0.0000
dst_host_same_srv_rate: F-statistic = 116917.2767, p-value = 0.0000
dst_host_diff_srv_rate: F-statistic = 7898.2052, p-value = 0.0000
dst_host_same_src_port_rate: F-statistic = 1085.8242, p-value = 0.0000
dst_host_srv_diff_host_rate: F-statistic = 491.3467, p-value = 0.0000
dst_host_serror_rate: F-statistic = 9

## Data Preprocessing

In [None]:
def create_interaction_features(df):
        # Byte-related interactions
        df['total_bytes'] = df['src_bytes'] + df['dst_bytes']
        df['byte_ratio'] = df['src_bytes'] / (df['dst_bytes'] + 1)  # Add 1 to avoid division by zero
        
        # Network-related interactions
        df['connection_complexity'] = df['count'] * df['srv_count']
        df['error_interaction'] = df['serror_rate'] * df['srv_serror_rate']
        
        return df
    
train= create_interaction_features(train)
test = create_interaction_features(test)

In [None]:


# Categorical Feature Encoding
def categorical_encoding(df):
    # One-Hot Encoding for low-cardinality categorical variables
    categorical_columns = ['protocol_type', 'service', 'flag']
    df_encoded = pd.get_dummies(df, columns=categorical_columns)
    
    return df_encoded
train = categorical_encoding(train)
test = categorical_encoding(test)

In [None]:
  # Feature Selection using Random Forest Feature Importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def feature_selection(df, label_column='label', n_features=15):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import LabelEncoder
    
    # Separate features and target
    X = df.drop(label_column, axis=1)
    y = df[label_column]
    
    # Encode target variable
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    # Train Random Forest for feature importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y_encoded)
    
    # Get feature importances
    importances = rf.feature_importances_
    feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)
    
    # Plot feature importances
    plt.figure(figsize=(10, 6))
    feature_importances[:n_features].plot(kind='bar')
    plt.title(f'Top {n_features} Feature Importances')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.close()
    
    # Select top features
    selected_features = feature_importances[:n_features].index.tolist()
    selected_features.append('label')
    
    return df[selected_features]

In [22]:
train_features = feature_selection(train)
train_features.head()

Unnamed: 0,total_bytes,src_bytes,same_srv_rate,byte_ratio,diff_srv_rate,dst_bytes,logged_in,dst_host_diff_srv_rate,dst_host_same_srv_rate,flag_9,dst_host_srv_count,count,dst_host_same_src_port_rate,connection_complexity,dst_host_serror_rate,label
0,491,491,1.0,491.0,0.0,0,0,0.03,0.17,True,25,2,0.17,4,0.0,0
1,146,146,0.08,146.0,0.15,0,0,0.6,0.0,True,1,13,0.88,13,0.0,0
2,0,0,0.05,0.0,0.07,0,0,0.05,0.1,False,26,123,0.0,738,1.0,1
3,8385,232,1.0,0.028452,0.0,8153,1,0.0,1.0,True,255,5,0.03,25,0.03,0
4,619,199,1.0,0.472684,0.0,420,1,0.0,1.0,True,255,30,0.0,960,0.0,0


In [23]:
test_features = feature_selection(test)
test_features.head()

Unnamed: 0,dst_bytes,src_bytes,service_24,byte_ratio,total_bytes,dst_host_rerror_rate,dst_host_diff_srv_rate,flag_9,dst_host_srv_count,dst_host_same_srv_rate,dst_host_srv_rerror_rate,duration,logged_in,connection_complexity,srv_count,label
0,0,0,False,0.0,0,1.0,0.06,False,10,0.04,1.0,0,0,2290,10,1
1,0,0,False,0.0,0,1.0,0.06,False,1,0.0,1.0,0,0,136,1,1
2,0,12983,False,12983.0,12983,0.0,0.04,True,86,0.61,0.0,2,0,1,1,0
3,0,20,False,20.0,20,0.0,0.0,True,57,1.0,0.0,0,0,65,65,1
4,15,0,False,0.0,15,0.83,0.17,False,86,0.31,0.71,1,0,8,8,1


In [31]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

def preprocess_data(train_features, test_features, target_col, n_features=10):


    # Separate target from features
    y_train = train_features[target_col]
    X_train = train_features.drop(columns=[target_col])
    
    y_test = test_features[target_col]
    X_test = test_features.drop(columns=[target_col])

    # Feature Selection using SelectKBest
    selector = SelectKBest(score_func=f_classif, k=n_features)
    selector.fit(X_train, y_train)
    selected_features = X_train.columns[selector.get_support()].tolist()

    # Select the same features in test set
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Initialize and fit the scaler on training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    return X_train_scaled, X_test_scaled, y_train, y_test, selected_features



In [33]:
X_train_scaled, X_test_scaled, y_train, y_test, features = preprocess_data(
    train, test, target_col='label', n_features=10
)

  f = msb / msw


## Model Building

In [None]:

def isolation_forest_anomaly_detection(X_train, X_test, y_test):
    # Train Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    iso_forest.fit(X_train)
    
    # Predict anomalies
    y_pred_forest = iso_forest.predict(X_test)
    
    # Convert predictions to binary (0 for inliers, 1 for outliers)
    y_pred_forest_binary = np.where(y_pred_forest == 1, 0, 1)
    

    y_test_binary = y_test
    
    # Evaluation
    print("\nIsolation Forest Anomaly Detection Results:")
    print(classification_report(y_test_binary, y_pred_forest_binary))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test_binary, y_pred_forest_binary)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Isolation Forest')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.savefig('isolation_forest_confusion_matrix.png')
    plt.close()
    


In [None]:
def one_class_svm_anomaly_detection(X_train, X_test, y_test):
    # Train One-Class SVM
    svm = OneClassSVM(kernel='rbf', nu=0.1, gamma='scale')
    svm.fit(X_train)
    
    # Predict anomalies
    y_pred_svm = svm.predict(X_test)
    
    # Convert predictions to binary (0 for inliers, 1 for outliers)
    y_pred_svm_binary = np.where(y_pred_svm == 1, 0, 1)
    
    y_test_binary = y_test
    
    # Evaluation
    print("\nOne-Class SVM Anomaly Detection Results:")
    print(classification_report(y_test_binary, y_pred_svm_binary))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test_binary, y_pred_svm_binary)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - One-Class SVM')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.savefig('one_class_svm_confusion_matrix.png')
    plt.close()
 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

def random_forest_classification(X_train, X_test, y_train, y_test):
   

    # Optimized hyperparameter tuning using a reduced grid
    param_grid = {
        'n_estimators': [100, 200],          # Reduced options
        'max_depth': [None, 10, 20],         # Fewer depth options
        'min_samples_split': [2, 5],         # Reduced options
        'min_samples_leaf': [1, 2]           # Reduced options
    }

    # Initialize Random Forest Classifier
    rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1)

    # Perform Grid Search with fewer cross-validation folds
    grid_search = GridSearchCV(
        estimator=rf_classifier, 
        param_grid=param_grid, 
        cv=3,                             # Reduced CV folds
        scoring='f1_weighted', 
        n_jobs=-1,
        verbose=1                        # To monitor progress
    )
    grid_search.fit(X_train, y_train)

    # Best model
    best_rf = grid_search.best_estimator_

    # Print best parameters
    print("\nBest Hyperparameters:")
    print(grid_search.best_params_)

    # Predictions
    y_pred = best_rf.predict(X_test)

    # Detailed Evaluation
    print("\nRandom Forest Classification Report:")
    print(classification_report(
        y_test, 
        y_pred, 
        target_names=['Normal', 'Attack']
    ))

    # Confusion Matrix Visualization
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'Attack'], 
                yticklabels=['Normal', 'Attack'])
    plt.title('Confusion Matrix - Random Forest')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.savefig('random_forest_confusion_matrix.png')
    plt.close()

    # ROC Curve for Binary Classification
    plt.figure(figsize=(10, 8))
    y_pred_proba = best_rf.predict_proba(X_test)[:, 1]  # Probability for the positive class

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve - Random Forest')
    plt.legend(loc="lower right")
    plt.savefig('random_forest_roc_curve.png')
    plt.close()





## Evaluation and Results

In [40]:

isolation_forest_anomaly_detection(X_train_scaled, X_test_scaled, y_test)
    




Isolation Forest Anomaly Detection Results:
              precision    recall  f1-score   support

           0       0.49      0.98      0.66      9711
           1       0.94      0.24      0.38     12833

    accuracy                           0.56     22544
   macro avg       0.72      0.61      0.52     22544
weighted avg       0.75      0.56      0.50     22544



In [41]:
one_class_svm_anomaly_detection(X_train_scaled, X_test_scaled, y_test)


One-Class SVM Anomaly Detection Results:
              precision    recall  f1-score   support

           0       0.49      0.98      0.65      9711
           1       0.93      0.23      0.37     12833

    accuracy                           0.55     22544
   macro avg       0.71      0.60      0.51     22544
weighted avg       0.74      0.55      0.49     22544



In [48]:
label_encoder = LabelEncoder()
random_forest_model = random_forest_classification(
    X_train_scaled, 
    X_test_scaled, 
    y_train, 
    y_test
)

Fitting 3 folds for each of 24 candidates, totalling 72 fits

Best Hyperparameters:
{'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}

Random Forest Classification Report:
              precision    recall  f1-score   support

      Normal       0.62      0.96      0.75      9711
      Attack       0.95      0.55      0.70     12833

    accuracy                           0.73     22544
   macro avg       0.78      0.75      0.72     22544
weighted avg       0.81      0.73      0.72     22544

