In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('Train_data.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"Number of normal samples: {len(df[df['class'] == 'normal'])}")
print(f"Number of anomaly samples: {len(df[df['class'] == 'anomaly'])}")

# Handle categorical features
categorical_cols = ['protocol_type', 'service', 'flag', 'class']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split features and target
X = df.drop('class', axis=1)
y = df['class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


Dataset shape: (25192, 42)
Number of normal samples: 13449
Number of anomaly samples: 11743
Training set shape: (20153, 41)
Testing set shape: (5039, 41)


In [20]:
pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
from sklearn.feature_selection import SelectKBest, f_classif

# Select top k features
k = 20  # Number of features to select
selector = SelectKBest(f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Get selected feature indices
selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices]
print(f"Selected features: {selected_features.tolist()}")

# Plot feature importance scores
plt.figure(figsize=(12, 8))
scores = selector.scores_
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': scores})
feature_scores = feature_scores.sort_values('Score', ascending=False)
sns.barplot(x='Score', y='Feature', data=feature_scores.head(20))
plt.title('Top 20 Features by F-score')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()


Selected features: ['protocol_type', 'service', 'flag', 'logged_in', 'count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
rf_model = RandomForestClassifier(random_state=42)

# Define hyperparameters for grid search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform grid search
grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_
print(f"Best Random Forest parameters: {grid_search.best_params_}")

# Make predictions
y_pred_rf = best_rf_model.predict(X_test_selected)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1 Score: {f1_rf:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.tight_layout()
plt.savefig('rf_confusion_matrix.png')
plt.close()

# Plot feature importance
plt.figure(figsize=(12, 8))
importances = best_rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
selected_feature_names = selected_features.tolist()
plt.bar(range(X_train_selected.shape[1]), importances[indices])
plt.xticks(range(X_train_selected.shape[1]), [selected_feature_names[i] for i in indices], rotation=90)
plt.title('Feature Importance - Random Forest')
plt.tight_layout()
plt.savefig('rf_feature_importance.png')
plt.close()


Best Random Forest parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

Random Forest Results:
Accuracy: 0.9948
Precision: 0.9933
Recall: 0.9970
F1 Score: 0.9952

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2349
           1       0.99      1.00      1.00      2690

    accuracy                           0.99      5039
   macro avg       0.99      0.99      0.99      5039
weighted avg       0.99      0.99      0.99      5039



In [23]:
from sklearn.svm import SVC

# Define the model
svm_model = SVC(random_state=42)

# Define hyperparameters for grid search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Perform grid search
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

# Get the best model
best_svm_model = grid_search.best_estimator_
print(f"Best SVM parameters: {grid_search.best_params_}")

# Make predictions
y_pred_svm = best_svm_model.predict(X_test_selected)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

print("\nSVM Results:")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"Precision: {precision_svm:.4f}")
print(f"Recall: {recall_svm:.4f}")
print(f"F1 Score: {f1_svm:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_svm)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - SVM')
plt.tight_layout()
plt.savefig('svm_confusion_matrix.png')
plt.close()


Best SVM parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

SVM Results:
Accuracy: 0.9815
Precision: 0.9790
Recall: 0.9866
F1 Score: 0.9828

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2349
           1       0.98      0.99      0.98      2690

    accuracy                           0.98      5039
   macro avg       0.98      0.98      0.98      5039
weighted avg       0.98      0.98      0.98      5039



In [25]:
from sklearn.ensemble import GradientBoostingClassifier

# Define the model
gb_model = GradientBoostingClassifier(random_state=42)

# Define hyperparameters for grid search
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]
}

# Perform grid search
grid_search = GridSearchCV(gb_model, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

# Get the best model
best_gb_model = grid_search.best_estimator_
print(f"Best Gradient Boosting parameters: {grid_search.best_params_}")

# Make predictions
y_pred_gb = best_gb_model.predict(X_test_selected)

# Evaluate the model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
precision_gb = precision_score(y_test, y_pred_gb)
recall_gb = recall_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb)

print("\nGradient Boosting Results:")
print(f"Accuracy: {accuracy_gb:.4f}")
print(f"Precision: {precision_gb:.4f}")
print(f"Recall: {recall_gb:.4f}")
print(f"F1 Score: {f1_gb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_gb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Gradient Boosting')
plt.tight_layout()
plt.savefig('gb_confusion_matrix.png')
plt.close()

# Plot feature importance
plt.figure(figsize=(12, 8))
importances = best_gb_model.feature_importances_
indices = np.argsort(importances)[::-1]
selected_feature_names = selected_features.tolist()
plt.bar(range(X_train_selected.shape[1]), importances[indices])
plt.xticks(range(X_train_selected.shape[1]), [selected_feature_names[i] for i in indices], rotation=90)
plt.title('Feature Importance - Gradient Boosting')
plt.tight_layout()
plt.savefig('gb_feature_importance.png')
plt.close()


Best Gradient Boosting parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}

Gradient Boosting Results:
Accuracy: 0.9952
Precision: 0.9952
Recall: 0.9959
F1 Score: 0.9955

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2349
           1       1.00      1.00      1.00      2690

    accuracy                           1.00      5039
   macro avg       1.00      1.00      1.00      5039
weighted avg       1.00      1.00      1.00      5039



In [30]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(random_state=42)

# With this:
from lightgbm import LGBMClassifier
lgb_model = LGBMClassifier(random_state=42)

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <54A1AE05-1E14-3DA2-A8D0-062134694298> /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]
