# Cybersecurity Intrusion Detection - EDA & Modeling

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load dataset
file_path = "cybersecurity_intrusion_data_cleaned.csv"  # Adjust path if needed
df = pd.read_csv(file_path)

df.info()
df.head()


## Exploratory Data Analysis

In [None]:

# Target distribution
attack_counts = df['attack_detected'].value_counts(normalize=True) * 100

plt.figure(figsize=(5,4))
sns.countplot(x='attack_detected', data=df, palette='Set2')
plt.title('Attack Detection Distribution')
plt.xlabel('Attack Detected (0=No, 1=Yes)')
plt.ylabel('Count')
plt.show()

attack_counts


In [None]:

# Correlation heatmap (numeric features)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr = df[numeric_cols].corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap - Numeric Features')
plt.show()


In [None]:

# Boxplots for numeric vs target
for col in ['network_packet_size','session_duration','ip_reputation_score','failed_logins','login_attempts']:
    plt.figure(figsize=(6,3))
    sns.boxplot(x='attack_detected', y=col, data=df, palette='Set3')
    plt.title(f'{col} vs Attack Detected')
    plt.show()


In [None]:

# Categorical distributions and vs target
for col in ['protocol_type','encryption_used','browser_type']:
    plt.figure(figsize=(7,3))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index, palette='Set1')
    plt.title(f'{col} Distribution')
    plt.show()

    plt.figure(figsize=(7,3))
    sns.countplot(x=col, hue='attack_detected', data=df, palette='Set2')
    plt.title(f'{col} vs Attack Detected')
    plt.xticks(rotation=45)
    plt.show()


## Modeling (Baseline & Advanced Models)

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import seaborn as sns

# Features and target
X = df.drop(columns=['attack_detected','session_id'])
y = df['attack_detected']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing
categorical_features = [c for c in X.columns if X[c].dtype == 'object']
numeric_features = [c for c in X.columns if c not in categorical_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [None]:

# Logistic Regression baseline
pipe_lr = Pipeline(steps=[('preprocessor', preprocessor),
                          ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))])
pipe_lr.fit(X_train, y_train)

y_pred = pipe_lr.predict(X_test)
y_proba = pipe_lr.predict_proba(X_test)[:,1]

print("Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()


In [None]:

# Random Forest model
pipe_rf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('clf', RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced'))])
pipe_rf.fit(X_train, y_train)

y_pred_rf = pipe_rf.predict(X_test)
y_proba_rf = pipe_rf.predict_proba(X_test)[:,1]

print("Random Forest Results")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1:", f1_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print(classification_report(y_test, y_pred_rf))

sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d')
plt.title('Confusion Matrix - Random Forest')
plt.show()


In [None]:

# XGBoost model (if available)
pipe_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                           ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))])
pipe_xgb.fit(X_train, y_train)

y_pred_xgb = pipe_xgb.predict(X_test)
y_proba_xgb = pipe_xgb.predict_proba(X_test)[:,1]

print("XGBoost Results")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))
print("F1:", f1_score(y_test, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, y_proba_xgb))
print(classification_report(y_test, y_pred_xgb))

sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d')
plt.title('Confusion Matrix - XGBoost')
plt.show()
