# 🧪 Model Prototyping with Processed Threat Datasets

This notebook demonstrates how to train basic ML models on a small sample from the cleaned datasets.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
# Load NSL-KDD99 or CIC-IDS-2017
df = pd.read_csv('../data/processed/NSL-KDD99/KDDTrain+_cleaned.csv')  # or use CIC-IDS-2017
df.shape


In [None]:
# Drop non-numeric or high-cardinality features if needed
X = df.drop(columns=['label', 'difficulty'], errors='ignore')
y = df['label'] if 'label' in df.columns else None

# Convert categorical to numeric (if needed)
X = pd.get_dummies(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')


In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print(classification_report(y_test, y_pred_lr))
