In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Selected features for training
selected_features = [
    'Flow Duration', 'Flow Bytes/s', 'Flow Packets/s',
    'Total Fwd Packets', 'Total Backward Packets', 'Average Packet Size', 'Packet Length Std',
    'Flow IAT Mean', 'Flow IAT Std', 'Fwd IAT Mean', 'Bwd IAT Mean',
    'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count'
]

# File paths for the One-Class SVM baseline datasets
oneclass_svm_train_files = [
    'Dataset/normal_traffic_labeled.csv',
    'Dataset//home_traffic_labeled.csv'
]

# File paths for the Gradient Boosting training datasets
gradient_boosting_train_files = [
    'Dataset/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
    'Dataset/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    'Dataset/Friday-WorkingHours-Morning.pcap_ISCX.csv',
    'Dataset/Monday-WorkingHours.pcap_ISCX.csv',
    'Dataset/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    'Dataset/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    'Dataset/Tuesday-WorkingHours.pcap_ISCX.csv',
    'Dataset/Wednesday-workingHours.pcap_ISCX.csv'
]

# -----------------------------
# Load and Preprocess Data for One-Class SVM
# -----------------------------
print("Loading and concatenating One-Class SVM baseline datasets...")
svm_data = pd.concat([pd.read_csv(file) for file in oneclass_svm_train_files])

# Renaming columns by removing leading/trailing whitespace
svm_data.columns = svm_data.columns.str.strip()

# Dropping duplicates and handling NaN and infinite values
svm_data = svm_data.drop_duplicates()
svm_data = svm_data.replace([np.inf, -np.inf], np.nan)

# Fill missing values for specific columns in testing data
for col in selected_features:
    if col in svm_data.columns:
        svm_data[col] = svm_data[col].fillna(svm_data[col].median())

# Encode labels (0 = BENIGN, 1 = MALICIOUS)
svm_data['Label'] = svm_data['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

# Prepare features for One-Class SVM, using only BENIGN samples
X_svm_train = svm_data.loc[svm_data['Label'] == 0, selected_features].copy()

# -----------------------------
# Load and Preprocess Data for Gradient Boosting
# -----------------------------
print("Loading and concatenating Gradient Boosting training datasets...")
gb_data = pd.concat([pd.read_csv(file) for file in gradient_boosting_train_files])

# Renaming columns by removing leading/trailing whitespace
gb_data.columns = gb_data.columns.str.strip()

# Dropping duplicates and handling NaN and infinite values
gb_data = gb_data.drop_duplicates()
gb_data = gb_data.replace([np.inf, -np.inf], np.nan)

# Fill missing values for specific columns in testing data
for col in selected_features:
    if col in gb_data.columns:
        gb_data[col] = gb_data[col].fillna(gb_data[col].median())

# Encode labels (0 = BENIGN, 1 = MALICIOUS)
gb_data['Label'] = gb_data['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

# Prepare features and labels for Gradient Boosting
X_gb_train = gb_data[selected_features]
y_gb_train = gb_data['Label']

# -----------------------------
# Combine Data for Scaling and Standardize
# -----------------------------
# Fit a single scaler on combined benign data
print("Fitting scaler and scaling data...")
scaler = StandardScaler()
X_combined_benign = pd.concat([X_svm_train, X_gb_train[y_gb_train == 0]])
scaler.fit(X_combined_benign)

# Scale training data for both models
X_svm_train_scaled = scaler.transform(X_svm_train)
X_gb_train_scaled = scaler.transform(X_gb_train)

# Save the scaler and selected features
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(selected_features, 'selected_features.joblib')
print("Scaler and selected features have been saved.")

# -----------------------------
# Train One-Class SVM
# -----------------------------
print("Training One-Class SVM on BENIGN samples from One-Class SVM training data...")
svm_model = OneClassSVM(kernel="rbf",  gamma=0.08, nu=0.05)
svm_model.fit(X_svm_train_scaled)

# Save the One-Class SVM model
joblib.dump(svm_model, 'oneclass_svm_model.joblib')
print("One-Class SVM model has been saved.")

# Evaluate One-Class SVM on its own baseline data
X_svm_test_scaled = scaler.transform(svm_data[selected_features])
y_svm_test = svm_data['Label']
y_svm_pred = np.where(svm_model.predict(X_svm_test_scaled) == -1, 1, 0)
print("Classification Report (One-Class SVM):\n", classification_report(y_svm_test, y_svm_pred))
print("Confusion Matrix (One-Class SVM):\n", confusion_matrix(y_svm_test, y_svm_pred))

# -----------------------------
# Train Gradient Boosting
# -----------------------------
print("Splitting data into training and testing sets for Gradient Boosting...")
X_train, X_test, y_train, y_test = train_test_split(
    X_gb_train_scaled, y_gb_train, test_size=0.2, random_state=42, stratify=y_gb_train
)

print("\nTraining Gradient Boosting...")
gb = GradientBoostingClassifier(n_estimators=200, random_state=42)
gb.fit(X_train, y_train)

# Save the Gradient Boosting model
joblib.dump(gb, 'gradient_boosting_model.joblib')
print("Gradient Boosting model has been saved.")

# Evaluate Gradient Boosting
y_pred_gb = gb.predict(X_test)
print("\nGradient Boosting Classification Report:\n", classification_report(y_test, y_pred_gb))
print("Gradient Boosting Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
