# **BINARY CLASSIFICATION**

**TRAIN**

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os

In [None]:
#mount gdrive directory
from google.colab import drive
drive.mount('/content/drive')

#Global path variables
BALANCED_TRAIN_DATASET_PATH =
TEST_DATASET_PATH =
BINARY_RESULTS_SAVE_PATH =
BINARY_RESULTS_FILE_PATH

Mounted at /content/drive


In [None]:
# Load training dataset
train_df = pd.read_csv(BALANCED_TRAIN_DATASET_PATH)

# Verify the class distribution in the training data
print(f"train_df shape: {train_df.shape}")
train_class_distribution = train_df['category'].value_counts()
print("Train class distribution:\n", train_class_distribution)

train_df shape: (401860, 110)
Train class distribution:
 category
0    200000
1    180000
2     13860
3      5000
4      3000
Name: count, dtype: int64


In [None]:
# Convert category column to binary classes
train_df['category'] = train_df['category'].apply(lambda x: 0 if x == 0 else 1)

# Verify the new class distribution
binary_class_distribution = train_df['category'].value_counts()
print("Binary Class Distribution:\n", binary_class_distribution)

Binary Class Distribution:
 category
1    201860
0    200000
Name: count, dtype: int64


In [None]:
# Splitting inputData (features) and outputData (labels)
X = train_df.drop(columns=['category']).values  # Features
y = train_df['category'].values  # Labels

# Verify shapes and class distribution
print(f"Train Data Shape: X = {X.shape}, y = {y.shape}")
print("Class Distribution:\n", train_df['category'].value_counts())

Train Data Shape: X = (401860, 109), y = (401860,)
Class Distribution:
 category
1    201860
0    200000
Name: count, dtype: int64


In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Dictionary of all models used
optimized_models = {
    'random_forest': RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42),
    'knn': KNeighborsClassifier(n_neighbors=3, weights='uniform', metric='euclidean'),
    'logistic_regression': LogisticRegression(C=1, solver='liblinear', max_iter=3000, random_state=42),
    'xgboost': XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42),  # Replacing SVM
    'naive_bayes': GaussianNB()
}

In [None]:
# Dictionary to store trained models
trained_models = {}

# Train all models and store in dictionary
for model_name, model in optimized_models.items():
    print(f"\nTraining {model_name.replace('_', ' ').capitalize()}...")
    model.fit(X, y)  # Train model on entire dataset
    trained_models[model_name] = model  # Store trained model

print("All models trained and mapped successfully.")


Training Random forest...

Training Knn...

Training Logistic regression...

Training Xgboost...

Training Naive bayes...
All models trained and mapped successfully.


**TESTING**

In [None]:
# Load test dataset
test_df = pd.read_csv(TEST_DATASET_PATH)

# Verify the class distribution in the test data
test_class_distribution = test_df['category'].value_counts()
print("Test class distribution:\n", test_class_distribution)

# View the shape of the testing dataset
print(f"Test DataFrame shape: {test_df.shape}")

# Verify unique values in the 'category' column
if 'category' in test_df.columns:
    print("Unique values in 'category' column:", test_df['category'].unique())
else:
    print("'category' column not found in test_df!")

Test class distribution:
 category
0    47913
1    23568
3     3058
2     2682
4       70
Name: count, dtype: int64
Test DataFrame shape: (77291, 103)
Unique values in 'category' column: [0 2 1 3 4]


In [None]:
# Convert category column to binary classes
test_df['category'] = test_df['category'].apply(lambda x: 0 if x == 0 else 1)

# Verify the new class distribution
binary_class_distribution = test_df['category'].value_counts()
print("Binary Class Distribution:\n", binary_class_distribution)

Binary Class Distribution:
 category
0    47913
1    29378
Name: count, dtype: int64


In [None]:
# Find differences in columns between train and test datasets
train_columns = set(train_df.columns)
test_columns = set(test_df.columns)

# Columns in train but not in test
only_in_train = train_columns - test_columns
# Columns in test but not in train
only_in_test = test_columns - train_columns

# Print results
if only_in_train:
    print(f"Columns in TRAIN but NOT in TEST: {only_in_train}")
else:
    print("No extra columns found in TRAIN.")

if only_in_test:
    print(f"Columns in TEST but NOT in TRAIN: {only_in_test}")
else:
    print("No extra columns found in TEST.")

# Print shapes before reindexing
print("\nBefore reindexing:")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Reindex test dataset to match train dataset columns
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

# Print shapes after reindexing
print("\nAfter reindexing:")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Columns in TRAIN but NOT in TEST: {'srv_serror_rate', 'service_http_8001', 'serror_rate', 'service_red_i', 'service_urh_i', 'service_aol', 'service_http_2784', 'service_harvest'}
Columns in TEST but NOT in TRAIN: {'service_icmp'}

Before reindexing:
Train shape: (401860, 110)
Test shape: (77291, 103)

After reindexing:
Train shape: (401860, 110)
Test shape: (77291, 110)


In [None]:
# Separate features (X_test) and labels (y_test)
if 'category' in test_df.columns:
    y_test_binary = test_df['category'].values  # Extract labels
    X_test_binary = test_df.drop(columns=['category']).values  # Drop category to get features
else:
    raise ValueError("'category' column not found in test dataset!")

print(f"Test dataset split successfully.")
print(f"X_test shape: {X_test_binary.shape}")
print(f"y_test shape: {y_test_binary.shape}")

Test dataset split successfully.
X_test shape: (77291, 109)
y_test shape: (77291,)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Ensure the directory exists
os.makedirs(BINARY_RESULTS_SAVE_PATH, exist_ok=True)

print("Setup completed.")

 Setup completed.


In [None]:
#Initialize counters for total class counts
total_attacks = np.sum(y_test_binary == 1)   # Total attack samples
total_normals = np.sum(y_test_binary == 0)   # Total normal samples

# File to save evaluation results
results_file = os.path.join(results_dir, "evaluation_results_binary.txt")

In [None]:
# Global variable for binary thesis evaluation results file
BINARY_RESULTS_FILE_PATH = os.path.join(BINARY_RESULTS_SAVE_PATH, "binary_model_evaluation_results.txt")

# Evaluating Models and Generating Reports
with open(BINARY_RESULTS_FILE_PATH, "w") as f:
    for model_name, model in trained_models.items():
        f.write(f"\nEvaluating {model_name.replace('_', ' ').capitalize()} on the Binary Test Set:\n{'-' * 40}\n")

        # Predict on the test set
        y_pred_binary = model.predict(X_test_binary)

        # Calculate standard metrics
        accuracy = accuracy_score(y_test_binary, y_pred_binary)
        report = classification_report(y_test_binary, y_pred_binary, output_dict=True, zero_division=0)
        confusion = confusion_matrix(y_test_binary, y_pred_binary)

        # Save confusion matrix heatmap
        normalized_matrix = confusion / confusion.sum(axis=1, keepdims=True)
        plt.figure(figsize=(8, 6))
        sns.heatmap(normalized_matrix, annot=True, fmt=".2f", cmap="Blues",
                    xticklabels=["Normal", "Attack"],
                    yticklabels=["Normal", "Attack"])
        plt.title(f"Confusion Matrix Heatmap: {model_name.replace('_', ' ').capitalize()}")
        plt.xlabel("Predicted Class")
        plt.ylabel("Actual Class")
        heatmap_path = os.path.join(BINARY_RESULTS_SAVE_PATH, f"{model_name.replace('_', '-')}_Binary_Heatmap.png")
        plt.savefig(heatmap_path)
        plt.close()
        f.write(f"Saved heatmap to {heatmap_path}\n")

        # Extract precision, recall, and F1 scores
        class_scores = {
            class_label: {
                "Precision": report[str(class_label)]['precision'],
                "Recall": report[str(class_label)]['recall'],
                "F1 Score": report[str(class_label)]['f1-score']
            }
            for class_label in [0, 1]  # Binary classes: 0 (Normal), 1 (Attack)
        }

        # Calculate overall metrics
        overall_precision = np.mean([class_scores[label]["Precision"] for label in class_scores])
        overall_recall = np.mean([class_scores[label]["Recall"] for label in class_scores])
        overall_f1_score = np.mean([class_scores[label]["F1 Score"] for label in class_scores])

        # Calculate false positives/negatives
        misclassified_as_normal = np.sum((y_pred_binary == 0) & (y_test_binary == 1))
        false_alarms = np.sum((y_pred_binary == 1) & (y_test_binary == 0))

        # Save overall metrics to file
        f.write(f"Accuracy: {accuracy:.4f}\n")
        f.write(f"Overall Precision: {overall_precision:.4f}\n")
        f.write(f"Overall Recall: {overall_recall:.4f}\n")
        f.write(f"Overall F1 Score: {overall_f1_score:.4f}\n")
        f.write(f"Misclassified as Normal (False Negatives): {misclassified_as_normal}/{total_attacks}\n")
        f.write(f"False Alarms (False Positives): {false_alarms}/{total_normals}\n")

        # Save class-wise metrics
        f.write("\nClass-Wise Metrics:\n")
        for class_label in [0, 1]:  # Binary classes
            f.write(f"Class {class_label} ({['Normal', 'Attack'][class_label]}):\n")
            f.write(f"  Precision: {class_scores[class_label]['Precision']:.4f}\n")
            f.write(f"  Recall:    {class_scores[class_label]['Recall']:.4f}\n")
            f.write(f"  F1 Score:  {class_scores[class_label]['F1 Score']:.4f}\n")

        # Save confusion matrix in file
        f.write("\nConfusion Matrix:\n")
        f.write(np.array2string(confusion) + "\n")

print(f"All evaluation results saved to {BINARY_RESULTS_FILE_PATH}")


All evaluation results saved to /content/drive/MyDrive/MachineLearning/new_imported_dataset/binary_thesis/Confusion_matrices/evaluation_results_binary.txt
