In [2]:
import pandas as pd
from sklearn.metrics import classification_report

In [6]:
# Load the test dataset
# isolation forest
# test_df = pd.read_csv("isolation-forest/test_with_final_anomalies.csv")

# svm
test_df = pd.read_csv("svm/test_with_svm_anomalies.csv")

In [7]:
# Manually label my ground truth for comparison

# Define thresholds for labeling anomalies
high_response_time_threshold = test_df['response_time'].quantile(0.95)  # Top 5% response times
error_status_codes = [404, 500, 503]  #  Common error codes

# Label ground truth based on rules
def label_ground_truth(row):
    if row['status_code'] in error_status_codes:
        return -1  # Anomalous: Error status codes
    elif row['response_time'] > high_response_time_threshold:
        return -1  # Anomalous: High response time
    elif pd.isnull(row['internal_ip']):
        return -1  # Anomalous: Missing client IP
    else:
        return 1  # Normal log

# Apply the labeling function
test_df['true_label'] = test_df.apply(label_ground_truth, axis=1)

# Save the labeled dataset (isolation-forest)
test_df.to_csv("isolation-forest/test_with_ground_truth.csv", index=False)

# one class svm
# test_df.to_csv("svm/test_with_ground_truth.csv", index=False)

print("Ground truth labels added and saved to 'test_with_ground_truth.csv'.")


Ground truth labels added and saved to 'test_with_ground_truth.csv'.


In [5]:
# Accuracy of isolation-forest
test_df_with_ground_truth = pd.read_csv("isolation-forest/test_with_ground_truth.csv")


y_true = test_df_with_ground_truth['true_label'] 
y_pred = test_df_with_ground_truth['is_anomalous'].apply(lambda x: -1 if x else 1)  # Convert to match format

# Generate a classification report
print(classification_report(y_true, y_pred, target_names=['Normal', 'Anomalous']))


              precision    recall  f1-score   support

      Normal       1.00      0.29      0.46      2397
   Anomalous       0.93      1.00      0.97     24227

    accuracy                           0.94     26624
   macro avg       0.97      0.65      0.71     26624
weighted avg       0.94      0.94      0.92     26624



In [8]:
# Accuracy of svm
# load data with ground truth
test_df_with_ground_truth = pd.read_csv("svm/test_with_ground_truth.csv")


y_true = test_df_with_ground_truth['true_label'] 
y_pred = test_df_with_ground_truth['is_anomalous_svm'].apply(lambda x: -1 if x else 1)  # Convert to match format

# Generate a classification report
print(classification_report(y_true, y_pred, target_names=['Normal', 'Anomalous']))


              precision    recall  f1-score   support

      Normal       0.80      0.47      0.59      2397
   Anomalous       0.95      0.99      0.97     24227

    accuracy                           0.94     26624
   macro avg       0.88      0.73      0.78     26624
weighted avg       0.94      0.94      0.93     26624

