In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = "dataset.csv"  # Replace with your file path
data = pd.read_csv(file_path)

# Preprocessing: Ensure numeric data
# Convert datetime columns to numeric (Unix timestamp)
for col in data.select_dtypes(include=['object', 'datetime']):
    try:
        data[col] = pd.to_datetime(data[col], errors='coerce').astype(int) / 10**9
    except:
        pass  # Skip non-datetime columns

# Drop or encode remaining non-numeric columns
data = pd.get_dummies(data, drop_first=True)  # One-hot encoding for categorical data

# Ensure no missing values
data = data.dropna()  # Drop rows with missing values or handle them appropriately

# Separate features and labels
X = data.drop(columns=['label'])  # Replace 'label' with your actual label column name
y = data['label']  # Binary labels: 1 for normal, 0 for anomaly

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the data to avoid large values affecting models (especially OneClassSVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Models
models = {
    "LocalOutlierFactor": LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1),
    "OneClassSVM": OneClassSVM(kernel="rbf", gamma=0.1),
    "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=5),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=42),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42)
}

# Evaluate models
def analyze_model(model, X_train, X_test, y_train, y_test, model_name):
    if hasattr(model, 'fit_predict'):  # For LOF
        model.fit(X_train)
        y_pred = model.predict(X_test)
        y_pred = np.where(y_pred == 1, 1, 0)  # Convert LOF output to binary
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Calculate metrics
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nModel: {model_name}")
    print(classification_report(y_test, y_pred, zero_division=0))
    print(f"Confusion Matrix:\n{cm}")

    return {
        "Model": model_name,
        "Precision": report['1']['precision'],
        "Recall": report['1']['recall'],
        "F1-Score": report['1']['f1-score'],
        "False Positives": cm[0][1],
        "False Negatives": cm[1][0]
    }

# Perform analysis
results = []

for model_name, model in models.items():
    print(f"Analyzing {model_name}...")
    result = analyze_model(model, X_train_scaled, X_test_scaled, y_train, y_test, model_name)
    results.append(result)

# Convert results to DataFrame for summary
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df)

# Save results to CSV (optional)
results_df.to_csv("model_discrepancy_analysis.csv", index=False)


Analyzing LocalOutlierFactor...

Model: LocalOutlierFactor
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.00      0.00      0.00      1754
           1       0.57      0.88      0.69      2520

    accuracy                           0.52      4274
   macro avg       0.19      0.29      0.23      4274
weighted avg       0.34      0.52      0.41      4274

Confusion Matrix:
[[   0    0    0]
 [  92    0 1662]
 [ 304    0 2216]]
Analyzing OneClassSVM...

Model: OneClassSVM
              precision    recall  f1-score   support

           0       0.23      0.29      0.25      1754
           1       0.40      0.33      0.36      2520

    accuracy                           0.31      4274
   macro avg       0.31      0.31      0.31      4274
weighted avg       0.33      0.31      0.31      4274

Confusion Matrix:
[[ 502 1252]
 [1700  820]]
Analyzing KNeighborsClassifier...

Model: KNeighborsClassifier
       