In [None]:
# GPS_IDS_Reproduction.py
# Reproduction of Abrar et al. (2024) GPS-IDS anomaly-based spoofing detection

# If running in Colab, uncomment to mount Google Drive:
# from google.colab import drive
# drive.mount('/content/drive')

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_curve
import joblib

# ==== CONFIG ====
# Path to your CSV file. Ensure Google Drive is mounted if needed.
CSV_PATH = '/content/drive/MyDrive/thesis/combined_data_Auto_pilot.csv'
FEATURE_COLS = [
    'latitude', 'longitude',
    'accel_x', 'accel_y', 'accel_z',
    'gyro_x', 'gyro_y', 'gyro_z',
    'steering_angle', 'throttle', 'brake', 'speed'
]
LABEL_COL = 'label'  # expected values: 'live' or 'spoofed'

# ==== Load & Validate Data ====
if not os.path.isfile(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found at {CSV_PATH}. Ensure path is correct and Drive is mounted.")

df = pd.read_csv(CSV_PATH)
missing = [c for c in FEATURE_COLS + [LABEL_COL] if c not in df.columns]
if missing:
    raise KeyError(f"Missing columns: {missing}. Please verify your dataset.")

X = df[FEATURE_COLS].to_numpy()
y = (df[LABEL_COL] == 'spoofed').astype(int).to_numpy()
print(f"Loaded {len(df)} samples with features: {FEATURE_COLS}")

# ==== 5-Fold CV & Threshold Tuning ====
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores, thresholds = [], []
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=200, random_state=42)
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)[:, 1]

    prec, rec, th = precision_recall_curve(y_test, y_prob)
    f1 = 2 * (prec * rec) / (prec + rec + 1e-9)
    best_idx = np.nanargmax(f1)
    thresholds.append(th[best_idx])
    f1_scores.append(f1[best_idx])
    print(f"Fold {fold}: Best Threshold={th[best_idx]:.3f}, F1={f1[best_idx]:.3f}")

mean_f1 = np.mean(f1_scores)
mean_thresh = np.mean(thresholds)
print(f"\nAverage F1 across folds: {mean_f1:.3f}, Average Threshold: {mean_thresh:.3f}\n")

# ==== Retrain on Full Dataset & Save ====
final_clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=200, random_state=42)
final_clf.fit(X, y)

model_path = 'gps_ids_mlp_model.pkl'
threshold_path = 'gps_ids_threshold.txt'
joblib.dump(final_clf, model_path)
with open(threshold_path, 'w') as f:
    f.write(f"{mean_thresh}\n")
print(f"✅ Model saved to {model_path}")
print(f"✅ Threshold saved to {threshold_path}")
