# Feature Extraction Demo - GPS Spoofing Detection

This notebook demonstrates the feature extraction pipeline for GPS spoofing detection.

In [None]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from utils.synthetic_data import generate_synthetic_dataset
from preprocessing.signal_processing import generate_ca_code
from features.pipeline import build_feature_vector, build_feature_dataframe, preprocess_features
from utils.plots import plot_feature_distributions

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Generate Test Dataset

In [None]:
# Generate synthetic signals
signals, labels, metadata = generate_synthetic_dataset(
    num_authentic=50,
    num_spoofed=50,
    fs=5e6,
    duration=0.5,
    prn_range=(1, 5),
    random_state=42
)

print(f"Generated {len(signals)} signals")
print(f"Authentic: {sum(1 for l in labels if l == 0)}")
print(f"Spoofed: {sum(1 for l in labels if l == 1)}")

## 2. Extract Features from Single Signal

Demonstrate feature extraction for a single signal window.

In [None]:
# Select first signal
signal = signals[0]
prn = metadata[0]['prn']
label = labels[0]

# Generate PRN code
ca_code = generate_ca_code(prn)

# Extract features
features = build_feature_vector(
    signal=signal,
    prn_code=ca_code,
    fs=5e6,
    label=label,
    metadata={'prn': prn, 'segment_index': 0}
)

# Display features
print("\nExtracted Features:")
print("="*60)
for key, value in sorted(features.items()):
    if isinstance(value, float):
        print(f"{key:25s}: {value:12.4f}")
    else:
        print(f"{key:25s}: {value}")

## 3. Build Complete Feature DataFrame

Extract features for all signals and create a DataFrame.

In [None]:
# Extract features for all signals
print("Extracting features for all signals...")

all_features = []
for i, signal in enumerate(signals):
    prn = metadata[i]['prn']
    ca_code = generate_ca_code(prn)
    
    features = build_feature_vector(
        signal=signal,
        prn_code=ca_code,
        fs=5e6,
        label=labels[i],
        metadata={'prn': prn, 'segment_index': i}
    )
    all_features.append(features)

# Create DataFrame
df_features = pd.DataFrame(all_features)

print(f"\nFeature DataFrame shape: {df_features.shape}")
print(f"\nColumns: {list(df_features.columns)}")
print("\nFirst few rows:")
df_features.head()

## 4. Feature Statistics and Correlations

In [None]:
# Select numeric features (exclude metadata)
feature_cols = [col for col in df_features.columns if col not in ['label', 'prn', 'segment_index']]

# Summary statistics
print("Feature Summary Statistics:")
print("="*80)
df_features[feature_cols].describe()

In [None]:
# Compute correlation matrix for key features
key_features = ['peak_height', 'peak_to_secondary', 'fwhm', 'fpw', 'asymmetry', 
                'cn0_estimate', 'total_power', 'snr_estimate']
available_features = [f for f in key_features if f in df_features.columns]

corr_matrix = df_features[available_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Visualize Feature Distributions

Compare feature distributions between authentic and spoofed signals.

In [None]:
# Plot key feature distributions
key_features = ['peak_height', 'peak_to_secondary', 'fpw', 'asymmetry', 
                'cn0_estimate', 'snr_estimate', 'total_power', 'fwhm']
available_features = [f for f in key_features if f in df_features.columns]

plot_feature_distributions(
    df_features,
    features=available_features,
    label_col='label',
    figsize=(16, 12)
)
plt.show()

## 6. Feature Preprocessing

Demonstrate feature preprocessing (imputation, scaling).

In [None]:
# Prepare features for modeling
X = df_features.drop(columns=['label', 'prn', 'segment_index'], errors='ignore')
y = df_features['label'].values

# Preprocess features
X_processed, imputer, scaler, _ = preprocess_features(
    X, y,
    fit=True
)

print(f"\nOriginal feature shape: {X.shape}")
print(f"Processed feature shape: {X_processed.shape}")
print(f"\nProcessed features (first 5 samples):")
print(X_processed[:5])

## 7. Feature Importance Analysis

Use a simple Random Forest to assess feature importance.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train simple model
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_processed, y)

# Get feature importances
feature_names = [col for col in X.columns]
importances = rf.feature_importances_

# Sort by importance
indices = np.argsort(importances)[::-1]

# Plot feature importances
plt.figure(figsize=(12, 8))
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Feature Importance', fontsize=12)
plt.title('Feature Importance (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Print top features
print("\nTop 10 Most Important Features:")
for i in range(min(10, len(indices))):
    print(f"{i+1}. {feature_names[indices[i]]:25s}: {importances[indices[i]]:.4f}")

## 8. Save Feature Dataset

Save the extracted features for use in model training.

In [None]:
# Save to CSV
output_path = '../data/processed/features_demo.csv'
df_features.to_csv(output_path, index=False)
print(f"Features saved to: {output_path}")

## Summary

Key features for GPS spoofing detection:

**Correlation-based (SQMs):**
- `peak_to_secondary`: Ratio of primary to secondary peak (decreases with spoofing)
- `fpw`: Fractional Peak Width (increases with spoofing)
- `asymmetry`: Peak asymmetry (changes with synchronized spoofing)

**Power-based:**
- `cn0_estimate`: Carrier-to-Noise ratio (increases with power attacks)
- `total_power`: Signal power (often elevated in spoofing)

Next: See `training_eval.ipynb` for model training and evaluation.