# Exploratory Data Analysis (EDA) - GPS Spoofing Detection

This notebook demonstrates exploratory analysis of GPS signals for spoofing detection.
We use synthetic data for demonstration, but you can replace with real FGI-SpoofRepo or TEXBAT data.

In [None]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from utils.synthetic_data import generate_synthetic_dataset
from preprocessing.signal_processing import generate_ca_code
from features.correlation import compute_cross_correlation
from utils.plots import plot_correlation_profile, plot_signal_spectrum

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Generate Synthetic Dataset

Generate a small dataset with both authentic and spoofed signals for demonstration.

In [None]:
# Generate synthetic data
signals, labels, metadata = generate_synthetic_dataset(
    num_authentic=10,
    num_spoofed=10,
    fs=5e6,
    duration=0.5,
    prn_range=(1, 3),
    random_state=42
)

print(f"Generated {len(signals)} signals")
print(f"Authentic: {sum(1 for l in labels if l == 0)}")
print(f"Spoofed: {sum(1 for l in labels if l == 1)}")

## 2. Visualize Signal Examples

Compare authentic and spoofed signals in time and frequency domains.

In [None]:
# Select one authentic and one spoofed signal
authentic_idx = next(i for i, l in enumerate(labels) if l == 0)
spoofed_idx = next(i for i, l in enumerate(labels) if l == 1)

auth_signal = signals[authentic_idx]
spoof_signal = signals[spoofed_idx]

# Plot time domain (first 1000 samples)
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Authentic - Time domain
t = np.arange(1000) / 5e6 * 1e6  # Convert to microseconds
axes[0, 0].plot(t, np.real(auth_signal[:1000]), 'b-', linewidth=0.5)
axes[0, 0].set_title('Authentic Signal - Time Domain (I)')
axes[0, 0].set_xlabel('Time (μs)')
axes[0, 0].set_ylabel('Amplitude')
axes[0, 0].grid(True, alpha=0.3)

# Spoofed - Time domain
axes[0, 1].plot(t, np.real(spoof_signal[:1000]), 'r-', linewidth=0.5)
axes[0, 1].set_title('Spoofed Signal - Time Domain (I)')
axes[0, 1].set_xlabel('Time (μs)')
axes[0, 1].set_ylabel('Amplitude')
axes[0, 1].grid(True, alpha=0.3)

# Authentic - IQ scatter
axes[1, 0].scatter(np.real(auth_signal[::100]), np.imag(auth_signal[::100]), 
                   alpha=0.5, s=1, c='blue')
axes[1, 0].set_title('Authentic Signal - IQ Constellation')
axes[1, 0].set_xlabel('I')
axes[1, 0].set_ylabel('Q')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].axis('equal')

# Spoofed - IQ scatter
axes[1, 1].scatter(np.real(spoof_signal[::100]), np.imag(spoof_signal[::100]), 
                   alpha=0.5, s=1, c='red')
axes[1, 1].set_title('Spoofed Signal - IQ Constellation')
axes[1, 1].set_xlabel('I')
axes[1, 1].set_ylabel('Q')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].axis('equal')

plt.tight_layout()
plt.show()

## 3. Analyze Correlation Profiles

Compare correlation profiles for authentic vs spoofed signals.

In [None]:
# Get PRN code
prn = metadata[authentic_idx]['prn']
ca_code = generate_ca_code(prn)

# Compute correlations
auth_corr = compute_cross_correlation(auth_signal, ca_code)
spoof_corr = compute_cross_correlation(spoof_signal, ca_code)

# Plot correlation profiles
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

fs = 5e6
ca_chip_rate = 1.023e6
samples_per_chip = fs / ca_chip_rate
chips = np.arange(len(auth_corr)) / samples_per_chip

# Authentic
axes[0].plot(chips, auth_corr, 'b-', linewidth=1.5)
peak_idx = np.argmax(auth_corr)
axes[0].plot(chips[peak_idx], auth_corr[peak_idx], 'ro', markersize=10, label='Peak')
axes[0].set_title('Authentic Signal - Correlation Profile', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Code Phase (chips)')
axes[0].set_ylabel('Correlation Magnitude')
axes[0].grid(True, alpha=0.3)
axes[0].legend()

# Spoofed
axes[1].plot(chips, spoof_corr, 'r-', linewidth=1.5)
peak_idx = np.argmax(spoof_corr)
axes[1].plot(chips[peak_idx], spoof_corr[peak_idx], 'ro', markersize=10, label='Peak')
axes[1].set_title('Spoofed Signal - Correlation Profile', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Code Phase (chips)')
axes[1].set_ylabel('Correlation Magnitude')
axes[1].grid(True, alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\nAuthentic - Peak value: {np.max(auth_corr):.2f}")
print(f"Spoofed - Peak value: {np.max(spoof_corr):.2f}")
print(f"Difference: {((np.max(spoof_corr) - np.max(auth_corr)) / np.max(auth_corr) * 100):.1f}%")

## 4. Analyze C/N0 Distribution

Compare C/N0 values between authentic and spoofed signals.

In [None]:
# Extract C/N0 from metadata
cn0_data = []
for i, meta in enumerate(metadata):
    cn0_data.append({
        'cn0': meta['cn0'],
        'label': 'Spoofed' if labels[i] == 1 else 'Authentic'
    })

df_cn0 = pd.DataFrame(cn0_data)

# Plot C/N0 distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
sns.boxplot(data=df_cn0, x='label', y='cn0', ax=axes[0])
axes[0].set_title('C/N0 Distribution by Class', fontsize=14, fontweight='bold')
axes[0].set_ylabel('C/N0 (dB-Hz)')
axes[0].grid(True, alpha=0.3)

# Histogram
for label in ['Authentic', 'Spoofed']:
    data = df_cn0[df_cn0['label'] == label]['cn0']
    axes[1].hist(data, bins=10, alpha=0.6, label=label, density=True)
axes[1].set_title('C/N0 Histogram', fontsize=14, fontweight='bold')
axes[1].set_xlabel('C/N0 (dB-Hz)')
axes[1].set_ylabel('Density')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistics
print("\nC/N0 Statistics:")
print(df_cn0.groupby('label')['cn0'].describe())

## 5. Load Real Dataset (Optional)

Uncomment and modify the following cell to load real FGI-SpoofRepo or TEXBAT data.

In [None]:
# from utils.data_loader import load_fgi_dataset, load_texbat_dataset

# # Option 1: Load FGI-SpoofRepo
# # signals, labels, metadata = load_fgi_dataset('../data/raw/fgi-spoof-repo')

# # Option 2: Load TEXBAT
# # signals, labels, metadata = load_texbat_dataset(
# #     '../data/raw/texbat',
# #     fs=5e6,
# #     segment_duration=0.5,
# #     spoof_start_time=17.0,
# #     max_segments=100
# # )

## Summary

Key observations:
1. **Signal Characteristics**: Spoofed signals typically show higher power and altered correlation profiles
2. **Correlation Profile**: Secondary peaks or asymmetry indicate potential spoofing
3. **C/N0**: Spoofed signals often have elevated C/N0 values

Next steps:
- See `feature_demo.ipynb` for detailed feature extraction
- See `training_eval.ipynb` for model training and evaluation