# ECG Data Exploration

This notebook demonstrates how to explore and visualize ECG datasets for the ECG-LLM project.

## Contents
1. Load ECG datasets
2. Explore signal characteristics
3. Visualize ECG waveforms
4. Analyze data quality
5. Statistical analysis

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.utils.data_loader import ECGDataLoader
from src.utils.visualization import ECGVisualizer
from src.preprocessing.signal_filter import ECGFilter

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
%matplotlib inline

## 1. Load ECG Dataset

First, let's load a sample ECG dataset and explore its structure.

In [None]:
# Initialize data loader
data_path = Path('../data/raw/ptb-xl')
loader = ECGDataLoader(data_path)

# Get dataset information
dataset_info = loader.get_dataset_info()
print(f"Dataset Info: {dataset_info}")

In [None]:
# Load sample data (or use synthetic data for demonstration)
# For demonstration, we'll create synthetic ECG data

def create_synthetic_ecg(length=5000, sampling_rate=500, leads=12):
    """Create synthetic ECG data for demonstration."""
    t = np.linspace(0, length/sampling_rate, length)
    ecg_data = np.zeros((length, leads))
    
    for lead in range(leads):
        # Simple ECG-like waveform
        baseline = 0.1 * np.sin(2 * np.pi * 0.5 * t)  # Baseline wander
        qrs = 0.8 * np.sin(2 * np.pi * 1.2 * t) * np.exp(-5 * (t % 0.8)**2)  # QRS complex
        noise = 0.05 * np.random.normal(0, 1, length)  # Noise
        
        ecg_data[:, lead] = baseline + qrs + noise
    
    return ecg_data, t

# Generate synthetic data
ecg_signal, time_axis = create_synthetic_ecg()
print(f"ECG Signal Shape: {ecg_signal.shape}")

## 2. Signal Characteristics

In [None]:
# Analyze signal characteristics
print(f"Signal duration: {len(ecg_signal)/500:.2f} seconds")
print(f"Number of leads: {ecg_signal.shape[1]}")
print(f"Sampling rate: 500 Hz")
print(f"Signal range: {np.min(ecg_signal):.3f} to {np.max(ecg_signal):.3f}")
print(f"Signal mean: {np.mean(ecg_signal):.3f}")
print(f"Signal std: {np.std(ecg_signal):.3f}")

## 3. ECG Visualization

In [None]:
# Plot first few leads
fig, axes = plt.subplots(3, 1, figsize=(15, 10))

lead_names = ['Lead I', 'Lead II', 'Lead III']

for i in range(3):
    axes[i].plot(time_axis[:2000], ecg_signal[:2000, i], linewidth=1)
    axes[i].set_title(f'{lead_names[i]}')
    axes[i].set_xlabel('Time (s)')
    axes[i].set_ylabel('Amplitude (mV)')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 12-lead ECG visualization
fig, axes = plt.subplots(4, 3, figsize=(20, 15))
axes = axes.flatten()

lead_names = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']

for i in range(12):
    axes[i].plot(time_axis[:2000], ecg_signal[:2000, i], 'b-', linewidth=0.8)
    axes[i].set_title(f'Lead {lead_names[i]}')
    axes[i].set_xlabel('Time (s)')
    axes[i].set_ylabel('Amplitude (mV)')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Signal Quality Analysis

In [None]:
# Analyze signal quality
filter_obj = ECGFilter(sampling_rate=500)
quality_metrics = filter_obj.get_signal_quality(ecg_signal)

print("Signal Quality Metrics:")
for lead_name, metrics in quality_metrics['leads'].items():
    print(f"{lead_name}: SNR = {metrics['snr_db']:.2f} dB, Range = {metrics['amplitude_range']:.3f}")

print(f"\nOverall SNR: {quality_metrics['overall_snr_db']:.2f} dB")

## 5. Statistical Analysis

In [None]:
# Statistical analysis of ECG signals
stats_df = pd.DataFrame({
    'Lead': [f'Lead_{i}' for i in range(12)],
    'Mean': [np.mean(ecg_signal[:, i]) for i in range(12)],
    'Std': [np.std(ecg_signal[:, i]) for i in range(12)],
    'Min': [np.min(ecg_signal[:, i]) for i in range(12)],
    'Max': [np.max(ecg_signal[:, i]) for i in range(12)],
    'Range': [np.ptp(ecg_signal[:, i]) for i in range(12)]
})

print("Statistical Summary:")
print(stats_df.round(4))

In [None]:
# Correlation analysis
correlation_matrix = np.corrcoef(ecg_signal.T)

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            xticklabels=lead_names,
            yticklabels=lead_names)
plt.title('ECG Lead Correlation Matrix')
plt.tight_layout()
plt.show()

## 6. Frequency Domain Analysis

In [None]:
# Frequency domain analysis
from scipy.fft import fft, fftfreq

# Analyze Lead II
lead_ii = ecg_signal[:, 1]

# Compute FFT
fft_vals = fft(lead_ii)
freqs = fftfreq(len(lead_ii), 1/500)

# Plot frequency spectrum
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plt.plot(time_axis[:2000], lead_ii[:2000])
plt.title('Lead II - Time Domain')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude (mV)')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(freqs[:len(freqs)//2], np.abs(fft_vals[:len(fft_vals)//2]))
plt.title('Lead II - Frequency Domain')
plt.xlabel('Frequency (Hz)')
plt.ylabel('Magnitude')
plt.xlim(0, 100)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Conclusion

This notebook demonstrated basic ECG data exploration including:
- Loading and visualizing ECG signals
- Analyzing signal quality metrics
- Statistical analysis of leads
- Frequency domain analysis

Next steps:
1. Preprocessing and filtering (notebook 02)
2. Signal encoding experiments (notebook 03)
3. Model training and evaluation (notebooks 04-05)