# Dataset 1: SNMC Patient-wise Excel Data Exploration

This notebook demonstrates how to load and explore the SNMC patient-wise EEG data stored in Excel files.

## Dataset Structure

- **Location**: `data/raw/patient_wise_mat/`
- **Files**: 12 patients × 4 books each = 48 Excel files
- **Format**: Each Excel file contains multiple sheets
- **Columns**: Time (HH-MM-SS) + 16 bipolar EEG channels
- **Channels**:
  - Right: FP2-F4, F4-C4, C4-P4, P4-O2, FP2-F8, F8-T4, T4-T6, T6-O2
  - Left: FP1-F3, F3-C3, C3-P3, P3-O1, FP1-F7, F7-T3, T3-T5, T5-O1

## Seizure Information

- **Patient 1** (ID 363) → Has seizures
- **Patient 11** (ID 1306) → Has seizures
- All other patients → No seizures

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from loaders import (
    load_patient_book,
    load_patient_data,
    list_available_snmc_files,
    get_sheet_info,
    extract_eeg_data,
    convert_to_numpy,
    has_seizures,
    get_patient_seizure_id,
)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 8)

## 1. List Available Files

In [None]:
# List all available patient files
available_files = list_available_snmc_files()

print(f"Total patients found: {len(available_files)}")
print("\nPatients and their files:")
for patient_id in sorted(available_files.keys()):
    files = available_files[patient_id]
    seizure_status = "HAS SEIZURES" if has_seizures(patient_id) else "No seizures"
    seizure_id = get_patient_seizure_id(patient_id)
    id_info = f" (ID: {seizure_id})" if seizure_id else ""
    print(f"  Patient {patient_id}{id_info}: {len(files)} books - {seizure_status}")

## 2. Load a Single Book (Excel File)

Let's load one Excel file to see its structure:

In [None]:
# Check if we have any data
if not available_files:
    print("⚠️ No data files found!")
    print("Please download the SNMC dataset first:")
    print("  python scripts/download_datasets.py --dataset snmc")
else:
    # Load Patient 1, Book 1 (patient with seizures)
    patient_id = 1
    book_file = available_files[patient_id][0]  # First book
    
    print(f"Loading: {book_file}")
    sheets = load_patient_book(book_file)
    
    print(f"\nLoaded {len(sheets)} sheets:")
    for sheet_name in sheets.keys():
        print(f"  - {sheet_name}")

## 3. Examine a Single Sheet

In [None]:
if available_files:
    # Get the first sheet
    first_sheet_name = list(sheets.keys())[0]
    first_sheet = sheets[first_sheet_name]
    
    print(f"Sheet: {first_sheet_name}")
    print(f"Shape: {first_sheet.shape}")
    print(f"\nColumns ({len(first_sheet.columns)}):")
    print(list(first_sheet.columns))
    print(f"\nFirst few rows:")
    display(first_sheet.head())
    
    # Get sheet info
    info = get_sheet_info(first_sheet)
    print(f"\nSheet Info:")
    print(f"  Rows: {info['n_rows']}")
    print(f"  EEG Channels: {info['n_channels']}")
    print(f"  Has Time Column: {info['has_time_column']}")
    if info['has_time_column']:
        print(f"  Time Column: {info['time_column']}")

## 4. Extract EEG Data

In [None]:
if available_files:
    # Extract time and EEG data
    time_series, eeg_data = extract_eeg_data(first_sheet)
    
    print(f"Time series shape: {time_series.shape if time_series is not None else 'N/A'}")
    print(f"EEG data shape: {eeg_data.shape}")
    print(f"\nEEG Channels ({len(eeg_data.columns)}):")
    for i, channel in enumerate(eeg_data.columns, 1):
        print(f"  {i:2d}. {channel}")

## 5. Convert to NumPy Arrays

In [None]:
if available_files:
    # Convert to numpy arrays
    arrays = convert_to_numpy(first_sheet)
    
    print(f"Data array shape: {arrays['data'].shape}")
    print(f"Channels: {arrays['channels']}")
    if 'time' in arrays:
        print(f"Time array shape: {arrays['time'].shape}")
    
    # Basic statistics
    print(f"\nData Statistics:")
    print(f"  Mean: {np.mean(arrays['data']):.2f}")
    print(f"  Std: {np.std(arrays['data']):.2f}")
    print(f"  Min: {np.min(arrays['data']):.2f}")
    print(f"  Max: {np.max(arrays['data']):.2f}")

## 6. Load Complete Patient Data

Load all 4 books for a patient:

In [None]:
if available_files:
    # Load all data for Patient 1 (has seizures)
    patient_data = load_patient_data(1)
    
    print(f"Patient ID: {patient_data['patient_id']}")
    print(f"Seizure ID: {patient_data['seizure_id']}")
    print(f"Has Seizures: {patient_data['has_seizures']}")
    print(f"\nMetadata:")
    print(f"  Books loaded: {patient_data['metadata']['num_books']}")
    print(f"  Total sheets: {patient_data['metadata']['total_sheets']}")
    
    print(f"\nBooks and their sheets:")
    for book_num in sorted(patient_data['books'].keys()):
        sheets = patient_data['books'][book_num]
        print(f"  Book {book_num}: {len(sheets)} sheets - {list(sheets.keys())}")

## 7. Visualize EEG Signals

In [None]:
if available_files:
    # Plot first few channels from the first sheet
    time_series, eeg_data = extract_eeg_data(first_sheet)
    
    # Plot first 4 channels
    n_channels_to_plot = min(4, len(eeg_data.columns))
    n_samples_to_plot = min(1000, len(eeg_data))  # First 1000 samples
    
    fig, axes = plt.subplots(n_channels_to_plot, 1, figsize=(15, 10))
    if n_channels_to_plot == 1:
        axes = [axes]
    
    for i in range(n_channels_to_plot):
        channel = eeg_data.columns[i]
        data = eeg_data[channel].iloc[:n_samples_to_plot].values
        
        axes[i].plot(data, linewidth=0.5)
        axes[i].set_ylabel(channel)
        axes[i].set_xlim(0, n_samples_to_plot)
        axes[i].grid(True, alpha=0.3)
    
    axes[-1].set_xlabel('Sample')
    fig.suptitle(f'EEG Signals - Patient {patient_id}, Book 1, Sheet: {first_sheet_name}', 
                 fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 8. Compare Patients With and Without Seizures

In [None]:
if available_files and len(available_files) >= 2:
    # Load data from patient with seizures (Patient 1)
    patient_with_seizures = load_patient_data(1)
    
    # Load data from patient without seizures (Patient 2, if available)
    patients_without_seizures = [p for p in available_files.keys() if not has_seizures(p)]
    
    if patients_without_seizures:
        patient_without_seizures = load_patient_data(patients_without_seizures[0])
        
        print("Comparison:")
        print(f"\nPatient {patient_with_seizures['patient_id']} (HAS SEIZURES - ID {patient_with_seizures['seizure_id']}):")
        print(f"  Books: {patient_with_seizures['metadata']['num_books']}")
        print(f"  Total sheets: {patient_with_seizures['metadata']['total_sheets']}")
        
        print(f"\nPatient {patient_without_seizures['patient_id']} (No seizures):")
        print(f"  Books: {patient_without_seizures['metadata']['num_books']}")
        print(f"  Total sheets: {patient_without_seizures['metadata']['total_sheets']}")

## 9. Summary

This notebook demonstrated:
1. Loading Excel files with multiple sheets
2. Extracting patient metadata (seizure status)
3. Accessing time series and EEG channel data
4. Converting to NumPy arrays for analysis
5. Visualizing EEG signals

## Next Steps

- **Feature Extraction**: Extract time-domain and frequency-domain features
- **Signal Processing**: Apply filtering, artifact removal
- **Seizure Detection**: Build models to detect seizures
- **Cross-Patient Analysis**: Compare patterns across patients
- **Integration**: Combine with other datasets (Delhi Hospital, CSV dataset)