# RespiScope-AI: Exploratory Data Analysis

This notebook performs comprehensive EDA on the ICBHI respiratory sound dataset.

## Contents
1. Dataset Overview
2. Class Distribution
3. Audio Characteristics
4. Spectrogram Analysis
5. Quality Metrics
6. Recommendations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('✓ Libraries imported')

## 1. Dataset Overview

In [None]:
# Load metadata
metadata_path = '../data/processed/processed_metadata.csv'
df = pd.read_csv(metadata_path)

print(f"Total samples: {len(df)}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

## 2. Class Distribution

In [None]:
# Class distribution
class_counts = df['class'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
class_counts.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Class Distribution (Count)', fontsize=14)
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart
class_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', startangle=90)
axes[1].set_title('Class Distribution (Percentage)', fontsize=14)
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

# Print statistics
print("\nClass Distribution:")
for cls, count in class_counts.items():
    print(f"  {cls}: {count} ({count/len(df)*100:.1f}%)")

# Imbalance ratio
max_count = class_counts.max()
min_count = class_counts.min()
print(f"\nImbalance Ratio: {max_count/min_count:.2f}:1")

## 3. Audio Characteristics

In [None]:
# Duration distribution
if 'duration' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    axes[0].hist(df['duration'], bins=30, color='skyblue', edgecolor='black')
    axes[0].set_title('Duration Distribution')
    axes[0].set_xlabel('Duration (seconds)')
    axes[0].set_ylabel('Count')
    axes[0].axvline(df['duration'].mean(), color='red', linestyle='--', label='Mean')
    axes[0].legend()
    
    # Box plot by class
    df.boxplot(column='duration', by='class', ax=axes[1])
    axes[1].set_title('Duration by Class')
    axes[1].set_xlabel('Class')
    axes[1].set_ylabel('Duration (seconds)')
    plt.suptitle('')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nDuration Statistics:")
    print(f"  Mean: {df['duration'].mean():.2f}s")
    print(f"  Median: {df['duration'].median():.2f}s")
    print(f"  Std: {df['duration'].std():.2f}s")
    print(f"  Min: {df['duration'].min():.2f}s")
    print(f"  Max: {df['duration'].max():.2f}s")

## 4. Quality Metrics

In [None]:
# SNR distribution
if 'snr_db' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # SNR histogram
    axes[0].hist(df['snr_db'], bins=30, color='lightgreen', edgecolor='black')
    axes[0].set_title('SNR Distribution')
    axes[0].set_xlabel('SNR (dB)')
    axes[0].set_ylabel('Count')
    axes[0].axvline(20, color='red', linestyle='--', label='Threshold (20dB)')
    axes[0].legend()
    
    # SNR by class
    df.boxplot(column='snr_db', by='class', ax=axes[1])
    axes[1].set_title('SNR by Class')
    axes[1].set_xlabel('Class')
    axes[1].set_ylabel('SNR (dB)')
    plt.suptitle('')
    
    plt.tight_layout()
    plt.show()
    
    # Quality statistics
    quality_pass = df['quality_pass'].sum() if 'quality_pass' in df.columns else 0
    print(f"\nQuality Statistics:")
    print(f"  Passed quality check: {quality_pass}/{len(df)} ({quality_pass/len(df)*100:.1f}%)")
    print(f"  Mean SNR: {df['snr_db'].mean():.2f} dB")
    print(f"  Median SNR: {df['snr_db'].median():.2f} dB")

## 5. Sample Spectrograms

In [None]:
# Load and display sample spectrograms
spec_dir = '../data/processed/spectrograms'

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

# Get one sample from each class
classes = df['class'].unique()[:4]

for idx, cls in enumerate(classes):
    sample = df[df['class'] == cls].iloc[0]
    spec_path = Path(spec_dir) / sample['clip_filename']
    
    if spec_path.exists():
        spec = np.load(spec_path)
        
        im = axes[idx].imshow(spec.T, aspect='auto', origin='lower', cmap='viridis')
        axes[idx].set_title(f'{cls}', fontsize=12)
        axes[idx].set_xlabel('Time')
        axes[idx].set_ylabel('Mel Frequency')
        plt.colorbar(im, ax=axes[idx])

plt.tight_layout()
plt.show()

## 6. Recommendations

Based on the analysis:

1. **Class Imbalance**: Use focal loss and class weights
2. **Duration Variance**: Extract fixed 3-second clips
3. **Quality Issues**: Filter low SNR recordings
4. **Data Augmentation**: Apply SpecAugment, mixup
5. **Model Strategy**: Use ensemble of multiple folds