In [15]:
!pip install numpy matplotlib librosa scipy plotly nbformat



In [16]:
import numpy as np
from scipy.io import wavfile
from IPython.display import Audio, display
import plotly.graph_objects as go

# ============================================================================
# SECTION 1: SIGNAL GENERATION (The Physics)
# ============================================================================
# Why: Create synthetic audio signals to learn DSP fundamentals.
# What we'll build: Two pure tones at different frequencies + noise
#
# Real-world analogy: Imagine a room with a low hum (e.g., AC power at 150 Hz)
# and a bird chirping (high-pitched at 3000 Hz). We'll simulate this.
# ============================================================================

# --- Setup: Key audio parameters ---
sample_rate = 16000  # Hz (samples per second). Standard for speech/phone audio.
duration = 3  # seconds. How long is our recording?
amplitude = 0.5  # Max amplitude for our sine waves (range -1 to 1)

# --- Function: Pure sine wave generator ---
def generate_tone(freq, duration, sr=16000):
    """
    Create a pure sine wave at a given frequency.
    
    Math: y(t) = A * sin(2π*f*t)
    - A = 0.5 (Amplitude, controls loudness)
    - f = Frequency in Hz (e.g., 150 Hz = 150 cycles per second)
    - t = time array from 0 to duration
    
    Returns:
      t: time array (used for plotting)
      signal: the actual waveform (numpy array of audio samples)
    """
    t = np.linspace(0, duration, int(sr * duration), endpoint=False)
    signal = amplitude * np.sin(2 * np.pi * freq * t) 
    return t, signal

# --- Create two component signals ---
t, tone_low = generate_tone(150, duration, sample_rate)  # Low tone (like AC hum)
_, tone_high = generate_tone(3000, duration, sample_rate)  # High tone (like birdsong)
audio = tone_low + tone_high  # Mix them together (linear superposition)

# --- Add realistic noise ---
# Why: Real recordings are never clean. Microphone noise, wind, etc.
noise = np.random.normal(0, 0.1, audio.shape)  # Gaussian noise: mean=0, std=0.1
noisy_audio = audio + noise  # Composite signal: tone_low + tone_high + noise

# --- Info & Playback ---
print(f"Signal Generated. shape: {noisy_audio.shape}, Sampling Rate: {sample_rate}")
print(f"  → {noisy_audio.shape[0]} samples at {sample_rate} Hz = {duration} seconds")
print(f"  → To reconstruct audio: play 16000 samples per second")

rms_manual = np.sqrt(np.mean(tone_high*2))
print(f"RMS Amplitude (Perceived Loudness): {rms_manual:.4f}")
print(f"  → Range typically 0–1 for normalized audio")
display(Audio(audio, rate=sample_rate, embed=True))

Signal Generated. shape: (48000,), Sampling Rate: 16000
  → 48000 samples at 16000 Hz = 3 seconds
  → To reconstruct audio: play 16000 samples per second
RMS Amplitude (Perceived Loudness): 0.0000
  → Range typically 0–1 for normalized audio


In [17]:
# 1) Pick a short time window so the plot is responsive and features are clear.
# At 16 kHz, 1000 samples ≈ 0.0625 seconds.
N = 1000

# 2) Build an interactive line plot with three traces:
#    - Noisy Signal: what you would "record" (tone_low + tone_high + noise)
#    - High Tone (3000 Hz): fast oscillations
#    - Underlying Low Tone (150 Hz): slow oscillations (dashed for contrast)
fig = go.Figure([
    go.Scatter(x=t[:N], y=noisy_audio[:N], name='Noisy Signal', opacity=0.7),
    go.Scatter(x=t[:N], y=tone_high[:N], name='High Tone (3000 Hz)', opacity=0.5),
    go.Scatter(x=t[:N], y=tone_low[:N], name='Low Tone (150 Hz)', line=dict(dash='dash'), opacity=0.9)
])

# 3) Tidy layout. Tip: Use the legend to hide/show components as you inspect.
fig.update_layout(
    title='Time Domain: Composite vs. Component Tones',
    xaxis_title='Time (s)',
    yaxis_title='Amplitude',
    width=1400,
    height=400,
    hovermode='x unified'  # One unified tooltip across traces at a given time
)

# Try this: Click a legend item to toggle visibility. Drag to zoom.
fig.show()

In [18]:
# ============================================================================
# SECTION 2: TIME DOMAIN ANALYSIS
# ============================================================================
# Why: Measure signal properties directly from time samples.
# Use case: RMS (Root Mean Square) is the "loudness" or energy metric.
# Definition: RMS is basically average amplitude over time without direction.
# Peak: measures the tallest wave in the wavform/highest amplitude.
# ============================================================================

# --- RMS Amplitude (Energy/Loudness Metric) ---
# Formula: RMS = sqrt( mean(x[n]^2) for all n )
# 
# Intuition:
#   1. Square each sample (x[n]^2) to get energy
#   2. Average the energy across all samples (mean)
#   3. Take the square root to get back to amplitude scale
#
# Why it matters:
#   - RMS is what our ears perceive as "loudness"
#   - Used in audio gain control, dynamic range, etc.
#   - In dB scale: dB = 20*log10(RMS/reference)

def rms(x):
    # Formula: RMS = sqrt( mean(x[n]^2) for all n )
    return np.sqrt(np.mean(x**2))

print("RMS tone_low             :", rms(tone_low))
print("RMS tone_high            :", rms(tone_high))
print(f"  → RMS of tone_high and tone_low remain ~ same since their amplitudes are same at {amplitude} irrespective of the number of oscillations.\n")
print("RMS Audio                :", rms(audio))
print("RMS Audio with Noise     :", rms(noisy_audio))
print("Peak Amplitude           :", np.max(np.abs(noisy_audio)))

print(f"  → Range typically 0–1 for normalized audio")

RMS tone_low             : 0.3535533905932738
RMS tone_high            : 0.3535533905932739
  → RMS of tone_high and tone_low remain ~ same since their amplitudes are same at 0.5 irrespective of the number of oscillations.

RMS Audio                : 0.49999999999999967
RMS Audio with Noise     : 0.5106067242375291
Peak Amplitude           : 1.3047473519675035
  → Range typically 0–1 for normalized audio


In [19]:
# ============================================================================
# SECTION 3: FREQUENCY DOMAIN (FFT Intuition)
# ============================================================================
# Why: Transform time-domain samples into frequency bins to see which
#      frequencies are present in our signal.
# What we'll see: A peak at 150 Hz (low tone) and 3000 Hz (high tone).
# Key insight: Hard-to-see components in time domain pop out clearly here!
# ============================================================================

# --- The FFT Transformation ---
# How: Real FFT converts our ~16000 time samples -> ~8000 frequency bins
# Math: FFT decomposes the signal into its frequency components
# Expected: Peaks where our pure tones (150 Hz, 3000 Hz) exist
# Calculates one FFT over single audio, Which frequencies exist anywhere in this whole recording?
# global frequency snapshot

fft_spectrum = np.fft.rfft(noisy_audio)
freqs = np.fft.rfftfreq(len(noisy_audio), 1 / sample_rate)
magnitude = np.abs(fft_spectrum) #Amplitude for each frequency component

# Convert to dB scale (decibels)
# Why: dB scale compresses the dynamic range and matches human hearing perception
# Formula: dB = 20 * log10(magnitude) + small offset (1e-12) to avoid log(0)
magnitude_db = 20 * np.log10(magnitude + 1e-12)

# --- Interactive Visualization ---
# Create an interactive plot to explore the frequency spectrum
fig = go.Figure()
fig.add_trace(go.Scatter(x=freqs, y=magnitude_db, mode='lines', name='Magnitude (dB)'))

# Configure layout for clarity
fig.update_layout(
    title='Frequency Domain: FFT Magnitude (dB)',
    xaxis_title='Frequency (Hz)',
    yaxis_title='Magnitude (dB)',
    xaxis=dict(range=[0, 4000]),  # Zoom into the relevant frequency range
    width=1000,
    height=400,
)

# --- Annotate Expected Tones ---
# Visual markers to highlight where we expect peaks
# Red line: Low tone at 150 Hz (AC hum, background noise)
# Green line: High tone at 3000 Hz (bird chirp, foreground sound)
fig.add_vline(x=150, line=dict(color='red', dash='dash'), opacity=0.3, annotation_text='150 Hz', annotation_position='top right')
fig.add_vline(x=3000, line=dict(color='green', dash='dash'),  opacity=0.3, annotation_text='3000 Hz', annotation_position='top right')


fig.show()

In [20]:
# ============================================================================
# SECTION 4: SPECTROGRAM (Time-Frequency Analysis)
# ============================================================================
# Why: Combine time and frequency information. FFT shows \"what\" frequencies
#      are present, but loses \"when\" they occur. Spectrogram fixes this!
# What we'll see: A 2D heatmap where time flows left→right, frequency bottom→top,
#                 and color intensity shows energy (magnitude in dB).
# Key insight: Constant tones (150 Hz, 3000 Hz) appear as horizontal lines.
#              The spectrogram reveals how the signal evolves over time.
# ============================================================================

# --- The Short-Time Fourier Transform (STFT) ---
# How: Divide the signal into overlapping windows, compute FFT for each window
# Math: For each time window, compute FFT to get frequency content at that moment
# Parameters:
#   - nperseg=1024: window size (larger = better frequency resolution, worse time)
#   - noverlap=614: 60% overlap between windows (smooth transitions)
#   - scaling='spectrum': returns power spectrum (magnitude squared)

from scipy import signal
f_spec, t_spec, Sxx = signal.spectrogram(noisy_audio, fs=sample_rate, nperseg=1024, noverlap=614, scaling='spectrum')

# Convert power spectrum to dB scale (same as before)
# Why: dB compression makes weak and strong signals equally visible
# Formula: dB = 10 * log10(power) + small offset (1e-12) to avoid log(0)
Sxx_db = 10 * np.log10(Sxx + 1e-12)
# Sxx_db is POWER, not amplitude
# power ∝ amplitude2

# --- Interactive Heatmap Visualization ---
# Create an interactive 2D heatmap to explore time-frequency content
import plotly.graph_objects as go
fig = go.Figure(data=go.Heatmap(
    x=t_spec,          # Time axis (horizontal)
    y=f_spec,          # Frequency axis (vertical)
    z=Sxx_db,          # Magnitude in dB (color intensity)
    colorscale='Viridis',  # Color gradient (dark=low energy, bright=high energy)
    colorbar=dict(title='dB')
))

# Configure layout for clarity
fig.update_layout(
    title='Spectrogram (dB): Time-Frequency Heatmap',
    xaxis_title='Time (s)',
    yaxis_title='Frequency (Hz)',
    width=1000,
    height=500,
)

# Limit y-axis to 0-4000 Hz for clarity (focus on relevant frequency range)
fig.update_yaxes(range=[0,4000])

fig.show()

In [21]:
# ============================================================================
# SECTION 5: MEL-SPECTROGRAM (Perceptually-Motivated Time-Frequency)
# ============================================================================
# Why: Human hearing doesn't perceive frequency on a linear scale.
#      We hear differences better in lower frequencies (e.g., 100 Hz vs 200 Hz)
#      than in higher frequencies (e.g., 5000 Hz vs 5100 Hz).
#      Mel-scale warps frequency to match human perception!
# What we'll see: Same time-frequency heatmap as spectrogram, but frequency
#                 axis is compressed at low frequencies, stretched at high.
# Key insight: Mel-spectrograms are the standard input for audio ML models.
# ============================================================================

import librosa

# --- Compute Mel-Spectrogram ---
# Parameters:
#   - n_fft=2048: FFT window size (frequency resolution)
#   - hop_length=512: number of samples between frames (time resolution)
#   - n_mels=128: number of mel-frequency bands (perceptual frequency buckets)
#   - fmin/fmax: frequency range (20 Hz to 8000 Hz covers most speech)

hop_length = 614
# Compute mel-spectrogram (returns power directly in mel-scale)
S_mel = librosa.feature.melspectrogram(
    y=noisy_audio,
    sr=sample_rate,
    n_fft=1024,
    hop_length=hop_length,
    n_mels=128,
    fmin=20,
    fmax=8000,
)

# Convert power to dB scale (same formula as before)
# Why: Compresses dynamic range so weak and strong signals are both visible
S_mel_db = librosa.power_to_db(S_mel, ref=np.max)

# Create time axis (for x-axis labels)
# Each frame corresponds to hop_length samples
t_mel = librosa.frames_to_time(
    np.arange(S_mel_db.shape[1]), sr=sample_rate, hop_length=hop_length
)

# --- Interactive Mel-Spectrogram Heatmap ---
fig = go.Figure(data=go.Heatmap(
    x=t_mel,                                    # Time axis (seconds)
    y=np.arange(S_mel_db.shape[0]),             # Mel-frequency bin index
    z=S_mel_db,                                 # Magnitude in dB
    colorscale='Viridis',                       # Color gradient (dark=low, bright=high)
    colorbar=dict(title='dB')
))

fig.update_layout(
    title='Mel-Spectrogram (dB): Perceptually-Motivated Time-Frequency',
    xaxis_title='Time (s)',
    yaxis_title='Mel-Frequency Bin',
    width=1000,
    height=500,
)

fig.show()

print(f"Mel-Spectrogram shape: {S_mel_db.shape}")
print(f"  → {S_mel_db.shape[1]} time frames × {S_mel_db.shape[0]} mel-frequency bins")
print(f"  → Frequency range: 20 Hz → 8000 Hz (human speech & audio)")
print(f"  → {S_mel_db.shape[0]} mel bands (perceptual buckets) compress low frequencies, stretch high")

Mel-Spectrogram shape: (128, 79)
  → 79 time frames × 128 mel-frequency bins
  → Frequency range: 20 Hz → 8000 Hz (human speech & audio)
  → 128 mel bands (perceptual buckets) compress low frequencies, stretch high


In [26]:
# ============================================================================
# SECTION 6: MFCCs (Mel-Frequency Cepstral Coefficients)
# ============================================================================
# Why: MFCCs are compact, perceptually-motivated features widely used in
#      speech and audio ML. They summarize spectral shape on a mel-scale and
#      capture timbral characteristics useful for classification and recognition.
# What: Compute a small set of MFCC coefficients per frame and visualize them
#       as an interactive heatmap over time using Plotly.
# Key params:
#   - n_mfcc: number of cepstral coefficients to keep (commonly 13 for speech)
#   - hop_length: frame hop in samples (controls time resolution)
#   - n_fft / n_mels: used internally by MFCC computation via the mel-spectrogram
#Additional Context:
#   - MFCC is calculated by taking the DCT (Discrete Cosine Transform) of the
#     log-mel-spectrogram. This decorrelates the mel bands and compacts
#     information into a few coefficients.
#   - After DCT (Discrete Cosine Transform), covers F1,F2,F3
#       - MFCC 0 to 2   : Overall energy and coarse slope
#       - MFCC 3 to 12  : Formant structure and vowel identity
#       - MFCC above 13 : Fine spectral detail like pitch harmonics, noise texture, mic artifacts
# ============================================================================

# Compute MFCCs (shape: n_mfcc × n_frames)
# We reuse `hop_length` defined earlier to keep time alignment consistent.
n_mfcc = 13
mfccs = librosa.feature.mfcc(y=noisy_audio, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length)

# Convert frame indices to time (seconds) for the x-axis
t_mfcc = librosa.frames_to_time(np.arange(mfccs.shape[1]), sr=sample_rate, hop_length=hop_length)

# --- Interactive Plotly Heatmap for MFCCs ---
# x: time (s), y: MFCC coefficient index (1..n_mfcc), z: coefficient value
fig = go.Figure(data=go.Heatmap(
    x=t_mfcc,
    y=np.arange(1, mfccs.shape[0] + 1),
    z=mfccs,
    colorscale='Viridis',
    colorbar=dict(title='MFCC')
))

fig.update_layout(
    title=f'MFCCs over Time (n_mfcc={n_mfcc}, hop_length={hop_length})',
    xaxis_title='Time (s)',
    yaxis_title='MFCC Coefficient Index',
    width=1000,
    height=400,
)

fig.show()

# Sanity output
print(f"MFCCs shape: {mfccs.shape}  → {mfccs.shape[0]} coeffs × {mfccs.shape[1]} frames")
print(f"Time range: {t_mfcc[0]:.3f}s → {t_mfcc[-1]:.3f}s, hop_length={hop_length} samples")

MFCCs shape: (13, 79)  → 13 coeffs × 79 frames
Time range: 0.000s → 2.993s, hop_length=614 samples


In [40]:
# ============================================================================
# COMPREHENSIVE AUDIO ANALYSIS FUNCTION (WITH LINKED SUBPLOTS)
# ============================================================================
# Purpose: Single function to compute and visualize all audio features
#          (time domain, frequency domain, mel-spectrogram, MFCC) as subplots.
# Input: audio waveform + sample rate
# Output: Interactive Plotly figure with 2x2 grid of subplots
# Feature: Time-based subplots are linked—zooming/panning one updates others!
# To be used in as external function
# ============================================================================

from plotly.subplots import make_subplots

def plot_audio_analysis(audio, sr, hop_length=614, n_mfcc=13, n_fft=1024):
    """
    Create a comprehensive 2x2 subplot visualization of audio analysis with linked axes.
    
    Parameters:
    -----------
    audio : numpy array
        Audio waveform (time-domain samples)
    sr : int
        Sample rate in Hz
    hop_length : int
        Number of samples between frames (controls time resolution)
    n_mfcc : int
        Number of MFCC coefficients to compute
    n_fft : int
        FFT size for mel-spectrogram
    nperseg : int
        Window length for STFT
    noverlap : int
        Window overlap for STFT
    
    Returns:
    --------
    fig : plotly.graph_objects.Figure
        Interactive subplot figure with 4 visualizations:
        - Top-left: Time-domain waveform
        - Top-right: FFT magnitude spectrum (dB)
        - Bottom-left: Mel-spectrogram (dB)
        - Bottom-right: MFCC heatmap
        
    Interaction:
    -----------
        - Time-based subplots (waveform, mel-spec, MFCC) share the same x-axis.
        - Zooming/panning in any time subplot updates the others automatically!
        - FFT subplot has its own frequency x-axis (independent).
    """

    # Create subplots: 2 rows, 2 cols
    # Key: specs with "secondary_x=False" and shared axes for linking
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Time Domain: Waveform',
            'Frequency Domain: FFT (dB)',
            'Mel-Spectrogram (dB)',
            'MFCCs'
        ),
        specs=[
            [{"secondary_y": False}, {"secondary_y": False}],
            [{"secondary_y": False}, {"secondary_y": False}]
        ],
        vertical_spacing=0.15,
        horizontal_spacing=0.12
    )
    
    # ---- Subplot 1 (top-left): Time Domain ----
    t = np.arange(len(audio)) / sr
    fig.add_trace(
        go.Scatter(x=t, y=audio, name='Waveform', 
                   mode='lines', line=dict(color='blue', width=0.5)),
        row=1, col=1
    )
    fig.update_xaxes(title_text='Time (s)', row=1, col=1)
    fig.update_yaxes(title_text='Amplitude', row=1, col=1)
    
    # ---- Subplot 2 (top-right): FFT Spectrum ----
    # Note: FFT has frequency x-axis (independent, not linked to time)
    fft_spectrum = np.fft.rfft(audio)
    freqs = np.fft.rfftfreq(len(audio), 1 / sr)
    magnitude_db = 20 * np.log10(np.abs(fft_spectrum) + 1e-12)
    
    fig.add_trace(
        go.Scatter(x=freqs, y=magnitude_db, name='FFT Magnitude',
                   mode='lines', line=dict(color='green', width=1)),
        row=1, col=2
    )
    fig.update_xaxes(title_text='Frequency (Hz)', range=[0, 8000], row=1, col=2)
    fig.update_yaxes(title_text='Magnitude (dB)', row=1, col=2)
    
    # ---- Subplot 3 (bottom-left): Mel-Spectrogram ----
    S_mel = librosa.feature.melspectrogram(
        y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length,
        n_mels=128, fmin=20, fmax=8000
    )
    S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
    t_mel = librosa.frames_to_time(np.arange(S_mel_db.shape[1]), sr=sr, hop_length=hop_length)
    
    fig.add_trace(
        go.Heatmap(
            x=t_mel, y=np.arange(S_mel_db.shape[0]), z=S_mel_db,
            colorscale='Viridis', name='Mel-Spec',
            colorbar=dict(title='dB', x=0.46, len=0.4, y=0.25)
        ),
        row=2, col=1
    )
    fig.update_xaxes(title_text='Time (s)', row=2, col=1)
    fig.update_yaxes(title_text='Mel Bin', row=2, col=1)
    
    # ---- Subplot 4 (bottom-right): MFCC ----
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
    t_mfcc = librosa.frames_to_time(np.arange(mfccs.shape[1]), sr=sr, hop_length=hop_length)
    
    fig.add_trace(
        go.Heatmap(
            x=t_mfcc, y=np.arange(1, mfccs.shape[0] + 1), z=mfccs,
            colorscale='Viridis', name='MFCC',
            colorbar=dict(title='MFCC', x=1.02, len=0.4, y=0.25)
        ),
        row=2, col=2
    )
    fig.update_xaxes(title_text='Time (s)', row=2, col=2)
    fig.update_yaxes(title_text='MFCC Index', row=2, col=2)
    
    # ---- LINK TIME AXES (Synchronized Zooming) ----
    # Make the x-axes of waveform (1,1), mel-spec (2,1), and MFCC (2,2) share the same range
    # When you zoom in one, the others automatically update!
    fig.update_xaxes(matches="x", row=2, col=1)  # Mel-spec x-axis matches waveform (1,1)
    fig.update_xaxes(matches="x", row=2, col=2)  # MFCC x-axis matches waveform (1,1)
    
    # Global layout config
    fig.update_layout(
        title_text='Comprehensive Audio Analysis: Time, Frequency, Mel-Spec, & MFCC (Linked Time Axes)',
        height=900,
        width=1400,
        showlegend=True,
        hovermode='closest'
    )
    
    return fig


# ============================================================================
# Call the function on noisy_audio
# ============================================================================
print("Generating comprehensive audio analysis plot with linked subplots...")
print("  → Try zooming or panning in any time-based subplot! All time subplots will update together automatically.\n")
fig_analysis = plot_audio_analysis(
    noisy_audio,
    sample_rate,
    hop_length=hop_length,  # 60% as overlap
    n_mfcc=13
)
fig_analysis.show()

Generating comprehensive audio analysis plot with linked subplots...
  → Try zooming or panning in any time-based subplot! All time subplots will update together automatically.

