# Chord Detection via FFT
Creates a 9:16 vertical video showing real-time chord detection from audio using Fourier transform and chromagram analysis.

In [1]:
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.patches import Rectangle
import imageio
from pathlib import Path

In [None]:
# Configuration
AUDIO_PATH = Path("audio/input.mp3")
OUTPUT_PATH = Path("out/chord_detection.mp4")

FPS = 30#30
WINDOW_SIZE = 8192 
HOP_SIZE = 512    

# Frequency range for analysis (guitar fundamentals)
FREQ_MIN = 60   # Hz (below low E ~82Hz)
FREQ_MAX = 1000 # Hz (covers harmonics)

# Detection threshold (0-1, normalized chroma)
DETECTION_THRESHOLD = 0.3

# Volume threshold - don't detect chords below this RMS level
VOLUME_THRESHOLD = 0.02

# Waveform display window (in seconds)
WAVEFORM_WINDOW = 0.1  # 100ms of audio displayed at a time

In [3]:
# Note names and chord templates
NOTE_NAMES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']

def make_chord_template(root: int, intervals: list) -> np.ndarray:
    """Create a chord template given root pitch class and intervals."""
    template = np.zeros(12)
    for interval in intervals:
        template[(root + interval) % 12] = 1.0
    return template

# Intervals: major = [0, 4, 7], minor = [0, 3, 7]
MAJOR_INTERVALS = [0, 4, 7]
MINOR_INTERVALS = [0, 3, 7]

CHORD_TEMPLATES = {}
for i, note in enumerate(NOTE_NAMES):
    CHORD_TEMPLATES[note] = make_chord_template(i, MAJOR_INTERVALS)
    CHORD_TEMPLATES[f"{note}m"] = make_chord_template(i, MINOR_INTERVALS)

print(f"Loaded {len(CHORD_TEMPLATES)} chord templates")
print("Chords:", list(CHORD_TEMPLATES.keys()))

Loaded 24 chord templates
Chords: ['C', 'Cm', 'C#', 'C#m', 'D', 'Dm', 'D#', 'D#m', 'E', 'Em', 'F', 'Fm', 'F#', 'F#m', 'G', 'Gm', 'G#', 'G#m', 'A', 'Am', 'A#', 'A#m', 'B', 'Bm']


In [4]:
def extract_audio(audio_path: Path) -> tuple[np.ndarray, int]:
    """Load audio from mp3 file using librosa, return mono float32 array and sample rate."""
    import librosa
    
    # Load audio (sr=None keeps original sample rate, mono=True converts to mono)
    audio_array, sr = librosa.load(str(audio_path), sr=None, mono=True)
    
    # Normalize to [-1, 1]
    audio_array = audio_array.astype(np.float32)
    max_val = np.max(np.abs(audio_array))
    if max_val > 0:
        audio_array = audio_array / max_val
    
    return audio_array, sr

audio, sample_rate = extract_audio(AUDIO_PATH)
duration = len(audio) / sample_rate
print(f"Audio: {duration:.2f}s, {sample_rate}Hz, {len(audio)} samples")

  from .autonotebook import tqdm as notebook_tqdm


Audio: 23.68s, 48000Hz, 1136640 samples


In [5]:
def freq_to_chroma(freq: float) -> int:
    """Convert frequency (Hz) to pitch class (0-11). 0=C, 1=C#, ..., 9=A, 10=A#, 11=B."""
    if freq <= 0:
        return -1
    # MIDI note number: A4=440Hz is MIDI 69
    midi = 12 * np.log2(freq / 440.0) + 69
    return int(round(midi)) % 12

def compute_chroma(spectrum: np.ndarray, freqs: np.ndarray, freq_min: float, freq_max: float) -> np.ndarray:
    """Compute chromagram from FFT magnitude spectrum."""
    chroma = np.zeros(12)
    
    for k, (mag, freq) in enumerate(zip(spectrum, freqs)):
        if freq < freq_min or freq > freq_max:
            continue
        pitch_class = freq_to_chroma(freq)
        if pitch_class >= 0:
            chroma[pitch_class] += mag ** 2  # energy
    
    # Normalize
    max_val = np.max(chroma)
    if max_val > 0:
        chroma = chroma / max_val
    
    return chroma

def match_chord(chroma: np.ndarray) -> tuple[str, float]:
    """Match chroma vector to best chord template. Returns (chord_name, confidence)."""
    best_score = -1
    best_chord = "?"
    
    for name, template in CHORD_TEMPLATES.items():
        # Cosine similarity
        score = np.dot(chroma, template) / (np.linalg.norm(chroma) * np.linalg.norm(template) + 1e-8)
        if score > best_score:
            best_score = score
            best_chord = name
    
    return best_chord, best_score

In [6]:
def compute_fft_frame(audio: np.ndarray, center: int, window_size: int) -> tuple[np.ndarray, np.ndarray]:
    """Compute FFT magnitude for a single frame centered at sample 'center'."""
    half = window_size // 2
    start = max(0, center - half)
    end = min(len(audio), center + half)
    
    # Extract and zero-pad if needed
    segment = np.zeros(window_size)
    actual_start = half - (center - start)
    segment[actual_start:actual_start + (end - start)] = audio[start:end]
    
    # Apply Hanning window
    segment = segment * np.hanning(window_size)
    
    # FFT (positive frequencies only)
    spectrum = np.abs(np.fft.rfft(segment))
    freqs = np.fft.rfftfreq(window_size, 1.0 / sample_rate)
    
    return spectrum, freqs

In [7]:
def get_note_freq_ranges() -> list[tuple[str, float, float]]:
    """Get frequency ranges for each pitch class (covering guitar range ~80-1200 Hz)."""
    ranges = []
    
    for pitch_class in range(12):
        note_name = NOTE_NAMES[pitch_class]
        # Collect all frequencies for this pitch class in our range
        note_freqs = []
        
        for octave in range(2, 6):  # C2 to B5
            midi = pitch_class + (octave + 1) * 12  # C4 = midi 60
            freq = 440.0 * (2 ** ((midi - 69) / 12))
            if FREQ_MIN <= freq <= FREQ_MAX:
                # Frequency bin width: half semitone below to half semitone above
                freq_low = freq * (2 ** (-0.5/12))
                freq_high = freq * (2 ** (0.5/12))
                note_freqs.append((freq_low, freq_high, freq))
        
        if note_freqs:
            ranges.append((note_name, pitch_class, note_freqs))
    
    return ranges

NOTE_FREQ_RANGES = get_note_freq_ranges()
print(f"Note ranges computed for {len(NOTE_FREQ_RANGES)} pitch classes")

Note ranges computed for 12 pitch classes


In [8]:
# Precompute all frames
total_frames = int(duration * FPS)
samples_per_frame = len(audio) / total_frames

print(f"Total frames: {total_frames}")
print(f"Samples per frame: {samples_per_frame:.1f}")

# Precompute FFT and chroma for each frame
frame_data = []
for i in range(total_frames):
    center_sample = int(i * samples_per_frame)
    spectrum, freqs = compute_fft_frame(audio, center_sample, WINDOW_SIZE)
    chroma = compute_chroma(spectrum, freqs, FREQ_MIN, FREQ_MAX)
    
    # Calculate volume (RMS) for this window
    half = WINDOW_SIZE // 2
    start = max(0, center_sample - half)
    end = min(len(audio), center_sample + half)
    window_audio = audio[start:end]
    volume = np.sqrt(np.mean(window_audio ** 2)) if len(window_audio) > 0 else 0
    
    # Only detect chord if volume is above threshold
    is_silent = volume < VOLUME_THRESHOLD
    if is_silent:
        chord, confidence = '-', 0.0
    else:
        chord, confidence = match_chord(chroma)
    
    frame_data.append({
        'spectrum': spectrum,
        'freqs': freqs,
        'chroma': chroma,
        'chord': chord,
        'confidence': confidence,
        'center_sample': center_sample,
        'volume': volume,
        'is_silent': is_silent
    })
    
    if i % 30 == 0:
        print(f"Precomputed frame {i}/{total_frames}")

print("Precomputation done!")

Total frames: 710
Samples per frame: 1600.9
Precomputed frame 0/710
Precomputed frame 30/710
Precomputed frame 60/710
Precomputed frame 90/710
Precomputed frame 120/710
Precomputed frame 150/710
Precomputed frame 180/710
Precomputed frame 210/710
Precomputed frame 240/710
Precomputed frame 270/710
Precomputed frame 300/710
Precomputed frame 330/710
Precomputed frame 360/710
Precomputed frame 390/710
Precomputed frame 420/710
Precomputed frame 450/710
Precomputed frame 480/710
Precomputed frame 510/710
Precomputed frame 540/710
Precomputed frame 570/710
Precomputed frame 600/710
Precomputed frame 630/710
Precomputed frame 660/710
Precomputed frame 690/710
Precomputation done!


In [9]:
# Color scheme - LIGHT MODE
BG_COLOR = "#ffffff"
TEXT_COLOR = "#1a1a1a"
WAVEFORM_COLOR = "#2563eb"  # blue
FFT_COLOR = "#2563eb"  # blue
DETECTED_COLOR = np.array([22, 163, 74]) / 255  # green for detected notes
UNDETECTED_COLOR = np.array([220, 38, 38]) / 255  # red for undetected
SILENT_COLOR = (0.6, 0.6, 0.6)  # gray for silent

# Waveform window in samples
WAVEFORM_SAMPLES = int(WAVEFORM_WINDOW * sample_rate)

def lerp_color(val, color_low, color_high):
    """Linearly interpolate between two colors based on val (0-1)."""
    val = np.clip(val, 0, 1)
    return tuple(color_low + val * (color_high - color_low))

def get_band_energy(spectrum, freqs, freq_low, freq_high):
    """Get normalized energy in a specific frequency band."""
    mask = (freqs >= freq_low) & (freqs <= freq_high)
    if not np.any(mask):
        return 0.0
    return np.sum(spectrum[mask] ** 2)

def render_frame(fig, axes, frame_idx: int, data: dict, audio: np.ndarray, sample_rate: int) -> np.ndarray:
    """Render a single frame."""
    ax_title, ax_wave, ax_fft, ax_chord, ax_blank = axes
    
    # Clear all axes
    for ax in axes:
        ax.clear()
        ax.set_facecolor(BG_COLOR)
        ax.set_xticks([])
        ax.set_yticks([])
        for spine in ax.spines.values():
            spine.set_visible(False)
    
    # 1. Title
    ax_title.text(0.5, 0.5, "Chord Detection via STFT\n(Short-Time Fourier Transform)", ha="center", va="center",
                  color=TEXT_COLOR, fontsize=36, fontweight="bold", transform=ax_title.transAxes)
    
    # 2. Dynamic waveform (current window only, like an oscilloscope)
    center_sample = data['center_sample']
    half_window = WAVEFORM_SAMPLES // 2
    wave_start = max(0, center_sample - half_window)
    wave_end = min(len(audio), center_sample + half_window)
    
    # Extract current waveform segment
    wave_segment = audio[wave_start:wave_end]
    
    # Pad if necessary (at start/end of audio)
    if len(wave_segment) < WAVEFORM_SAMPLES:
        if wave_start == 0:
            wave_segment = np.pad(wave_segment, (WAVEFORM_SAMPLES - len(wave_segment), 0))
        else:
            wave_segment = np.pad(wave_segment, (0, WAVEFORM_SAMPLES - len(wave_segment)))
    
    # Downsample for display
    display_samples = 500
    step = max(1, len(wave_segment) // display_samples)
    wave_display = wave_segment[::step]
    wave_x = np.linspace(0, 1, len(wave_display))
    
    ax_wave.fill_between(wave_x, wave_display, -wave_display, alpha=0.4, color=WAVEFORM_COLOR)
    ax_wave.plot(wave_x, wave_display, color=WAVEFORM_COLOR, linewidth=1.5)
    ax_wave.set_xlim(0, 1)
    ax_wave.set_ylim(-1, 1)
    
    # Center line
    ax_wave.axhline(0, color=TEXT_COLOR, linewidth=0.5, alpha=0.3)
    ax_wave.set_title("Audio Waveform (Real-Time)", color=TEXT_COLOR, fontsize=20, pad=5)
    
    # 3. FFT with note overlays
    spectrum = data['spectrum']
    freqs = data['freqs']
    chroma = data['chroma']
    is_silent = data.get('is_silent', False)
    
    # Limit to frequency range of interest
    freq_mask = (freqs >= FREQ_MIN) & (freqs <= FREQ_MAX)
    plot_freqs = freqs[freq_mask]
    plot_spectrum = spectrum[freq_mask]
    
    # Normalize spectrum for display
    plot_spectrum = plot_spectrum / (np.max(plot_spectrum) + 1e-8)
    
    # Compute energy for each individual frequency band (for per-octave coloring)
    band_energies = []
    for note_name, pitch_class, freq_ranges in NOTE_FREQ_RANGES:
        for freq_low, freq_high, center_freq in freq_ranges:
            if freq_high <= FREQ_MAX:
                energy = get_band_energy(spectrum, freqs, freq_low, freq_high)
                band_energies.append(energy)
    
    # Normalize band energies
    max_band_energy = max(band_energies) if band_energies else 1.0
    if max_band_energy > 0:
        band_energies_norm = [e / max_band_energy for e in band_energies]
    else:
        band_energies_norm = [0.0] * len(band_energies)
    
    # Draw note frequency bands with per-octave coloring
    band_idx = 0
    for note_name, pitch_class, freq_ranges in NOTE_FREQ_RANGES:
        is_sharp = '#' in note_name
        
        for freq_low, freq_high, center_freq in freq_ranges:
            if freq_high <= FREQ_MAX:
                # Use per-band energy instead of chroma
                band_val = band_energies_norm[band_idx] if not is_silent else 0
                band_color = lerp_color(band_val, UNDETECTED_COLOR, DETECTED_COLOR)
                alpha = 0.2 + 0.5 * band_val  # alpha ranges from 0.2 to 0.7
                
                ax_fft.axvspan(freq_low, freq_high, alpha=alpha, color=band_color)
                # Label the note at the center of the band
                ax_fft.text(center_freq, 1.02 + is_sharp*0.06, note_name, ha='center', va='bottom',
                           fontsize=14, color=band_color, fontweight='bold')
                band_idx += 1
    
    # Draw FFT magnitude
    ax_fft.fill_between(plot_freqs, plot_spectrum, alpha=0.5, color=FFT_COLOR)
    ax_fft.plot(plot_freqs, plot_spectrum, color=FFT_COLOR, linewidth=1.0)
    ax_fft.set_xlim(FREQ_MIN, FREQ_MAX)
    ax_fft.set_ylim(0, 1.15)  # Slightly more room for labels
    ax_fft.set_title("Frequency Spectrum (K-Space)", color=TEXT_COLOR, fontsize=20, pad=10)
    
    # Add frequency labels
    ax_fft.set_xlabel("Frequency (Hz, Log Scale)", color=TEXT_COLOR, fontsize=16)
    ax_fft.tick_params(axis='x', colors=TEXT_COLOR, labelsize=20)
    ax_fft.set_xscale('log')
    ax_fft.set_xticks([100, 200, 400, 800])
    ax_fft.set_xticklabels([100, 200, 400, 800])
    for spine in ['bottom']:
        ax_fft.spines[spine].set_visible(True)
        ax_fft.spines[spine].set_color(TEXT_COLOR)
    
    # 4. Detected chord
    chord = data['chord']
    confidence = data['confidence']
    if is_silent:
        chord_color = SILENT_COLOR
        conf_text = "(silent)"
    else:
        chord_color = lerp_color(confidence, UNDETECTED_COLOR, DETECTED_COLOR)
        conf_text = f"confidence: {confidence:.0%}"
    
    ax_chord.text(0.5, 0.6, chord, ha="center", va="center",
                  color=chord_color, fontsize=72, fontweight="bold", transform=ax_chord.transAxes)
    ax_chord.text(0.5, 0.15, conf_text, ha="center", va="center",
                  color=TEXT_COLOR, fontsize=24, alpha=0.7, transform=ax_chord.transAxes)
    
    # 5. Blank area for video (just label it)
    ax_blank.text(0.5, 0.5, "[Video Overlay Area]", ha="center", va="center",
                  color=TEXT_COLOR, fontsize=24, alpha=0.2, transform=ax_blank.transAxes)
    
    # Render to array
    fig.canvas.draw()
    w, h = fig.canvas.get_width_height()
    buf = fig.canvas.buffer_rgba()
    rgba = np.frombuffer(buf, dtype=np.uint8).reshape((h, w, 4))
    return np.ascontiguousarray(rgba[..., :3])

In [10]:
# Setup figure (9:16 portrait, 1080x1920)
dpi = 100
fig = plt.figure(figsize=(1080/dpi, 1920/dpi), dpi=dpi, facecolor=BG_COLOR)

# Layout: Title (6%), Waveform (14%), FFT (38%), Chord (12%), Blank (30%)
gs = GridSpec(5, 1, figure=fig, height_ratios=[0.6, 1.4, 3.0, 1.2, 3.8], hspace=0.24,
              top=0.98, bottom=0.02, left=0.05, right=0.95)
ax_title = fig.add_subplot(gs[0])
ax_wave = fig.add_subplot(gs[1])
ax_fft = fig.add_subplot(gs[2])
ax_chord = fig.add_subplot(gs[3])
ax_blank = fig.add_subplot(gs[4])
axes = (ax_title, ax_wave, ax_fft, ax_chord, ax_blank)

# Create output directory
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
print(f"Output will be saved to: {OUTPUT_PATH.resolve()}")

Output will be saved to: C:\Users\kaustav\OneDrive - Microsoft\Documents\Code\Fun\February2026\FourierChord\out\chord_detection.mp4


In [11]:
# Render video (without audio first)
VIDEO_ONLY_PATH = Path("out/chord_detection_video_only.mp4")

writer = imageio.get_writer(
    str(VIDEO_ONLY_PATH), fps=FPS, codec="libx264",
    macro_block_size=None,
    ffmpeg_params=["-pix_fmt", "yuv420p", "-crf", "18"]
)

try:
    for i, data in enumerate(frame_data):
        frame = render_frame(fig, axes, i, data, audio, sample_rate)
        writer.append_data(frame)
        
        if i % 30 == 0:
            print(f"Frame {i}/{total_frames} - Chord: {data['chord']} ({data['confidence']:.0%})")
finally:
    writer.close()
    plt.close(fig)

print(f"\nVideo (no audio) saved to: {VIDEO_ONLY_PATH.resolve()}")

# Combine video with audio using imageio-ffmpeg
import imageio_ffmpeg
import subprocess

ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
print(f"\nUsing ffmpeg from: {ffmpeg_exe}")
print("Adding audio to video...")

subprocess.run([
    ffmpeg_exe, "-y",
    "-i", str(VIDEO_ONLY_PATH),
    "-i", str(AUDIO_PATH),
    "-c:v", "copy",
    "-c:a", "aac",
    "-shortest",
    str(OUTPUT_PATH)
], check=True)

print(f"\nFinal video with audio saved to: {OUTPUT_PATH.resolve()}")

Frame 0/710 - Chord: - (0%)
Frame 30/710 - Chord: Dm (85%)
Frame 60/710 - Chord: Dm (85%)
Frame 90/710 - Chord: G (71%)
Frame 120/710 - Chord: G (65%)
Frame 150/710 - Chord: G (67%)
Frame 180/710 - Chord: Em (75%)
Frame 210/710 - Chord: C (79%)
Frame 240/710 - Chord: C (83%)
Frame 270/710 - Chord: C (79%)
Frame 300/710 - Chord: Am (85%)
Frame 330/710 - Chord: Am (93%)
Frame 360/710 - Chord: Am (88%)
Frame 390/710 - Chord: Dm (95%)
Frame 420/710 - Chord: Dm (93%)
Frame 450/710 - Chord: D (65%)
Frame 480/710 - Chord: G (93%)
Frame 510/710 - Chord: G (83%)
Frame 540/710 - Chord: Em (70%)
Frame 570/710 - Chord: Cm (64%)
Frame 600/710 - Chord: C (86%)
Frame 630/710 - Chord: F#m (64%)
Frame 660/710 - Chord: Am (76%)
Frame 690/710 - Chord: Am (69%)

Video (no audio) saved to: C:\Users\kaustav\OneDrive - Microsoft\Documents\Code\Fun\February2026\FourierChord\out\chord_detection_video_only.mp4

Using ffmpeg from: c:\Users\kaustav\anaconda3\envs\personalDS\lib\site-packages\imageio_ffmpeg\binari