In [None]:
!pip install pyworld

In [None]:
# ============================================================
# PYWORLD-BASED SPEECH ANALYSIS PIPELINE
# ============================================================
# This version replaces custom cepstrum + harmonic heuristics (Cepstrum ≠ pitch tracker)
# with a robust, production-grade approach using pyworld.
# | System              | What “pitch” means                            |
# | ------------------- | --------------------------------------------- |
# | **Cepstrum-based**  | Dominant periodic spacing in the spectrum     |
# | **WORLD (pyworld)** | Estimated vocal fold vibration (F₀) over time |
# Cepstrum often locks onto:2nd harmonic (2×F₀)or even formant spacing artifacts
# WORLD estimates glottal excitation, not spectral repetition.Models vocal fold vibration, Suppresses harmonics
# So WORLD’s pitch is the physical F₀.
# ------------------------------------------------------------

import numpy as np
import pyworld as pw
from scipy.signal import welch

# ============================================================
# Core Feature Extraction
# ============================================================

def extract_f0(audio, sr):
    """
    Extract fundamental frequency (F0) using WORLD.
    Returns f0 (Hz) and time axis.
    """
    f0, t = pw.dio(audio, sr)
    f0 = pw.stonemask(audio, f0, t, sr)
    return f0, t


def spectral_entropy(signal, sr, n_fft=2048):
    spec = np.abs(np.fft.rfft(signal, n=n_fft))
    spec = spec / (np.sum(spec) + 1e-12)
    return -np.sum(spec * np.log2(spec + 1e-12))


def spectral_flatness(signal, n_fft=2048):
    spectrum = np.abs(np.fft.rfft(signal, n=n_fft)) + 1e-12
    return np.exp(np.mean(np.log(spectrum))) / np.mean(spectrum)

# ============================================================
# Pitch statistics and voicing
# ============================================================

def analyze_pitch(audio, sr):
    f0, t = extract_f0(audio, sr)

    voiced = f0 > 0
    voiced_ratio = np.mean(voiced)

    if np.any(voiced):
        pitch_std = np.std(f0[voiced])
        pitch_median = np.median(f0[voiced])
    else:
        pitch_std = 0.0
        pitch_median = np.nan

    return {
        "f0": f0,
        "voiced_ratio": voiced_ratio,
        "pitch_std": pitch_std,
        "pitch_median": pitch_median,
    }


# ============================================================
# High-level classification logic
# ============================================================

def classify_signal(audio, sr):
    # Pitch-based features
    pitch_info = analyze_pitch(audio, sr)

    # Spectral features
    entropy = spectral_entropy(audio, sr)
    flatness = spectral_flatness(audio)

    voiced_ratio = pitch_info["voiced_ratio"]
    pitch_std = pitch_info["pitch_std"]

    # Decision logic (robust + interpretable)
    if voiced_ratio < 0.2:
        decision = "unvoiced_or_noise"
    elif pitch_std < 5:
        decision = "periodic_non_speech"
    elif 0.25 < flatness < 0.8 and 4 < entropy < 9:
        decision = "voiced_speech"
    else:
        decision = "ambiguous"

    return {
        "decision": decision,
        "pitch_median": pitch_info["pitch_median"],
        "pitch_std": pitch_std,
        "voiced_ratio": voiced_ratio,
        "entropy": entropy,
        "flatness": flatness,
    }

# ============================================================
# Example usage
# ============================================================
result = classify_signal(noisy_audio, sample_rate)
print(result)
