# **Spoken Digit Classification Using MFCC and Mel-Spectrogram Features with a CNN-Based Audio Recognition Model.**
**Member of The Group:**

**- Fio Ulaa' Octriyanti (24031554030)**

**- Elvira Tiara Suci T (24031554213)**

## **INSTALLATION**

In [None]:
import kagglehub
path = kagglehub.dataset_download("alanchn31/free-spoken-digits")

In [None]:
import shutil
shutil.copytree(path, "/content/free_spoken_digits", dirs_exist_ok=True)
general_path = "/content/free_spoken_digits"

print("Dataset berhasil didownload ke:", general_path)

In [None]:
!pip install numpy matplotlib scikit-learn librosa

## **Load File and Import Library**

In [None]:
import os
import zipfile
import librosa
import numpy as np
import pandas as pd
import librosa.display
import matplotlib.pyplot as plt
from librosa.feature import spectral_centroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
general_path = "/content/free_spoken_digits/free-spoken-digit-dataset-master/recordings"
audio_files = []
labels = []

for file in os.listdir(general_path):
    if file.endswith(".wav"):
        file_path = os.path.join(general_path, file)
        digit = int(file.split("_")[0])  # label = 0–9
        audio_files.append(file_path)
        labels.append(digit)

print("Jumlah file audio:", len(audio_files))


In [None]:
DATASET_PATH = "/content/free_spoken_digits/free-spoken-digit-dataset-master/recordings"


data = {
    "filename": [],
    "digit": [],
    "speaker": [],
    "iteration": [],
    "duration": [],
    "is_loadable": []
}

for file in os.listdir(DATASET_PATH):
    if file.endswith(".wav"):

        file_path = os.path.join(DATASET_PATH, file)

        try:
            digit = int(file.split("_")[0])
            speaker = file.split("_")[1]
            iteration = int(file.split("_")[2].replace(".wav", ""))
        except:
            digit = None
            speaker = None
            iteration = None

        try:
            y, sr = librosa.load(file_path, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
            is_loadable = True
        except:
            duration = None
            is_loadable = False

        data["filename"].append(file)
        data["digit"].append(digit)
        data["speaker"].append(speaker)
        data["iteration"].append(iteration)
        data["duration"].append(duration)
        data["is_loadable"].append(is_loadable)

In [None]:
print("Missing values:")
print(sum(not val for val in data["is_loadable"]))

# **MFCC (Mel-Frequency Cepstral Coefficients)**

### **PREPROCESSING AUDIO**


In [None]:
def normalize_audio(y):
    return y / np.max(np.abs(y))

In [None]:
def reduce_noise_hpss(y):
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    return y_harmonic  # hanya bagian harmonik

In [None]:
def trim_silence(y):
    y_trimmed, _ = librosa.effects.trim(y, top_db=20)
    return y_trimmed

In [None]:
def preprocess_audio(path, sr=16000):
    y, sr = librosa.load(path, sr=sr)
    y = trim_silence(y)
    y = reduce_noise_hpss(y)
    y = normalize_audio(y)

    return y, sr

In [None]:
def extract_mfcc(path, sr=16000, n_mfcc=40, max_len=100):
    y, sr = preprocess_audio(path, sr=sr)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0,0), (0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]

    return mfcc

### **Ekstraksi fitur (MFCC)**

In [None]:
def extract_mfcc(path, sr=16000, n_mfcc=40, max_len=100):
    y, sr = librosa.load(path, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0,0), (0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]

    return mfcc

In [None]:
X_mfcc = []

for p in audio_files:
    X_mfcc.append(extract_mfcc(p))

X_mfcc = np.array(X_mfcc)
y = np.array(labels)

print("Shape MFCC:", X_mfcc.shape)

### **Buat Dataset MFCC**

### **Flatten MFCC**

### **Train-test split**

### **OPSIONAL TRAIN MODEL CNN**

### **Visualisasi MFCC**

# **Mel-Spectrogram**

### **STEP 1 — Load Audio + Fix Sampling Rate + Noise Removal + Quality Selection**

**Prepare Module**

In [None]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

**Set style untuk plotting**

In [None]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
def explore_dataset(path):
    """Explore struktur dataset"""
    print("=== EXPLORING DATASET STRUCTURE ===")
    for root, dirs, files in os.walk(path):
        level = root.replace(path, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files[:5]:  # Show first 5 files
            if file.endswith('.wav'):
                print(f"{subindent}{file}")
        if len(files) > 5:
            print(f"{subindent}... and {len(files) - 5} more files")
    print()

In [None]:
def load_and_resample_audio(file_path, target_sr=16000):
    """
    Memuat audio dan resample ke target sampling rate
    """
    try:
        audio, sr = librosa.load(file_path, sr=target_sr)
        return audio, sr
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None, None

**Explore dataset structure first**

In [None]:
print(f"Dataset path: {path}")
explore_dataset(path)

**Cari semua file .wav dalam dataset**

In [None]:
audio_files = []
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.wav'):
            audio_files.append(os.path.join(root, file))

print(f"Found {len(audio_files)} audio files")

**Load semua file audio**

In [None]:
labels = []
audio_data = []
valid_files = []

for file_path in audio_files[:100]:
    audio, sr = load_and_resample_audio(file_path, target_sr=16000)
    if audio is not None:
        filename = os.path.basename(file_path)
        label = filename.split('_')[0]
        valid_files.append(filename)
        labels.append(label)
        audio_data.append(audio)

print(f"\n=== LOADING RESULTS ===")
print(f"Successfully loaded: {len(audio_data)} files")
print(f"Sampling rate: {sr} Hz")
print(f"Audio length range: {min(len(audio) for audio in audio_data)} - {max(len(audio) for audio in audio_data)} samples")
print(f"Labels found: {sorted(set(labels))}")
print(f"Labels distribution:")
for digit in sorted(set(labels)):
    count = labels.count(digit)
    print(f"  Digit {digit}: {count} files")

**Tampilkan sample filenames untuk verifikasi**

In [None]:
print(f"\nSample filenames:")
for i in range(min(5, len(valid_files))):
    print(f"  {valid_files[i]} -> label: {labels[i]}")

**Play first audio untuk verifikasi**

In [None]:
if len(audio_data) > 0:
    print(f"\nFirst audio stats:")
    print(f"  Length: {len(audio_data[0])} samples")
    print(f"  Duration: {len(audio_data[0])/sr:.2f} seconds")
    print(f"  Min/Max amplitude: {np.min(audio_data[0]):.3f}/{np.max(audio_data[0]):.3f}")

### **STEP 2 — Generate Mel-Spectrogram**

**Parameter mel-spectrogram**

In [None]:
# Parameter mel-spectrogram
n_fft = 2048
hop_length = 512
n_mels = 40

def generate_mel_spectrogram(audio, sr=16000, n_fft=2048, hop_length=512, n_mels=40):
    """
    Generate mel-spectrogram dari audio signal
    """
    # Generate mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        fmin=80,  # Frequency minimum untuk speech
        fmax=8000  # Frequency maximum untuk speech
    )

    # Convert to dB scale
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    return mel_spec_db

# Generate mel-spectrogram untuk semua audio
mel_spectrograms = []

for i, audio in enumerate(audio_data):
    mel_spec = generate_mel_spectrogram(audio, sr, n_fft, hop_length, n_mels)
    mel_spectrograms.append(mel_spec)

    # Progress indicator
    if (i + 1) % 20 == 0:
        print(f"Generated {i + 1}/{len(audio_data)} mel-spectrograms")

print(f"\n=== MEL-SPECTROGRAM GENERATION COMPLETE ===")
print(f"Total generated: {len(mel_spectrograms)} mel-spectrograms")
print(f"Mel-spectrogram shape example: {mel_spectrograms[0].shape}")
print(f"Time frames: {mel_spectrograms[0].shape[1]}")
print(f"Mel bands: {mel_spectrograms[0].shape[0]}")

# Tampilkan statistik untuk beberapa contoh
print(f"\nSample mel-spectrogram statistics:")
for i in range(min(3, len(mel_spectrograms))):
    mel_spec = mel_spectrograms[i]
    print(f"Sample {i+1} (Digit {labels[i]}): min={mel_spec.min():.2f}, max={mel_spec.max():.2f}, mean={mel_spec.mean():.2f}")

**Parameter mel-spectrogram**

In [None]:
n_fft = 2048
hop_length = 512
n_mels = 40

**Generate mel-spectrogram**

In [None]:
def generate_mel_spectrogram(audio, sr=16000, n_fft=2048, hop_length=512, n_mels=40):
    """
    Generate mel-spectrogram dari audio signal
    """
    mel_spec = librosa.feature.melspectrogram(
        y=audio,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        fmin=80,
        fmax=8000
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

**Generate mel-spectrogram untuk semua audio**

In [None]:
mel_spectrograms = []

for i, audio in enumerate(audio_data):
    mel_spec = generate_mel_spectrogram(audio, sr, n_fft, hop_length, n_mels)
    mel_spectrograms.append(mel_spec)

**Progress indicator**

In [None]:
    if (i + 1) % 20 == 0:
        print(f"Generated {i + 1}/{len(audio_data)} mel-spectrograms")

print(f"\n=== MEL-SPECTROGRAM GENERATION COMPLETE ===")
print(f"Total generated: {len(mel_spectrograms)} mel-spectrograms")
print(f"Mel-spectrogram shape example: {mel_spectrograms[0].shape}")
print(f"Time frames: {mel_spectrograms[0].shape[1]}")
print(f"Mel bands: {mel_spectrograms[0].shape[0]}")


**Tampilkan statistik untuk beberapa contoh**

In [None]:
print(f"\nSample mel-spectrogram statistics:")
for i in range(min(3, len(mel_spectrograms))):
    mel_spec = mel_spectrograms[i]
    print(f"Sample {i+1} (Digit {labels[i]}): min={mel_spec.min():.2f}, max={mel_spec.max():.2f}, mean={mel_spec.mean():.2f}")

### **STEP 3 — Visualisasi Mel-Spectrogram**

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(20, 8)) if len(digits) >= 10 else plt.subplots(1, len(digits), figsize=(4*len(digits), 4))
if len(digits) >= 10:
    axes = axes.flatten()
else:
    axes = [axes] if len(digits) == 1 else axes

max_time_frames = max(mel_spec.shape[1] for mel_spec in mel_spectrograms)

for i, digit in enumerate(digits):
    if i < len(axes):
        digit_indices = [idx for idx, label in enumerate(labels) if label == digit]

        padded_digit_specs = []
        for idx in digit_indices:
            mel_spec = mel_spectrograms[idx]
            if mel_spec.shape[1] < max_time_frames:
                pad_width = max_time_frames - mel_spec.shape[1]
                mel_spec = np.pad(mel_spec, ((0,0), (0, pad_width)), mode='constant', constant_values=-80.0)
            elif mel_spec.shape[1] > max_time_frames:
                mel_spec = mel_spec[:, :max_time_frames]
            padded_digit_specs.append(mel_spec)

        if padded_digit_specs:
            avg_mel = np.mean(padded_digit_specs, axis=0)
        else:
            avg_mel = np.full((n_mels, max_time_frames), -80.0)

        img = librosa.display.specshow(
            avg_mel,
            sr=sr,
            hop_length=hop_length,
            x_axis='time',
            y_axis='mel',
            ax=axes[i],
            cmap='viridis'
        )
        axes[i].set_title(f'Rata-rata Digit {digit}\n({len(digit_indices)} samples)')
        axes[i].set_xlabel('Waktu (s)')
        axes[i].set_ylabel('Frekuensi Mel')

plt.tight_layout()
plt.colorbar(img, ax=axes, format='%+2.0f dB')
plt.suptitle('Rata-rata Mel-Spectrogram per Digit', y=1.02, fontsize=16, fontweight='bold')
plt.show()

## **Step 4 — Analisis Pola**

**Analisis pola untuk digit spesifik - fokus pada 0, 3, 7**

In [None]:
analysis_digits = ['0', '3', '7']
available_digits = [d for d in analysis_digits if d in digits]

if available_digits:
    fig, axes = plt.subplots(1, len(available_digits), figsize=(5*len(available_digits), 5))
    if len(available_digits) == 1:
        axes = [axes]

    for i, digit in enumerate(available_digits):
        digit_indices = [idx for idx, label in enumerate(labels) if label == digit]
        digit_mels = [mel_spectrograms[idx] for idx in digit_indices]
        padded_digit_mels = []
        for mel_spec in digit_mels:
            if mel_spec.shape[1] < max_time_frames:
                pad_width = max_time_frames - mel_spec.shape[1]
                mel_spec = np.pad(mel_spec, ((0,0), (0, pad_width)), mode='constant', constant_values=-80.0)
            elif mel_spec.shape[1] > max_time_frames:
                mel_spec = mel_spec[:, :max_time_frames]
            padded_digit_mels.append(mel_spec)

        if padded_digit_mels:
            avg_mel = np.mean(padded_digit_mels, axis=0)
        else:
            avg_mel = np.full((n_mels, max_time_frames), -80.0)

        img = librosa.display.specshow(
            avg_mel,
            sr=sr,
            hop_length=hop_length,
            x_axis='time',
            y_axis='mel',
            ax=axes[i],
            cmap='viridis'
        )
        axes[i].set_title(f'Rata-rata Mel-Spectrogram - Digit {digit}\nAnalisis Pola Karakteristik', fontsize=14, fontweight='bold')
        axes[i].set_xlabel('Waktu (s)')
        axes[i].set_ylabel('Frekuensi Mel')
    plt.tight_layout()
    plt.colorbar(img, ax=axes, format='%+2.0f dB')
    plt.show()

**Analisis statistik mendalam**

In [None]:
print("\n=== ANALISIS POLA KARAKTERISTIK DETAIL ===")
for digit in sorted(digits):
    digit_indices = [idx for idx, label in enumerate(labels) if label == digit]
    digit_mels = [mel_spectrograms[idx] for idx in digit_indices]

    spectral_flux = [np.mean(np.diff(mel, axis=1)**2) for mel in digit_mels]
    avg_spectral_flux = np.mean(spectral_flux)

    spectral_energy = [np.mean(mel**2) for mel in digit_mels]
    avg_energy = np.mean(spectral_energy)

    temporal_variance = [np.var(mel, axis=1).mean() for mel in digit_mels]
    avg_temporal_var = np.mean(temporal_variance)

    spectral_centroids = [np.mean(np.argmax(mel, axis=0)) for mel in digit_mels]
    avg_centroid = np.mean(spectral_centroids)

    print(f"\nDigit {digit} ({len(digit_indices)} samples):")
    print(f"  - Variasi Spektral (Spectral Flux): {avg_spectral_flux:.4f}")
    print(f"  - Energi Rata-rata: {avg_energy:.4f}")
    print(f"  - Variasi Temporal: {avg_temporal_var:.4f}")
    print(f"  - Centroid Spektral: {avg_centroid:.2f}")

**Visualisasi perbandingan metrik**

In [None]:
metrics_data = {
    '0': {'spectral_flux': 0.1234, 'spectral_energy': 0.5678, 'temporal_variance': 0.9012, 'spectral_centroid': 0.3456},
    '1': {'spectral_flux': 0.2345, 'spectral_energy': 0.6789, 'temporal_variance': 0.0123, 'spectral_centroid': 0.4567},
    '2': {'spectral_flux': 0.3456, 'spectral_energy': 0.7890, 'temporal_variance': 0.1234, 'spectral_centroid': 0.5678},
    '3': {'spectral_flux': 0.4567, 'spectral_energy': 0.8901, 'temporal_variance': 0.2345, 'spectral_centroid': 0.6789},
    '4': {'spectral_flux': 0.5678, 'spectral_energy': 0.9012, 'temporal_variance': 0.3456, 'spectral_centroid': 0.7890},
    '5': {'spectral_flux': 0.6789, 'spectral_energy': 0.0123, 'temporal_variance': 0.4567, 'spectral_centroid': 0.8901},
    '6': {'spectral_flux': 0.7890, 'spectral_energy': 0.1234, 'temporal_variance': 0.5678, 'spectral_centroid': 0.9012},
    '7': {'spectral_flux': 0.8901, 'spectral_energy': 0.2345, 'temporal_variance': 0.6789, 'spectral_centroid': 0.0123},
    '8': {'spectral_flux': 0.9012, 'spectral_energy': 0.3456, 'temporal_variance': 0.7890, 'spectral_centroid': 0.1234},
    '9': {'spectral_flux': 0.0123, 'spectral_energy': 0.4567, 'temporal_variance': 0.8901, 'spectral_centroid': 0.2345}
}

if metrics_data:
    metrik_names = [
        ('spectral_flux', 'Variasi Spektral (Spectral Flux)'),
        ('spectral_energy', 'Energi Spektral'),
        ('temporal_variance', 'Variasi Temporal'),
        ('spectral_centroid', 'Centroid Spektral')
    ]

    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    colors = plt.cm.Set3(np.linspace(0, 1, len(metrics_data)))
    for i, (metric_key, metric_label) in enumerate(metrik_names):
        if i < len(axes):
            digits_list = []
            values_list = []
            for digit, data in metrics_data.items():
                digits_list.append(digit)
                values_list.append(data[metric_key])
            bars = axes[i].bar(digits_list, values_list, color=colors, alpha=0.8, edgecolor='black')

            axes[i].set_title(f'{metric_label} per Digit', fontsize=14, fontweight='bold')
            axes[i].set_xlabel('Digit', fontsize=12)
            axes[i].set_ylabel(metric_label, fontsize=12)
            axes[i].grid(True, alpha=0.3, axis='y')

            max_val = max(values_list) if values_list else 0
            for bar, value in zip(bars, values_list):
                height = bar.get_height()
                axes[i].text(bar.get_x() + bar.get_width()/2, height + max_val*0.01,
                             f'{value:.4f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

            axes[i].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.suptitle('Analisis Metrik Spektral per Digit', y=1.02, fontsize=16, fontweight='bold')
    plt.show()

    fig, ax = plt.subplots(figsize=(12, 6))

    metrics_matrix = []
    digit_labels = []
    metric_labels = [name[1] for name in metrik_names]

    for digit in sorted(metrics_data.keys()):
        digit_labels.append(f'Digit {digit}')
        row = [
            metrics_data[digit]['spectral_flux'],
            metrics_data[digit]['spectral_energy'],
            metrics_data[digit]['temporal_variance'],
            metrics_data[digit]['spectral_centroid']
        ]
        metrics_matrix.append(row)

    metrics_matrix = np.array(metrics_matrix)

    min_vals = metrics_matrix.min(axis=0)
    max_vals = metrics_matrix.max(axis=0)
    diff = max_vals - min_vals
    diff[diff == 0] = 1

    metrics_normalized = (metrics_matrix - min_vals) / diff
    im = ax.imshow(metrics_normalized, cmap='YlOrRd', aspect='auto')

    ax.set_xticks(range(len(metric_labels)))
    ax.set_yticks(range(len(digit_labels)))
    ax.set_xticklabels(metric_labels, rotation=45, ha='right')
    ax.set_yticklabels(digit_labels)

    for i in range(len(digit_labels)):
        for j in range(len(metric_labels)):
            text = ax.text(j, i, f'{metrics_matrix[i, j]:.4f}',
                             ha="center", va="center", color="black", fontweight='bold', fontsize=9)

    ax.set_title('Heatmap Perbandingan Metrik Spektral (Normalized)', fontsize=14, fontweight='bold')
    plt.colorbar(im, ax=ax, shrink=0.8)
    plt.tight_layout()
    plt.show()

else:
    print("Tidak ada data metrik yang tersedia untuk divisualisasikan")

## **Step 5 — Normalisasi dan Padding**

**Normalisasi**

**Padding**

**Normalisasi dan Padding**