<a href="https://colab.research.google.com/github/Deviprasanna-17/audio_analysis_infysp_group1/blob/main/audio_datasets_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================
# INSTALL DEPENDENCIES
# ===============================
!pip install librosa soundfile noisereduce webrtcvad pydub --quiet

import os
import numpy as np
import librosa
import soundfile as sf
import noisereduce as nr
import webrtcvad
import struct
from pydub import AudioSegment

# ===============================
# DATASET LOADING
# ===============================
def load_dataset(path, extensions=(".wav", ".mp3", ".flac", ".ogg")):
    audio_files = [
        os.path.join(path, f)
        for f in os.listdir(path)
        if f.lower().endswith(extensions)
    ]
    return audio_files

# ===============================
# PREPROCESSING STEPS
# ===============================

# 1. Load + Resample to 16kHz + Mono
def load_and_resample(path, target_sr=16000):
    audio, sr = librosa.load(path, sr=target_sr, mono=True)
    return audio, target_sr

# 2. Normalize audio
def normalize(audio):
    return audio / (np.max(np.abs(audio)) + 1e-9)

# 3. Noise reduction
def denoise(audio, sr):
    return nr.reduce_noise(y=audio, sr=sr)

# 4. Silence removal
def remove_silence(audio, sr):
    trimmed, _ = librosa.effects.trim(audio, top_db=25)
    return trimmed

# 5. Voice Activity Detection (WebRTC VAD)
vad = webrtcvad.Vad(2)  # aggressiveness: 0–3

def frame_generator(audio, sr, frame_ms=30):
    frame_len = int(sr * frame_ms / 1000)
    for i in range(0, len(audio), frame_len):
        chunk = audio[i:i+frame_len]
        if len(chunk) < frame_len:
            break
        pcm = struct.pack("%dh" % frame_len, *(chunk * 32768).astype('int16'))
        yield pcm, i, i+frame_len

def apply_vad(audio, sr):
    voiced_audio = []
    for pcm, start, end in frame_generator(audio, sr):
        if vad.is_speech(pcm, sr):
            voiced_audio.extend(audio[start:end])
    return np.array(voiced_audio)

# ===============================
# FULL PIPELINE
# ===============================
def preprocess_audio(path):
    audio, sr = load_and_resample(path)

    audio = normalize(audio)
    audio = denoise(audio, sr)
    audio = remove_silence(audio, sr)
    audio = apply_vad(audio, sr)

    return audio, sr

# ===============================
# PROCESS ENTIRE DATASET
# ===============================
raw_dir = "/content/raw_audio"       # ← Put your folder of audio files here
processed_dir = "/content/processed_audio"
os.makedirs(processed_dir, exist_ok=True)

audio_paths = load_dataset(raw_dir)

print("Found", len(audio_paths), "audio files.")

for path in audio_paths:
    print("Processing:", path)
    audio, sr = preprocess_audio(path)

    out_name = os.path.basename(path).rsplit(".", 1)[0] + "_clean.wav"
    out_path = os.path.join(processed_dir, out_name)

    sf.write(out_path, audio, sr)

print("Processing complete! Cleaned files saved to:", processed_dir)
