[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ayush-1204/Speaker_Recognition_System/blob/main/notebooks/01_data_preparation.ipynb)

# Notebook 01 — Data Preparation & VAD
✅ Use this notebook for dataset creation and segmentation via VAD
✅ Runs on Google Colab or local Jupyter


In [None]:
# You should install these once in Colab
!pip install numpy scipy soundfile webrtcvad librosa sounddevice


In [None]:
import os, json
from pathlib import Path
import soundfile as sf
import numpy as np
import webrtcvad
from scipy.signal import resample_poly

# Folder Setup
ROOT = Path('.')
DATA_RAW = ROOT / 'data/raw'
DATA_PROCESSED = ROOT / 'data/processed'
FAMILIAR_DIR = DATA_PROCESSED / 'familiar'
STRANGER_DIR = DATA_PROCESSED / 'stranger'
METADATA_DIR = ROOT / 'metadata'
ENROLL_DB = METADATA_DIR / 'enrollment_db.json'
MANIFEST_JSON = METADATA_DIR / 'manifest.json'

for p in [DATA_RAW, DATA_PROCESSED, FAMILIAR_DIR, STRANGER_DIR, METADATA_DIR]:
    p.mkdir(parents=True, exist_ok=True)

def read_wav_mono(path, target_sr=16000):
    audio, sr = sf.read(str(path))
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
    if sr != target_sr:
        audio = resample_poly(audio, target_sr, sr)
    return audio.astype(np.float32), target_sr

def write_wav(path, audio, sr=16000):
    sf.write(str(path), audio, sr, subtype='PCM_16')

class Frame:
    def __init__(self, bytes_, timestamp, duration):
        self.bytes = bytes_
        self.timestamp = timestamp
        self.duration = duration

def frame_generator(frame_ms, audio, sr):
    n = int(sr * frame_ms/1000)
    timestamp = 0.0
    duration = n/sr
    int16_audio = (audio * 32768).astype('int16')
    for i in range(0, len(audio)-n, n):
        chunk = int16_audio[i:i+n]
        yield Frame(chunk.tobytes(), timestamp, duration)
        timestamp += duration

def vad_segment(in_path, out_folder, aggressiveness=2):
    audio, sr = read_wav_mono(in_path)
    vad = webrtcvad.Vad(aggressiveness)
    frames = list(frame_generator(30, audio, sr))
    os.makedirs(out_folder, exist_ok=True)
    saved = []
    for i, fr in enumerate(frames):
        if vad.is_speech(fr.bytes, sr):
            arr = np.frombuffer(fr.bytes, dtype='int16').astype('float32')/32768.0
            op = Path(out_folder)/f"seg_{i}.wav"
            write_wav(op, arr, sr)
            saved.append(str(op))
    return saved

print("✅ Notebook initialized — ready for VAD segmentation!")
