# 一、函数与库

In [2]:
import os
import random
import shutil
from pathlib import Path
import numpy as np
import torch
import torchaudio
import soundfile as sf
from multiprocessing import Pool, cpu_count
import multiprocessing as mp
from silero_vad import load_silero_vad
from IPython.display import Audio
from scipy.fft import rfft, fft

In [3]:
class MFCCProcessor:
    """
    Python implementation of MFCC (Mel-Frequency Cepstral Coefficients)
    Based on the RISC-V DSP library implementation
    """
    
    def __init__(self, fft_len=256, nb_mel_filters=40, nb_dct_outputs=13, use_cfft=False):
        """
        Initialize MFCC processor
        
        Args:
            fft_len: FFT length (should match window length)
            nb_mel_filters: Number of mel filter banks
            nb_dct_outputs: Number of DCT outputs (MFCC coefficients)
            use_cfft: Whether to use CFFT instead of RFFT (default: False)
        """
        self.fft_len = fft_len
        self.nb_mel_filters = nb_mel_filters
        self.nb_dct_outputs = nb_dct_outputs
        self.use_cfft = use_cfft
        
        # Initialize coefficient arrays - to be filled with actual data
        self.window_coefs = None
        self.filter_pos = None
        self.filter_lengths = None
        self.filter_coefs = None
        self.dct_coefs = None
        
        self._load_coefficients()
    
    def _load_coefficients(self):
        """Load pre-computed coefficients"""
        
        # Hanning window coefficients (256 points)
        # TODO: Fill with mfcc_window_coefs_hann256 data
        self.window_coefs = np.array([
            # Insert mfcc_window_coefs_hann256 values here
            # Format: [0.000000, 0.000151, 0.000602, ...]
            0.000000,0.000151,0.000602,0.001355,0.002408,0.003760,0.005412,0.007361,0.009607,0.012149,
            0.014984,0.018112,0.021530,0.025236,0.029228,0.033504,0.038060,0.042895,0.048005,0.053388,
            0.059039,0.064957,0.071136,0.077573,0.084265,0.091208,0.098396,0.105827,0.113495,0.121396,
            0.129524,0.137876,0.146447,0.155230,0.164221,0.173414,0.182803,0.192384,0.202150,0.212096,
            0.222215,0.232501,0.242949,0.253551,0.264302,0.275194,0.286222,0.297379,0.308658,0.320052,
            0.331555,0.343159,0.354858,0.366644,0.378510,0.390449,0.402455,0.414519,0.426635,0.438795,
            0.450991,0.463218,0.475466,0.487729,0.500000,0.512271,0.524534,0.536782,0.549009,0.561205,
            0.573365,0.585481,0.597545,0.609551,0.621490,0.633356,0.645142,0.656841,0.668445,0.679947,
            0.691342,0.702621,0.713778,0.724806,0.735698,0.746449,0.757051,0.767499,0.777785,0.787904,
            0.797850,0.807616,0.817197,0.826586,0.835779,0.844770,0.853553,0.862124,0.870476,0.878604,
            0.886505,0.894173,0.901604,0.908792,0.915735,0.922427,0.928864,0.935044,0.940961,0.946612,
            0.951995,0.957105,0.961940,0.966496,0.970772,0.974764,0.978470,0.981888,0.985016,0.987851,
            0.990393,0.992639,0.994588,0.996240,0.997592,0.998645,0.999398,0.999849,1.000000,0.999849,
            0.999398,0.998645,0.997592,0.996240,0.994588,0.992639,0.990393,0.987851,0.985016,0.981888,
            0.978470,0.974764,0.970772,0.966496,0.961940,0.957105,0.951995,0.946612,0.940961,0.935044,
            0.928864,0.922427,0.915735,0.908792,0.901604,0.894173,0.886505,0.878604,0.870476,0.862124,
            0.853553,0.844770,0.835779,0.826586,0.817197,0.807616,0.797850,0.787904,0.777785,0.767499,
            0.757051,0.746449,0.735698,0.724806,0.713778,0.702621,0.691342,0.679947,0.668445,0.656841,
            0.645142,0.633356,0.621490,0.609551,0.597545,0.585481,0.573365,0.561205,0.549009,0.536782,
            0.524534,0.512271,0.500000,0.487729,0.475466,0.463218,0.450991,0.438795,0.426635,0.414519,
            0.402455,0.390449,0.378510,0.366644,0.354858,0.343159,0.331555,0.320052,0.308658,0.297379,
            0.286222,0.275194,0.264302,0.253551,0.242949,0.232501,0.222215,0.212096,0.202150,0.192384,
            0.182803,0.173414,0.164221,0.155230,0.146447,0.137876,0.129524,0.121396,0.113495,0.105827,
            0.098396,0.091208,0.084265,0.077573,0.071136,0.064957,0.059039,0.053388,0.048005,0.042895,
            0.038060,0.033504,0.029228,0.025236,0.021530,0.018112,0.014984,0.012149,0.009607,0.007361,
            0.005412,0.003760,0.002408,0.001355,0.000602,0.000151
        ], dtype=np.float32)
        
        # MEL filter positions (40 filters)
        # TODO: Fill with mfcc_filter_pos_mel40 data
        self.filter_pos = np.array([
            # Insert mfcc_filter_pos_mel40 values here
            # Format: [1, 2, 3, 4, 5, 6, 8, 9, ...]
            1,2,3,4,5,6,8,9,11,12,
            14,15,17,19,21,23,25,27,30,32,
            35,38,40,43,46,50,53,57,60,64,
            68,73,77,82,87,92,97,103,109,115,
        ], dtype=np.uint32)
        
        # MEL filter lengths (40 filters)
        # TODO: Fill with mfcc_filter_len_mel40 data
        self.filter_lengths = np.array([
            # Insert mfcc_filter_len_mel40 values here
            # Format: [2, 2, 2, 2, 3, 3, 3, 3, ...]
            2,2,2,2,3,3,3,3,3,3,
            3,4,4,4,4,4,5,5,5,6,
            5,5,6,7,7,7,7,7,8,9,
            9,9,10,10,10,11,12,12,13,13,
        ], dtype=np.uint32)
        
        # MEL filter coefficients
        # TODO: Fill with mfcc_filter_coefs_mel40 data
        self.filter_coefs = np.array([
            # Insert mfcc_filter_coefs_mel40 values here
            # Format: [0.940365, 0.158628, 0.841372, ...]
            0.940365,0.158628,0.841372,0.293816,0.706184,0.462403,0.537597,0.661904,0.338096,0.890104,
            0.145015,0.109896,0.854985,0.424850,0.575150,0.727995,0.052989,0.272005,0.947011,0.398503,
            0.601497,0.763326,0.146352,0.236674,0.853648,0.546566,0.453434,0.963036,0.394905,0.036964,
            0.605095,0.841380,0.301730,0.158620,0.698270,0.775275,0.261386,0.224725,0.738614,0.759477,
            0.269002,0.240523,0.730998,0.789451,0.320349,0.210549,0.679651,0.861250,0.411736,0.138750,
            0.588264,0.971416,0.539920,0.116902,0.028584,0.460080,0.883098,0.702035,0.295011,0.297965,
            0.704989,0.895539,0.503343,0.118164,0.104461,0.496657,0.881836,0.739755,0.367882,0.002322,
            0.260245,0.632118,0.997678,0.642866,0.289313,0.357134,0.710687,0.941471,0.599160,0.262206,
            0.058529,0.400840,0.737794,0.930444,0.603716,0.281873,0.069556,0.396284,0.718127,0.964769,
            0.652268,0.344238,0.040553,0.035231,0.347732,0.655761,0.959447,0.741092,0.445738,0.154382,
            0.258908,0.554262,0.845618,0.866915,0.583236,0.303246,0.026850,0.133085,0.416764,0.696754,
            0.973150,0.753958,0.484481,0.218335,0.246042,0.515519,0.781665,0.955439,0.695714,0.439085,
            0.185479,0.044561,0.304286,0.560915,0.814521,0.934825,0.687055,0.442105,0.199909,0.065175,
            0.312945,0.557895,0.800091,0.960408,0.723542,0.489253,0.257486,0.028188,0.039592,0.276458,
            0.510747,0.742514,0.971812,0.801306,0.576789,0.354590,0.134660,0.198694,0.423211,0.645410,
            0.865340,0.916954,0.701428,0.488037,0.276741,0.067498,0.083046,0.298572,0.511963,0.723259,
            0.932502,0.860269,0.655015,0.451700,0.250287,0.050740,0.139731,0.344985,0.548300,0.749713,
            0.949260,0.853026,0.657111,0.462962,0.270549,0.079839,0.146974,0.342889,0.537038,0.729451,
            0.920161,0.890805,0.703415,0.517642,0.333459,0.150837,0.109195,0.296585,0.482358,0.666541,
            0.849163,0.969752,0.790177,0.612087,0.435458,0.260267,0.086489,0.030248,0.209823,0.387913,
            0.564542,0.739733,0.913511,0.914103,0.743086,0.573416,0.405074,0.238037,0.072286,0.085897,
            0.256914,0.426584,0.594926,0.761963,0.927714,0.907802,0.744564,0.582555,0.421755,0.262148,
            0.103715,0.092198,0.255436,0.417445,0.578245,0.737852,0.896285,0.946440,0.790305,0.635294,
            0.481391,0.328580,0.176846,0.026174,0.053560,0.209695,0.364706,0.518609,0.671420,0.823154,
            0.973826,0.876550,0.727957,0.580384,0.433814,0.288236,0.143636
        ], dtype=np.float32)
        
        # DCT coefficients matrix (13 x 40)
        # TODO: Fill with mfcc_dct_coefs_dct13 data
        dct_coefs_flat = np.array([
            # Insert mfcc_dct_coefs_dct13 values here
            # Format: [0.223607, 0.223607, 0.223607, ...]
            0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,
            0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,
            0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,
            0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,
            0.223434,0.222057,0.219310,0.215212,0.209786,0.203067,0.195096,0.185922,0.175602,0.164200,
            0.151784,0.138434,0.124229,0.109259,0.093615,0.077394,0.060696,0.043624,0.026282,0.008779,
            -0.008779,-0.026282,-0.043624,-0.060696,-0.077394,-0.093615,-0.109259,-0.124229,-0.138434,-0.151784,
            -0.164200,-0.175602,-0.185922,-0.195096,-0.203067,-0.209786,-0.215212,-0.219310,-0.222057,-0.223434,
            0.222917,0.217429,0.206586,0.190656,0.170032,0.145221,0.116834,0.085571,0.052200,0.017544,
            -0.017544,-0.052200,-0.085571,-0.116834,-0.145221,-0.170032,-0.190656,-0.206586,-0.217429,-0.222917,
            -0.222917,-0.217429,-0.206586,-0.190656,-0.170032,-0.145221,-0.116834,-0.085571,-0.052200,-0.017544,
            0.017544,0.052200,0.085571,0.116834,0.145221,0.170032,0.190656,0.206586,0.217429,0.222917,
            0.222057,0.209786,0.185922,0.151784,0.109259,0.060696,0.008779,-0.043624,-0.093615,-0.138434,
            -0.175602,-0.203067,-0.219310,-0.223434,-0.215212,-0.195096,-0.164200,-0.124229,-0.077394,-0.026282,
            0.026282,0.077394,0.124229,0.164200,0.195096,0.215212,0.223434,0.219310,0.203067,0.175602,
            0.138434,0.093615,0.043624,-0.008779,-0.060696,-0.109259,-0.151784,-0.185922,-0.209786,-0.222057,
            0.220854,0.199235,0.158114,0.101515,0.034980,-0.034980,-0.101515,-0.158114,-0.199235,-0.220854,
            -0.220854,-0.199235,-0.158114,-0.101515,-0.034980,0.034980,0.101515,0.158114,0.199235,0.220854,
            0.220854,0.199235,0.158114,0.101515,0.034980,-0.034980,-0.101515,-0.158114,-0.199235,-0.220854,
            -0.220854,-0.199235,-0.158114,-0.101515,-0.034980,0.034980,0.101515,0.158114,0.199235,0.220854,
            0.219310,0.185922,0.124229,0.043624,-0.043624,-0.124229,-0.185922,-0.219310,-0.219310,-0.185922,
            -0.124229,-0.043624,0.043624,0.124229,0.185922,0.219310,0.219310,0.185922,0.124229,0.043624,
            -0.043624,-0.124229,-0.185922,-0.219310,-0.219310,-0.185922,-0.124229,-0.043624,0.043624,0.124229,
            0.185922,0.219310,0.219310,0.185922,0.124229,0.043624,-0.043624,-0.124229,-0.185922,-0.219310,
            0.217429,0.170032,0.085571,-0.017544,-0.116834,-0.190656,-0.222917,-0.206586,-0.145221,-0.052200,
            0.052200,0.145221,0.206586,0.222917,0.190656,0.116834,0.017544,-0.085571,-0.170032,-0.217429,
            -0.217429,-0.170032,-0.085571,0.017544,0.116834,0.190656,0.222917,0.206586,0.145221,0.052200,
            -0.052200,-0.145221,-0.206586,-0.222917,-0.190656,-0.116834,-0.017544,0.085571,0.170032,0.217429,
            0.215212,0.151784,0.043624,-0.077394,-0.175602,-0.222057,-0.203067,-0.124229,-0.008779,0.109259,
            0.195096,0.223434,0.185922,0.093615,-0.026282,-0.138434,-0.209786,-0.219310,-0.164200,-0.060696,
            0.060696,0.164200,0.219310,0.209786,0.138434,0.026282,-0.093615,-0.185922,-0.223434,-0.195096,
            -0.109259,0.008779,0.124229,0.203067,0.222057,0.175602,0.077394,-0.043624,-0.151784,-0.215212,
            0.212663,0.131433,0.000000,-0.131433,-0.212663,-0.212663,-0.131433,-0.000000,0.131433,0.212663,
            0.212663,0.131433,0.000000,-0.131433,-0.212663,-0.212663,-0.131433,-0.000000,0.131433,0.212663,
            0.212663,0.131433,0.000000,-0.131433,-0.212663,-0.212663,-0.131433,-0.000000,0.131433,0.212663,
            0.212663,0.131433,-0.000000,-0.131433,-0.212663,-0.212663,-0.131433,-0.000000,0.131433,0.212663,
            0.209786,0.109259,-0.043624,-0.175602,-0.223434,-0.164200,-0.026282,0.124229,0.215212,0.203067,
            0.093615,-0.060696,-0.185922,-0.222057,-0.151784,-0.008779,0.138434,0.219310,0.195096,0.077394,
            -0.077394,-0.195096,-0.219310,-0.138434,0.008779,0.151784,0.222057,0.185922,0.060696,-0.093615,
            -0.203067,-0.215212,-0.124229,0.026282,0.164200,0.223434,0.175602,0.043624,-0.109259,-0.209786,
            0.206586,0.085571,-0.085571,-0.206586,-0.206586,-0.085571,0.085571,0.206586,0.206586,0.085571,
            -0.085571,-0.206586,-0.206586,-0.085571,0.085571,0.206586,0.206586,0.085571,-0.085571,-0.206586,
            -0.206586,-0.085571,0.085571,0.206586,0.206586,0.085571,-0.085571,-0.206586,-0.206586,-0.085571,
            0.085571,0.206586,0.206586,0.085571,-0.085571,-0.206586,-0.206586,-0.085571,0.085571,0.206586,
            0.203067,0.060696,-0.124229,-0.222057,-0.164200,0.008779,0.175602,0.219310,0.109259,-0.077394,
            -0.209786,-0.195096,-0.043624,0.138434,0.223434,0.151784,-0.026282,-0.185922,-0.215212,-0.093615,
            0.093615,0.215212,0.185922,0.026282,-0.151784,-0.223434,-0.138434,0.043624,0.195096,0.209786,
            0.077394,-0.109259,-0.219310,-0.175602,-0.008779,0.164200,0.222057,0.124229,-0.060696,-0.203067,
            0.199235,0.034980,-0.158114,-0.220854,-0.101515,0.101515,0.220854,0.158114,-0.034980,-0.199235,
            -0.199235,-0.034980,0.158114,0.220854,0.101515,-0.101515,-0.220854,-0.158114,0.034980,0.199235,
            0.199235,0.034980,-0.158114,-0.220854,-0.101515,0.101515,0.220854,0.158114,-0.034980,-0.199235,
            -0.199235,-0.034980,0.158114,0.220854,0.101515,-0.101515,-0.220854,-0.158114,0.034980,0.199235
        ], dtype=np.float32)
        
        # Reshape DCT coefficients to matrix form
        if len(dct_coefs_flat) > 0:
            self.dct_coefs = dct_coefs_flat.reshape(self.nb_dct_outputs, self.nb_mel_filters)
        else:
            self.dct_coefs = np.zeros((self.nb_dct_outputs, self.nb_mel_filters), dtype=np.float32)
    
    def compute_mfcc(self, input_signal):
        """
        Compute MFCC features from input signal
        
        Args:
            input_signal: Input audio signal (numpy array)
            
        Returns:
            mfcc_features: MFCC coefficients (numpy array)
        """
        # Ensure input is the correct length and type
        if len(input_signal) != self.fft_len:
            raise ValueError(f"Input signal length ({len(input_signal)}) must match FFT length ({self.fft_len})")
        
        src = input_signal.astype(np.float32).copy()
        
        # # Step 1: Normalize
        # max_value = np.max(np.abs(src))
        # max_index = np.argmax(np.abs(src))
        
        # # if max_value != 0.0:
        # #     src = src / max_value
        
        # Step 2: Apply window function
        if self.window_coefs is not None and len(self.window_coefs) == self.fft_len:
            src = src * self.window_coefs
        
        # Step 3: Compute spectrum magnitude
        if self.use_cfft:
            # CFFT-based implementation
            # Convert real to complex
            complex_signal = np.zeros(self.fft_len, dtype=np.complex64)
            complex_signal.real = src
            complex_signal.imag = 0.0
            
            # Compute FFT
            fft_result = fft(complex_signal)
            spectrum_mag = np.abs(fft_result)
        else:
            # RFFT-based implementation (default)
            fft_result = rfft(src)
            
            # Unpack real values (mimic the C code behavior)
            tmp = np.zeros(self.fft_len + 2, dtype=np.float32)
            tmp[0] = fft_result[0].real  # DC component
            
            # Pack real and imaginary parts
            for i in range(1, len(fft_result)):
                if i < self.fft_len // 2:
                    tmp[2*i] = fft_result[i].real
                    tmp[2*i + 1] = fft_result[i].imag
            
            # Handle Nyquist frequency
            if len(fft_result) > self.fft_len // 2:
                tmp[self.fft_len] = fft_result[-1].real
                tmp[self.fft_len + 1] = 0.0
            
            tmp[1] = 0.0  # Set imaginary part of DC to 0
            
            # Compute magnitude
            spectrum_mag = np.zeros(self.fft_len, dtype=np.float32)
            for i in range(self.fft_len):
                if i == 0:
                    spectrum_mag[i] = abs(tmp[0])
                elif i < self.fft_len // 2:
                    spectrum_mag[i] = np.sqrt(tmp[2*i]**2 + tmp[2*i + 1]**2)
                else:
                    spectrum_mag[i] = spectrum_mag[self.fft_len - i]
        
        # # Restore original scale if normalization was applied
        # if max_value != 0.0:
        #     spectrum_mag = spectrum_mag * max_value
        # spectrum_mag *= 32.0
        # Step 4: Apply MEL filters
        mel_outputs = np.zeros(self.nb_mel_filters, dtype=np.float32)
        coef_idx = 0
        
        if (self.filter_pos is not None and self.filter_lengths is not None and 
            self.filter_coefs is not None):
            
            for i in range(self.nb_mel_filters):
                pos = self.filter_pos[i]
                length = self.filter_lengths[i]
                
                # Compute dot product
                result = 0.0
                for j in range(length):
                    if pos + j < len(spectrum_mag) and coef_idx + j < len(self.filter_coefs):
                        result += spectrum_mag[pos + j] * self.filter_coefs[coef_idx + j]
                
                mel_outputs[i] = result
                coef_idx += length
        
        # Step 5: Compute logarithm
        # Add small offset to avoid log(0)
        mel_outputs = mel_outputs + 1.0e-6
        log_mel = np.log(mel_outputs)
        
        # Step 6: Apply DCT transformation
        if self.dct_coefs is not None:
            mfcc_features = np.dot(self.dct_coefs, log_mel)
        else:
            mfcc_features = log_mel[:self.nb_dct_outputs]
        # ② 计算帧能量并替换 C0
        # log_energy = np.log(np.sum(src**2) + 1e-30)
        # mfcc_features = np.dot(self.dct_coefs, log_mel)
        # mfcc_features[0] = log_energy
        return mfcc_features


# 工厂函数：返回一个新的 MFCCProcessor 实例
def mfcc_processor_factory():
    return MFCCProcessor(
        fft_len=256,
        nb_mel_filters=40,
        nb_dct_outputs=13,
        use_cfft=True
    )

# 子进程中调用的函数
def process_row(row_data):
    segment_size = 256
    num_segments = 31
    num_features = 13
    row_features = np.zeros((num_segments, num_features), dtype=np.float32)
    
    # 在子进程中创建 mfcc_processor
    mfcc_processor = mfcc_processor_factory()
    
    for j in range(num_segments):
        start = j * segment_size
        end = start + segment_size
        segment = row_data[start:end]
        mfcc = mfcc_processor.compute_mfcc(segment)
        row_features[j, :] = mfcc
    
    return row_features

def compute_batch_mfcc_features_parallel(input_array):
    """
    多进程并行处理 (n, 8000) 音频数组，返回 (n, 31, 13) 特征。
    """
    n_rows = input_array.shape[0]

    # 启动进程池，使用所有可用核心
    with mp.Pool(processes=mp.cpu_count()) as pool:
        results = pool.map(process_row, [input_array[i] for i in range(n_rows)])

    # 拼接结果
    return np.stack(results, axis=0)



def play_audio_jupyter(numpy_array, index, sample_rate=8000):
    audio_data = numpy_array[index].astype(np.float32)
    return Audio(audio_data, rate=sample_rate)

def concat_wav_files(input_folder, output_file):
    """
    拼接指定文件夹下所有wav文件，递归查找（按路径名排序）.

    参数:
        input_folder (str or Path): 输入文件夹路径
        output_file (str or Path): 输出文件路径
    """
    input_folder = Path(input_folder)
    wav_files = sorted(input_folder.rglob("*.wav"))  # 按路径排序，确保顺序一致

    if not wav_files:
        print("未找到任何wav文件！")
        return

    # 读取第一个文件，初始化数据和参数
    data_all, samplerate = sf.read(wav_files[0])
    if data_all.ndim == 1:
        data_all = data_all[:, np.newaxis]  # 转为二维（单声道）

    # 遍历剩下的文件，拼接
    for wav_file in wav_files[1:]:
        data, sr = sf.read(wav_file)
        if sr != samplerate:
            raise ValueError(f"采样率不匹配: {wav_file} 采样率 {sr} != {samplerate}")
        if data.ndim == 1:
            data = data[:, np.newaxis]
        data_all = np.vstack([data_all, data])

    # 写入拼接后的音频
    sf.write(output_file, data_all, samplerate)
    print(f"拼接完成，输出文件: {output_file}")
# from pathlib import Path

# # 假设你想获取 'root_dir' 目录下的所有子文件夹路径
# root_dir = Path('/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/00origin_wav/noise')

# # 使用 Path.iterdir() + is_dir() 过滤出子文件夹
# subfolders = [p for p in root_dir.iterdir() if p.is_dir()]
# for subfolder in subfolders:
#     input_folder_path = subfolder
#     output_file_path = subfolder.name + ".wav"  # 输出文件名为子文件夹名加上.wav后缀
#     concat_wav_files(input_folder_path, output_file_path)



# 秒数 -> hh:mm:ss 或 mm:ss
def format_duration(seconds: float) -> str:
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    if hours > 0:
        return f"{hours:02}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes:02}:{secs:02}"

# # 目标文件夹
# folder_path = Path("/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/01combined_wav/noise")

# for wav_file in folder_path.rglob("*.wav"):
#     with sf.SoundFile(wav_file) as f:
#         duration = len(f) / f.samplerate
#     print(f"{wav_file}: {duration:.2f} 秒 ({format_duration(duration)})")




def split_wav(file_path: Path, chunk_size=8000):
    """
    将指定 WAV 文件切分为多个 WAV 文件，每个文件 chunk_size 采样点。
    """
    # 获取音频数据和采样率
    data, samplerate = sf.read(file_path)
    
    # 确认 chunk_size 是否合适
    total_samples = data.shape[0]
    num_chunks = (total_samples + chunk_size - 1) // chunk_size  # 向上取整
    
    # 创建输出文件夹（使用原文件名，不带扩展名）
    output_dir = file_path.with_suffix('')
    os.makedirs(output_dir, exist_ok=True)
    
    # 切分并保存
    for i in range(num_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, total_samples)
        chunk_data = data[start:end]
        
        output_filename = f"{file_path.stem}_{i:03d}.wav"  # 例如：file_000.wav
        output_path = Path(output_dir) / output_filename
        
        sf.write(output_path, chunk_data, samplerate)
        print(f"已保存: {output_path}")

# from pathlib import Path

# directory = Path("/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/01combined_wav/noise")  # 替换为你的路径
# files = [f for f in directory.iterdir() if f.is_file()]

# print(files)

# for wav_file in files:
#     split_wav(wav_file)




def random_copy_files(src_dir, dst_dir, num_files):
    """
    从 src_dir 中随机选择 num_files 个文件，复制到 dst_dir。

    Args:
        src_dir (str or Path): 源文件夹路径。
        dst_dir (str or Path): 目标文件夹路径。
        num_files (int): 要随机复制的文件数。
    """
    # 将路径转换为 Path 对象
    src_dir = Path(src_dir)
    dst_dir = Path(dst_dir)

    # 确保目标文件夹存在
    dst_dir.mkdir(parents=True, exist_ok=True)

    # 获取所有文件（不包括子文件夹中的文件）
    all_files = [f for f in src_dir.iterdir() if f.is_file()]

    # 如果 num_files 大于实际文件数，自动限制为最大数量
    num_files = min(num_files, len(all_files))

    # 随机选择文件
    selected_files = random.sample(all_files, num_files)

    # 复制文件到目标文件夹
    for file in selected_files:
        shutil.copy(file, dst_dir / file.name)
    
    print(f"已从 {src_dir} 随机复制 {num_files} 个文件到 {dst_dir}。")

# # 示例用法
# random_copy_files("/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/01combined_wav/noise/clip/music", "/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/01combined_wav/noise/clip/music_500", 500)




def load_wav_as_tensor(file_path):
    """加载单个wav文件为torch.tensor，返回（audio_tensor, sample_rate）"""
    audio_tensor, sample_rate = torchaudio.load(file_path)
    return audio_tensor, sample_rate

def load_all_wav_files_to_numpy_check_length(input_folder, expected_length=8000):
    """
    递归加载指定文件夹下的所有wav文件，返回一个 (n, expected_length) 的 numpy 数组。
    检查每个wav文件是否是 expected_length 个采样点，否则打印路径。
    """
    input_folder = Path(input_folder)
    wav_files = sorted(input_folder.rglob("*.wav"))

    audio_segments = []
    for wav_file in wav_files:
        audio_tensor, _ = load_wav_as_tensor(wav_file)
        audio_tensor = audio_tensor[0]  # 取第一个通道

        if audio_tensor.shape[0] != expected_length:
            print(f"⚠️ 文件 {wav_file} 采样点数: {audio_tensor.shape[0]}（期望 {expected_length}）")
        else:
            audio_segments.append(audio_tensor.numpy())

    return np.stack(audio_segments, axis=0)
# path_list = [
# "/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/01combined_wav/noise",
# "/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/01combined_wav/others",
# "/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/01combined_wav/XiaoXin",
# "/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/01combined_wav/XiaoYuan",
# ]

# noise_numpy = load_all_wav_files_to_numpy_check_length(path_list[0])
# print(noise_numpy.shape)  # 期望输出: (n, 8000)
# others_numpy = load_all_wav_files_to_numpy_check_length(path_list[1])
# print(others_numpy.shape)  # 期望输出: (n, 8000)
# xiaoxin_numpy = load_all_wav_files_to_numpy_check_length(path_list[2])
# print(xiaoxin_numpy.shape)  # 期望输出: (n, 8000)
# xiaoyuan_numpy = load_all_wav_files_to_numpy_check_length(path_list[3])
# print(xiaoyuan_numpy.shape)  # 期望输出: (n, 8000)

def normalize_audio_batch(numpy_array: np.ndarray) -> np.ndarray:
    """
    对 (n, 8000) 的音频批量数据进行逐行“音频归一化”。

    音频归一化的思路：
    - 每行除以其最大绝对值，确保最大幅度为1。
    - 如果某一行全是0，保持全0，避免除以0。

    参数:
    numpy_array: (n, 8000) 形状的numpy数组，每行是一段音频。

    返回:
    归一化后的 (n, 8000) numpy数组，数据类型仍保持 float32。
    """
    # 确保是float32，避免精度问题
    numpy_array = numpy_array.astype(np.float32)
    
    # 求每行的最大绝对值
    max_abs_values = np.max(np.abs(numpy_array), axis=1, keepdims=True)

    # 避免除以零：如果最大值是0，就保持0
    max_abs_values[max_abs_values == 0] = 1.0

    # 归一化
    normalized = numpy_array / max_abs_values

    return normalized
# # 对所有音频数据进行归一化
# noise_numpy_norm = normalize_audio_batch(noise_numpy)
# others_numpy_norm = normalize_audio_batch(others_numpy)
# xiaoxin_numpy_norm = normalize_audio_batch(xiaoxin_numpy)
# xiaoyuan_numpy_norm = normalize_audio_batch(xiaoyuan_numpy)

def mix_audio_batches(signal_batch, noise_batch, snr_list, normalize_audio_batch):
    """
    从 signal_batch (n, 8000) 和 noise_batch (m, 8000) 中随机混音，按 snr_list 中的 SNR 进行混音，直至 signal_batch 抽完。
    返回: 一个字典，键为 SNR，值为对应混音后的 numpy 数组 (n, 8000)。
    """
    n, _ = signal_batch.shape
    m, _ = noise_batch.shape

    # 随机顺序抽取 signal_batch（不放回）
    signal_indices = np.random.permutation(n)

    # 为每个 SNR 创建空列表
    mixed_audio_dict = {snr: [] for snr in snr_list}

    for idx in signal_indices:
        # 当前信号
        signal = signal_batch[idx]

        # 随机从 noise_batch（有放回）抽一个
        noise_idx = np.random.randint(0, m)
        noise = noise_batch[noise_idx]

        # 计算信号和噪声功率
        signal_power = np.mean(signal ** 2)
        noise_power = np.mean(noise ** 2)

        # 避免除0
        if noise_power == 0:
            noise_power = 1e-12

        for snr in snr_list:
            # 计算缩放因子
            target_noise_power = signal_power / (10 ** (snr / 10))
            scaling_factor = np.sqrt(target_noise_power / noise_power)

            # 混音
            mixed = signal + scaling_factor * noise
            mixed_audio_dict[snr].append(mixed)

    # 转换为 numpy 数组并进行归一化
    for snr in snr_list:
        mixed_audio = np.stack(mixed_audio_dict[snr], axis=0)
        mixed_audio = normalize_audio_batch(mixed_audio)
        mixed_audio_dict[snr] = mixed_audio

    return mixed_audio_dict


# snr_list = [-3, 0, 5, 10, 20]

# # 混音
# others_noise_mixed_dict = mix_audio_batches(others_numpy_norm, noise_numpy_norm, snr_list, normalize_audio_batch)
# xiaoxin_noise_mixed_dict = mix_audio_batches(xiaoxin_numpy_norm, noise_numpy_norm, snr_list, normalize_audio_batch)
# xiaoyuan_noise_mixed_dict = mix_audio_batches(xiaoyuan_numpy_norm, noise_numpy_norm, snr_list, normalize_audio_batch)


def concatenate_mixed_audio(mixed_audio_dict: dict) -> np.ndarray:
    """
    将混音字典中的所有 SNR 的 numpy 数组拼接为一个大的 numpy 数组。
    
    参数:
    mixed_audio_dict: dict，键是 SNR，值是 (n, 8000) 的 numpy 数组。
    
    返回:
    拼接后的 numpy 数组，形状是 (sum_n, 8000)。
    """
    # 取出所有值（每个 numpy 数组）并拼接
    concatenated = np.concatenate(list(mixed_audio_dict.values()), axis=0)
    return concatenated
# # 拼接所有 SNR 的混音结果
# others_noise_numpy = concatenate_mixed_audio(others_noise_mixed_dict)
# xiaoxin_noise_numpy = concatenate_mixed_audio(xiaoxin_noise_mixed_dict)
# xiaoyuan_noise_numpy = concatenate_mixed_audio(xiaoyuan_noise_mixed_dict)
# # 保存拼接后的结果
# np.save("others_noise_numpy.npy", others_noise_numpy)
# np.save("xiaoxin_noise_numpy.npy", xiaoxin_noise_numpy)
# np.save("xiaoyuan_noise_numpy.npy", xiaoyuan_noise_numpy)




def detect_speech_sequence(audio_tensor, sampling_rate=8000):
    """
    输入: 音频时间序列 (torch.Tensor)
    输出: 概率序列 (list)
    """
    model = load_silero_vad(onnx=False)
    
    # 确定帧大小
    frame_size = 256 if sampling_rate == 8000 else 512
    
    # 逐帧处理，输出概率序列
    probs = []
    for i in range(0, len(audio_tensor), frame_size):
        chunk = audio_tensor[i:i + frame_size]
        
        # 如果帧长度不够，填充零
        if len(chunk) < frame_size:
            chunk = torch.nn.functional.pad(chunk, (0, frame_size - len(chunk)))
        
        prob = model(chunk, sampling_rate).item()
        probs.append(prob)
    
    return probs

def label_speech_sequence(audio_tensor, sampling_rate=8000, threshold=0.5):
    """
    输入: 音频时间序列 (torch.Tensor)
    输出: 语音标签序列 (list)
    """
    probs = detect_speech_sequence(audio_tensor, sampling_rate)
    # 计算概率平均值，并根据阈值生成标签
    average_prob = sum(probs) / len(probs)
    label = 1 if average_prob >= threshold else 0
    return label
# import torch
# import torchaudio
# # 使用示例
# audio = torchaudio.load("/home/lidonghaowsl/develop/VeriSilicon_Cup_Competition_preliminary_round/dataset/sr/combined_wav/XiaoYuan/XiaoYuan_012.wav")[0]
# audio = audio.squeeze()  # 确保音频是1D张量
# prob_sequence = detect_speech_sequence(audio)

# print(f"输入长度: {len(audio)}, 输出概率序列长度: {len(prob_sequence)}")
# print(f"概率序列: {prob_sequence[:5]}...")  # 显示前5个概率值
# # 计算平均概率
# average_prob = sum(prob_sequence) / len(prob_sequence)
# print(f"平均概率: {average_prob:.4f}")

# # 计算语音标签序列
# label = label_speech_sequence(audio, threshold=0.5)
# print(f"语音标签: {label} (1表示语音，0表示非语音)")

# def label_batch_numpy_array(numpy_array, sampling_rate=8000, threshold=0.5):
#     """
#     输入: (n, 8000) 形状的 numpy 数组
#     输出: n 维向量，包含每行的语音标签
#     """
#     # 确保输入是 numpy 数组
#     numpy_array = np.asarray(numpy_array)
#     n = numpy_array.shape[0]
#     labels = []

#     for i in range(n):
#         # 转为 torch.Tensor 并确保是 float32
#         audio_tensor = torch.from_numpy(numpy_array[i]).float()
#         label = label_speech_sequence(audio_tensor, sampling_rate, threshold)
#         labels.append(label)

#     return np.array(labels, dtype=np.int32)
# 封装一个多进程处理函数
def _process_single_audio(args):
    audio_array, sampling_rate, threshold = args
    audio_tensor = torch.from_numpy(audio_array).float()
    label = label_speech_sequence(audio_tensor, sampling_rate, threshold)
    return label
def label_batch_numpy_array(numpy_array, sampling_rate=8000, threshold=0.5):
    """
    多进程版本
    输入: (n, 8000) numpy 数组
    输出: n 维向量 (numpy array)，包含每行的语音标签
    """
    numpy_array = np.asarray(numpy_array)
    n = numpy_array.shape[0]
    
    # 为每个音频生成参数元组
    args_list = [(numpy_array[i], sampling_rate, threshold) for i in range(n)]
    
    # 使用 multiprocessing.Pool 加速
    with Pool(processes=cpu_count()) as pool:
        labels = pool.map(_process_single_audio, args_list)
    
    return np.array(labels, dtype=np.int32)

# noise_labels = label_batch_numpy_array(noise_numpy_norm)
# print(f"噪声标签: {noise_labels[:10]}")  # 显示前10个标签
# others_labels = label_batch_numpy_array(others_numpy_norm)
# print(f"其他标签: {others_labels[:10]}")  # 显示前10个标签  
# xiaoxin_labels = label_batch_numpy_array(xiaoxin_numpy_norm)
# print(f"小新标签: {xiaoxin_labels[:10]}")  # 显示前10个标签
# xiaoyuan_labels = label_batch_numpy_array(xiaoyuan_numpy_norm)
# print(f"小苑标签: {xiaoyuan_labels[:10]}")  # 显示前10个标签
# xiaoyuan_noise_labels = label_batch_numpy_array(xiaoyuan_noise_numpy)
# print(f"小苑噪声标签: {xiaoyuan_noise_labels[:10]}")  # 显示前10个标签
# xiaoxin_noise_labels = label_batch_numpy_array(xiaoxin_noise_numpy)
# print(f"小新噪声标签: {xiaoxin_noise_labels[:10]}")  # 显示前10个标签
# others_noise_labels = label_batch_numpy_array(others_noise_numpy)
# print(f"其他噪声标签: {others_noise_labels[:10]}")  # 显示前10个标签

# 二、VAD 数据准备

## 划分数据集（文件级）

In [4]:
from myutils.split_by_weighted_duration import main as split_dataset
categorys = {
             "noise":1,     # 后面会膨胀为3+3*3*5=3+45=48
             "board":1,       # 1+1*3*5=16
             }              # speech:non-speech = 1:1
split_dataset(root_dir="vad_brd_only/00origin_wav",
               out_dir="vad_brd_only/00origin_wav_split", 
               weight_dict=categorys,
               split_ratio=(0.8, 0.12, 0.08))

限制标签：noise，最大总时长为 7271.97 秒
✅ 划分完成！结果保存在 vad_brd_only/00origin_wav_split


In [None]:
from myutils.file_copier import copy_files_to_subfolders

txt_paths = ["vad_brd_only/00origin_wav_split/test.txt",
"vad_brd_only/00origin_wav_split/val.txt",
"vad_brd_only/00origin_wav_split/test.txt"]
target_dirs = ["vad_brd_only/00origin_wav_split/test",
               "vad_brd_only/00origin_wav_split/val",
               "vad_brd_only/00origin_wav_split/test"]

for txt_path,target_dir in zip(txt_paths,target_dirs):
    copy_files_to_subfolders(target_dir, txt_path)

已复制: vad_brd_only/00origin_wav/noise/cat_122.wav -> vad_brd_only/00origin_wav_split/train/noise/cat_122.wav
已复制: vad_brd_only/00origin_wav/noise/dog_71.wav -> vad_brd_only/00origin_wav_split/train/noise/dog_71.wav
已复制: vad_brd_only/00origin_wav/noise/cat_90.wav -> vad_brd_only/00origin_wav_split/train/noise/cat_90.wav
已复制: vad_brd_only/00origin_wav/noise/dog_40.wav -> vad_brd_only/00origin_wav_split/train/noise/dog_40.wav
已复制: vad_brd_only/00origin_wav/noise/cat_98.wav -> vad_brd_only/00origin_wav_split/train/noise/cat_98.wav
已复制: vad_brd_only/00origin_wav/noise/dog_102.wav -> vad_brd_only/00origin_wav_split/train/noise/dog_102.wav
已复制: vad_brd_only/00origin_wav/noise/cat_150.wav -> vad_brd_only/00origin_wav_split/train/noise/cat_150.wav
已复制: vad_brd_only/00origin_wav/noise/noise_83.wav -> vad_brd_only/00origin_wav_split/train/noise/noise_83.wav
已复制: vad_brd_only/00origin_wav/noise/noise_45.wav -> vad_brd_only/00origin_wav_split/train/noise/noise_45.wav
已复制: vad_brd_only/00origin_wav/n

In [8]:
root_dir = Path('vad_brd_only/00origin_wav_split')
out_root_dir = str(root_dir.parent) + "/01combined_wav_split" 

# 使用 Path.iterdir() + is_dir() 过滤出子文件夹
subfolders = [p for p in root_dir.iterdir() if p.is_dir()]
for subfolder in subfolders:
    subsubfolders = [p for p in subfolder.iterdir() if p.is_dir()]
    for subsubfolder in subsubfolders:
        input_folder_path = subsubfolder
        output_file_dir = str(out_root_dir) +"/" + subfolder.name 
        os.makedirs(output_file_dir, exist_ok=True)  # 确保输出目录存在
        output_file_path = output_file_dir + "/" + subsubfolder.name + ".wav" 
        concat_wav_files(input_folder_path, output_file_path)

拼接完成，输出文件: vad_brd_only/01combined_wav_split/val/board.wav
拼接完成，输出文件: vad_brd_only/01combined_wav_split/val/noise.wav
拼接完成，输出文件: vad_brd_only/01combined_wav_split/test/board.wav
拼接完成，输出文件: vad_brd_only/01combined_wav_split/test/noise.wav
拼接完成，输出文件: vad_brd_only/01combined_wav_split/train/board.wav
拼接完成，输出文件: vad_brd_only/01combined_wav_split/train/noise.wav


In [9]:
root_dir = Path('vad_brd_only/01combined_wav_split')
subfolders = [p for p in root_dir.iterdir() if p.is_dir()]
numpy_dict = {}
for subfolder in subfolders:
    files = [f for f in subfolder.iterdir() if f.is_file() and f.suffix == '.wav']
    if not files:
        print(f"⚠️ 子文件夹 {subfolder} 中没有找到任何 wav 文件，跳过处理。")
        continue
    for file in files:
        split_wav(file)

已保存: vad_brd_only/01combined_wav_split/val/board/board_000.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_001.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_002.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_003.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_004.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_005.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_006.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_007.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_008.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_009.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_010.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_011.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_012.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_013.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_014.wav
已保存: vad_brd_only/01combined_wav_split/val/board/board_

In [10]:
root_dir = Path('vad_brd_only/01combined_wav_split')
subfolders = [p for p in root_dir.iterdir() if p.is_dir()]
numpy_dict = {}
for subfolder in subfolders:
    subsubfolders = [p for p in subfolder.iterdir() if p.is_dir()]
    if not subsubfolders:
        print(f"⚠️ 子文件夹 {subfolder} 没有子文件夹，跳过处理。")
        continue
    for subsubfolder in subsubfolders:
        numpy_dict[subfolder.name+ "_" + subsubfolder.name] = load_all_wav_files_to_numpy_check_length(subsubfolder)
        print(subfolder.name,"_" ,subsubfolder.name, ": ",numpy_dict[subfolder.name+ "_" + subsubfolder.name].shape)


⚠️ 文件 vad_brd_only/01combined_wav_split/val/board/board_871.wav 采样点数: 5472（期望 8000）
val _ board :  (871, 8000)
⚠️ 文件 vad_brd_only/01combined_wav_split/val/noise/noise_872.wav 采样点数: 1898（期望 8000）
val _ noise :  (872, 8000)
⚠️ 文件 vad_brd_only/01combined_wav_split/test/board/board_583.wav 采样点数: 5312（期望 8000）
test _ board :  (583, 8000)
⚠️ 文件 vad_brd_only/01combined_wav_split/test/noise/noise_581.wav 采样点数: 6111（期望 8000）
test _ noise :  (581, 8000)
⚠️ 文件 vad_brd_only/01combined_wav_split/train/board/board_5815.wav 采样点数: 7840（期望 8000）
train _ board :  (5815, 8000)
⚠️ 文件 vad_brd_only/01combined_wav_split/train/noise/noise_5815.wav 采样点数: 7771（期望 8000）
train _ noise :  (5815, 8000)


In [11]:
print(numpy_dict.keys())

dict_keys(['val_board', 'val_noise', 'test_board', 'test_noise', 'train_board', 'train_noise'])


## 归一化音频数据

In [12]:
# 遍历numpy_dict每个键的值，进行归一化
numpy_norm_dict = {}
root_dir = Path('vad_brd_only/02numpy_norm')
root_dir.mkdir(parents=True, exist_ok=True)

for key in numpy_dict.keys():
    numpy_norm_dict[key] = normalize_audio_batch(numpy_dict[key])
    np.save(root_dir / f"{key}_norm.npy", numpy_norm_dict[key])
    print(f"已保存归一化后的 {key} 到 {root_dir / f'{key}_norm.npy'}")

已保存归一化后的 val_board 到 vad_brd_only/02numpy_norm/val_board_norm.npy
已保存归一化后的 val_noise 到 vad_brd_only/02numpy_norm/val_noise_norm.npy
已保存归一化后的 test_board 到 vad_brd_only/02numpy_norm/test_board_norm.npy
已保存归一化后的 test_noise 到 vad_brd_only/02numpy_norm/test_noise_norm.npy
已保存归一化后的 train_board 到 vad_brd_only/02numpy_norm/train_board_norm.npy
已保存归一化后的 train_noise 到 vad_brd_only/02numpy_norm/train_noise_norm.npy


## 混合数据生成：不同信噪比（SNR）

In [None]:
mixed_dict = {}  # 用于存储混音结果
snr_list = [-3, 0, 10]

# 需要处理的三个分组
groups = ['val', 'test', 'train']

for group in groups:
    # 取出该组内的4个键的numpy数组
    keys_in_group = [f"{group}_board", f"{group}_noise"]

    # 两两有序配对
    for i in range(len(keys_in_group)):
        for j in range(len(keys_in_group)):
            if i != j:  # 避免自混音
                key1 = keys_in_group[i]
                key2 = keys_in_group[j]

                # 取出 numpy1 和 numpy2
                numpy1 = numpy_dict[key1]
                numpy2 = numpy_dict[key2]

                # 混音，假设 snr_list 和 normalize_audio_batch 已定义
                mixed = mix_audio_batches(numpy1, numpy2, snr_list, normalize_audio_batch)

                # 存储
                mixed_key = f"{group}_{key1.split('_')[1]}_{key2.split('_')[1]}"
                mixed_dict[mixed_key] = mixed


In [19]:
print(mixed_dict.keys())

dict_keys(['val_board_noise', 'val_noise_board', 'test_board_noise', 'test_noise_board', 'train_board_noise', 'train_noise_board'])


In [20]:
concatenated_dict = {}  # 用于存储拼接后的结果

for mixed_key, snr_dict in mixed_dict.items():
    # snr_dict: 例如 {'snr_0': numpy数组, 'snr_1': numpy数组, ...}
    concatenated_audio = concatenate_mixed_audio(snr_dict)
    concatenated_dict[mixed_key] = concatenated_audio

print(concatenated_dict.keys())

dict_keys(['val_board_noise', 'val_noise_board', 'test_board_noise', 'test_noise_board', 'train_board_noise', 'train_noise_board'])


In [21]:
train_arrays = []
val_arrays = []
test_arrays = []

for mixed_key, array in concatenated_dict.items():
    if mixed_key.startswith("train_"):
        train_arrays.append(array)
    elif mixed_key.startswith("val_"):
        val_arrays.append(array)
    elif mixed_key.startswith("test_"):
        test_arrays.append(array)

# 分别拼接为三个大数组
train_mixed_data = np.concatenate(train_arrays, axis=0) if train_arrays else None
val_mixed_data = np.concatenate(val_arrays, axis=0) if val_arrays else None
test_mixed_data = np.concatenate(test_arrays, axis=0) if test_arrays else None
#查看形状
print(f"训练数据形状: {train_mixed_data.shape if train_mixed_data is not None else '无数据'}")
print(f"验证数据形状: {val_mixed_data.shape if val_mixed_data is not None else '无数据'}")
print(f"测试数据形状: {test_mixed_data.shape if test_mixed_data is not None else '无数据'}")

训练数据形状: (34890, 8000)
验证数据形状: (5229, 8000)
测试数据形状: (3492, 8000)


In [22]:
train_arrays = []
val_arrays = []
test_arrays = []

for origin_key, array in numpy_dict.items():
    if origin_key.startswith("train_"):
        train_arrays.append(array)
    elif origin_key.startswith("val_"):
        val_arrays.append(array)
    elif origin_key.startswith("test_"):
        test_arrays.append(array)

# 分别拼接为三个大数组
train_origin_data = np.concatenate(train_arrays, axis=0) if train_arrays else None
val_origin_data = np.concatenate(val_arrays, axis=0) if val_arrays else None
test_origin_data = np.concatenate(test_arrays, axis=0) if test_arrays else None
#查看形状
print(f"训练数据形状: {train_origin_data.shape if train_origin_data is not None else '无数据'}")
print(f"验证数据形状: {val_origin_data.shape if val_origin_data is not None else '无数据'}")
print(f"测试数据形状: {test_origin_data.shape if test_origin_data is not None else '无数据'}")

训练数据形状: (11630, 8000)
验证数据形状: (1743, 8000)
测试数据形状: (1164, 8000)


In [23]:
train_data = np.concatenate([train_mixed_data, train_origin_data], axis=0)
val_data = np.concatenate([val_mixed_data, val_origin_data], axis=0)
test_data = np.concatenate([test_mixed_data, test_origin_data], axis=0)

# 查看形状
print(f"训练数据形状: {train_data.shape}")
print(f"验证数据形状: {val_data.shape}")
print(f"测试数据形状: {test_data.shape}")


训练数据形状: (46520, 8000)
验证数据形状: (6972, 8000)
测试数据形状: (4656, 8000)


## 教师模型Silero自动标注

In [24]:
train_labels = label_batch_numpy_array(train_data)
val_labels = label_batch_numpy_array(val_data)
test_labels = label_batch_numpy_array(test_data)
# 查看标签形状
print(f"训练标签形状: {train_labels.shape}")
print(f"验证标签形状: {val_labels.shape}")
print(f"测试标签形状: {test_labels.shape}")

训练标签形状: (46520,)
验证标签形状: (6972,)
测试标签形状: (4656,)


In [25]:
root_dir = Path('vad_brd_only/03final(snr+mfcc)')
root_dir.mkdir(parents=True, exist_ok=True)

np.save(root_dir / "train_data.npy", train_data)
np.save(root_dir / "val_data.npy", val_data)
np.save(root_dir / "test_data.npy", test_data)
np.save(root_dir / "train_labels.npy", train_labels)
np.save(root_dir / "val_labels.npy", val_labels)
np.save(root_dir / "test_labels.npy", test_labels)

## MFCC

In [26]:
train_mfcc = compute_batch_mfcc_features_parallel(train_data)
val_mfcc = compute_batch_mfcc_features_parallel(val_data)
test_mfcc = compute_batch_mfcc_features_parallel(test_data)

In [27]:
# 打印形状
print(f"训练MFCC形状: {train_mfcc.shape}")
print(f"验证MFCC形状: {val_mfcc.shape}")
print(f"测试MFCC形状: {test_mfcc.shape}")

训练MFCC形状: (46520, 31, 13)
验证MFCC形状: (6972, 31, 13)
测试MFCC形状: (4656, 31, 13)


In [28]:
root_dir = Path('vad_brd_only/03final(snr+mfcc)')

np.save(root_dir / "train_mfcc.npy", train_mfcc)
np.save(root_dir / "val_mfcc.npy", val_mfcc)
np.save(root_dir / "test_mfcc.npy", test_mfcc)

In [29]:
# 根据标签分析数据分布是否均衡
def analyze_label_distribution(labels):
    """
    分析标签分布，返回每个标签的计数。
    
    参数:
    labels: numpy array，形状为 (n,) 的标签数组。
    
    返回:
    dict，键为标签值，值为对应的计数。
    """
    unique, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique, counts))
# 分析训练集标签分布
train_label_distribution = analyze_label_distribution(train_labels)
print("训练集标签分布:", train_label_distribution)
# 分析验证集标签分布
val_label_distribution = analyze_label_distribution(val_labels)
print("验证集标签分布:", val_label_distribution)
# 分析测试集标签分布
test_label_distribution = analyze_label_distribution(test_labels)
print("测试集标签分布:", test_label_distribution)

训练集标签分布: {np.int32(0): np.int64(26580), np.int32(1): np.int64(19940)}
验证集标签分布: {np.int32(0): np.int64(4081), np.int32(1): np.int64(2891)}
测试集标签分布: {np.int32(0): np.int64(2439), np.int32(1): np.int64(2217)}


In [31]:
import numpy as np

# 查看当前训练集的类别分布
unique, counts = np.unique(train_labels, return_counts=True)
print("当前分布:", dict(zip(unique, counts)))

# 找到少数类的数量（用作目标数量）
min_count = min(counts)
print(f"目标数量: {min_count}")

# 分别获取两个类别的索引
class_0_indices = np.where(train_labels == 0)[0]
class_1_indices = np.where(train_labels == 1)[0]

print(f"类别0的样本数: {len(class_0_indices)}")
print(f"类别1的样本数: {len(class_1_indices)}")

# 随机采样，让两个类别数量相等
np.random.seed(42)  # 设置随机种子，保证结果可重复

# 从多数类中随机选择与少数类相同数量的样本
sampled_class_0_indices = np.random.choice(class_0_indices, size=min_count, replace=False)
sampled_class_1_indices = class_1_indices  # 少数类全部保留

# 合并索引
balanced_indices = np.concatenate([sampled_class_0_indices, sampled_class_1_indices])

# 打乱顺序
np.random.shuffle(balanced_indices)

# 取出平衡后的数据
train_mfcc_balanced = train_mfcc[balanced_indices]
train_labels_balanced = train_labels[balanced_indices]

# 验证平衡效果
unique_balanced, counts_balanced = np.unique(train_labels_balanced, return_counts=True)
print("平衡后分布:", dict(zip(unique_balanced, counts_balanced)))
print(f"平衡后总样本数: {len(train_labels_balanced)}")

当前分布: {np.int32(0): np.int64(26580), np.int32(1): np.int64(19940)}
目标数量: 19940
类别0的样本数: 26580
类别1的样本数: 19940
平衡后分布: {np.int32(0): np.int64(19940), np.int32(1): np.int64(19940)}
平衡后总样本数: 39880


In [32]:
import numpy as np

# 查看当前训练集的类别分布
unique, counts = np.unique(test_labels, return_counts=True)
print("当前分布:", dict(zip(unique, counts)))

# 找到少数类的数量（用作目标数量）
min_count = min(counts)
print(f"目标数量: {min_count}")

# 分别获取两个类别的索引
class_0_indices = np.where(test_labels == 0)[0]
class_1_indices = np.where(test_labels == 1)[0]

print(f"类别0的样本数: {len(class_0_indices)}")
print(f"类别1的样本数: {len(class_1_indices)}")

# 随机采样，让两个类别数量相等
np.random.seed(42)  # 设置随机种子，保证结果可重复

# 从多数类中随机选择与少数类相同数量的样本
sampled_class_0_indices = np.random.choice(class_0_indices, size=min_count, replace=False)
sampled_class_1_indices = class_1_indices  # 少数类全部保留

# 合并索引
balanced_indices = np.concatenate([sampled_class_0_indices, sampled_class_1_indices])

# 打乱顺序
np.random.shuffle(balanced_indices)

# 取出平衡后的数据
test_mfcc_balanced = test_mfcc[balanced_indices]
test_labels_balanced = test_labels[balanced_indices]

# 验证平衡效果
unique_balanced, counts_balanced = np.unique(test_labels_balanced, return_counts=True)
print("平衡后分布:", dict(zip(unique_balanced, counts_balanced)))
print(f"平衡后总样本数: {len(test_labels_balanced)}")

当前分布: {np.int32(0): np.int64(2439), np.int32(1): np.int64(2217)}
目标数量: 2217
类别0的样本数: 2439
类别1的样本数: 2217
平衡后分布: {np.int32(0): np.int64(2217), np.int32(1): np.int64(2217)}
平衡后总样本数: 4434


In [33]:
import numpy as np

# 查看当前训练集的类别分布
unique, counts = np.unique(val_labels, return_counts=True)
print("当前分布:", dict(zip(unique, counts)))

# 找到少数类的数量（用作目标数量）
min_count = min(counts)
print(f"目标数量: {min_count}")

# 分别获取两个类别的索引
class_0_indices = np.where(val_labels == 0)[0]
class_1_indices = np.where(val_labels == 1)[0]

print(f"类别0的样本数: {len(class_0_indices)}")
print(f"类别1的样本数: {len(class_1_indices)}")

# 随机采样，让两个类别数量相等
np.random.seed(42)  # 设置随机种子，保证结果可重复

# 从多数类中随机选择与少数类相同数量的样本
sampled_class_0_indices = np.random.choice(class_0_indices, size=min_count, replace=False)
sampled_class_1_indices = class_1_indices  # 少数类全部保留

# 合并索引
balanced_indices = np.concatenate([sampled_class_0_indices, sampled_class_1_indices])

# 打乱顺序
np.random.shuffle(balanced_indices)

# 取出平衡后的数据
val_mfcc_balanced = val_mfcc[balanced_indices]
val_labels_balanced = val_labels[balanced_indices]

# 验证平衡效果
unique_balanced, counts_balanced = np.unique(val_labels_balanced, return_counts=True)
print("平衡后分布:", dict(zip(unique_balanced, counts_balanced)))
print(f"平衡后总样本数: {len(val_labels_balanced)}")

当前分布: {np.int32(0): np.int64(4081), np.int32(1): np.int64(2891)}
目标数量: 2891
类别0的样本数: 4081
类别1的样本数: 2891
平衡后分布: {np.int32(0): np.int64(2891), np.int32(1): np.int64(2891)}
平衡后总样本数: 5782


In [43]:
root_dir = Path('vad_brd_only/03final(snr+mfcc)')

np.save(root_dir / "train_mfcc.npy", train_mfcc_balanced)
np.save(root_dir / "val_mfcc.npy", val_mfcc_balanced)
np.save(root_dir / "test_mfcc.npy", test_mfcc_balanced)
np.save(root_dir / "train_labels.npy", train_labels_balanced)
np.save(root_dir / "val_labels.npy", val_labels_balanced)
np.save(root_dir / "test_labels.npy", test_labels_balanced)

# 三、VAD 训练

In [44]:
import numpy as np
from pathlib import Path

# 定义根目录
root_dir = Path('vad_brd_only/03final(snr+mfcc)')

# 加载原始数据和标签
train_data = np.load(root_dir / "train_data.npy")
val_data   = np.load(root_dir / "val_data.npy")
test_data  = np.load(root_dir / "test_data.npy")

train_labels = np.load(root_dir / "train_labels.npy")
val_labels   = np.load(root_dir / "val_labels.npy")
test_labels  = np.load(root_dir / "test_labels.npy")

# 加载MFCC特征
train_mfcc = np.load(root_dir / "train_mfcc.npy")
val_mfcc   = np.load(root_dir / "val_mfcc.npy")
test_mfcc  = np.load(root_dir / "test_mfcc.npy")

# 增加一个维度，转换成适配 Conv2D 的 shape (batch, height, width, channels)
train_data = train_data[..., np.newaxis]
val_data   = val_data[..., np.newaxis]
test_data  = test_data[..., np.newaxis]

train_mfcc = train_mfcc[..., np.newaxis]
val_mfcc   = val_mfcc[..., np.newaxis]
test_mfcc  = test_mfcc[..., np.newaxis]

# 如果想打印一下各个文件的数据维度
print("Train data shape:", train_data.shape)
print("Val data shape:", val_data.shape)
print("Test data shape:", test_data.shape)

print("Train labels shape:", train_labels.shape)
print("Val labels shape:", val_labels.shape)
print("Test labels shape:", test_labels.shape)

print("Train MFCC shape:", train_mfcc.shape)
print("Val MFCC shape:", val_mfcc.shape)
print("Test MFCC shape:", test_mfcc.shape)

# 后续可自由组合、封装为 dataset 类或 DataLoader


Train data shape: (46520, 8000, 1)
Val data shape: (6972, 8000, 1)
Test data shape: (4656, 8000, 1)
Train labels shape: (39880,)
Val labels shape: (5782,)
Test labels shape: (4434,)
Train MFCC shape: (39880, 31, 13, 1)
Val MFCC shape: (5782, 31, 13, 1)
Test MFCC shape: (4434, 31, 13, 1)


In [45]:
# 去掉 MFCC 的第一维（假设 MFCC shape 为 (batch, time, n_mfcc, 1)）
train_mfcc = train_mfcc[:, :, 1:, :]
val_mfcc   = val_mfcc[:, :, 1:, :]
test_mfcc  = test_mfcc[:, :, 1:, :]

# 再次打印 shape，确认第一维已去除
print("Train MFCC shape (after removal):", train_mfcc.shape)
print("Val MFCC shape (after removal):", val_mfcc.shape)
print("Test MFCC shape (after removal):", test_mfcc.shape)

# 保存处理后的 MFCC 特征（可选）
np.save(root_dir / "train_mfcc_no0.npy", train_mfcc)
np.save(root_dir / "val_mfcc_no0.npy", val_mfcc)
np.save(root_dir / "test_mfcc_no0.npy", test_mfcc)

print("已保存去除第一维后的 MFCC 特征！")

Train MFCC shape (after removal): (39880, 31, 12, 1)
Val MFCC shape (after removal): (5782, 31, 12, 1)
Test MFCC shape (after removal): (4434, 31, 12, 1)
已保存去除第一维后的 MFCC 特征！


In [46]:
import tensorflow as tf
import numpy as np

def create_vad_model():
    # 定义输入
    inputs = tf.keras.Input(shape=(31, 12, 1))
    
    # 逐层传递x
    # 第一步：Conv2D卷积操作（不包含激活函数）
    x = tf.keras.layers.Conv2D(4, kernel_size=(3, 3), padding='same')(inputs)
    # 第二步：单独应用ReLU激活函数
    x = tf.keras.layers.ReLU()(x)
    x = tf.keras.layers.AveragePooling2D(pool_size=(31, 12), padding='valid')(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(2, activation='sigmoid')(x)
    # outputs = tf.keras.layers.Sigmoid()(dense_outputs) # 再单独加上 Sigmoid

    
    # 创建模型
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# 量化转换函数
def quantize_model(model, X_sample):
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    converter.inference_input_type = tf.int8
    converter.inference_output_type = tf.int8

    def representative_dataset():
        for i in range(min(100, len(X_sample))):
            data = X_sample[i:i+1]
            yield [tf.cast(data, tf.float32)]
    
    converter.representative_dataset = representative_dataset
    tflite_quant_model = converter.convert()
    return tflite_quant_model

In [47]:
import numpy as np
import tensorflow as tf

# 将标签转为 one-hot 编码
train_labels_cat = tf.keras.utils.to_categorical(train_labels, num_classes=2)
val_labels_cat = tf.keras.utils.to_categorical(val_labels, num_classes=2)

# 创建模型
vad_model = create_vad_model()

# 编译
vad_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# 训练
print("开始训练 VAD 模型")
vad_history = vad_model.fit(
    train_mfcc, train_labels_cat,
    validation_data=(val_mfcc, val_labels_cat),
    epochs=20,
    batch_size=32
)

# 量化转换
print("\n量化 VAD 模型")
vad_quant_tflite = quantize_model(vad_model, train_mfcc[:100])
file_path = "vad_model_quant_mfcc12_cls2_sigmoid.tflite"
with open(file_path, "wb") as f:
    f.write(vad_quant_tflite)

print(f"\n✅ 模型已训练并量化完成！文件:'{file_path}'")


开始训练 VAD 模型
Epoch 1/20
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - accuracy: 0.5742 - loss: 0.6851 - val_accuracy: 0.6885 - val_loss: 0.6500
Epoch 2/20
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.6742 - loss: 0.6464 - val_accuracy: 0.7240 - val_loss: 0.5978
Epoch 3/20
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.7145 - loss: 0.6007 - val_accuracy: 0.7402 - val_loss: 0.5557
Epoch 4/20
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.7456 - loss: 0.5598 - val_accuracy: 0.7615 - val_loss: 0.5237
Epoch 5/20
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.7629 - loss: 0.5282 - val_accuracy: 0.7580 - val_loss: 0.5052
Epoch 6/20
[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.7721 - loss: 0.5049 - val_accuracy: 0.7705 - val_loss: 0.4865


INFO:tensorflow:Assets written to: /tmp/tmpmwizuqfg/assets


Saved artifact at '/tmp/tmpmwizuqfg'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 31, 12, 1), dtype=tf.float32, name='keras_tensor_12')
Output Type:
  TensorSpec(shape=(None, 2), dtype=tf.float32, name=None)
Captures:
  139946202005136: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139946204428432: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139946152215184: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139946152215568: TensorSpec(shape=(), dtype=tf.resource, name=None)


W0000 00:00:1753148672.605859   21407 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1753148672.605897   21407 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-07-22 09:44:32.606865: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpmwizuqfg
2025-07-22 09:44:32.608279: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-07-22 09:44:32.608305: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpmwizuqfg
I0000 00:00:1753148672.628107   21407 mlir_graph_optimization_pass.cc:425] MLIR V1 optimization pass is not enabled
2025-07-22 09:44:32.628860: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-07-22 09:44:32.654032: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpmwizuqfg
2025-07-22 09:44:32.661045: I tensorflow/cc/saved_model/loader.cc:471] SavedModel 


✅ 模型已训练并量化完成！文件:'vad_model_quant_mfcc12_cls2_sigmoid.tflite'
