#### 依赖

In [1]:
import os
import random
import shutil
from pathlib import Path
import numpy as np
import torch
import torchaudio
import soundfile as sf
from multiprocessing import Pool, cpu_count
import multiprocessing as mp
from silero_vad import load_silero_vad
from IPython.display import Audio
from scipy.fft import rfft, fft

#### MFCC

In [2]:
class MFCCProcessor:
    """
    Python implementation of MFCC (Mel-Frequency Cepstral Coefficients)
    Based on the RISC-V DSP library implementation
    """
    
    def __init__(self, fft_len=256, nb_mel_filters=40, nb_dct_outputs=13, use_cfft=False):
        """
        Initialize MFCC processor
        
        Args:
            fft_len: FFT length (should match window length)
            nb_mel_filters: Number of mel filter banks
            nb_dct_outputs: Number of DCT outputs (MFCC coefficients)
            use_cfft: Whether to use CFFT instead of RFFT (default: False)
        """
        self.fft_len = fft_len
        self.nb_mel_filters = nb_mel_filters
        self.nb_dct_outputs = nb_dct_outputs
        self.use_cfft = use_cfft
        
        # Initialize coefficient arrays - to be filled with actual data
        self.window_coefs = None
        self.filter_pos = None
        self.filter_lengths = None
        self.filter_coefs = None
        self.dct_coefs = None
        
        self._load_coefficients()
    
    def _load_coefficients(self):
        """Load pre-computed coefficients"""
        
        # Hanning window coefficients (256 points)
        # TODO: Fill with mfcc_window_coefs_hann256 data
        self.window_coefs = np.array([
            # Insert mfcc_window_coefs_hann256 values here
            # Format: [0.000000, 0.000151, 0.000602, ...]
            0.000000,0.000151,0.000602,0.001355,0.002408,0.003760,0.005412,0.007361,0.009607,0.012149,
            0.014984,0.018112,0.021530,0.025236,0.029228,0.033504,0.038060,0.042895,0.048005,0.053388,
            0.059039,0.064957,0.071136,0.077573,0.084265,0.091208,0.098396,0.105827,0.113495,0.121396,
            0.129524,0.137876,0.146447,0.155230,0.164221,0.173414,0.182803,0.192384,0.202150,0.212096,
            0.222215,0.232501,0.242949,0.253551,0.264302,0.275194,0.286222,0.297379,0.308658,0.320052,
            0.331555,0.343159,0.354858,0.366644,0.378510,0.390449,0.402455,0.414519,0.426635,0.438795,
            0.450991,0.463218,0.475466,0.487729,0.500000,0.512271,0.524534,0.536782,0.549009,0.561205,
            0.573365,0.585481,0.597545,0.609551,0.621490,0.633356,0.645142,0.656841,0.668445,0.679947,
            0.691342,0.702621,0.713778,0.724806,0.735698,0.746449,0.757051,0.767499,0.777785,0.787904,
            0.797850,0.807616,0.817197,0.826586,0.835779,0.844770,0.853553,0.862124,0.870476,0.878604,
            0.886505,0.894173,0.901604,0.908792,0.915735,0.922427,0.928864,0.935044,0.940961,0.946612,
            0.951995,0.957105,0.961940,0.966496,0.970772,0.974764,0.978470,0.981888,0.985016,0.987851,
            0.990393,0.992639,0.994588,0.996240,0.997592,0.998645,0.999398,0.999849,1.000000,0.999849,
            0.999398,0.998645,0.997592,0.996240,0.994588,0.992639,0.990393,0.987851,0.985016,0.981888,
            0.978470,0.974764,0.970772,0.966496,0.961940,0.957105,0.951995,0.946612,0.940961,0.935044,
            0.928864,0.922427,0.915735,0.908792,0.901604,0.894173,0.886505,0.878604,0.870476,0.862124,
            0.853553,0.844770,0.835779,0.826586,0.817197,0.807616,0.797850,0.787904,0.777785,0.767499,
            0.757051,0.746449,0.735698,0.724806,0.713778,0.702621,0.691342,0.679947,0.668445,0.656841,
            0.645142,0.633356,0.621490,0.609551,0.597545,0.585481,0.573365,0.561205,0.549009,0.536782,
            0.524534,0.512271,0.500000,0.487729,0.475466,0.463218,0.450991,0.438795,0.426635,0.414519,
            0.402455,0.390449,0.378510,0.366644,0.354858,0.343159,0.331555,0.320052,0.308658,0.297379,
            0.286222,0.275194,0.264302,0.253551,0.242949,0.232501,0.222215,0.212096,0.202150,0.192384,
            0.182803,0.173414,0.164221,0.155230,0.146447,0.137876,0.129524,0.121396,0.113495,0.105827,
            0.098396,0.091208,0.084265,0.077573,0.071136,0.064957,0.059039,0.053388,0.048005,0.042895,
            0.038060,0.033504,0.029228,0.025236,0.021530,0.018112,0.014984,0.012149,0.009607,0.007361,
            0.005412,0.003760,0.002408,0.001355,0.000602,0.000151
        ], dtype=np.float32)
        
        # MEL filter positions (40 filters)
        # TODO: Fill with mfcc_filter_pos_mel40 data
        self.filter_pos = np.array([
            # Insert mfcc_filter_pos_mel40 values here
            # Format: [1, 2, 3, 4, 5, 6, 8, 9, ...]
            1,2,3,4,5,6,8,9,11,12,
            14,15,17,19,21,23,25,27,30,32,
            35,38,40,43,46,50,53,57,60,64,
            68,73,77,82,87,92,97,103,109,115,
        ], dtype=np.uint32)
        
        # MEL filter lengths (40 filters)
        # TODO: Fill with mfcc_filter_len_mel40 data
        self.filter_lengths = np.array([
            # Insert mfcc_filter_len_mel40 values here
            # Format: [2, 2, 2, 2, 3, 3, 3, 3, ...]
            2,2,2,2,3,3,3,3,3,3,
            3,4,4,4,4,4,5,5,5,6,
            5,5,6,7,7,7,7,7,8,9,
            9,9,10,10,10,11,12,12,13,13,
        ], dtype=np.uint32)
        
        # MEL filter coefficients
        # TODO: Fill with mfcc_filter_coefs_mel40 data
        self.filter_coefs = np.array([
            # Insert mfcc_filter_coefs_mel40 values here
            # Format: [0.940365, 0.158628, 0.841372, ...]
            0.940365,0.158628,0.841372,0.293816,0.706184,0.462403,0.537597,0.661904,0.338096,0.890104,
            0.145015,0.109896,0.854985,0.424850,0.575150,0.727995,0.052989,0.272005,0.947011,0.398503,
            0.601497,0.763326,0.146352,0.236674,0.853648,0.546566,0.453434,0.963036,0.394905,0.036964,
            0.605095,0.841380,0.301730,0.158620,0.698270,0.775275,0.261386,0.224725,0.738614,0.759477,
            0.269002,0.240523,0.730998,0.789451,0.320349,0.210549,0.679651,0.861250,0.411736,0.138750,
            0.588264,0.971416,0.539920,0.116902,0.028584,0.460080,0.883098,0.702035,0.295011,0.297965,
            0.704989,0.895539,0.503343,0.118164,0.104461,0.496657,0.881836,0.739755,0.367882,0.002322,
            0.260245,0.632118,0.997678,0.642866,0.289313,0.357134,0.710687,0.941471,0.599160,0.262206,
            0.058529,0.400840,0.737794,0.930444,0.603716,0.281873,0.069556,0.396284,0.718127,0.964769,
            0.652268,0.344238,0.040553,0.035231,0.347732,0.655761,0.959447,0.741092,0.445738,0.154382,
            0.258908,0.554262,0.845618,0.866915,0.583236,0.303246,0.026850,0.133085,0.416764,0.696754,
            0.973150,0.753958,0.484481,0.218335,0.246042,0.515519,0.781665,0.955439,0.695714,0.439085,
            0.185479,0.044561,0.304286,0.560915,0.814521,0.934825,0.687055,0.442105,0.199909,0.065175,
            0.312945,0.557895,0.800091,0.960408,0.723542,0.489253,0.257486,0.028188,0.039592,0.276458,
            0.510747,0.742514,0.971812,0.801306,0.576789,0.354590,0.134660,0.198694,0.423211,0.645410,
            0.865340,0.916954,0.701428,0.488037,0.276741,0.067498,0.083046,0.298572,0.511963,0.723259,
            0.932502,0.860269,0.655015,0.451700,0.250287,0.050740,0.139731,0.344985,0.548300,0.749713,
            0.949260,0.853026,0.657111,0.462962,0.270549,0.079839,0.146974,0.342889,0.537038,0.729451,
            0.920161,0.890805,0.703415,0.517642,0.333459,0.150837,0.109195,0.296585,0.482358,0.666541,
            0.849163,0.969752,0.790177,0.612087,0.435458,0.260267,0.086489,0.030248,0.209823,0.387913,
            0.564542,0.739733,0.913511,0.914103,0.743086,0.573416,0.405074,0.238037,0.072286,0.085897,
            0.256914,0.426584,0.594926,0.761963,0.927714,0.907802,0.744564,0.582555,0.421755,0.262148,
            0.103715,0.092198,0.255436,0.417445,0.578245,0.737852,0.896285,0.946440,0.790305,0.635294,
            0.481391,0.328580,0.176846,0.026174,0.053560,0.209695,0.364706,0.518609,0.671420,0.823154,
            0.973826,0.876550,0.727957,0.580384,0.433814,0.288236,0.143636
        ], dtype=np.float32)
        
        # DCT coefficients matrix (13 x 40)
        # TODO: Fill with mfcc_dct_coefs_dct13 data
        dct_coefs_flat = np.array([
            # Insert mfcc_dct_coefs_dct13 values here
            # Format: [0.223607, 0.223607, 0.223607, ...]
            0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,
            0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,
            0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,
            0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,0.223607,
            0.223434,0.222057,0.219310,0.215212,0.209786,0.203067,0.195096,0.185922,0.175602,0.164200,
            0.151784,0.138434,0.124229,0.109259,0.093615,0.077394,0.060696,0.043624,0.026282,0.008779,
            -0.008779,-0.026282,-0.043624,-0.060696,-0.077394,-0.093615,-0.109259,-0.124229,-0.138434,-0.151784,
            -0.164200,-0.175602,-0.185922,-0.195096,-0.203067,-0.209786,-0.215212,-0.219310,-0.222057,-0.223434,
            0.222917,0.217429,0.206586,0.190656,0.170032,0.145221,0.116834,0.085571,0.052200,0.017544,
            -0.017544,-0.052200,-0.085571,-0.116834,-0.145221,-0.170032,-0.190656,-0.206586,-0.217429,-0.222917,
            -0.222917,-0.217429,-0.206586,-0.190656,-0.170032,-0.145221,-0.116834,-0.085571,-0.052200,-0.017544,
            0.017544,0.052200,0.085571,0.116834,0.145221,0.170032,0.190656,0.206586,0.217429,0.222917,
            0.222057,0.209786,0.185922,0.151784,0.109259,0.060696,0.008779,-0.043624,-0.093615,-0.138434,
            -0.175602,-0.203067,-0.219310,-0.223434,-0.215212,-0.195096,-0.164200,-0.124229,-0.077394,-0.026282,
            0.026282,0.077394,0.124229,0.164200,0.195096,0.215212,0.223434,0.219310,0.203067,0.175602,
            0.138434,0.093615,0.043624,-0.008779,-0.060696,-0.109259,-0.151784,-0.185922,-0.209786,-0.222057,
            0.220854,0.199235,0.158114,0.101515,0.034980,-0.034980,-0.101515,-0.158114,-0.199235,-0.220854,
            -0.220854,-0.199235,-0.158114,-0.101515,-0.034980,0.034980,0.101515,0.158114,0.199235,0.220854,
            0.220854,0.199235,0.158114,0.101515,0.034980,-0.034980,-0.101515,-0.158114,-0.199235,-0.220854,
            -0.220854,-0.199235,-0.158114,-0.101515,-0.034980,0.034980,0.101515,0.158114,0.199235,0.220854,
            0.219310,0.185922,0.124229,0.043624,-0.043624,-0.124229,-0.185922,-0.219310,-0.219310,-0.185922,
            -0.124229,-0.043624,0.043624,0.124229,0.185922,0.219310,0.219310,0.185922,0.124229,0.043624,
            -0.043624,-0.124229,-0.185922,-0.219310,-0.219310,-0.185922,-0.124229,-0.043624,0.043624,0.124229,
            0.185922,0.219310,0.219310,0.185922,0.124229,0.043624,-0.043624,-0.124229,-0.185922,-0.219310,
            0.217429,0.170032,0.085571,-0.017544,-0.116834,-0.190656,-0.222917,-0.206586,-0.145221,-0.052200,
            0.052200,0.145221,0.206586,0.222917,0.190656,0.116834,0.017544,-0.085571,-0.170032,-0.217429,
            -0.217429,-0.170032,-0.085571,0.017544,0.116834,0.190656,0.222917,0.206586,0.145221,0.052200,
            -0.052200,-0.145221,-0.206586,-0.222917,-0.190656,-0.116834,-0.017544,0.085571,0.170032,0.217429,
            0.215212,0.151784,0.043624,-0.077394,-0.175602,-0.222057,-0.203067,-0.124229,-0.008779,0.109259,
            0.195096,0.223434,0.185922,0.093615,-0.026282,-0.138434,-0.209786,-0.219310,-0.164200,-0.060696,
            0.060696,0.164200,0.219310,0.209786,0.138434,0.026282,-0.093615,-0.185922,-0.223434,-0.195096,
            -0.109259,0.008779,0.124229,0.203067,0.222057,0.175602,0.077394,-0.043624,-0.151784,-0.215212,
            0.212663,0.131433,0.000000,-0.131433,-0.212663,-0.212663,-0.131433,-0.000000,0.131433,0.212663,
            0.212663,0.131433,0.000000,-0.131433,-0.212663,-0.212663,-0.131433,-0.000000,0.131433,0.212663,
            0.212663,0.131433,0.000000,-0.131433,-0.212663,-0.212663,-0.131433,-0.000000,0.131433,0.212663,
            0.212663,0.131433,-0.000000,-0.131433,-0.212663,-0.212663,-0.131433,-0.000000,0.131433,0.212663,
            0.209786,0.109259,-0.043624,-0.175602,-0.223434,-0.164200,-0.026282,0.124229,0.215212,0.203067,
            0.093615,-0.060696,-0.185922,-0.222057,-0.151784,-0.008779,0.138434,0.219310,0.195096,0.077394,
            -0.077394,-0.195096,-0.219310,-0.138434,0.008779,0.151784,0.222057,0.185922,0.060696,-0.093615,
            -0.203067,-0.215212,-0.124229,0.026282,0.164200,0.223434,0.175602,0.043624,-0.109259,-0.209786,
            0.206586,0.085571,-0.085571,-0.206586,-0.206586,-0.085571,0.085571,0.206586,0.206586,0.085571,
            -0.085571,-0.206586,-0.206586,-0.085571,0.085571,0.206586,0.206586,0.085571,-0.085571,-0.206586,
            -0.206586,-0.085571,0.085571,0.206586,0.206586,0.085571,-0.085571,-0.206586,-0.206586,-0.085571,
            0.085571,0.206586,0.206586,0.085571,-0.085571,-0.206586,-0.206586,-0.085571,0.085571,0.206586,
            0.203067,0.060696,-0.124229,-0.222057,-0.164200,0.008779,0.175602,0.219310,0.109259,-0.077394,
            -0.209786,-0.195096,-0.043624,0.138434,0.223434,0.151784,-0.026282,-0.185922,-0.215212,-0.093615,
            0.093615,0.215212,0.185922,0.026282,-0.151784,-0.223434,-0.138434,0.043624,0.195096,0.209786,
            0.077394,-0.109259,-0.219310,-0.175602,-0.008779,0.164200,0.222057,0.124229,-0.060696,-0.203067,
            0.199235,0.034980,-0.158114,-0.220854,-0.101515,0.101515,0.220854,0.158114,-0.034980,-0.199235,
            -0.199235,-0.034980,0.158114,0.220854,0.101515,-0.101515,-0.220854,-0.158114,0.034980,0.199235,
            0.199235,0.034980,-0.158114,-0.220854,-0.101515,0.101515,0.220854,0.158114,-0.034980,-0.199235,
            -0.199235,-0.034980,0.158114,0.220854,0.101515,-0.101515,-0.220854,-0.158114,0.034980,0.199235
        ], dtype=np.float32)
        
        # Reshape DCT coefficients to matrix form
        if len(dct_coefs_flat) > 0:
            self.dct_coefs = dct_coefs_flat.reshape(self.nb_dct_outputs, self.nb_mel_filters)
        else:
            self.dct_coefs = np.zeros((self.nb_dct_outputs, self.nb_mel_filters), dtype=np.float32)
    
    def compute_mfcc(self, input_signal):
        """
        Compute MFCC features from input signal
        
        Args:
            input_signal: Input audio signal (numpy array)
            
        Returns:
            mfcc_features: MFCC coefficients (numpy array)
        """
        # Ensure input is the correct length and type
        if len(input_signal) != self.fft_len:
            raise ValueError(f"Input signal length ({len(input_signal)}) must match FFT length ({self.fft_len})")
        
        src = input_signal.astype(np.float32).copy()
        
        # # Step 1: Normalize
        # max_value = np.max(np.abs(src))
        # max_index = np.argmax(np.abs(src))
        
        # # if max_value != 0.0:
        # #     src = src / max_value
        
        # Step 2: Apply window function
        if self.window_coefs is not None and len(self.window_coefs) == self.fft_len:
            src = src * self.window_coefs
        
        # Step 3: Compute spectrum magnitude
        if self.use_cfft:
            # CFFT-based implementation
            # Convert real to complex
            complex_signal = np.zeros(self.fft_len, dtype=np.complex64)
            complex_signal.real = src
            complex_signal.imag = 0.0
            
            # Compute FFT
            fft_result = fft(complex_signal)
            spectrum_mag = np.abs(fft_result)
        else:
            # RFFT-based implementation (default)
            fft_result = rfft(src)
            
            # Unpack real values (mimic the C code behavior)
            tmp = np.zeros(self.fft_len + 2, dtype=np.float32)
            tmp[0] = fft_result[0].real  # DC component
            
            # Pack real and imaginary parts
            for i in range(1, len(fft_result)):
                if i < self.fft_len // 2:
                    tmp[2*i] = fft_result[i].real
                    tmp[2*i + 1] = fft_result[i].imag
            
            # Handle Nyquist frequency
            if len(fft_result) > self.fft_len // 2:
                tmp[self.fft_len] = fft_result[-1].real
                tmp[self.fft_len + 1] = 0.0
            
            tmp[1] = 0.0  # Set imaginary part of DC to 0
            
            # Compute magnitude
            spectrum_mag = np.zeros(self.fft_len, dtype=np.float32)
            for i in range(self.fft_len):
                if i == 0:
                    spectrum_mag[i] = abs(tmp[0])
                elif i < self.fft_len // 2:
                    spectrum_mag[i] = np.sqrt(tmp[2*i]**2 + tmp[2*i + 1]**2)
                else:
                    spectrum_mag[i] = spectrum_mag[self.fft_len - i]
        
        # # Restore original scale if normalization was applied
        # if max_value != 0.0:
        #     spectrum_mag = spectrum_mag * max_value
        # spectrum_mag *= 32.0
        # Step 4: Apply MEL filters
        mel_outputs = np.zeros(self.nb_mel_filters, dtype=np.float32)
        coef_idx = 0
        
        if (self.filter_pos is not None and self.filter_lengths is not None and 
            self.filter_coefs is not None):
            
            for i in range(self.nb_mel_filters):
                pos = self.filter_pos[i]
                length = self.filter_lengths[i]
                
                # Compute dot product
                result = 0.0
                for j in range(length):
                    if pos + j < len(spectrum_mag) and coef_idx + j < len(self.filter_coefs):
                        result += spectrum_mag[pos + j] * self.filter_coefs[coef_idx + j]
                
                mel_outputs[i] = result
                coef_idx += length
        
        # Step 5: Compute logarithm
        # Add small offset to avoid log(0)
        mel_outputs = mel_outputs + 1.0e-6
        log_mel = np.log(mel_outputs)
        
        # Step 6: Apply DCT transformation
        if self.dct_coefs is not None:
            mfcc_features = np.dot(self.dct_coefs, log_mel)
        else:
            mfcc_features = log_mel[:self.nb_dct_outputs]
        # ② 计算帧能量并替换 C0
        # log_energy = np.log(np.sum(src**2) + 1e-30)
        # mfcc_features = np.dot(self.dct_coefs, log_mel)
        # mfcc_features[0] = log_energy
        return mfcc_features


# 工厂函数：返回一个新的 MFCCProcessor 实例
def mfcc_processor_factory():
    return MFCCProcessor(
        fft_len=256,
        nb_mel_filters=40,
        nb_dct_outputs=13,
        use_cfft=True
    )

# 子进程中调用的函数
def process_row(row_data):
    segment_size = 256
    num_segments = 31
    num_features = 13
    row_features = np.zeros((num_segments, num_features), dtype=np.float32)
    
    # 在子进程中创建 mfcc_processor
    mfcc_processor = mfcc_processor_factory()
    
    for j in range(num_segments):
        start = j * segment_size
        end = start + segment_size
        segment = row_data[start:end]
        mfcc = mfcc_processor.compute_mfcc(segment)
        row_features[j, :] = mfcc
    
    return row_features

def compute_batch_mfcc_features_parallel(input_array):
    """
    多进程并行处理 (n, 8000) 音频数组，返回 (n, 31, 13) 特征。
    """
    n_rows = input_array.shape[0]

    # 启动进程池，使用所有可用核心
    with mp.Pool(processes=mp.cpu_count()) as pool:
        results = pool.map(process_row, [input_array[i] for i in range(n_rows)])

    # 拼接结果
    return np.stack(results, axis=0)

# 2. 数据集构建

## 2.1 OFFICIAL数据集

In [3]:
LABEL = 'NOISE'

### 2.1.1 合并各类为单wav文件

In [4]:
import resampy
def concat_wav_files(input_folder, output_file, target_sr=8000):
    """
    拼接指定文件夹下所有wav文件，递归查找（按路径名排序）.
    自动转换为单声道、统一采样率为 target_sr。
    
    参数:
        input_folder (str or Path): 输入文件夹路径
        output_file (str or Path): 输出文件路径
        target_sr (int): 目标采样率，默认8000
    """
    input_folder = Path(input_folder)
    wav_files = sorted(input_folder.rglob("*.wav"))

    if not wav_files:
        print("未找到任何wav文件！")
        return

    data_all = []

    for wav_file in wav_files:
        data, sr = sf.read(wav_file)
        
        # 转为单声道
        if data.ndim == 2:
            data = np.mean(data, axis=1)

        # 降采样
        if sr != target_sr:
            data = resampy.resample(data, sr, target_sr)

        data_all.append(data)

    # 拼接所有数据
    data_all = np.concatenate(data_all, axis=0)

    # 写入到文件
    sf.write(output_file, data_all, target_sr)
    print(f"✅ 拼接完成，输出文件: {output_file}，采样率: {target_sr}Hz")

In [5]:
from pathlib import Path
import os

root_dir = Path(f'OFFICIAL/{LABEL}/00origin')
out_root_dir = str(root_dir.parent) + "/01combined"

# 遍历子文件夹（不再深入一层）
subfolders = [p for p in root_dir.iterdir() if p.is_dir()]
for subfolder in subfolders:
    input_folder_path = subfolder
    output_file_dir = str(out_root_dir)  # 不再用 subsubfolder.name
    os.makedirs(output_file_dir, exist_ok=True)

    # 输出文件以子文件夹命名
    output_file_path = os.path.join(output_file_dir, subfolder.name + ".wav")
    concat_wav_files(input_folder_path, output_file_path)


✅ 拼接完成，输出文件: OFFICIAL/NOISE/01combined/dog.wav，采样率: 8000Hz
✅ 拼接完成，输出文件: OFFICIAL/NOISE/01combined/environment_sound.wav，采样率: 8000Hz
✅ 拼接完成，输出文件: OFFICIAL/NOISE/01combined/cat.wav，采样率: 8000Hz
✅ 拼接完成，输出文件: OFFICIAL/NOISE/01combined/nature_sound.wav，采样率: 8000Hz
✅ 拼接完成，输出文件: OFFICIAL/NOISE/01combined/noise.wav，采样率: 8000Hz
✅ 拼接完成，输出文件: OFFICIAL/NOISE/01combined/music.wav，采样率: 8000Hz


### 2.1.2 切分为一秒一段

In [6]:
def split_wav(file_path: Path, chunk_size=8000):
    """
    将指定 WAV 文件切分为多个 WAV 文件，每个文件 chunk_size 采样点。
    """
    # 获取音频数据和采样率
    data, samplerate = sf.read(file_path)
    
    # 确认 chunk_size 是否合适
    total_samples = data.shape[0]
    num_chunks = (total_samples + chunk_size - 1) // chunk_size  # 向上取整
    
    # 创建输出文件夹（使用原文件名，不带扩展名）
    output_dir = file_path.with_suffix('')
    os.makedirs(output_dir, exist_ok=True)
    
    # 切分并保存
    for i in range(num_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, total_samples)
        chunk_data = data[start:end]
        
        output_filename = f"{file_path.stem}_{i:03d}.wav"  # 例如：file_000.wav
        output_path = Path(output_dir) / output_filename
        
        sf.write(output_path, chunk_data, samplerate)
        print(f"已保存: {output_path}")

In [7]:
root_dir = Path(f'OFFICIAL/{LABEL}/01combined')

files = [f for f in root_dir.iterdir() if f.is_file() and f.suffix == '.wav']
if not files:
    print(f"⚠️ 子文件夹 {root_dir} 中没有找到任何 wav 文件，跳过处理。")
for file in files:
    split_wav(file)

已保存: OFFICIAL/NOISE/01combined/dog/dog_000.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_001.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_002.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_003.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_004.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_005.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_006.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_007.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_008.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_009.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_010.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_011.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_012.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_013.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_014.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_015.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_016.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_017.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_018.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_019.wav
已保存: OFFICIAL/NOISE/01combined/dog/dog_020.wav
已保存: OFFICIAL

### 2.1.3 从文件夹读取为numpy数组

In [8]:

def load_wav_as_tensor(file_path):
    """加载单个wav文件为torch.tensor，返回（audio_tensor, sample_rate）"""
    audio_tensor, sample_rate = torchaudio.load(file_path)
    return audio_tensor, sample_rate

def load_all_wav_files_to_numpy_check_length(input_folder, expected_length=8000):
    """
    递归加载指定文件夹下的所有wav文件，返回一个 (n, expected_length) 的 numpy 数组。
    检查每个wav文件是否是 expected_length 个采样点，否则打印路径。
    """
    input_folder = Path(input_folder)
    wav_files = sorted(input_folder.rglob("*.wav"))

    audio_segments = []
    for wav_file in wav_files:
        audio_tensor, _ = load_wav_as_tensor(wav_file)
        audio_tensor = audio_tensor[0]  # 取第一个通道

        if audio_tensor.shape[0] != expected_length:
            print(f"⚠️ 文件 {wav_file} 采样点数: {audio_tensor.shape[0]}（期望 {expected_length}）")
        else:
            audio_segments.append(audio_tensor.numpy())

    if not audio_segments:
        print(f"⚠️ 文件夹 {input_folder} 中没有找到任何符合条件的音频文件")
        return np.empty((0, expected_length), dtype=np.float32)
    
    return np.stack(audio_segments, axis=0)

In [9]:
from pathlib import Path

root_dir = Path(f'OFFICIAL/{LABEL}/01combined')
subfolders = [p for p in root_dir.iterdir() if p.is_dir()]
numpy_dict = {}

for subfolder in subfolders:
    # 直接处理每个子文件夹下的 wav 文件
    numpy_dict[subfolder.name] = load_all_wav_files_to_numpy_check_length(subfolder)
    print(subfolder.name, ": ", numpy_dict[subfolder.name].shape)


⚠️ 文件 OFFICIAL/NOISE/01combined/dog/dog_597.wav 采样点数: 3880（期望 8000）
dog :  (597, 8000)
⚠️ 文件 OFFICIAL/NOISE/01combined/environment_sound/environment_sound_948.wav 采样点数: 4800（期望 8000）
environment_sound :  (948, 8000)
⚠️ 文件 OFFICIAL/NOISE/01combined/cat/cat_316.wav 采样点数: 5836（期望 8000）
cat :  (316, 8000)
⚠️ 文件 OFFICIAL/NOISE/01combined/nature_sound/nature_sound_388.wav 采样点数: 3314（期望 8000）
nature_sound :  (388, 8000)
⚠️ 文件 OFFICIAL/NOISE/01combined/noise/noise_492.wav 采样点数: 7304（期望 8000）
noise :  (492, 8000)
⚠️ 文件 OFFICIAL/NOISE/01combined/music/music_4527.wav 采样点数: 6646（期望 8000）
music :  (4527, 8000)


In [10]:
for name, array in numpy_dict.items():
    print(f"{name}: shape={array.shape}, length={len(array)}")


dog: shape=(597, 8000), length=597
environment_sound: shape=(948, 8000), length=948
cat: shape=(316, 8000), length=316
nature_sound: shape=(388, 8000), length=388
noise: shape=(492, 8000), length=492
music: shape=(4527, 8000), length=4527


### 2.1.4 归一化PCM

In [26]:
def normalize_audio_batch(numpy_array: np.ndarray) -> np.ndarray:
    """
    对 (n, 8000) 的音频批量数据进行逐行"音频归一化"。

    音频归一化的思路：
    - 每行除以其最大绝对值，确保最大幅度为1。
    - 如果某一行全是0，保持全0，避免除以0。

    参数:
    numpy_array: (n, 8000) 形状的numpy数组，每行是一段音频。

    返回:
    归一化后的 (n, 8000) numpy数组，数据类型仍保持 float32。
    """
    # 确保是float32，避免精度问题
    numpy_array = numpy_array.astype(np.float32)
    
    # 求每行的最大绝对值
    max_abs_values = np.max(np.abs(numpy_array), axis=1, keepdims=True)

    # 避免除以零：如果最大值是0，就保持0
    max_abs_values[max_abs_values == 0] = 1.0

    # 归一化
    normalized = numpy_array / max_abs_values

    return normalized

In [27]:
# 遍历numpy_dict每个键的值，进行归一化
numpy_norm_dict = {}
root_dir = Path(f'OFFICIAL/{LABEL}/02normed')
os.makedirs(root_dir, exist_ok=True)

for key in numpy_dict.keys():
    numpy_norm_dict[key] = normalize_audio_batch(numpy_dict[key])
    np.save(root_dir / f"{key}_norm.npy", numpy_norm_dict[key])
    print(f"已保存归一化后的 {key} 到 {root_dir / f'{key}_norm.npy'}")

已保存归一化后的 dog 到 OFFICIAL/NOISE/02normed/dog_norm.npy
已保存归一化后的 environment_sound 到 OFFICIAL/NOISE/02normed/environment_sound_norm.npy
已保存归一化后的 cat 到 OFFICIAL/NOISE/02normed/cat_norm.npy
已保存归一化后的 nature_sound 到 OFFICIAL/NOISE/02normed/nature_sound_norm.npy
已保存归一化后的 noise 到 OFFICIAL/NOISE/02normed/noise_norm.npy
已保存归一化后的 music 到 OFFICIAL/NOISE/02normed/music_norm.npy


### 2.1.5 各类处理为【等长】

In [28]:
def equalize_dict_lengths(input_dict, n):
    """
    处理输入字典，使得其中的 numpy 数组都等长为 n。
    
    参数:
        input_dict (dict): 键为整数，值为 numpy 数组。
        n (int): 目标长度。
    
    返回:
        dict: 处理后的新字典，值均为长度为 n 的 numpy 数组。
    """
    output_dict = {}
    for key, arr in input_dict.items():
        arr_len = len(arr)
        if arr_len >= n:
            # 如果数组长度大于等于 n，随机抽取 n 个索引
            idx = np.random.choice(arr_len, n, replace=False)
            new_arr = arr[idx]
        else:
            # 如果数组长度小于 n，先保留原数组
            deficit = n - arr_len
            # 随机从原数组中抽取 deficit 个索引，允许重复
            idx = np.random.choice(arr_len, deficit, replace=True)
            extra_arr = arr[idx]
            # 拼接成长度为 n
            new_arr = np.concatenate([arr, extra_arr], axis=0)
        output_dict[key] = new_arr
    return output_dict


In [29]:
for key, value in numpy_norm_dict.items():
    print(f'{key}: {len(value)}')

dog: 597
environment_sound: 948
cat: 316
nature_sound: 388
noise: 492
music: 4527


In [30]:
import numpy as np

def equalize_dict_lengths(input_dict, length_dict):
    """
    处理输入字典，使得其中的 numpy 数组达到指定的长度。
    
    参数:
        input_dict (dict): 键为整数，值为 numpy 数组。
        length_dict (dict): 键为 input_dict 的键，值为对应的目标长度。
    
    返回:
        dict: 处理后的新字典，每个键的值都是指定长度的 numpy 数组。
    """
    output_dict = {}
    for key, arr in input_dict.items():
        if key not in length_dict:
            # 如果没有指定长度，保持原数组不变
            output_dict[key] = arr.copy()
            continue
            
        target_length = length_dict[key]
        arr_len = len(arr)
        
        if arr_len >= target_length:
            # 如果数组长度大于等于目标长度，随机抽取指定数量的索引
            idx = np.random.choice(arr_len, target_length, replace=False)
            new_arr = arr[idx]
        else:
            # 如果数组长度小于目标长度，先保留原数组
            deficit = target_length - arr_len
            # 随机从原数组中抽取 deficit 个索引，允许重复
            idx = np.random.choice(arr_len, deficit, replace=True)
            extra_arr = arr[idx]
            # 拼接成目标长度
            new_arr = np.concatenate([arr, extra_arr], axis=0)
            
        output_dict[key] = new_arr
    return output_dict

In [32]:
length_dict = {
    'dog': 500,
    'cat': 500,
    'environment_sound': 1000,
    'nature_sound': 1000,
    'noise': 1000,
    'music': 3000,
}
equalized_numpy_norm_dict = equalize_dict_lengths(numpy_norm_dict, length_dict)
# 将所有数组按第一个维度拼接在一起
equalized_numpy_norm_data = np.concatenate(list(equalized_numpy_norm_dict.values()), axis=0)

# 打印结果形状
print("拼接后形状:", equalized_numpy_norm_data.shape)

拼接后形状: (7000, 8000)


In [34]:
# 打印类别和对应的数字标签
print("类别标签对应关系:")
for idx, key in enumerate(equalized_numpy_norm_dict.keys()):
    print(f"类别 {idx}: {key}")

print("\n" + "="*50)

# 生成一个大的 X 和对应的标签 Y
X_list = []
Y_list = []
for idx, (key, array) in enumerate(equalized_numpy_norm_dict.items()):
    X_list.append(array)
    Y_list.append(np.full((array.shape[0],), idx, dtype=np.int32))  # 每个类别用一个索引表示标签
    print(f"类别 '{key}' (标签 {idx}): 添加了 {array.shape[0]} 个样本")

# 拼接
X = np.concatenate(X_list, axis=0)
Y = np.concatenate(Y_list, axis=0)

print("\n" + "="*50)
print("最终结果:")
print("X 形状:", X.shape)
print("Y 形状:", Y.shape)
print("类别标签索引:", np.unique(Y))

# 统计每个类别的样本数量
print("\n各类别样本数量统计:")
unique_labels, counts = np.unique(Y, return_counts=True)
for label, count in zip(unique_labels, counts):
    # 通过索引找回对应的类别名称
    category_name = list(equalized_numpy_norm_dict.keys())[label]
    print(f"标签 {label} ({category_name}): {count} 个样本")

类别标签对应关系:
类别 0: dog
类别 1: environment_sound
类别 2: cat
类别 3: nature_sound
类别 4: noise
类别 5: music

类别 'dog' (标签 0): 添加了 500 个样本
类别 'environment_sound' (标签 1): 添加了 1000 个样本
类别 'cat' (标签 2): 添加了 500 个样本
类别 'nature_sound' (标签 3): 添加了 1000 个样本
类别 'noise' (标签 4): 添加了 1000 个样本
类别 'music' (标签 5): 添加了 3000 个样本

最终结果:
X 形状: (7000, 8000)
Y 形状: (7000,)
类别标签索引: [0 1 2 3 4 5]

各类别样本数量统计:
标签 0 (dog): 500 个样本
标签 1 (environment_sound): 1000 个样本
标签 2 (cat): 500 个样本
标签 3 (nature_sound): 1000 个样本
标签 4 (noise): 1000 个样本
标签 5 (music): 3000 个样本


In [35]:
# 存储X Y X_mfcc
root_dir = Path(f'OFFICIAL/{LABEL}/03mfcc')
os.makedirs(root_dir, exist_ok=True)

np.save(root_dir / "X.npy", X)
np.save(root_dir / "Y.npy", Y)