# 1 导入相关依赖

In [1]:
import os
import librosa  
import numpy as np  

import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.optim as optim
import tqdm
from sklearn.metrics import accuracy_score,mean_squared_error

# 2 该数据集真实场景的标签——12类

    Google speech commands dataset. Only 'yes', 'no', 'up', 'down', 'left',
        'right', 'on', 'off', 'stop' and 'go' are treated as known classes.
        All other classes are used as 'unknown' samples.
        See for more information: https://www.kaggle.com/c/tensorflow-speech-recognition-challenge
        

In [2]:
CLASSES = 'unknown, silence, yes, no, up, down, left, right, on, off, stop, go'.split(', ')
len(CLASSES)

12

# 3 基于FBANK特征提取函数

In [3]:
def fbank_features(audio_file):
    y, sr = librosa.load(audio_file, sr=None)  # 假设'noisy_speech.wav'是您的含噪语音文件  
  
    # 预处理（如果需要的话，可以在这里加入降噪步骤）  
    # 提取FBANK特征  
    n_fft = 2048  # FFT窗口大小  
    hop_length = 256  # 帧移  
    n_mels = 40  # Mel滤波器的数量  
    fmin = 0.0  # 最低频率  
    fmax = sr / 2  # 最高频率  

    # 使用librosa提取FBANK特征  
    fbank = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmin=fmin, fmax=fmax)  
    fbank_db = librosa.power_to_db(fbank, ref=np.max)  # 转换为分贝值  
    return fbank_db

# 4 定义Torch 数据集
    加入 silence_percentage=0.1 比例的安静数据集，tricks
    

In [4]:
class Speech_Commands_Dataset(Dataset):
    
    def __init__(self, folder, transform=None, classes=CLASSES, silence_percentage=0.2):
        all_classes = [d for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d)) and not d.startswith('_')]


        class_to_idx = {classes[i]: i for i in range(len(classes))}
        for c in all_classes:
            if c not in class_to_idx:
                class_to_idx[c] = 0

        data = []
        for c in all_classes:
            d = os.path.join(folder, c)
            target = class_to_idx[c]
            for f in os.listdir(d):
                path = os.path.join(d, f)
                data.append((path, target))

        # add silence
        target = class_to_idx['silence']
        data += [('', target)] * int(len(data) * silence_percentage)

        self.classes = classes
        self.data = data
        self.transform = transform
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        path, target = self.data[index]
        data = {'path': path, 'target': target}

        if self.transform is not None:
            data = self.transform(data)

        return data,target


## 4.1 torch Dataset 中的数据转化
    需要提取特征维度，这里最好是32维，经验参数，40*32  而非 40*44

In [5]:
def data_trans(data):
    file_path = data['path']
    try:
        return fbank_features(file_path)[:,:16]
    except Exception as e:
        pass
    return np.zeros((40,16),dtype=np.float32)
        

## 4.2 根据 自定义的Dataset创建Dataloader 

In [6]:
train_ds = Speech_Commands_Dataset('./train',transform=data_trans)
valid_ds = Speech_Commands_Dataset('./valid',transform=data_trans)
test_ds = Speech_Commands_Dataset('./test',transform=data_trans)

In [7]:
train_dataloader = torch.utils.data.DataLoader(train_ds,
                                num_workers =0,
                                batch_size = 128,
                                shuffle = True)

In [8]:
valid_dataloader = torch.utils.data.DataLoader(valid_ds,
                                num_workers =0,
                                batch_size = 128,
                                shuffle = True)

In [9]:
test_dataloader = torch.utils.data.DataLoader(test_ds,
                                num_workers =0,
                                batch_size = 128,
                                shuffle = True)

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [11]:
device

device(type='cuda')

In [12]:
32*40

1280

# 6 Bi-LSTM 分类预测模型

In [16]:
class Bi_LSTM(nn.Module):
    def __init__(self,num_classes):
        super(Bi_LSTM,self).__init__()
        

        self.bi_lstm = nn.LSTM(input_size=16,
                               hidden_size=16,
                               num_layers = 1,
                               batch_first = True,
                               bidirectional = True
                              )
        
        self.predict_layer = nn.Sequential(*[
            
            torch.nn.BatchNorm1d(640),
            torch.nn.Linear(640, 128),
            torch.nn.Dropout(0.2),
            torch.nn.ReLU(),
            
            
            
            
            
            torch.nn.BatchNorm1d(128),
            torch.nn.Linear(128, num_classes),
            torch.nn.Dropout(0.2),
            nn.Softmax()

            
        ])
        
        
        
    def forward(self,x):
        x = x.flatten(1)
       
        out = self.predict_layer(x)
        
        return out
        

In [17]:
# LeNet 对于 minist的参数基本是固定的 ，需要做的说设置迭代次数

net = Bi_LSTM(num_classes=len(CLASSES)).to(device) # 初始化一个net
criterion = nn.CrossEntropyLoss() # 定义交叉熵损失函数
optimizer = torch.optim.Adam(net.parameters())

# Train the network 训练网络
num_epochs = 2  # 迭代次数

train_loss_history = []
val_loss_history = []

for epoch in range(num_epochs):
    # Train step
    net.train()
    train_loss = 0
    
    for batch in tqdm.tqdm(train_dataloader, mininterval=2, desc='  - (Training)   ', leave=False):
        
        data, target = batch
        
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()

        outputs = net(data)
        
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss = train_loss / len(train_dataloader)
    train_loss_history.append(train_loss)

    # Validation step
    net.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm.tqdm(valid_dataloader, mininterval=2, desc='  - (Training)   ', leave=False):
            
            data, target = batch
            
            data = data.to(device)
            target = target.to(device)

        

            outputs = net(data)
            loss = criterion(outputs, target)
            
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
        
            total += target.size(0)
        
            correct += (predicted == target).sum().item()
            
    val_loss = val_loss / len(valid_dataloader)
    val_loss_history.append(val_loss)
    accuracy = 100 * correct / total

    print(f"Epoch {epoch+1}/100: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {accuracy :.2f}%")



  y, sr = librosa.load(audio_file, sr=None)  # 假设'noisy_speech.wav'是您的含噪语音文件
  return self._call_impl(*args, **kwargs)
                                                                                                                       

Epoch 1/100: Train Loss: 2.0517, Val Loss: 1.9365, Val Accuracy: 68.33%


                                                                                                                       

Epoch 2/100: Train Loss: 2.0098, Val Loss: 1.9388, Val Accuracy: 68.40%




In [18]:
# Test the network
net.eval()
test_loss = 0
correct = 0
total = 0
# confusion_matrix = torch.zeros(10, 10)

with torch.no_grad():
    for batch in tqdm.tqdm(test_dataloader, mininterval=2, desc='  - (Training)   ', leave=False):
        data, target = batch
        
            
        data = data.to(device)
        target = target.to(device)
        
        outputs = net(data)
        
        loss = criterion(outputs, target)
        
        _, predicted = torch.max(outputs.data, 1)
        
        total += target.size(0)
        
        correct += (predicted == target).sum().item()
        test_loss += loss.item()
#         for i, j in zip(predicted, labels):
#             confusion_matrix[i][j] += 1
test_loss = test_loss / len(test_dataloader)
accuracy = 100 * correct / total
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy :.2f}%")

# Save the model
# torch.save(net.state_dict(), "best_model_lenet.pt")

  y, sr = librosa.load(audio_file, sr=None)  # 假设'noisy_speech.wav'是您的含噪语音文件
                                                                                                                       

Test Loss: 1.9381, Test Accuracy: 68.68%




In [None]:
nn.NLLLoss