In [1]:
cd _202505/

/disk4/chocho/SEMamba/_202505


In [2]:
import torch  
import torch.nn as nn  
  
class NeuralNetPyTorch1(nn.Module):  
    def __init__(self,   
                 input_neurons=72,  
                 batchsize=1,   
                 nDownSample=1,   
                 kernel_size=6,   
                 scalar_output=1.0):  
        super(NeuralNetPyTorch1, self).__init__()  
          
        # 根據def_se_nn_arch72_mel.txt定義網絡結構  
        self.neurons = [input_neurons, 72, 72, 72, 72, 257]  
        self.layer_types = ['conv1d', 'lstm', 'fc', 'fc', 'fc']  
        self.activations = ['tanh', 'tanh', 'tanh', 'tanh', 'sigmoid']  
        self.dropprobs = [0.25, 0.25, 0.25, 0.25, 0.0]  # 最後一層沒有指定dropout，設為0  
          
        self.nDownSample = nDownSample  
        self.kernel_size = kernel_size  
        self.scalar_output = scalar_output  
        self.num_layers = len(self.layer_types)  
          
        # 創建層列表  
        self.layers = nn.ModuleList()  
        self.dropout_layers = nn.ModuleList()  
          
        # LSTM 狀態變量  
        self.h_states = [None] * self.num_layers  
        self.c_states = [None] * self.num_layers  
          
        # 激活函數映射  
        self.act_funcs = {  
            'tanh': nn.Tanh(),  
            'sigmoid': nn.Sigmoid(),  
            'relu6': lambda x: torch.clamp(torch.relu(x), max=6.0),  
            'linear': lambda x: x  
        }  
          
        # 構建網絡層  
        for i in range(self.num_layers):  
            layer_type = self.layer_types[i]  
            in_neurons = self.neurons[i]  
            out_neurons = self.neurons[i+1]  
              
            if layer_type == 'fc':  
                self.layers.append(nn.Linear(in_neurons, out_neurons))  
                  
            elif layer_type == 'lstm':  
                self.layers.append(nn.LSTM(  
                    input_size=in_neurons,  
                    hidden_size=out_neurons,  
                    batch_first=True,  
                    num_layers=1  
                ))  
                  
            elif layer_type == 'conv1d':  
                # 注意：原始模型使用特殊配置的Conv2D作為1D卷積  
                # 在PyTorch中，我們直接使用Conv1d  
                self.layers.append(nn.Conv1d(  
                    in_channels=in_neurons,  
                    out_channels=out_neurons,  
                    kernel_size=kernel_size,  
                    stride=nDownSample,  
                    padding='same'  # 保持輸出大小與輸入相同  
                ))  
              
            # 添加dropout層  
            self.dropout_layers.append(nn.Dropout(self.dropprobs[i]))  
      
    def forward(self, x, reset_states=False):  
        batch_size = x.shape[0]  
          
        # 初始化或重置LSTM狀態  
        if reset_states:  
            for i in range(self.num_layers):  
                if self.layer_types[i] == 'lstm':  
                    self.h_states[i] = torch.zeros(1, batch_size, self.neurons[i+1], device=x.device)  
                    self.c_states[i] = torch.zeros(1, batch_size, self.neurons[i+1], device=x.device)  
          
        # 前向傳播  
        out = x  
        for i in range(self.num_layers):  
            layer = self.layers[i]  
            layer_type = self.layer_types[i]  
            activation = self.activations[i]  
              
            if layer_type == 'fc':  
                out = layer(out)  
                out = self.act_funcs[activation](out)  
                out = self.dropout_layers[i](out)  
                  
            elif layer_type == 'lstm':  
                if self.h_states[i] is not None and self.c_states[i] is not None:  
                    # 使用先前的狀態  
                    out, (self.h_states[i], self.c_states[i]) = layer(  
                        out, (self.h_states[i], self.c_states[i])  
                    )  
                else:  
                    # 沒有先前狀態  
                    out, (self.h_states[i], self.c_states[i]) = layer(out)  
                  
                out = self.dropout_layers[i](out)  
                  
            elif layer_type == 'conv1d':  
                # 調整形狀以適應Conv1d  
                # Conv1d期望輸入形狀為 [batch_size, channels, length]  
                # 而我們的輸入是 [batch_size, sequence_length, features]  
                out = out.permute(0, 2, 1)  # 變為 [batch_size, features, sequence_length]  
                out = layer(out)  
                out = out.permute(0, 2, 1)  # 變回 [batch_size, sequence_length, features]  
                out = self.act_funcs[activation](out)  
                out = self.dropout_layers[i](out)  
          
        # 應用輸出縮放  
        out = out * self.scalar_output  
          
        return out, (self.h_states, self.c_states)

In [3]:
import torch
import torch.nn as nn

class ReLU6(nn.Module):
    def forward(self, x):
        return torch.clamp(torch.relu(x), max=6.0)

class Identity(nn.Module):
    def forward(self, x):
        return x

class NeuralNetPyTorch2(nn.Module):
    def __init__(self, input_neurons=72, batchsize=1, nDownSample=1, kernel_size=6, scalar_output=1.0):
        super(NeuralNetPyTorch2, self).__init__()
        
        self.neurons = [input_neurons, 72, 72, 72, 72, 257]
        self.layer_types = ['conv1d', 'lstm', 'fc', 'fc', 'fc']
        self.activations = ['tanh', 'tanh', 'tanh', 'tanh', 'sigmoid']
        self.dropprobs = [0.25, 0.25, 0.25, 0.25, 0.0]
        self.nDownSample = nDownSample
        self.kernel_size = kernel_size
        self.scalar_output = scalar_output
        self.num_layers = len(self.layer_types)
        
        self.h_states = [None] * self.num_layers
        self.c_states = [None] * self.num_layers
        
        # self.act_funcs = {
        #     'tanh': nn.Tanh(),
        #     'sigmoid': nn.Sigmoid(),
        #     'relu6': lambda x: torch.clamp(torch.relu(x), max=6.0),
        #     'linear': lambda x: x
        # }
        
        self.act_funcs = {
            'tanh': nn.Tanh(),
            'sigmoid': nn.Sigmoid(),
            'relu6': ReLU6(),
            'linear': Identity()
        }
        
        # 使用 nn.Sequential 組織層
        layers = []
        for i in range(self.num_layers):
            layer_type = self.layer_types[i]
            in_neurons = self.neurons[i]
            out_neurons = self.neurons[i+1]
            activation = self.act_funcs[self.activations[i]]
            
            if layer_type == 'fc':
                layers.append(nn.Linear(in_neurons, out_neurons))
                layers.append(activation)
                if self.dropprobs[i] > 0:
                    layers.append(nn.Dropout(self.dropprobs[i]))
            
            elif layer_type == 'conv1d':
                layers.append(nn.Conv1d(in_channels=in_neurons, out_channels=out_neurons, kernel_size=kernel_size,
                                        stride=nDownSample, padding='same'))
                layers.append(activation)
                if self.dropprobs[i] > 0:
                    layers.append(nn.Dropout(self.dropprobs[i]))
            
            # LSTM 無法直接放入 nn.Sequential，因為它需要狀態管理
            # 這裡我們暫時保留為單獨層
            elif layer_type == 'lstm':
                self.lstm_layer = nn.LSTM(input_size=in_neurons, hidden_size=out_neurons, batch_first=True, num_layers=1)
                self.lstm_idx = i
                layers.append(activation)
                if self.dropprobs[i] > 0:
                    layers.append(nn.Dropout(self.dropprobs[i]))
        
        self.sequential_layers = nn.Sequential(*layers)
    
    def forward(self, x, reset_states=False):
        batch_size = x.shape[0]
        
        if reset_states:
            for i in range(self.num_layers):
                if self.layer_types[i] == 'lstm':
                    self.h_states[i] = torch.zeros(1, batch_size, self.neurons[i+1], device=x.device)
                    self.c_states[i] = torch.zeros(1, batch_size, self.neurons[i+1], device=x.device)
        
        out = x
        for i in range(self.num_layers):
            if self.layer_types[i] == 'conv1d':
                out = out.permute(0, 2, 1)
                out = self.sequential_layers[i*3:(i*3)+3](out)
                out = out.permute(0, 2, 1)
            elif self.layer_types[i] == 'lstm':
                if self.h_states[i] is not None and self.c_states[i] is not None:
                    out, (self.h_states[i], self.c_states[i]) = self.lstm_layer(out, (self.h_states[i], self.c_states[i]))
                else:
                    out, (self.h_states[i], self.c_states[i]) = self.lstm_layer(out)
                out = self.sequential_layers[i*3:(i*3)+2](out)  # 應用激活和 Dropout
            else:
                out = self.sequential_layers[i*3:(i*3)+3](out)
        
        out = out * self.scalar_output
        return out, (self.h_states, self.c_states)

在語音增強的實際應用中，這個過程如下：

1. 將輸入音頻轉換為STFT頻譜
1. 從STFT頻譜計算梅爾頻譜特徵（72維）
1. 將這些特徵送入神經網絡
1. 神經網絡輸出257維的時頻掩碼
1. 將掩碼應用於原始STFT頻譜
1. 進行逆STFT轉換，得到增強後的音頻

### Notes

- 輸入的72維特徵是經過降維的梅爾頻譜特徵，這種降維可以減少計算量並保留語音的關鍵特徵
- 輸出的257維對應於原始STFT頻譜的頻率點數量，這樣掩碼可以直接應用於頻譜
- 批次大小和序列長度在輸入和輸出之間保持不變，因為模型處理的是相同數量的時間步

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 創建模型  
model1 = NeuralNetPyTorch1(  
    input_neurons=72,  
    # nDownSample=1,  # 根據def_se_nn_arch72_mel.txt中的strides=1  
    kernel_size=6,  # 根據def_se_nn_arch72_mel.txt中的kernel_size=6  
    # scalar_output=1.0  
).to(device)

model2 = NeuralNetPyTorch2(  
    input_neurons=72,  
    # nDownSample=1,  # 根據def_se_nn_arch72_mel.txt中的strides=1  
    kernel_size=6,  # 根據def_se_nn_arch72_mel.txt中的kernel_size=6  
    # scalar_output=1.0  
).to(device)

# 假設的輸入 (batch_size, sequence_length, features)  
x = torch.randn(1, 1000, 72).to(device)  # 批次大小為1，序列長度為10，特徵數為72  
  
# 前向傳播  
output1, states = model1(x, reset_states=True)  
print(f"輸入形狀: {x.shape}")  
print(f"輸出形狀: {output1.shape}")  # 應該是 [1, 10, 257]
print(output1)

# 前向傳播  
output2, states = model2(x, reset_states=True)  
print(f"輸入形狀: {x.shape}")  
print(f"輸出形狀: {output2.shape}")  # 應該是 [1, 10, 257]
print(output2)

輸入形狀: torch.Size([1, 1000, 72])
輸出形狀: torch.Size([1, 1000, 257])
tensor([[[0.4590, 0.5345, 0.4707,  ..., 0.5526, 0.5182, 0.4753],
         [0.4552, 0.5107, 0.5023,  ..., 0.5745, 0.4943, 0.4710],
         [0.4562, 0.5131, 0.4808,  ..., 0.5358, 0.4796, 0.4892],
         ...,
         [0.4596, 0.5322, 0.4696,  ..., 0.5226, 0.4915, 0.4806],
         [0.4434, 0.5271, 0.4993,  ..., 0.5379, 0.4864, 0.4891],
         [0.4629, 0.5078, 0.4882,  ..., 0.5508, 0.4919, 0.4887]]],
       device='cuda:0', grad_fn=<MulBackward0>)
輸入形狀: torch.Size([1, 1000, 72])
輸出形狀: torch.Size([1, 1000, 257])
tensor([[[0.4544, 0.4822, 0.5041,  ..., 0.5049, 0.4731, 0.4673],
         [0.4659, 0.4680, 0.5000,  ..., 0.5083, 0.4918, 0.4866],
         [0.4721, 0.4668, 0.4896,  ..., 0.4907, 0.4772, 0.4778],
         ...,
         [0.4596, 0.4753, 0.4686,  ..., 0.5078, 0.4724, 0.4816],
         [0.4676, 0.4698, 0.4776,  ..., 0.4769, 0.4835, 0.4790],
         [0.4702, 0.4775, 0.4993,  ..., 0.4789, 0.4856, 0.4861]]],
       dev

  return F.conv1d(input, weight, bias, self.stride,


In [5]:
import torch
from torchinfo import summary

batch, length, dim = 1, 1000, 72
print("B,L,D:",batch, length, dim)
x = torch.randn(batch, length, dim).to(device)

# summary_str = summary(model, input_size=[(7, 201, 286), (7, 201, 286)], depth=5, col_names=("input_size", "output_size", "num_params"), verbose=0)
summary_str = summary(model1, input_size=[x.shape], depth=15, col_names=("input_size", "output_size", "num_params"), verbose=0)
print(summary_str)

B,L,D: 1 1000 72
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
NeuralNetPyTorch1                        [1, 1000, 72]             [1, 1000, 257]            --
├─ModuleList: 1-9                        --                        --                        (recursive)
│    └─Conv1d: 2-1                       [1, 72, 1000]             [1, 72, 1000]             31,176
├─ModuleList: 1-10                       --                        --                        --
│    └─Dropout: 2-2                      [1, 1000, 72]             [1, 1000, 72]             --
├─ModuleList: 1-9                        --                        --                        (recursive)
│    └─LSTM: 2-3                         [1, 1000, 72]             [1, 1000, 72]             42,048
├─ModuleList: 1-10                       --                        --                        --
│    └─Dropout: 2-4                      [1, 1000, 72]             [1, 1000, 72]        

In [6]:
import torch
from torchinfo import summary

batch, length, dim = 1, 1000, 72
print("B,L,D:",batch, length, dim)
x = torch.randn(batch, length, dim).to(device)

# summary_str = summary(model, input_size=[(7, 201, 286), (7, 201, 286)], depth=5, col_names=("input_size", "output_size", "num_params"), verbose=0)
summary_str = summary(model2, input_size=[x.shape], depth=15, col_names=("input_size", "output_size", "num_params"), verbose=0)
print(summary_str)

B,L,D: 1 1000 72
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
NeuralNetPyTorch2                        [1, 1000, 72]             [1, 1000, 257]            5,256
├─Sequential: 1-3                        --                        --                        (recursive)
│    └─Conv1d: 2-1                       [1, 72, 1000]             [1, 72, 1000]             31,176
│    └─Tanh: 2-2                         [1, 72, 1000]             [1, 72, 1000]             --
│    └─Dropout: 2-3                      [1, 72, 1000]             [1, 72, 1000]             --
├─LSTM: 1-2                              [1, 1000, 72]             [1, 1000, 72]             42,048
├─Sequential: 1-3                        --                        --                        (recursive)
│    └─Tanh: 2-4                         [1, 1000, 72]             [1, 1000, 72]             --
│    └─Dropout: 2-5                      [1, 1000, 72]             [1, 1000, 72]     

In [7]:
torch.save(model2.state_dict(), 'model2_weights.pth')

In [8]:
!du -sh model2_weights.pth

408K	model2_weights.pth


In [9]:
torch.save(model2, 'model2_full.pth')

In [10]:
!du -sh model2_full.pth

412K	model2_full.pth
