In [1]:
import torch
import torch.nn as nn
import numpy as np
# import torchvision

INPUT_SIZE = 40
HIDDEN_SIZE = 512  # units inside the lstm
# DROP_RATE = 0.2  # drop-out rate
LAYERS = 1  # number of lstm layers, will be increased to 4


class toy_lstm(nn.Module):
    def __init__(self):
        super(toy_lstm, self).__init__()

        self.rnn = nn.LSTM(
            input_size=INPUT_SIZE,
            hidden_size=HIDDEN_SIZE,
            num_layers=LAYERS,
#             dropout=DROP_RATE,
            batch_first=True
        )
        self.fc = nn.Linear(HIDDEN_SIZE, 40)  # fully connected layer
        self.h_s = None
        self.h_c = None

    def forward(self, x):
        r_out, (h_s, h_c) = self.rnn(x)
        output = self.fc(r_out)
        return output

In [14]:
import kaldiark

# Read data index from the total scp file

count = 0

with open('./data/raw_fbank_train_si284.1.scp', 'rb') as scp_file:  # use '../remote/data/wsj/fbank/' replace '/data/'
    lines = scp_file.readlines()
    for line in lines: # line is like b'4avc040p /home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.10.ark:2769059\n'
        temp = str(line).split()[1]
        print(temp)
        file_loc = temp.split(':')[0][28:]  # ark file path; keep [18:]
        pointer = temp.split(':')[1][:-3].replace('\\r', '')  # pointer to the utterance
        print(file_loc)
        print(pointer)

        # According to the file name and pointer to get the matrix
        with open('./data' + file_loc, 'rb') as ark_file:  # use '../remote/data' + file_loc replace './data/' + file_loc
            ark_file.seek(int(pointer))
            utt_mat = kaldiark.parse_feat_matrix(ark_file)
            print(utt_mat.shape)  
        
        count = count + 1
        if count > 10:
            break

/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:9\r\n'
/raw_fbank_train_si284.1.ark
9
(652, 40)
/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:26439\r\n'
/raw_fbank_train_si284.1.ark
26439
(693, 40)
/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:54509\r\n'
/raw_fbank_train_si284.1.ark
54509
(1069, 40)
/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:97619\r\n'
/raw_fbank_train_si284.1.ark
97619
(449, 40)
/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:115929\r\n'
/raw_fbank_train_si284.1.ark
115929
(373, 40)
/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:131199\r\n'
/raw_fbank_train_si284.1.ark
131199
(550, 40)
/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:153549\r\n'
/raw_fbank_train_si284.1.ark
153549
(344, 40)
/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:167659\r\n'
/raw_fbank_train_si284.1.ark
167659
(636, 40)
/home/htang2/kaldi/wsj/fbank/raw_fbank_train_si284.1.ark:193449\r\n'
/raw_fbank_train_si284.1.a

In [8]:
import torch
import torch.nn as nn
import numpy as np

import kaldiark
from apc import toy_lstm
import glob

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  # 

LEARNING_RATE = 0.1
EPOCH = 20

rnn = toy_lstm().to(device)  
optimizer = torch.optim.Adam(rnn.parameters(), lr=LEARNING_RATE)  # optimize all parameters
loss_func = nn.MSELoss()
# Learning rate decay schedule
mult_step_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                           milestones=[EPOCH // 2, EPOCH // 4 * 3], gamma=0.1)

# Predefine the prediction gap
K = 8  # predefine the gap


# Train + Dev
train_loss = []
valid_loss = []
min_valid_loss = np.inf
for i in range(EPOCH):
    total_train_loss = []
    rnn.train()  # Training
    
    # Use the total scp files
    # Read data index from the total scp file
    with open('./data/raw_fbank_train_si284.1.scp', 'rb') as scp_file:
        lines = scp_file.readlines()
        for line in lines[:15]:
            temp = str(line).split()[1]
            file_loc = temp.split(':')[0][28:]  # ark file path; keep [18:]
            pointer = temp.split(':')[1][:-3].replace('\\r', '')  # pointer to the utterance
#             print(file_loc, pointer)

            # According to the file name and pointer to get the matrix
            with open('./data' + file_loc, 'rb') as ark_file:
                ark_file.seek(int(pointer))
                utt_mat = kaldiark.parse_feat_matrix(ark_file)
            
                utt_mat = np.expand_dims(utt_mat, axis=0)  # expand a new dimension as batch
                utt_mat = torch.Tensor(utt_mat).to(device)   # change data to tensor

                output = rnn(utt_mat[:, :-K, :])
                
#                 print(utt_mat.shape, output.shape)

                loss = loss_func(output, utt_mat[:, K:, :])  # compute the difference
                optimizer.zero_grad()  # clear gradients for this training step
                loss.backward()  # back-prop
                optimizer.step()  # gradients
                total_train_loss.append(loss.item())
        train_loss.append(np.mean(total_train_loss))
    print('train complete!')

    total_valid_loss = []
    rnn.eval()  # Validation
    
    # Use one of scp files
    # Read data index from the total scp file
    with open('./data/raw_fbank_train_si284.2.scp', 'rb') as scp_file:  # change 1 to dev 
        lines = scp_file.readlines()
        for line in lines[:3]:
            temp = str(line).split()[1]
            file_loc = temp.split(':')[0][28:]  # ark file path; keep [18:]
            pointer = temp.split(':')[1][:-3].replace('\\r', '')  # pointer to the utterance

            # According to the file name and pointer to get the matrix
            with open('./data' + file_loc, 'rb') as ark_file:
                ark_file.seek(int(pointer))
                utt_mat = kaldiark.parse_feat_matrix(ark_file)
            
                utt_mat = np.expand_dims(utt_mat, axis=0)  # expand a new dimension as batch
                utt_mat = torch.Tensor(utt_mat).to(device)   # change data to tensor

                with torch.no_grad():
                    output = rnn(utt_mat[:, :-K, :])  # rnn output
                    
#                     print(utt_mat_mat.shape, output.shape)

                loss = loss_func(output, utt_mat[:, K:, :])
            total_valid_loss.append(loss.item())
        valid_loss.append(np.mean(total_valid_loss))
    print('dev complete!')

    if (valid_loss[-1] < min_valid_loss):
        torch.save({'epoch': i, 'model': rnn, 'train_loss': train_loss,
                    'valid_loss': valid_loss}, './LSTM.model')
        min_valid_loss = valid_loss[-1]

    # Log
    log_string = ('iter: [{:d}/{:d}], train_loss: {:0.6f}, valid_loss: {:0.6f}, '
                  'best_valid_loss: {:0.6f}, lr: {:0.7f}').format((i + 1), EPOCH,
                                                                  train_loss[-1],
                                                                  valid_loss[-1],
                                                                  min_valid_loss,
                                                                  optimizer.param_groups[0]['lr'])
    mult_step_scheduler.step()  # 学习率更新
    print(log_string)  # 打印日志

train complete!
dev complete!
iter: [1/20], train_loss: 65.319239, valid_loss: 40.543747, best_valid_loss: 40.543747, lr: 0.1000000
train complete!
dev complete!
iter: [2/20], train_loss: 16.876287, valid_loss: 21.307021, best_valid_loss: 21.307021, lr: 0.1000000
train complete!
dev complete!
iter: [3/20], train_loss: 12.337894, valid_loss: 17.886119, best_valid_loss: 17.886119, lr: 0.1000000
train complete!
dev complete!
iter: [4/20], train_loss: 11.849856, valid_loss: 17.893142, best_valid_loss: 17.886119, lr: 0.1000000
train complete!
dev complete!
iter: [5/20], train_loss: 11.805400, valid_loss: 17.802566, best_valid_loss: 17.802566, lr: 0.1000000
train complete!
dev complete!
iter: [6/20], train_loss: 11.673864, valid_loss: 17.814047, best_valid_loss: 17.802566, lr: 0.1000000
train complete!
dev complete!
iter: [7/20], train_loss: 11.637706, valid_loss: 17.654215, best_valid_loss: 17.654215, lr: 0.1000000
train complete!
dev complete!
iter: [8/20], train_loss: 11.720034, valid_los