In [1]:
import torchaudio
import os
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from scipy import signal
import time

from paths import *
from misc_progress_bar import draw_progress_bar

In [2]:
class MFCCTransform(nn.Module): 
    def __init__(self): 
        super().__init__()
    
    def forward(self, waveform, sr=16000): 
        # extract mfcc
        feature = torchaudio.compliance.kaldi.mfcc(waveform, sample_frequency=sr)

        # add deltas
        d1 = torchaudio.functional.compute_deltas(feature)
        d2 = torchaudio.functional.compute_deltas(d1)
        feature = torch.cat([feature, d1, d2], dim=-1)

        # Apply normalization (CMVN)
        eps = 1e-9
        mean = feature.mean(0, keepdim=True)
        std = feature.std(0, keepdim=True, unbiased=False)
        feature = (feature - mean) / (std + eps)
        return feature

In [3]:
transformer = MFCCTransform()

In [4]:
mfcc_feats = torch.empty((0, 25, 39))

In [5]:
src_ = "/home/ldlmdl/Documents/wavln/src/bsc/phone_seg_random/"
total = len(os.listdir(src_))
for idx, file in enumerate(os.listdir(src_)):
    draw_progress_bar(idx, total)
    try: 
        wave, sr = torchaudio.load(os.path.join(src_, file))
        resampled_wave = torch.tensor(signal.resample(wave, 4240, axis=1))
        single_mfccfeats = transformer(resampled_wave)
        mfcc_feats = torch.cat((mfcc_feats, single_mfccfeats.unsqueeze(0)), dim=0)
    except: 
        print("!")

torch.save(mfcc_feats, os.path.join(bsc_path, "random.mfcc")) 

[                                                  ] 0%	

KeyboardInterrupt: 

In [6]:
total

1438841