In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchaudio import load, transforms
import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import os
import glob
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
tf = transforms.MFCC(sample_rate=8000)

def normalize(tensor):
    tensor_minusmean = tensor - tensor.mean()
    return tensor_minusmean / tensor_minusmean.max()

In [5]:
class WavDataset(Dataset):
    def __init__(self, data_folder, length=300000, transform=None):
        self.data_folder = data_folder
        self.dim = length
        self.wav_list = []
        self.transform = transform

        formats = [".wav", ".WAV"]
        for root, dirnames, filenames in os.walk(data_folder):
            for filename in filenames:
                if os.path.splitext(filename)[1] in formats:
                    label = str(root).split("/")[-1]
                    self.wav_list.append([os.path.join(root, filename), label])

        
    def __getitem__(self, item):
        filename, label = self.wav_list[item]
        wb_wav, sr = load(filename)
        wb_wav = wb_wav[0, :] # 单声道

        length = len(wb_wav)
        if length >= self.dim:
               max_audio_start = length - self.dim
               audio_start = np.random.randint(0, max_audio_start)
               wb_wav = wb_wav[audio_start: audio_start + self.dim]
        else:
            wb_wav = F.pad(wb_wav, (0, self.dim - length), "constant")
        
        if self.transform is not None:
            wb_wav = normalize(self.transform(wb_wav))

        return wb_wav, sr, filename, label
 
    def __len__(self):
        return len(self.wav_list)
