In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
!unzip -q /content/gdrive/MyDrive/Яндекс/train.zip -d train
!unzip -q /content/gdrive/MyDrive/Яндекс/test.zip -d test

In [None]:
import torch

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')
    DEVICE = torch.device("cuda")
Data_modes = ['train', 'test']

In [None]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torch.optim import Adam

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import time
from tqdm import tqdm


#for loading and visualizing audio files
import librosa
import librosa.display


#to play audio
import IPython.display as ipd

In [None]:
audio_fpath = "/content/train/train/"
audio_clips = os.listdir(audio_fpath)
for i,el in enumerate(audio_clips):
  if el.split('.')[1] != 'wav':
    targets_name = el
    i_targets = i
audio_clips.pop(i_targets)
print("No. of .wav files in audio folder = ",len(audio_clips))

In [None]:
targets = pd.read_csv(audio_fpath+targets_name, sep = '\t', header = None, names = ['filename', 'idx_sex'])
targets

In [None]:

# 1. Get the file path to an included audio example
filename = '/content/train/train/0006238dc99eaf68957dfc81826d1071.wav'

# 2. Load the audio as a waveform `y`
#    Store the sampling rate as `sr`
y, sr = librosa.load(filename)

# 3. Run the default beat tracker
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

print('Estimated tempo: {:.2f} beats per minute'.format(tempo))

# 4. Convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.waveplot(y, sr=sr)

In [None]:
print(targets[targets['filename'] == os.path.basename(filename).split('.')[0]])
idx_sex_dict = {1:'Female', 2: 'Male'}
ipd.Audio(y, rate = sr)

In [None]:
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,
                                    fmax=8000)

In [None]:
fig, ax = plt.subplots()
S_dB = librosa.power_to_db(S, ref=np.max)
img = librosa.display.specshow(S_dB, x_axis='time',
                         y_axis='mel', sr=sr,
                         fmax=8000, ax=ax)
fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set(title='Mel-frequency spectrogram')

Сделаем DataSet спектрограмм

In [None]:
class AudioDataset():
  def __init__(self,  train_mode =True):
    if train_mode:
      self.trainset = self.create_dataset('train')
      self.validset = self.create_dataset('valid')
    self.testset = self.create_dataset('test')
    
  def return_sex(self, id):
      return self.id_sex_dict[id]

  def return_data(self):
      return self.trainset, self.validset, self.testset

  def return_test(self):
      return self.testset

  def create_dataset(self, mode):
      
      if mode == 'train' or mode == 'valid':
        audio_fpath = "/content/train/train/"
        audio_clips = os.listdir(audio_fpath)
        for i,el in enumerate(audio_clips):
          if el.split('.')[1] != 'wav':
            targets_name = el
            i_targets = i
        audio_clips.pop(i_targets)
        targets = pd.read_csv(audio_fpath+targets_name, sep = '\t', header = None, names = ['filename', 'idx_sex'])
        spectrograms_and_targets = []
        
        if mode == 'train':
          for filename in tqdm(audio_clips[:10]):
            trgt = targets[targets['filename'] == os.path.basename(filename).split('.')[0]]
            spectrograms_and_targets.append(
                self.preprocess_sample(mode,audio_fpath+filename, trgt))

          x,y = map(np.stack, zip(*spectrograms_and_targets))
          return x,y

        if mode == 'valid':
          for filename in tqdm(audio_clips[10:15]):
            trgt = targets[targets['filename'] == os.path.basename(filename).split('.')[0]]
            spectrograms_and_targets.append(
                self.preprocess_sample(mode,audio_fpath+filename, trgt))

          x,y = map(np.stack, zip(*spectrograms_and_targets))
          return x,y

      if mode == 'test':
        audio_fpath = "/content/test/test/"
        audio_clips = os.listdir(audio_fpath)
        specs_test = []
        for filename in tqdm(audio_clips[:7]):
          specs_test.append(
              self.preprocess_sample_test(mode,audio_fpath+filename)
          )
        return specs_test

  @staticmethod
  def spec_to_image(spec, eps=1e-6):
        mean = spec.mean()
        std = spec.std()
        spec_norm = (spec - mean) / (std + eps)
        spec_min, spec_max = spec_norm.min(), spec_norm.max()
        spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
        spec_scaled = spec_scaled.astype(np.uint8)
        return spec_scaled

  def preprocess_sample(self, mode ,filename, target = 0, max_length = 100):
      amp, sr = librosa.load(filename)
      spectrogram = librosa.feature.melspectrogram(amp, sr=sr, n_mels=128, fmin=1, fmax=8000)[:, :max_length]
      spectrogram = np.pad(spectrogram, [[0, 0], [0, max(0, max_length - spectrogram.shape[1])]], mode='constant')
      return self.spec_to_image(np.float32(spectrogram)), target

  def preprocess_sample_test(self, mode ,filename, target = 0, max_length = 100):
      amp, sr = librosa.load(filename)
      spectrogram = librosa.feature.melspectrogram(amp, sr=sr, n_mels=128, fmin=1, fmax=8000)[:, :max_length]
      spectrogram = np.pad(spectrogram, [[0, 0], [0, max(0, max_length - spectrogram.shape[1])]], mode='constant')
      
      return self.spec_to_image(np.float32(spectrogram))

In [None]:
class DataLoader():
  def __init__(self, spectrograms, targets):
    dataset = AudioDataset().return_train()
    self.data = list(zip(dataset))

  def nex_batch(self, b_size, device):
    ind = np.random.randint(len(self.data), size = b_size)

    input = [self.data[i] for i in ind]

    source = [line[0] for line in input]
    target = [line[1] for line in input]

    return self.torch_batch(source, target, device)
  
  @staticmethod
  def torch_batch(source, target, device):
    return tuple(
        [
            torch.tensor(val,dtype = torch.float).to(device, non_blocking = True)
            for val in [source, target]
        ]
    )

In [None]:
train_data, valid_data, test_data = AudioDataset().return_data()

In [None]:
trainset = DataLoader(*train_data)
testset = DataLoader(*test_data)
batch_size = 64


Выберем модель 

In [None]:
class Model(nn.Module):
    def __init__(self, window_sizes=(3, 4, 5)):
        super(Model, self).__init__()

        self.convs = nn.ModuleList([
            nn.Conv2d(1, 128, [window_size, 128], padding=(window_size - 1, 0))
            for window_size in window_sizes
        ])

        self.fc = nn.Linear(128 * len(window_sizes), 1)

    def forward(self, x):
        # Apply a convolution + max pool layer for each window size
        x = torch.unsqueeze(x, 1)  # [B, C, T, E] Add a channel dim.
        xs = []
        for conv in self.convs:
            x2 = F.relu(conv(x))  # [B, F, T, 1]
            x2 = torch.squeeze(x2, -1)  # [B, F, T]
            x2 = F.max_pool1d(x2, x2.size(2))  # [B, F, 1]
            xs.append(x2)
        x = torch.cat(xs, 2)  # [B, F, window]

        # FC
        x = x.view(x.size(0), -1)  # [B, F * window]
        logits = self.fc(x)  # [B, class]
        probs = torch.sigmoid(logits).view(-1)
        return probs

    def loss(self, probs, targets):
        return nn.BCELoss()(probs.float(), targets.float())

In [None]:
model = Model()
if DEVICE == torch.device('cuda'):
    model.cuda()
else:
    model.cpu()
model.train()

optimizer = Adam(
    [p for p in model.parameters() if p.requires_grad], betas=(0.9, 0.999), eps=1e-5
)

In [None]:
import torch as t
for i in tqdm(range(100)):

    optimizer.zero_grad()

    input, target = trainset.next_batch(batch_size, device=DEVICE)
    out = model(input)
    loss = model.loss(out, target)
    loss.backward()
    optimizer.step()


