In [1]:
!pip install pytorch_metric_learning
!pip install -U torchaudio

Collecting pytorch_metric_learning
  Downloading pytorch_metric_learning-1.1.2-py3-none-any.whl (106 kB)
     |████████████████████████████████| 106 kB 901 kB/s            
Installing collected packages: pytorch-metric-learning
Successfully installed pytorch-metric-learning-1.1.2
Collecting torchaudio
  Downloading torchaudio-0.10.2-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
     |████████████████████████████████| 2.9 MB 879 kB/s            
[?25hCollecting torch==1.10.2
  Downloading torch-1.10.2-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
     |████████████████████████████████| 881.9 MB 1.4 kB/s             
Installing collected packages: torch, torchaudio
  Attempting uninstall: torch
    Found existing installation: torch 1.9.1
    Uninstalling torch-1.9.1:
      Successfully uninstalled torch-1.9.1
  Attempting uninstall: torchaudio
    Found existing installation: torchaudio 0.9.1
    Uninstalling torchaudio-0.9.1:
      Successfully uninstalled torchaudio-0.9.

In [2]:
import os
import random
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import torchaudio
from pytorch_metric_learning import losses

In [4]:
bundle = torchaudio.pipelines.WAV2VEC2_BASE

In [5]:
wav2vec2 = bundle.get_model()

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960.pth


  0%|          | 0.00/360M [00:00<?, ?B/s]

In [6]:
wav2vec2.encoder.transformer.layers = wav2vec2.encoder.transformer.layers[:-4]

In [7]:
root_dir = '/kaggle/input/classification-of-short-noisy-audio-speech/hackaton_ds/train/'
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [8]:
class CommandDataset(Dataset):

    def __init__(self, meta, root_dir, sample_rate, labelmap):
        self.meta = meta
        self.root_dir = root_dir
        self.sample_rate = sample_rate
        self.labelmap = labelmap

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        file_name = self.meta['path'].iloc[idx]
        waveform, sample_rate = torchaudio.load(file_name)
        
        if random.randint(0, 1):
        
            effects = [
              ["speed", str(np.random.random() + 0.5)],  # reduce the speed
                                 # This only changes sample rate, so it is necessary to
                                 # add `rate` effect with original sample rate after this.
              ["rate", f"{sample_rate}"],
            ]

            # Apply effects
            waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
                waveform, sample_rate, effects)
            
        if random.randint(0, 1):
        
            effects = [
              ["rate", f"{sample_rate}"],
              ["reverb", "-w"],  # Reverbration gives some dramatic feeling
            ]

            # Apply effects
            waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
                waveform, sample_rate, effects)
        
        waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)#[:, :10**5]
        waveform = torch.nn.functional.pad(waveform, (16000-waveform.shape[1], 0))[0]
            
        label = self.meta['label'].iloc[idx]

        return waveform, self.labelmap[label]

In [9]:
labels = {
    'yes': 0, 
    'no': 1, 
    'up': 2, 
    'down': 3, 
    'left': 4, 
    'right': 5, 
    'on': 6, 
    'off': 7, 
    'stop': 8, 
    'go': 9, 
}

In [10]:
data = pd.DataFrame([
    {'label': i[0].split('/')[-1], 'path': i[0] + '/' + j}
    for i in os.walk(root_dir)
    for j in i[2]
])

In [11]:
data.label.value_counts()

stop     8925
yes      8910
no       8905
up       8905
go       8895
right    8875
on       8875
down     8845
off      8835
left     8820
Name: label, dtype: int64

In [12]:
train, val, _, _ = train_test_split(data, data['label'], test_size=0.1)

In [13]:
train_dataset = CommandDataset(meta=train, root_dir=root_dir, sample_rate=bundle.sample_rate, labelmap=labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12)

val_dataset = CommandDataset(meta=val, root_dir=root_dir, sample_rate=bundle.sample_rate, labelmap=labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=12)

  cpuset_checked))


In [14]:
class CommandClassifier(nn.Module):
    def __init__(self, feature_extractor):
        super(CommandClassifier, self).__init__()
        self.feature_extractor = feature_extractor
        self.linear = nn.Linear(768, len(labels))
        
    def forward(self, X):
        features = self.get_embeddings(X)
        logits = self.linear(features)
        return logits
    
    def get_embeddings(self, X):
        embeddings = self.feature_extractor(X)[0].mean(axis=1)
        return nn.functional.normalize(embeddings)

In [15]:
model = CommandClassifier(wav2vec2)
#model.load_state_dict(torch.load('model.pth'))
model.to(device)

CommandClassifier(
  (feature_extractor): Wav2Vec2Model(
    (feature_extractor): FeatureExtractor(
      (conv_layers): ModuleList(
        (0): ConvLayerBlock(
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        )
        (1): ConvLayerBlock(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (2): ConvLayerBlock(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (3): ConvLayerBlock(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (4): ConvLayerBlock(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        )
        (5): ConvLayerBlock(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        )
        (6): ConvLayerBlock(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bia

In [16]:
EPOCHS = 10
lr = 0.00001

optimizer = optim.AdamW(model.parameters(), lr)

criterion = losses.ArcFaceLoss(len(labels), 768).to(device)

In [17]:
for epoch in range(EPOCHS):
    
    model.train()     
        
    train_loss = []
    for batch, targets in tqdm(train_dataloader, desc=f"Epoch: {epoch}"):
        optimizer.zero_grad()
        batch = batch.to(device)
        targets = targets.to(device)
        
        predictions = model.get_embeddings(batch)

        loss = criterion(predictions, targets) 
        loss.backward()
        
        optimizer.step()

        train_loss.append(loss.item())
        
    print('Training loss:', np.mean(train_loss))
    
    model.eval()
        
    val_loss = []
    for batch, targets in tqdm(val_dataloader, desc=f"Epoch: {epoch}"):
        
        with torch.no_grad():
        
            batch = batch.to(device)
            targets = targets.to(device)
            
            predictions = model.get_embeddings(batch)

            loss = criterion(predictions, targets) 

            val_loss.append(loss.item())
        
    print('Val loss:', np.mean(val_loss))

Epoch: 0: 100%|██████████| 1249/1249 [08:45<00:00,  2.38it/s]


Training loss: 28.190096437501563


Epoch: 0: 100%|██████████| 139/139 [00:49<00:00,  2.82it/s]


Val loss: 22.898601614314018


Epoch: 1: 100%|██████████| 1249/1249 [08:29<00:00,  2.45it/s]


Training loss: 17.229438633227748


Epoch: 1: 100%|██████████| 139/139 [00:46<00:00,  2.96it/s]


Val loss: 11.748464800471025


Epoch: 2: 100%|██████████| 1249/1249 [08:26<00:00,  2.46it/s]


Training loss: 12.737000874274248


Epoch: 2: 100%|██████████| 139/139 [00:47<00:00,  2.94it/s]


Val loss: 11.011330409015683


Epoch: 3: 100%|██████████| 1249/1249 [08:28<00:00,  2.46it/s]


Training loss: 11.252048713861226


Epoch: 3: 100%|██████████| 139/139 [00:43<00:00,  3.18it/s]


Val loss: 9.547991982466883


Epoch: 4: 100%|██████████| 1249/1249 [08:26<00:00,  2.47it/s]


Training loss: 10.407116248189592


Epoch: 4: 100%|██████████| 139/139 [00:45<00:00,  3.08it/s]


Val loss: 9.862160144092368


Epoch: 5: 100%|██████████| 1249/1249 [08:27<00:00,  2.46it/s]


Training loss: 9.970041818481336


Epoch: 5: 100%|██████████| 139/139 [00:45<00:00,  3.03it/s]


Val loss: 9.280216170729494


Epoch: 6: 100%|██████████| 1249/1249 [08:27<00:00,  2.46it/s]


Training loss: 9.60015226154923


Epoch: 6: 100%|██████████| 139/139 [00:44<00:00,  3.10it/s]


Val loss: 9.006933380373948


Epoch: 7: 100%|██████████| 1249/1249 [08:28<00:00,  2.46it/s]


Training loss: 9.45025641217625


Epoch: 7: 100%|██████████| 139/139 [00:44<00:00,  3.11it/s]


Val loss: 8.918710077409264


Epoch: 8: 100%|██████████| 1249/1249 [08:27<00:00,  2.46it/s]


Training loss: 9.11135631604038


Epoch: 8: 100%|██████████| 139/139 [00:45<00:00,  3.08it/s]


Val loss: 8.852062199613174


Epoch: 9: 100%|██████████| 1249/1249 [08:27<00:00,  2.46it/s]


Training loss: 8.983869398184257


Epoch: 9: 100%|██████████| 139/139 [00:47<00:00,  2.92it/s]

Val loss: 8.493209423778726





In [18]:
EPOCHS = 20
lr = 0.00001

optimizer = optim.AdamW(model.parameters(), lr)

criterion = nn.CrossEntropyLoss()

In [19]:
torch.save(model.state_dict(), 'model.pth')

In [20]:
writer = SummaryWriter()

In [21]:
for epoch in range(EPOCHS):
    
    model.train()
        
    train_loss = []
    train_predictions = []
    train_targets = []
    for batch, targets in tqdm(train_dataloader, desc=f"Epoch: {epoch}"):
        optimizer.zero_grad()
        
        batch = batch.to(device)
        targets = targets.to(device)
        
        predictions = model(batch)
        
        loss = criterion(predictions, targets) 
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())
        
        train_predictions.extend(predictions.cpu().detach().numpy().argmax(axis=1))
        train_targets.extend(targets.cpu().detach().numpy())
        
    
    train_loss = np.mean(train_loss)
    train_accuracy = accuracy_score(train_targets, train_predictions)
    
    print('Training loss:', train_loss, end=' ')
    print('Train accuracy:', train_accuracy)
    
    model.eval()
        
    val_predictions = []
    val_targets = []
    val_loss = []
    for batch, targets in tqdm(val_dataloader, desc=f"Epoch: {epoch}"):
        
        with torch.no_grad():
        
            batch = batch.to(device)
            targets = targets.to(device)
            predictions = model(batch)
            loss = criterion(predictions, targets) 
            
            val_loss.append(loss.item())

            val_predictions.extend(predictions.cpu().numpy().argmax(axis=1))
            val_targets.extend(targets.cpu().numpy())
        
    val_loss = np.mean(val_loss)
    val_accuracy = accuracy_score(val_targets, val_predictions)
    
    print('Val loss:', val_loss, end=' ')
    print('Val accuracy:', val_accuracy, end=' ')
    
    torch.save(model.state_dict(), 'model.pth')
    
    writer.add_scalars(
        'Accuracy', 
        {'train': train_accuracy, 'val': val_accuracy,}, 
        epoch
    )
    writer.add_scalars(
        'Loss', 
        {'train': train_loss, 'val': val_loss,}, 
        epoch
    )

  cpuset_checked))
Epoch: 0: 100%|██████████| 1249/1249 [08:20<00:00,  2.50it/s]


Training loss: 1.8756983633323896 Train accuracy: 0.8143309431742813


Epoch: 0: 100%|██████████| 139/139 [00:47<00:00,  2.93it/s]


Val loss: 1.6954565039641565 Val accuracy: 0.8396215790066449 

Epoch: 1: 100%|██████████| 1249/1249 [08:20<00:00,  2.49it/s]


Training loss: 1.6208828876646544 Train accuracy: 0.8398218017544519


Epoch: 1: 100%|██████████| 139/139 [00:48<00:00,  2.88it/s]


Val loss: 1.4995400536832193 Val accuracy: 0.8471674738146188 

Epoch: 2: 100%|██████████| 1249/1249 [08:20<00:00,  2.50it/s]


Training loss: 1.4263253932575688 Train accuracy: 0.8452903855539288


Epoch: 2: 100%|██████████| 139/139 [00:45<00:00,  3.07it/s]


Val loss: 1.3201172163160584 Val accuracy: 0.849194729136164 

Epoch: 3: 100%|██████████| 1249/1249 [08:21<00:00,  2.49it/s]


Training loss: 1.2560138145955302 Train accuracy: 0.8494450075709227


Epoch: 3: 100%|██████████| 139/139 [00:44<00:00,  3.09it/s]


Val loss: 1.1594499435356196 Val accuracy: 0.852573488005406 

Epoch: 4: 100%|██████████| 1249/1249 [08:21<00:00,  2.49it/s]


Training loss: 1.0975069085820568 Train accuracy: 0.8558271076572687


Epoch: 4: 100%|██████████| 139/139 [00:44<00:00,  3.12it/s]


Val loss: 1.0177479540701393 Val accuracy: 0.8568532492397792 

Epoch: 5: 100%|██████████| 1249/1249 [08:22<00:00,  2.49it/s]


Training loss: 0.9613919271957024 Train accuracy: 0.8590181577004418


Epoch: 5: 100%|██████████| 139/139 [00:44<00:00,  3.14it/s]


Val loss: 0.895346703289224 Val accuracy: 0.8570784998310621 

Epoch: 6: 100%|██████████| 1249/1249 [08:20<00:00,  2.50it/s]


Training loss: 0.8472239873331008 Train accuracy: 0.8625095418653251


Epoch: 6: 100%|██████████| 139/139 [00:44<00:00,  3.12it/s]


Val loss: 0.7906038666800629 Val accuracy: 0.8642865187521117 

Epoch: 7: 100%|██████████| 1249/1249 [08:20<00:00,  2.49it/s]


Training loss: 0.7501768259118327 Train accuracy: 0.8668894144736019


Epoch: 7: 100%|██████████| 139/139 [00:45<00:00,  3.06it/s]


Val loss: 0.7038744061970882 Val accuracy: 0.8665390246649397 

Epoch: 8: 100%|██████████| 1249/1249 [08:22<00:00,  2.49it/s]


Training loss: 0.670487163136538 Train accuracy: 0.8695548798037817


Epoch: 8: 100%|██████████| 139/139 [00:45<00:00,  3.06it/s]


Val loss: 0.6418081826443295 Val accuracy: 0.867440027030071 

Epoch: 9: 100%|██████████| 1249/1249 [08:20<00:00,  2.49it/s]


Training loss: 0.6065149778743283 Train accuracy: 0.8721577755252719


Epoch: 9: 100%|██████████| 139/139 [00:47<00:00,  2.92it/s]


Val loss: 0.5799410686218481 Val accuracy: 0.8705935353080302 

Epoch: 10: 100%|██████████| 1249/1249 [08:19<00:00,  2.50it/s]


Training loss: 0.5581208644770355 Train accuracy: 0.8741349751598654


Epoch: 10: 100%|██████████| 139/139 [00:46<00:00,  3.02it/s]


Val loss: 0.531025377752112 Val accuracy: 0.8739722941772722 

Epoch: 11: 100%|██████████| 1249/1249 [08:21<00:00,  2.49it/s]


Training loss: 0.5137503604062372 Train accuracy: 0.8782019997246937


Epoch: 11: 100%|██████████| 139/139 [00:45<00:00,  3.08it/s]


Val loss: 0.5097522113820632 Val accuracy: 0.8708187858993129 

Epoch: 12: 100%|██████████| 1249/1249 [08:20<00:00,  2.49it/s]


Training loss: 0.4807645382167245 Train accuracy: 0.8790654603246111


Epoch: 12: 100%|██████████| 139/139 [00:45<00:00,  3.04it/s]


Val loss: 0.4717001454006854 Val accuracy: 0.8783646807072869 

Epoch: 13: 100%|██████████| 1249/1249 [08:20<00:00,  2.50it/s]


Training loss: 0.44786529895589294 Train accuracy: 0.8827946090025153


Epoch: 13: 100%|██████████| 139/139 [00:44<00:00,  3.10it/s]


Val loss: 0.4630011871135492 Val accuracy: 0.8732965424034238 

Epoch: 14: 100%|██████████| 1249/1249 [08:20<00:00,  2.49it/s]


Training loss: 0.4317512254389502 Train accuracy: 0.8834953886198396


Epoch: 14: 100%|██████████| 139/139 [00:44<00:00,  3.11it/s]


Val loss: 0.4285515723682994 Val accuracy: 0.8797161842549837 

Epoch: 15: 100%|██████████| 1249/1249 [08:21<00:00,  2.49it/s]


Training loss: 0.4109650780025533 Train accuracy: 0.8851972819761985


Epoch: 15: 100%|██████████| 139/139 [00:45<00:00,  3.07it/s]


Val loss: 0.40832706878511166 Val accuracy: 0.8833201937155085 

Epoch: 16: 100%|██████████| 1249/1249 [08:20<00:00,  2.50it/s]


Training loss: 0.39739994673704127 Train accuracy: 0.8864862159152057


Epoch: 16: 100%|██████████| 139/139 [00:44<00:00,  3.14it/s]


Val loss: 0.3991913080429859 Val accuracy: 0.8852348237414123 

Epoch: 17: 100%|██████████| 1249/1249 [08:21<00:00,  2.49it/s]


Training loss: 0.38086933570446446 Train accuracy: 0.8889014028106268


Epoch: 17: 100%|██████████| 139/139 [00:45<00:00,  3.08it/s]


Val loss: 0.39105070162591316 Val accuracy: 0.8843338213762811 

Epoch: 18: 100%|██████████| 1249/1249 [08:24<00:00,  2.48it/s]


Training loss: 0.3688080154890247 Train accuracy: 0.8916669795147101


Epoch: 18: 100%|██████████| 139/139 [00:45<00:00,  3.05it/s]


Val loss: 0.3951659688203455 Val accuracy: 0.8837706948980741 

Epoch: 19: 100%|██████████| 1249/1249 [08:25<00:00,  2.47it/s]


Training loss: 0.35933089574830257 Train accuracy: 0.8936566930710416


Epoch: 19: 100%|██████████| 139/139 [00:45<00:00,  3.06it/s]


Val loss: 0.39024318036415595 Val accuracy: 0.8823065660547359 

In [22]:
test_dir = '/kaggle/input/classification-of-short-noisy-audio-speech/hackaton_ds/test/'

In [23]:
model.eval()

pred = []
for i in tqdm(os.listdir(test_dir)):
    
    waveform, sample_rate = torchaudio.load(f'{test_dir}/{i}')
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)#[:, :10**5]
    waveform = torch.nn.functional.pad(waveform, (16000-waveform.shape[1], 0))[0]
    
    with torch.no_grad():
        predictions = model(waveform.unsqueeze(0).to(device))[0].cpu()
    
    text_lab = list(labels.keys())[predictions.argmax()]
    
    pred.append({'id': i.replace('.wav', ''), 'category': text_lab})
pred = pd.DataFrame(pred)

100%|██████████| 29620/29620 [09:42<00:00, 50.83it/s]


In [24]:
pred.to_csv('submission.csv', index=False)