In [1]:
# path to your train/test/meta folders
DATA_PATH = './data/'

# names of valuable files/folders
train_meta_fname = 'train.csv'
test_meta_fname = 'sample_submission.csv'
train_data_folder = 'audio_train/train/'
test_data_folder = 'audio_test/test/'

In [2]:
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchaudio
import torchvision
from torchaudio import transforms
from efficientnet_pytorch import EfficientNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm

In [3]:
# set seeds
import random
import numpy as np

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True

In [4]:
df_train = pd.read_csv(os.path.join(DATA_PATH, train_meta_fname))
df_test = pd.read_csv(os.path.join(DATA_PATH, test_meta_fname))
df_train.head(2)

Unnamed: 0,fname,label
0,8bcbcc394ba64fe85ed4.wav,Finger_snapping
1,00d77b917e241afa06f1.wav,Squeak


In [5]:
# parameters
fft_size = 2048
overlap = 4
hop = fft_size // overlap
mels = 64


n_classes = df_train.label.nunique()
print(n_classes)
classes_dict = {cl:i for i,cl in enumerate(df_train.label.unique())}
df_train['label_encoded'] = df_train.label.map(classes_dict)
df_train.head()

41


Unnamed: 0,fname,label,label_encoded
0,8bcbcc394ba64fe85ed4.wav,Finger_snapping,0
1,00d77b917e241afa06f1.wav,Squeak,1
2,17bb93b73b8e79234cb3.wav,Electric_piano,2
3,7d5c7a40a936136da55e.wav,Harmonica,3
4,17e0ee7565a33d6c2326.wav,Snare_drum,4


In [6]:
# https://github.com/lukemelas/EfficientNet-PyTorch
class BaseLineModel(nn.Module):
    
    def __init__(self, sample_rate=16000, n_classes=41):
        super().__init__()
        self.ms = torchaudio.transforms.MelSpectrogram(sample_rate, n_mels = mels, n_fft=fft_size, hop_length=hop)
#         self.bn1 = nn.BatchNorm2d(1)
        
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=10, kernel_size=3, padding=1)
        self.cnn3 = nn.Conv2d(in_channels=10, out_channels=3, kernel_size=3, padding=1)
        
        self.features = EfficientNet.from_pretrained('efficientnet-b1')
        # print(self.features)
        # use it as features
        # for param in self.features.parameters():
        #     param.requires_grad = False
            
        self.lin1 = nn.Linear(1000, 333)
        
        self.lin2 = nn.Linear(333, 111)
                
        self.lin3 = nn.Linear(111, n_classes)
        
    def forward(self, x):
        x = self.ms(x)
#         x = self.bn1(x)
                
        x = F.relu(self.cnn1(x))
        x = F.relu(self.cnn3(x))
        
        x = self.features(x)

        x = x.view(x.shape[0], -1)
        x = F.relu(x)
        
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = self.lin3(x)
        return x
    
    def inference(self, x):
        x = self.forward(x)
        x = F.softmax(x)
        return x

In [7]:
def sample_or_pad(waveform, wav_len=32000):
    m, n = waveform.shape
    if n < wav_len:
        padded_wav = torch.zeros(1, wav_len)
        padded_wav[:, :n] = waveform
        return padded_wav
    elif n > wav_len:
        offset = np.random.randint(0, n - wav_len)
        sampled_wav = waveform[:, offset:offset+wav_len]
        return sampled_wav
    else:
        return waveform
        
class EventDetectionDataset(Dataset):
    def __init__(self, data_path, x, y=None):
        self.x = x
        self.y = y
        self.data_path = data_path
    
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        path2wav = os.path.join(self.data_path, self.x[idx])
        #waveform, sample_rate = torchaudio.load(path2wav, normalization=True)
        waveform, sample_rate = torchaudio.load(path2wav)
        waveform = sample_or_pad(waveform)
        if self.y is not None:
            return waveform, self.y[idx]
        return waveform

In [8]:
X_train, X_val, y_train, y_val = train_test_split(df_train.fname.values, df_train.label_encoded.values, 
                                                  test_size=0.2, random_state=42)
train_loader = DataLoader(
                        EventDetectionDataset(os.path.join(DATA_PATH, train_data_folder), X_train, y_train),
                        batch_size=41
                )
val_loader = DataLoader(
                        EventDetectionDataset(os.path.join(DATA_PATH, train_data_folder), X_val, y_val),
                        batch_size=41
                )
test_loader = DataLoader(
                        EventDetectionDataset(os.path.join(DATA_PATH, test_data_folder), df_test.fname.values, None),
                        batch_size=41, shuffle=False
                )

In [9]:
def eval_model(model, eval_dataset):
    model.eval()
    forecast, true_labs = [], []
    with torch.no_grad():
        for wavs, labs in tqdm(eval_dataset):
            #wavs, labs = wavs.cuda(), labs.detach().numpy()
            labs = labs.detach().numpy()
            true_labs.append(labs)
            outputs = model.inference(wavs)
            
            outputs = outputs.detach().cpu().numpy().argmax(axis=1)
            forecast.append(outputs)
    forecast = [x for sublist in forecast for x in sublist]
    true_labs = [x for sublist in true_labs for x in sublist]
    return f1_score(forecast, true_labs, average='macro')

In [10]:
criterion = nn.CrossEntropyLoss()
model = BaseLineModel()
#model = model.cuda()
lr = 1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=lr)


Loaded pretrained weights for efficientnet-b1


In [11]:
n_epoch = 10
best_f1 = 0
for epoch in range(n_epoch):
    model.train()
    for wavs, labs in tqdm(train_loader):
        optimizer.zero_grad()
        #wavs, labs = wavs.cuda(), labs.cuda()
        outputs = model(wavs)
        loss = criterion(outputs, labs)
        loss.backward()
        optimizer.step()
#     if epoch % 10 == 0:
    f1 = eval_model(model, val_loader)
    f1_train = eval_model(model, train_loader)
    print(f'epoch: {epoch}, f1_test: {f1}, f1_train: {f1_train}')
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), '../baseline_fulldiv.pt')
        
    lr = lr * 0.95
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

100%|██████████| 111/111 [10:11<00:00,  5.51s/it]
100%|██████████| 28/28 [01:45<00:00,  3.78s/it]
100%|██████████| 111/111 [03:55<00:00,  2.13s/it]


epoch: 0, f1_test: 0.07767871742336283, f1_train: 0.08078379672255603


100%|██████████| 111/111 [08:04<00:00,  4.37s/it]
100%|██████████| 28/28 [00:56<00:00,  2.03s/it]
100%|██████████| 111/111 [04:22<00:00,  2.36s/it]
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 1, f1_test: 0.07563025842291002, f1_train: 0.09203902712513712


100%|██████████| 111/111 [08:15<00:00,  4.46s/it]
100%|██████████| 28/28 [00:56<00:00,  2.03s/it]
100%|██████████| 111/111 [13:53<00:00,  7.50s/it]  
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 2, f1_test: 0.1428102915236478, f1_train: 0.15460786352792671


100%|██████████| 111/111 [09:15<00:00,  5.01s/it]
100%|██████████| 28/28 [01:22<00:00,  2.94s/it]
100%|██████████| 111/111 [04:09<00:00,  2.25s/it]
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 3, f1_test: 0.17246514664654855, f1_train: 0.2138671698634403


100%|██████████| 111/111 [08:41<00:00,  4.70s/it]
100%|██████████| 28/28 [01:05<00:00,  2.35s/it]
100%|██████████| 111/111 [04:14<00:00,  2.29s/it]


epoch: 4, f1_test: 0.18094917144083028, f1_train: 0.21149008362659855


100%|██████████| 111/111 [10:14<00:00,  5.54s/it]
100%|██████████| 28/28 [01:10<00:00,  2.53s/it]
100%|██████████| 111/111 [04:02<00:00,  2.18s/it]
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 5, f1_test: 0.2082591584620916, f1_train: 0.20439043148659375


100%|██████████| 111/111 [08:54<00:00,  4.81s/it]
100%|██████████| 28/28 [01:02<00:00,  2.23s/it]
100%|██████████| 111/111 [04:55<00:00,  2.67s/it]
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 6, f1_test: 0.27389970931627894, f1_train: 0.3286745690364979


100%|██████████| 111/111 [09:22<00:00,  5.07s/it]
100%|██████████| 28/28 [00:59<00:00,  2.13s/it]
100%|██████████| 111/111 [05:11<00:00,  2.81s/it]
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 7, f1_test: 0.14673339157736376, f1_train: 0.19036634050896634


100%|██████████| 111/111 [13:14<00:00,  7.16s/it]
100%|██████████| 28/28 [01:44<00:00,  3.75s/it]
100%|██████████| 111/111 [05:10<00:00,  2.79s/it]
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 8, f1_test: 0.18370243637682568, f1_train: 0.23171105066151887


100%|██████████| 111/111 [08:51<00:00,  4.79s/it]
100%|██████████| 28/28 [00:57<00:00,  2.04s/it]
100%|██████████| 111/111 [04:21<00:00,  2.36s/it]
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 9, f1_test: 0.2502567657499155, f1_train: 0.3006708938285688


100%|██████████| 111/111 [09:54<00:00,  5.35s/it]
100%|██████████| 28/28 [01:02<00:00,  2.25s/it]
100%|██████████| 111/111 [04:22<00:00,  2.36s/it]
  0%|          | 0/111 [00:00<?, ?it/s]

epoch: 10, f1_test: 0.23752302161354205, f1_train: 0.31317624776374475


 61%|██████▏   | 68/111 [05:36<04:07,  5.77s/it]

KeyboardInterrupt: 

In [12]:
# make a model
model_name = 'baseline_fulldiv.pt'
#model = BaseLineModel().cuda()
model = BaseLineModel()
model.load_state_dict(torch.load(os.path.join('..', model_name)))
model.eval()
forecast = []
with torch.no_grad():
    for wavs in tqdm(test_loader):
        #wavs = wavs.cuda()
        outputs = model.inference(wavs)
        outputs = outputs.detach().cpu().numpy().argmax(axis=1)
        forecast.append(outputs)
forecast = [x for sublist in forecast for x in sublist]
decoder = {classes_dict[cl]:cl for cl in classes_dict}
forecast = pd.Series(forecast).map(decoder)
df_test['label'] = forecast
df_test.to_csv(f'{model_name}.csv', index=None)

Loaded pretrained weights for efficientnet-b1




  1%|          | 1/93 [00:11<16:59, 11.09s/it][A
  2%|▏         | 2/93 [00:17<14:32,  9.59s/it][A
  3%|▎         | 3/93 [00:22<12:23,  8.26s/it][A
  4%|▍         | 4/93 [00:34<13:55,  9.38s/it][A
  5%|▌         | 5/93 [00:44<14:15,  9.72s/it][A
  6%|▋         | 6/93 [00:54<13:50,  9.55s/it][A
  8%|▊         | 7/93 [00:59<11:48,  8.24s/it][A
  9%|▊         | 8/93 [01:03<09:59,  7.05s/it][A
 10%|▉         | 9/93 [01:07<08:32,  6.10s/it][A
 11%|█         | 10/93 [01:11<07:29,  5.41s/it][A
 12%|█▏        | 11/93 [01:15<06:47,  4.97s/it][A
 13%|█▎        | 12/93 [01:19<06:18,  4.67s/it][A
 14%|█▍        | 13/93 [01:22<05:35,  4.20s/it][A
 15%|█▌        | 14/93 [01:25<05:12,  3.95s/it][A
 16%|█▌        | 15/93 [01:30<05:24,  4.15s/it][A
 17%|█▋        | 16/93 [01:34<05:13,  4.07s/it][A
 18%|█▊        | 17/93 [01:37<04:50,  3.83s/it][A
 19%|█▉        | 18/93 [01:40<04:33,  3.65s/it][A
 20%|██        | 19/93 [01:43<04:24,  3.57s/it][A
 22%|██▏       | 20/93 [01:48<04:37,  