In [1]:
import torch.nn as nn
import numpy as np
import torch
import librosa
import os

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold



Using CUDA GPU for this project

In [2]:
num_labels = 24

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


Here I have created some functions that we'll be using for our processing and preparation phase

In [3]:
from skimage.transform import resize
from skimage.filters import gaussian
from skimage.color import rgb2gray
from skimage import exposure, util


def horizontal_flip(img):
    horizontal_flip_img = img[:, ::-1]
    return addChannels(horizontal_flip_img)

def vertical_flip(img):
    vertical_flip_img = img[::-1, :]
    return addChannels(vertical_flip_img)

def addNoisy(img):
    noise_img = util.random_noise(img)
    return addChannels(noise_img)

def contrast_stretching(img):
    contrast_img = exposure.rescale_intensity(img)
    return addChannels(contrast_img)

def randomGaussian(img):
    gaussian_img = gaussian(img)
    return addChannels(gaussian_img)

def grayScale(img):
    gray_img = rgb2gray(img)
    return addChannels(gray_img)

def randomGamma(img):
    img_gamma = exposure.adjust_gamma(img)
    return addChannels(img_gamma)

def addChannels(img):
    return np.stack((img, img, img))

def spec_to_image(spec):
    spec = resize(spec, (224, 400))
    eps=1e-6
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    spec_scaled = np.asarray(spec_scaled)
    return spec_scaled

In [4]:
import pandas as pd

sr = 48000
length = 10 * sr
data = pd.read_csv("../input/rfcx-species-audio-detection/train_tp.csv")

fmin = sr / 2
fmax = 0
for i in range(0, len(data)):
    if fmin > float(data.iloc[i]['f_min']):
        fmin = float(data.iloc[i]['f_min'])
    if fmax < float(data.iloc[i]['f_max']):
        fmax = float(data.iloc[i]['f_max'])
        
fmin = int(fmin * 0.9)
fmax = int(fmax * 1.1)

label_list = []
data_list = []
audio_data = {}
for i in range(0, len(data)):
    recording_id = data.recording_id.values[i]
    species_id = int(data.species_id.values[i])
    data_list.append(recording_id)
    label_list.append(species_id)

    wav, sr = librosa.load('../input/rfcx-species-audio-detection/train/' + recording_id + '.flac', sr=None)
    t_min = float(data.t_min.values[i]) * sr
    t_max = float(data.t_max.values[i]) * sr
    center = np.round((t_min + t_max) / 2)
    beginning = center - length / 2
    if beginning < 0:
        beginning = 0
    ending = beginning + length
    if ending > len(wav):
        ending = len(wav)
        beginning = ending - length
    slice = wav[int(beginning):int(ending)]
    
    spec=librosa.feature.melspectrogram(y=slice, sr=sr, fmin=fmin, fmax=fmax)
    spec_db=librosa.power_to_db(spec, top_db=80)
    
    img = spec_to_image(spec_db)
    
    audio_data[recording_id] = img

In [5]:
import copy
from tqdm import tqdm

learning_rate = 1e-4
epochs = 20
loss_fn = nn.CrossEntropyLoss()

def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, scheduler):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    train_losses = []
    valid_losses = []
    
    for epoch in tqdm(range(1,epochs+1)):
        model.train()
        batch_losses=[]
        for _, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        train_losses.append(batch_losses)

        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        
        for _, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())
        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        
        print("epoch = %d, train_loss = %.5f, val_loss = %.5f, val_accuracy = %.5f" % (epoch, np.mean(train_losses[-1]), np.mean(valid_losses[-1]), accuracy))

        scheduler.step(np.mean(valid_losses[-1]))
        if accuracy > best_acc:
            best_acc = accuracy
            best_model_wts = copy.deepcopy(model.state_dict())

    model.load_state_dict(best_model_wts)
    return model

In [6]:
import random

class AudioData(Dataset):
    def __init__(self, X, y, data_type):
        self.data = []
        self.labels = []
        self.augs = [addNoisy, contrast_stretching,randomGaussian,randomGamma, vertical_flip, horizontal_flip, addChannels]
        self.data_type=data_type
        for i in range(0, len(X)):
            recording_id = X[i]
            label = y[i]
            mel_spec = audio_data[recording_id]
            self.data.append(mel_spec)
            self.labels.append(label)
                
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.data_type == "train":
            aug= random.choice(self.augs)
            data = aug(self.data[idx])
        else:
            data = addChannels(self.data[idx])
        return data, self.labels[idx]

In [7]:
from torchvision.models import resnet101

def get_model():
    model = resnet101(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_labels)
    model = model.to(device)
    return model

In [8]:
fold_num = 4
skf = KFold(n_splits=fold_num, shuffle=True, random_state=32)

for fold_id, (train_index, val_index) in enumerate(skf.split(data_list, label_list)):
    X_train = np.take(data_list, train_index)
    y_train = np.take(label_list, train_index, axis = 0)
    X_val = np.take(data_list, val_index)
    y_val = np.take(label_list, val_index, axis = 0)

    train_data = AudioData(X_train, y_train, "train")
    valid_data = AudioData(X_val, y_val, "valid")
    train_loader = DataLoader(train_data, batch_size=8, shuffle=True, drop_last=True)
    valid_loader = DataLoader(valid_data, batch_size=8, shuffle=True, drop_last=True)

    model = get_model()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
    model = train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, scheduler)
    torch.save(model.state_dict(), "./model" + str(fold_id) + ".pt")
    
    del train_data, valid_data, train_loader, valid_loader, model, X_train, X_val, y_train, y_val

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:00<00:00, 320MB/s]
  5%|▌         | 1/20 [00:25<08:03, 25.43s/it]

epoch = 1, train_loss = 2.92614, val_loss = 2.47244, val_accuracy = 0.28289


 10%|█         | 2/20 [00:45<06:38, 22.12s/it]

epoch = 2, train_loss = 2.40460, val_loss = 2.44471, val_accuracy = 0.35855


 15%|█▌        | 3/20 [01:05<05:59, 21.17s/it]

epoch = 3, train_loss = 2.14263, val_loss = 1.91514, val_accuracy = 0.50329


 20%|██        | 4/20 [01:25<05:29, 20.62s/it]

epoch = 4, train_loss = 1.79884, val_loss = 2.36572, val_accuracy = 0.47368


 25%|██▌       | 5/20 [01:44<05:05, 20.34s/it]

epoch = 5, train_loss = 1.67930, val_loss = 1.50339, val_accuracy = 0.62500


 30%|███       | 6/20 [02:04<04:42, 20.19s/it]

epoch = 6, train_loss = 1.62854, val_loss = 1.52348, val_accuracy = 0.66447


 35%|███▌      | 7/20 [02:24<04:20, 20.05s/it]

epoch = 7, train_loss = 1.46808, val_loss = 1.55482, val_accuracy = 0.63158


 40%|████      | 8/20 [02:44<03:59, 20.00s/it]

epoch = 8, train_loss = 1.37995, val_loss = 1.47514, val_accuracy = 0.66118


 45%|████▌     | 9/20 [03:04<03:39, 19.93s/it]

epoch = 9, train_loss = 1.23298, val_loss = 1.26223, val_accuracy = 0.67763


 50%|█████     | 10/20 [03:24<03:19, 19.91s/it]

epoch = 10, train_loss = 1.01967, val_loss = 1.26174, val_accuracy = 0.69079


 55%|█████▌    | 11/20 [03:43<02:58, 19.88s/it]

epoch = 11, train_loss = 0.93250, val_loss = 1.31678, val_accuracy = 0.72039


 60%|██████    | 12/20 [04:03<02:38, 19.85s/it]

epoch = 12, train_loss = 0.73795, val_loss = 1.11077, val_accuracy = 0.74342


 65%|██████▌   | 13/20 [04:23<02:18, 19.83s/it]

epoch = 13, train_loss = 0.71068, val_loss = 1.10550, val_accuracy = 0.74342


 70%|███████   | 14/20 [04:43<01:58, 19.83s/it]

epoch = 14, train_loss = 0.58921, val_loss = 1.24521, val_accuracy = 0.73684


 75%|███████▌  | 15/20 [05:03<01:39, 19.82s/it]

epoch = 15, train_loss = 0.68500, val_loss = 1.46085, val_accuracy = 0.71711


 80%|████████  | 16/20 [05:22<01:19, 19.80s/it]

epoch = 16, train_loss = 0.60242, val_loss = 1.33551, val_accuracy = 0.73026


 85%|████████▌ | 17/20 [05:42<00:59, 19.80s/it]

epoch = 17, train_loss = 0.50006, val_loss = 1.24672, val_accuracy = 0.73026


 90%|█████████ | 18/20 [06:02<00:39, 19.87s/it]

epoch = 18, train_loss = 0.42558, val_loss = 1.17422, val_accuracy = 0.75658


 95%|█████████▌| 19/20 [06:22<00:19, 19.84s/it]

epoch = 19, train_loss = 0.28955, val_loss = 1.18705, val_accuracy = 0.75658


100%|██████████| 20/20 [06:42<00:00, 20.11s/it]

epoch = 20, train_loss = 0.29546, val_loss = 1.18537, val_accuracy = 0.75987



  5%|▌         | 1/20 [00:19<06:17, 19.88s/it]

epoch = 1, train_loss = 2.88450, val_loss = 2.84229, val_accuracy = 0.32566


 10%|█         | 2/20 [00:39<05:58, 19.89s/it]

epoch = 2, train_loss = 2.34868, val_loss = 1.79276, val_accuracy = 0.49342


 15%|█▌        | 3/20 [00:59<05:38, 19.89s/it]

epoch = 3, train_loss = 2.05625, val_loss = 1.52492, val_accuracy = 0.58882


 20%|██        | 4/20 [01:19<05:18, 19.90s/it]

epoch = 4, train_loss = 1.86208, val_loss = 1.69311, val_accuracy = 0.59211


 25%|██▌       | 5/20 [01:39<04:58, 19.88s/it]

epoch = 5, train_loss = 1.68245, val_loss = 2.05217, val_accuracy = 0.54605


 30%|███       | 6/20 [01:59<04:38, 19.90s/it]

epoch = 6, train_loss = 1.60317, val_loss = 1.43755, val_accuracy = 0.64474


 35%|███▌      | 7/20 [02:19<04:18, 19.89s/it]

epoch = 7, train_loss = 1.30612, val_loss = 1.37936, val_accuracy = 0.65461


 40%|████      | 8/20 [02:39<03:58, 19.90s/it]

epoch = 8, train_loss = 1.15577, val_loss = 1.22019, val_accuracy = 0.68421


 45%|████▌     | 9/20 [02:59<03:38, 19.90s/it]

epoch = 9, train_loss = 1.03440, val_loss = 1.01303, val_accuracy = 0.73684


 50%|█████     | 10/20 [03:18<03:18, 19.89s/it]

epoch = 10, train_loss = 0.96547, val_loss = 1.04160, val_accuracy = 0.73355


 55%|█████▌    | 11/20 [03:38<02:58, 19.88s/it]

epoch = 11, train_loss = 0.81332, val_loss = 1.18325, val_accuracy = 0.71711


 60%|██████    | 12/20 [03:58<02:39, 19.93s/it]

epoch = 12, train_loss = 0.75437, val_loss = 1.25901, val_accuracy = 0.71053


 65%|██████▌   | 13/20 [04:18<02:19, 19.91s/it]

epoch = 13, train_loss = 0.62781, val_loss = 1.04495, val_accuracy = 0.76974


 70%|███████   | 14/20 [04:38<01:59, 19.89s/it]

epoch = 14, train_loss = 0.59439, val_loss = 0.86449, val_accuracy = 0.77961


 75%|███████▌  | 15/20 [04:58<01:39, 19.90s/it]

epoch = 15, train_loss = 0.39952, val_loss = 0.85027, val_accuracy = 0.78947


 80%|████████  | 16/20 [05:18<01:19, 19.89s/it]

epoch = 16, train_loss = 0.35069, val_loss = 0.84912, val_accuracy = 0.79605


 85%|████████▌ | 17/20 [05:38<00:59, 19.87s/it]

epoch = 17, train_loss = 0.33361, val_loss = 0.83801, val_accuracy = 0.78618


 90%|█████████ | 18/20 [05:58<00:39, 19.90s/it]

epoch = 18, train_loss = 0.27927, val_loss = 0.83336, val_accuracy = 0.79934


 95%|█████████▌| 19/20 [06:17<00:19, 19.86s/it]

epoch = 19, train_loss = 0.31615, val_loss = 0.84549, val_accuracy = 0.79934


100%|██████████| 20/20 [06:37<00:00, 19.88s/it]

epoch = 20, train_loss = 0.30092, val_loss = 0.84255, val_accuracy = 0.78947



  5%|▌         | 1/20 [00:19<06:17, 19.86s/it]

epoch = 1, train_loss = 2.97436, val_loss = 2.62278, val_accuracy = 0.25000


 10%|█         | 2/20 [00:39<05:56, 19.81s/it]

epoch = 2, train_loss = 2.51350, val_loss = 1.79711, val_accuracy = 0.48355


 15%|█▌        | 3/20 [00:59<05:38, 19.89s/it]

epoch = 3, train_loss = 2.20190, val_loss = 1.45737, val_accuracy = 0.61184


 20%|██        | 4/20 [01:19<05:17, 19.83s/it]

epoch = 4, train_loss = 1.90110, val_loss = 1.49529, val_accuracy = 0.59211


 25%|██▌       | 5/20 [01:39<04:57, 19.84s/it]

epoch = 5, train_loss = 1.62556, val_loss = 2.04761, val_accuracy = 0.54934


 30%|███       | 6/20 [01:59<04:37, 19.85s/it]

epoch = 6, train_loss = 1.44910, val_loss = 1.22015, val_accuracy = 0.72039


 35%|███▌      | 7/20 [02:18<04:17, 19.83s/it]

epoch = 7, train_loss = 1.16756, val_loss = 1.28790, val_accuracy = 0.70395


 40%|████      | 8/20 [02:38<03:57, 19.80s/it]

epoch = 8, train_loss = 1.04743, val_loss = 1.20326, val_accuracy = 0.69408


 45%|████▌     | 9/20 [02:58<03:37, 19.80s/it]

epoch = 9, train_loss = 1.02714, val_loss = 1.14464, val_accuracy = 0.71711


 50%|█████     | 10/20 [03:18<03:18, 19.80s/it]

epoch = 10, train_loss = 0.87603, val_loss = 1.10138, val_accuracy = 0.72697


 55%|█████▌    | 11/20 [03:37<02:58, 19.79s/it]

epoch = 11, train_loss = 0.78406, val_loss = 1.23765, val_accuracy = 0.72039


 60%|██████    | 12/20 [03:57<02:38, 19.79s/it]

epoch = 12, train_loss = 0.73023, val_loss = 1.16949, val_accuracy = 0.74342


 65%|██████▌   | 13/20 [04:17<02:18, 19.77s/it]

epoch = 13, train_loss = 0.62785, val_loss = 1.19756, val_accuracy = 0.73684


 70%|███████   | 14/20 [04:37<01:58, 19.77s/it]

epoch = 14, train_loss = 0.62688, val_loss = 1.13870, val_accuracy = 0.74342


 75%|███████▌  | 15/20 [04:57<01:38, 19.80s/it]

epoch = 15, train_loss = 0.42351, val_loss = 0.91624, val_accuracy = 0.77632


 80%|████████  | 16/20 [05:16<01:19, 19.80s/it]

epoch = 16, train_loss = 0.35154, val_loss = 0.95009, val_accuracy = 0.79276


 85%|████████▌ | 17/20 [05:36<00:59, 19.82s/it]

epoch = 17, train_loss = 0.34151, val_loss = 0.90111, val_accuracy = 0.79605


 90%|█████████ | 18/20 [05:56<00:39, 19.81s/it]

epoch = 18, train_loss = 0.29131, val_loss = 0.93694, val_accuracy = 0.78947


 95%|█████████▌| 19/20 [06:16<00:19, 19.80s/it]

epoch = 19, train_loss = 0.28521, val_loss = 0.91777, val_accuracy = 0.79934


100%|██████████| 20/20 [06:36<00:00, 19.81s/it]

epoch = 20, train_loss = 0.28692, val_loss = 0.90760, val_accuracy = 0.79605



  5%|▌         | 1/20 [00:19<06:15, 19.79s/it]

epoch = 1, train_loss = 2.93376, val_loss = 2.38411, val_accuracy = 0.35197


 10%|█         | 2/20 [00:39<05:56, 19.80s/it]

epoch = 2, train_loss = 2.42455, val_loss = 1.83946, val_accuracy = 0.50000


 15%|█▌        | 3/20 [00:59<05:36, 19.82s/it]

epoch = 3, train_loss = 2.04187, val_loss = 2.08748, val_accuracy = 0.51974


 20%|██        | 4/20 [01:19<05:17, 19.82s/it]

epoch = 4, train_loss = 1.81673, val_loss = 1.64839, val_accuracy = 0.56579


 25%|██▌       | 5/20 [01:39<04:56, 19.80s/it]

epoch = 5, train_loss = 1.62301, val_loss = 1.30015, val_accuracy = 0.64803


 30%|███       | 6/20 [01:58<04:36, 19.79s/it]

epoch = 6, train_loss = 1.24577, val_loss = 1.27454, val_accuracy = 0.71382


 35%|███▌      | 7/20 [02:18<04:17, 19.77s/it]

epoch = 7, train_loss = 1.16050, val_loss = 1.19803, val_accuracy = 0.70724


 40%|████      | 8/20 [02:38<03:57, 19.79s/it]

epoch = 8, train_loss = 1.03111, val_loss = 1.17241, val_accuracy = 0.72039


 45%|████▌     | 9/20 [02:58<03:37, 19.81s/it]

epoch = 9, train_loss = 0.90109, val_loss = 1.16044, val_accuracy = 0.72697


 50%|█████     | 10/20 [03:18<03:18, 19.82s/it]

epoch = 10, train_loss = 0.81624, val_loss = 1.11116, val_accuracy = 0.73026


 55%|█████▌    | 11/20 [03:37<02:58, 19.82s/it]

epoch = 11, train_loss = 0.79097, val_loss = 1.11586, val_accuracy = 0.75658


 60%|██████    | 12/20 [03:57<02:38, 19.80s/it]

epoch = 12, train_loss = 0.62296, val_loss = 1.19873, val_accuracy = 0.75658
epoch = 13, train_loss = 0.59505, val_loss = 1.11287, val_accuracy = 0.77961


 70%|███████   | 14/20 [04:37<01:58, 19.82s/it]

epoch = 14, train_loss = 0.59517, val_loss = 1.03186, val_accuracy = 0.77632


 75%|███████▌  | 15/20 [04:57<01:39, 19.81s/it]

epoch = 15, train_loss = 0.54444, val_loss = 1.22331, val_accuracy = 0.75329


 80%|████████  | 16/20 [05:16<01:19, 19.79s/it]

epoch = 16, train_loss = 0.50493, val_loss = 1.03708, val_accuracy = 0.77303


 85%|████████▌ | 17/20 [05:36<00:59, 19.76s/it]

epoch = 17, train_loss = 0.39782, val_loss = 1.24739, val_accuracy = 0.75987


 90%|█████████ | 18/20 [05:56<00:39, 19.78s/it]

epoch = 18, train_loss = 0.45195, val_loss = 1.12117, val_accuracy = 0.78289


 95%|█████████▌| 19/20 [06:16<00:19, 19.79s/it]

epoch = 19, train_loss = 0.32024, val_loss = 1.11997, val_accuracy = 0.80921


100%|██████████| 20/20 [06:35<00:00, 19.80s/it]

epoch = 20, train_loss = 0.22147, val_loss = 1.06084, val_accuracy = 0.80592





In [9]:
def load_test_file(f):
    wav, sr = librosa.load('../input/rfcx-species-audio-detection/test/' + f, sr=None)

    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        spec=librosa.feature.melspectrogram(y=slice, sr=sr, fmin=fmin, fmax=fmax)
        spec_db=librosa.power_to_db(spec,top_db=80)

        img = spec_to_image(spec_db)
        mel_spec = np.stack((img, img, img))
        mel_array.append(mel_spec)
    
    return mel_array

In [10]:
members = []
for i in range(fold_num):
    model = get_model()
    model.load_state_dict(torch.load('./model'+str(i)+'.pt'))
    model.eval()
    members.append(model)
    
os.remove('./model0.pt') 
os.remove('./model1.pt')
os.remove('./model2.pt') 
os.remove('./model3.pt')

In [11]:
import csv

with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir('../input/rfcx-species-audio-detection/test/')
    print(len(test_files))
    
    for i in range(0, len(test_files)):
        data = load_test_file(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output_list = []
        for m in members:
            output = m(data)
            maxed_output = torch.max(output, dim=0)[0]
            maxed_output = maxed_output.cpu().detach()
            output_list.append(maxed_output)
        avg_maxed_output = torch.mean(torch.stack(output_list), dim=0)
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in avg_maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)


1992


  data = torch.tensor(data)
