### Imports and Definition of Constants

In [1]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader, RandomSampler
from resnest.torch import resnest50
from torch import nn
from torchvision import transforms

import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import librosa.display
import random

import matplotlib.pyplot as plt
import os
import csv

import sklearn
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

num_species = 24
batch_size = 64

fft = 2048
hop = 512 
# According to research, standard sampling bitrate is 48khz. Seen in discussion of kaggle competition as well. 
sr = 48000
length = 10*sr

data_path = '../Data/'

# should change this according to nvidia-smi output e.g. "3"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device('cuda')

tp = pd.read_csv(data_path + 'train_tp.csv')
fp = pd.read_csv(data_path + 'train_fp.csv')
fp['species_id'] = fp['species_id'].apply(lambda x : -x)

### Creating Melspectrograms specifically for ResNeSt

In [2]:
def create_mel_spectograms(df, df2, n_fft, hop_length, sample_rate):
    
    # returns list of tensors representing spectrograms
    
    mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(power=2.0, sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length)
    
    tensors = list()
    
    for idx,row in df.iterrows():

        wav, sr = torchaudio.load(data_path + 'train/' + row['recording_id'] + '.flac')
        
        # Slicing and centering spectograms 
        m = (int)((row['t_min'] + row['t_max'])*sr/2)
    
        l = (int)(m-(length/2))
        r = (int)(m+(length/2))
    
        #Assumes audio files are at least as long as length
        if l < 0:
            r += l
            l = 0
        elif r > len(wav):
            l -= r-len(wav)
            r = len(wav)
            
        melspec = mel_spectrogram_transform(wav[int(l):int(r)])
                
        tensors.append(melspec)
        
        
    for idx,row in df2.iterrows():
#         wav, sr = librosa.load(data_path + 'train/' + row['recording_id'] + '.flac', sr=None)
        wav, sr = torchaudio.load(data_path + 'train/' + row['recording_id'] + '.flac')
    
       # Slicing and centering spectograms 
        m = (int)((row['t_min'] + row['t_max'])*sr/2)
    
        l = (int)(m-(length/2))
        r = (int)(m+(length/2))
    
        #Assumes audio files are at least as long as length
        if l < 0:
            r += l
            l = 0
        elif r > len(wav):
            l -= r-len(wav)
            r = len(wav)
        
#         melspec = librosa.power_to_db(librosa.feature.melspectrogram(y=wav[int(l):int(r)], sr=sr))
        melspec = mel_spectrogram_transform(wav[int(l):int(r)])
        tensors.append(melspec)
        
    return tensors

In [3]:
tensors = create_mel_spectograms(tp, fp, fft, hop, sr)


In [4]:
df = pd.concat([fp, tp])

### Loading in Preproccessed Mel Spectrograms 

In [5]:
def to2DArray(x): 
    # casts object representation of specs stored in csv to a 2D numpy array 
    x=x.replace("[", '')
    x=x.replace("]", '')
    x=x.replace("...", '')
    x=x.replace("\n", '')
    y=np.array(x.split(" "))
    y = y[y != ""]
    y = np.asfarray(y, 'float64')
    y = np.reshape(y,(1, y.size))
    return y

### Dataset Definition and Additional Preprocessing

In [6]:
class RainforestDataset(Dataset):
    def __init__(self, df, list_of_tensors):
        
        self.data = []
        self.labels = []
        
        # additional preprocessing required for ResNeST, normalization outlined in paper
        self.preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        
        # custom label encoding to encourage the model to learn from false positive data and not be too confident
        # in predictions
        labels = df['species_id']
        for label in labels:
            label_arr = np.full(24, .043478)
            if label < 0:
                label_arr[label] = 0
            else:
                label_arr[label] = 1
            self.labels.append(label_arr)
             
        mspecs = list_of_tensors
        
        for i in range(len(mspecs)):
            current_mspec = mspecs[i]
            image_mspec = (transforms.ToPILImage()(current_mspec)).convert('RGB')
            preprocessed_mspec = self.preprocess(image_mspec)
            self.data.append(preprocessed_mspec)
            
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (self.data[idx], self.labels[idx])

### Model Definition and Loading to GPU

In [7]:
# Model class definition 
model = resnest50(pretrained=False)

# ResNeST pretrained model should be uploaded to this path with the notebook
model.load_state_dict(torch.load(data_path + 'resnest50-528c19ca.pth'))
model.eval()
model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(1024, num_species)
)

# load model into GPU
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=0.0001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)

loss_function = nn.BCEWithLogitsLoss()
loss_function.to(device)

BCEWithLogitsLoss()

### Definition of Training Loop

In [13]:
def training_loop(train_loader, val_loader, model, optimizer, scheduler, loss_function, e_poch):
    best_validation = float('inf')

    for e in range(0, e_poch):
        train_loss = []
        
        model.train()
        for batch, (data, target) in enumerate(train_loader):

            data = data.float()
            if torch.cuda.is_available():
                data, target = data.to('cuda'), target.to('cuda')

            optimizer.zero_grad()
            output = model(data)
            output = output.cuda()
            
            loss = loss_function(output, target)
            
            
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        for g in optimizer.param_groups:
            lr = g['lr']

        print("Epoch: ", str(e))
        print("Learning Rate: ", str(lr))
        print("Training Loss: ", str(sum(train_loss) / len(train_loss)))

        # Validation
        with torch.no_grad():
            val_loss = []
            val_corr = []

            model.eval()
            for batch, (data, target) in enumerate(val_loader):
                data = data.float()
                if torch.cuda.is_available():
                    data, target = data.cuda(), target.cuda()
                
                output = model(data)
                loss = loss_function(output, target)

                val_loss.append(loss.item())

                vals, answers = torch.max(output, 1)
                vals, targets = torch.max(target, 1)
                corrects = 0
                for i in range(0, len(answers)):
                    if answers[i] == targets[i]:
                        corrects = corrects + 1
                val_corr.append(corrects)
        
        validation_loss = (sum(val_loss) / len(val_loss))

        print("Epoch: ", str(e))
        print("Learning Rate: ", str(lr))
        print("Validation Loss: ", str(validation_loss))


        if validation_loss < best_validation:
            print('Saving new best model at epoch ' + str(e) + ' (' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()) + ')')
            torch.save(model, 'new_best_model_resnest.pth')
            best_validation = validation_loss

        scheduler.step()

    del model
    

### Creating Training and Validation Sets

In [9]:
train_df = None
val_df = None

X = df.drop('species_id', axis=1)
y = df['species_id']

X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, random_state=rng_seed, test_size=.2)

train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)

### Training

In [10]:
e_poch = 20
train_dataset = RainforestDataset(train_df, tensors)
val_dataset = RainforestDataset(val_df, tensors)


In [14]:
train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler = RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size = batch_size, sampler = RandomSampler(val_dataset))

training_loop(train_loader, val_loader, model, optimizer, scheduler, loss_function, e_poch)

Epoch:  0
Learning Rate:  0.0004
Training Loss:  0.19556427630571518
Epoch:  0
Learning Rate:  0.0004
Validation Loss:  0.19514412417221874
Saving new best model at epoch 0 (1565/1800)
Epoch:  1
Learning Rate:  0.0004
Training Loss:  0.19547568089040243
Epoch:  1
Learning Rate:  0.0004
Validation Loss:  0.1946375986090105
Saving new best model at epoch 1 (1565/1800)
Epoch:  2
Learning Rate:  0.0004
Training Loss:  0.1953922952649959
Epoch:  2
Learning Rate:  0.0004
Validation Loss:  0.1956746716755553
Epoch:  3
Learning Rate:  0.0004
Training Loss:  0.19538059293610763
Epoch:  3
Learning Rate:  0.0004
Validation Loss:  0.19462992689975478
Saving new best model at epoch 3 (1565/1800)
Epoch:  4
Learning Rate:  0.0004
Training Loss:  0.19532688470517584
Epoch:  4
Learning Rate:  0.0004
Validation Loss:  0.19500620693308635
Epoch:  5
Learning Rate:  0.00016
Training Loss:  0.195330130054589
Epoch:  5
Learning Rate:  0.00016
Validation Loss:  0.19451531739171396
Saving new best model at epo

### Submission Generation

In [41]:
def create_mel_spectograms(file):
    fft = 2048
    hop = 512 
    # According to research, standard sampling bitrate is 48khz. Seen in discussion of kaggle competition as well. 
    sr = 48000
    mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(power=2.0, sample_rate=sr, n_fft=fft, hop_length=hop)
    preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(320),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    
    wav, sr = torchaudio.load(data_path + 'test/{}'.format(file))
    
    
    # Split for enough segments to not miss anything
    segments = len(wav) / (10*sr)
    segments = int(np.ceil(segments))
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        melspec = mel_spectrogram_transform(wav)
        
        image_mspec = (transforms.ToPILImage()(melspec)).convert('RGB')
        
        preprocessed_mspec = preprocess(image_mspec).numpy()

        mel_array.append(preprocessed_mspec)
    
    return mel_array
#     wav, sr = librosa.load(data_path + "test/{}".format(file), sr=None)

#     mel_spec = librosa.feature.melspectrogram(wav, n_fft=fft, hop_length=hop, sr=sr) 
    

In [42]:
# Model class definition 
model = resnest50(pretrained=False)

torch.load('new_best_model_resnest.pth')
model.eval()
model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(1024, num_species)
)


# load model into GPU
model = model.to(device)


if torch.cuda.is_available():
    print('cuda available')
    model.cuda()
    
# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir(data_path + 'test/') 
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = create_mel_spectograms(test_files[i])
        data = torch.tensor(data)
        if torch.cuda.is_available():
            data = data.cuda()

        output = model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

cuda available
Starting prediction loop
1992


  data = torch.tensor(data)


Predicted for 100 of 1993 files
Predicted for 200 of 1993 files
Predicted for 300 of 1993 files
Predicted for 400 of 1993 files
Predicted for 500 of 1993 files
Predicted for 600 of 1993 files
Predicted for 700 of 1993 files
Predicted for 800 of 1993 files
Predicted for 900 of 1993 files
Predicted for 1000 of 1993 files
Predicted for 1100 of 1993 files
Predicted for 1200 of 1993 files
Predicted for 1300 of 1993 files
Predicted for 1400 of 1993 files
Predicted for 1500 of 1993 files
Predicted for 1600 of 1993 files
Predicted for 1700 of 1993 files
Predicted for 1800 of 1993 files
Predicted for 1900 of 1993 files
Submission generated
