### Imports and Definition of Constants

In [5]:
!pip install torchaudio torchtext

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting torchtext
  Downloading torchtext-0.12.0-cp38-cp38-manylinux1_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 13.0 MB/s eta 0:00:01
Installing collected packages: torchtext
Successfully installed torchtext-0.12.0


In [2]:
import torch
import torchaudio
import torchvision
from torchvision import transform
from torchaudio.transforms import MelSpectrogram
from PIL import Image
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import librosa.display
import random

import matplotlib.pyplot as plt
import os
from torch.utils.data import Dataset, DataLoader, RandomSampler
from resnest.torch import resnest50
from sklearn.model_selection import StratifiedKFold
from skimage.transform import resize
import csv

import sklearn
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

import csv
import torch.utils.data as td
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

num_species = 24
batch_size = 32

fft = 2048
hop = 512 
# According to research, standard sampling bitrate is 48khz. Seen in discussion of kaggle competition as well. 
sr = 48000
length = 10*sr

data_path = '../Data/'

# should change this according to nvidia-smi output e.g. "3"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device('cuda')

tp = pd.read_csv(data_path + 'train_tp.csv')
fp = pd.read_csv(data_path + 'train_fp.csv')
fp['species_id'] = fp['species_id'].apply(lambda x : -x)

NameError: name '_C' is not defined

### Creating Melspectrograms specifically for ResNeSt

In [55]:
def create_mel_spectograms(df, df2):
    
    mel_spectrogram_transform = MelSpectrogram(power=2.0, n_fft)
    
    df['spec'] = np.nan
    df['spec'] = df['spec'].astype(object)
    
    df2['spec'] = np.nan
    df2['spec'] = df2['spec'].astype(object)
    
    for idx,row in df.iterrows():

        wav, sr = librosa.load(data_path + 'train/' + row['recording_id'] + '.flac', sr=None)
        
        # Slicing and centering spectograms 
        m = (int)((row['t_min'] + row['t_max'])*sr/2)
    
        l = (int)(m-(length/2))
        r = (int)(m+(length/2))
    
        #Assumes audio files are at least as long as length
        if l < 0:
            r += l
            l = 0
        elif r > len(wav):
            l -= r-len(wav)
            r = len(wav)
        
        melspec = librosa.power_to_db(librosa.feature.melspectrogram(y=wav[int(l):int(r)], sr=sr))
        
        df.at[idx, 'spec'] = melspec
        
        
    for idx,row in df2.iterrows():
        wav, sr = librosa.load(data_path + 'train/' + row['recording_id'] + '.flac', sr=None)
    
       # Slicing and centering spectograms 
        m = (int)((row['t_min'] + row['t_max'])*sr/2)
    
        l = (int)(m-(length/2))
        r = (int)(m+(length/2))
    
        #Assumes audio files are at least as long as length
        if l < 0:
            r += l
            l = 0
        elif r > len(wav):
            l -= r-len(wav)
            r = len(wav)
        
        melspec = librosa.power_to_db(librosa.feature.melspectrogram(y=wav[int(l):int(r)], sr=sr))
        df2.at[idx, 'spec'] = mspec
        
    return pd.concat([df,df2])

In [56]:
df = create_mel_spectograms(tp, fp)

NameError: name 'mspec' is not defined

### Loading in Preproccessed Mel Spectrograms 

In [5]:
def to2DArray(x): 
    # casts object representation of specs stored in csv to a 2D numpy array 
    x=x.replace("[", '')
    x=x.replace("]", '')
    x=x.replace("...", '')
    x=x.replace("\n", '')
    y=np.array(x.split(" "))
    y = y[y != ""]
    y = np.asfarray(y, 'float64')
    y = np.reshape(y,(1, y.size))
    return y

In [6]:
# df['spec'] = df['spec'].apply(lambda x: to2DArray(x))
df.head()

Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max,spec
0,003bec244,14,1,44.544,2531.25,45.1307,5531.25,"[[0.01252426, 0.0023889802, 0.00061887037, 0.0..."
1,006ab765f,23,1,39.9615,7235.16,46.0452,11283.4,"[[1.1748381, 0.41643876, 1.1397215, 1.662797, ..."
2,007f87ba2,12,1,39.136,562.5,42.272,3281.25,"[[0.02010685, 0.027764402, 0.022443173, 0.0535..."
3,0099c367b,17,4,51.4206,1464.26,55.1996,4565.04,"[[0.0031949885, 0.0043321564, 0.0011921478, 0...."
4,009b760e6,10,1,50.0854,947.461,52.5293,10852.7,"[[0.018652013, 0.02159359, 0.06152439, 0.31434..."


### Dataset Definition and Additional Preprocessing

In [33]:
class RainforestDataset(Dataset):
    def __init__(self, df):
        
        self.data = []
        self.labels = []
        
        # additional preprocessing required for ResNeST, normalization outlined in paper
        self.preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(320),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
                
        labels = df['species_id']
        for label in labels:
            label_arr = np.full(24, .043478)
            if label < 0:
                label_arr[label] = 0
            else:
                label_arr[label] = 1
            self.labels.append(label_arr)
             
        mspecs = df['spec']
        
        for i in range(len(mspecs)):
            fig = plt.Figure()
            canvas = FigureCanvas(fig)
            current_mspec = librosa.display.specshow(mpecs[i])
            current_mspec = self.preprocess(current_mspec)
            self.data.append(current_mspec)
            
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (self.data[idx], self.labels[idx])

### Model Definition and Loading to GPU

In [34]:
# Model class definition 
model = resnest50(pretrained=False)

# ResNeST pretrained model should be uploaded to this path with the notebook
model.load_state_dict(torch.load(data_path + 'resnest50-528c19ca.pth'))
model.eval()
model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(1024, num_species)
)

# load model into GPU
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=0.0001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)

loss_function = nn.BCEWithLogitsLoss()
loss_function.cuda()

BCEWithLogitsLoss()

### Definition of Training Loop

In [35]:
def training_loop(train_loader, val_loader, model, optimizer, scheduler, loss_function, e_poch):
    best_correct = 0

    for e in range(0, e_poch):
        train_loss = []
        
        model.train()
        for batch, (data, target) in enumerate(train_loader):

            data = data.float()
            if torch.cuda.is_available():
                data, target = data.to('cuda'), target.to('cuda')

            optimizer.zero_grad()
            output = model(data)
            output = output.cuda()
            
            loss = loss_function(output, target)
            
            
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        for g in optimizer.param_groups:
            lr = g['lr']

        print("Epoch: ", str(e))
        print("Learning Rate: ", str(lr))
        print("Training Loss: ", str(sum(train_loss) / len(train_loss)))

        # Validation
        with torch.no_grad():
            val_loss = []
            val_corr = []

            model.eval()
            for batch, (data, target) in enumerate(val_loader):
                data = data.float()
                if torch.cuda.is_available():
                    data, target = data.cuda(), target.cuda()
                
        
                
                output = model(data)
                loss = loss_function(output, target)

                val_loss.append(loss.item())

                vals, answers = torch.max(output, 1)
                vals, targets = torch.max(target, 1)
                corrects = 0
                for i in range(0, len(answers)):
                    if answers[i] == targets[i]:
                        corrects = corrects + 1
                val_corr.append(corrects)


        print("Epoch: ", str(e))
        print("Learning Rate: ", str(lr))
        print("Validation Loss: ", str(sum(val_loss) / len(val_loss)))


        if sum(val_corr) > best_correct:
            print('Saving new best model at epoch ' + str(e) + ' (' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()) + ')')
            # torch.save(model, 'best_model_resnest.pth')
            best_correct = sum(val_corr)

        scheduler.step()

    del model
    

### Creating Training and Validation Sets

In [36]:
train_df = None
val_df = None

X = df.drop('species_id', axis=1)
y = df['species_id']

strat = StratifiedKFold(n_splits=2, shuffle=True, random_state=rng_seed)
train_dfs=[]
val_dfs=[]


for fold, (train_index, val_index) in enumerate(strat.split(X,y)):
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    train_dfs.append(train_df)
    val_dfs.append(val_df)

### Training

In [37]:
e_poch = 20
train_dataset = RainforestDataset(train_dfs[0])
val_dataset = RainforestDataset(val_dfs[0])

train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler = td.RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size = batch_size, sampler = td.RandomSampler(val_dataset))

training_loop(train_loader, val_loader, model, optimizer, scheduler, loss_function, e_poch)

(128, 938)
(128, 938)
(128, 938)
(128, 401)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 89)
(128, 938)
(128, 938)
(128, 938)
(128, 922)
(128, 938)
(128, 938)
(128, 898)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 859)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 408)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 172)
(128, 938)
(128, 938)
(128, 938)
(128, 531)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 574)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)


(128, 938)
(128, 938)
(128, 938)
(128, 182)
(128, 938)
(128, 249)
(128, 938)
(128, 467)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 874)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)
(128, 938)


KeyboardInterrupt: 

### Submission Generation

In [15]:
def create_mel_spectograms(file):
    wav, sr = librosa.load(data_path + "test/{}".format(file), sr=None)

    mel_spec = librosa.feature.melspectrogram(wav, n_fft=fft, hop_length=hop, sr=sr)    
    
    preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(320),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    
    current_mspec = (Image.fromarray(mel_spec, 'RGB'))
  
    current_mspec = preprocess(current_mspec)
    
    return current_mspec

In [16]:
# Model class definition 
model = resnest50(pretrained=False)

model = torch.load('best_model_resnest.pth')
model.eval()

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(1024, num_species)
)

# load model into GPU
model = model.to(device)


if torch.cuda.is_available():
    print('cuda available')
    model.cuda()
    
# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir(data_path + 'test/') 
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = create_mel_spectograms(test_files[i])
        if torch.cuda.is_available():
            data = data.cuda()

        output = model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

cuda available
Starting prediction loop
1992


RuntimeError: Expected 4-dimensional input for 4-dimensional weight [32, 3, 3, 3], but got 3-dimensional input of size [3, 320, 320] instead