## Installing ResNest

Pip install of resnest is currently borked, need to download pretrained model manually
according to the github, model is stored as ../Data/resnest*.pth on github and loaded below


In [1]:
# !pip install resnest

## Imports and Definition of Constants

In [1]:
import torch
from torch import nn
from torchvision import transforms
from PIL import Image
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import torch
import random
import os
from torch.utils.data import Dataset, DataLoader, RandomSampler
from resnest.torch import resnest50
from sklearn.model_selection import StratifiedKFold
from skimage.transform import resize
import csv

rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

num_species = 24
batch_size = 8

fft = 2048
hop = 512 
# According to research, standard sampling bitrate is 48khz. Seen in discussion of kaggle competition as well. 
sr = 48000
length = 10*sr

# ResNeSt50 input layer is 224 x 224 x 3, specifying dimensions here  
mel_spec_dimensions = (224,224)

data_path = '../Data/'

# should change this according to nvidia-smi output e.g. "3"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
device = torch.device('cuda')


## Loading in Preproccessed Mel Spectrograms 

In [3]:
def to2DArray(x): 
    # casts object representation of specs stored in csv to a 2D numpy array 
    x=x.replace("[", '')
    x=x.replace("]", '')
    x=x.replace("...", '')
    x=x.replace("\n", '')
    y=np.array(x.split(" "))
    y = y[y != ""]
    y = np.asfarray(y, 'float64')
    y = np.reshape(y,(1, y.size))
    return y

In [4]:
df = pd.read_csv(data_path + 'csv/train_tp_data.csv')
df['mspec_db'] = df['mspec_db'].apply(lambda x: to2DArray(x))
df.head()

Unnamed: 0.1,Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max,mspec_db,chroma_db,stft_db
0,0,003bec244,14,1,44.544,2531.25,45.1307,5531.25,"[[-43.52676276, -42.22672291, -40.10429537, -2...",[[ -4.77050225 -15.83399347 -10.23624425 ... ...,[[-38.94561364 -38.17916253 -36.60923968 ... -...
1,1,006ab765f,23,1,39.9615,7235.16,46.0452,11283.4,"[[-20.40886579, -17.40162276, -18.72003747, -5...",[[-2.48446742 -2.87657713 -1.8470297 ... -4.0...,[[-23.07769726 -19.43518602 -19.32889517 ... -...
2,2,007f87ba2,12,1,39.136,562.5,42.272,3281.25,"[[-54.39303891, -55.47439706, -60.4253212, -23...",[[-10.80191837 -5.18153825 -2.21800291 ... ...,[[-34.0305794 -37.85167318 -42.5896169 ... -...
3,3,0099c367b,17,4,51.4206,1464.26,55.1996,4565.04,"[[-10.84098544, -14.24867814, -13.64287614, -1...",[[-1.95534572 -1.56999388 -3.34319785 ... -0.8...,[[-17.41080114 -19.76300067 -18.89967515 ... -...
4,4,009b760e6,10,1,50.0854,947.461,52.5293,10852.7,"[[-19.11522228, -19.91873376, -17.13115801, -4...",[[-1.77162691 -0.72739473 -0.8605218 ... -1.1...,[[-23.5923663 -24.55557959 -23.3306552 ... -...


## Dataset Definition and Additional Preprocessing

In [5]:
class RainforestDataset(Dataset):
    def __init__(self, df):
        
        self.data = []
        self.labels = []
        
        # additional preprocessing required for ResNeST, normalization outlined in paper
        self.preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
                
        labels = df['species_id'].to_array()
        for label in labels:
            label_arr = np.zeros(24, dtype=np.single)
            label_arr[label] = 1
            self.labels.append(label_arr)
             
        mspecs = df['mspec_db']
        
        for i in range(len(mspecs)):
            current_mspec = (Image.fromarray(mspecs[i])).convert('RGB')
            current_mspec = self.preprocess(current_mspec)
            self.data.append(current_mspec)
            
            
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (self.data[idx], self.labels[idx])

## Model Definition and Loading to GPU

In [6]:
# Model class definition 
model = resnest50(pretrained=False)

# ResNeST pretrained model should be uploaded to this path with the notebook
model.load_state_dict(torch.load(data_path + 'resnest50-528c19ca.pth'))
model.eval()
model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_species)
)

pos_weight = (torch.ones(num_species) * num_species)

# load model into GPU
model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=0.001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)
loss_function = nn.BCEWithLogitsLoss(pos_weight)

loss_function.to(device)

BCEWithLogitsLoss()

## Definition of Training Loop

In [7]:
def training_loop(train_loader, val_loader, model, optimizer, scheduler, pos_weight, loss_function):
    best_corrects = 0


    for e in range(0, 20):
        train_loss = []

        model.train()
        for batch, (data, target) in enumerate(train_loader):
            data = data.float()
            if torch.cuda.is_available():
                data, target = data.to('cuda'), target.to('cuda')

            optimizer.zero_grad()
            output = model(data)
            output = output.cuda()
            
            loss = loss_function(output, target)
        
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        for g in optimizer.param_groups:
            lr = g['lr']

        print("Epoch: ", str(e))
        print("Learning Rate: ", str(lr))
        print("Training Loss: ", str(sum(train_loss) / len(train_loss)))

        # Validation
        with torch.no_grad():
            val_loss = []
            val_corr = []

            model.eval()
            for batch, (data, target) in enumerate(val_loader):
                data = data.float()
                if torch.cuda.is_available():
                    data, target = data.cuda(), target.cuda()
                
                output = model(data)
                loss = loss_function(output, target)

                val_loss.append(loss.item())

                vals, answers = torch.max(output, 1)
                vals, targets = torch.max(target, 1)
                corrects = 0
                for i in range(0, len(answers)):
                    if answers[i] == targets[i]:
                        corrects = corrects + 1
                val_corr.append(corrects)


        print("Epoch: ", str(e))
        print("Learning Rate: ", str(lr))
        print("Validation Loss: ", str(sum(val_loss) / len(val_loss)))


        if sum(val_corr) > best_corrects:
            print('Saving new best model at epoch ' + str(e) + ' (' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()) + ')')
            torch.save(model, data_path + 'best_model_resnest.pt')
            best_corrects = sum(val_corr)

        scheduler.step()

    del model
    
    return 1

In [8]:
train_df = None
val_df = None

X = df.drop('species_id', axis=1)
y = df['species_id']

strat = StratifiedKFold(n_splits=2, shuffle=True, random_state=rng_seed)

for fold, (train_index, val_index) in enumerate(strat.split(X,y)):
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]

    train_df = train_df.reset_index(drop=True)

    val_df = val_df.reset_index(drop=True)
    train_dataset = RainforestDataset(train_df)
    val_dataset = RainforestDataset(val_df)
    
    train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler = RandomSampler(train_dataset))
    val_loader = DataLoader(val_dataset, batch_size = batch_size, sampler = RandomSampler(val_dataset))
    training_loop(train_loader, val_loader, model, optimizer, scheduler, pos_weight, loss_function)

Epoch:  0
Learning Rate:  0.001
Training Loss:  6.631200467285357
Epoch:  0
Learning Rate:  0.001
Validation Loss:  4.261406390290511
Saving new best model at epoch 0 (50/608)
Epoch:  1
Learning Rate:  0.001
Training Loss:  4.232210689469388
Epoch:  1
Learning Rate:  0.001
Validation Loss:  8.304376363754272
Epoch:  2
Learning Rate:  0.001
Training Loss:  4.23956668063214
Epoch:  2
Learning Rate:  0.001
Validation Loss:  5.422140096363268
Epoch:  3
Learning Rate:  0.001
Training Loss:  4.216808400656047
Epoch:  3
Learning Rate:  0.001
Validation Loss:  149.1436058847528
Epoch:  4
Learning Rate:  0.001
Training Loss:  4.214287139867482
Epoch:  4
Learning Rate:  0.001
Validation Loss:  7.570223425564013
Epoch:  5
Learning Rate:  0.001
Training Loss:  4.221502592689113
Epoch:  5
Learning Rate:  0.001
Validation Loss:  4.1888581326133325
Epoch:  6
Learning Rate:  0.001
Training Loss:  4.197579085826874
Epoch:  6
Learning Rate:  0.001
Validation Loss:  9.0743475462261
Epoch:  7
Learning Rat

In [2]:
def create_mel_spectograms(df):
    wav, sr = librosa.load(data_path + "test/{}".format(df), sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        mel_spec = librosa.feature.melspectrogram(slice, n_fft=fft, hop_length=hop, sr=sr)
        mel_spec = resize(mel_spec, mel_spec_dimensions)
    
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)
        
        mel_spec = np.stack((mel_spec, mel_spec, mel_spec))

        mel_array.append(mel_spec)
    
    return mel_array

In [4]:
model = torch.load(data_path + 'best_model_resnest.pt')
model.eval()

# Scoring does not like many files:(
#if save_to_disk == 0:
#    for f in os.listdir('/kaggle/working/'):
#        os.remove('/kaggle/working/' + f)

if torch.cuda.is_available():
    model.cuda()
    
# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir(data_path + 'test/') 
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = create_mel_spectograms(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output = model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

Starting prediction loop
1992


KeyboardInterrupt: 