In [4]:
import pandas as pd
import numpy as np 
import soundfile as sf 
# !pip install librosa
import librosa
from skimage.transform import resize 
from PIL import Image

In [5]:
df = pd.read_csv('./Data/train_tp.csv')

In [107]:
fft = 2048
hop = 512 
sr = 48000
length = 10*sr

df = pd.read_csv('./Data/train_tp.csv')[0:304]
df.head()

Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max
0,003bec244,14,1,44.544,2531.25,45.1307,5531.25
1,006ab765f,23,1,39.9615,7235.16,46.0452,11283.4
2,007f87ba2,12,1,39.136,562.5,42.272,3281.25
3,0099c367b,17,4,51.4206,1464.26,55.1996,4565.04
4,009b760e6,10,1,50.0854,947.461,52.5293,10852.7


In [108]:
# Creating bitmap images for each spectogram to run through resnet50

# Source: https://www.kaggle.com/fffrrt/all-in-one-rfcx-baseline-for-beginners

for idx, row in df.iterrows():
    wav, sr = librosa.load('./Data/train/' + str(row['recording_id']) + '.flac', sr=None)

    t_min = row['t_min']*sr
    t_max = row['t_max']*sr
    
    center = np.round((t_min + t_max) / 2) 
    beginning = center - length / 2 
    if beginning < 0: 
        beginning = 0 
    
    ending = beginning + length 
    if ending > len(wav):
        ending = len(wav)
        beginning = ending - length
            
    mel_spec = librosa.feature.melspectrogram(y=wav[int(beginning):int(ending)], n_fft=fft, hop_length=hop, sr=sr, fmin=0, fmax=24000, power=1.5)
    mel_spec = resize(mel_spec, (224, 400))
    
    # Normalizing spectrogram 
    mel_spec = (mel_spec - np.min(mel_spec)) / np.max(mel_spec)
    
    mel_spec = mel_spec * 255
    mel_spec = np.round(mel_spec)    
    mel_spec = mel_spec.astype('uint8')
    mel_spec = np.asarray(mel_spec)
    
    bmp = Image.fromarray(mel_spec, 'L')
    bmp.save('./Data/working/train/' + str(row['recording_id']) + '_' + str(row['species_id']) + '_' + str(center) + '.bmp')
    
    if idx % 100 == 0:
        print('Processed ' + str(idx) + ' train examples from ' + str(len(df)))

Processed 0 train examples from 304
Processed 100 train examples from 304
Processed 200 train examples from 304
Processed 300 train examples from 304


In [120]:
import os
import torch
import random 

num_species = 24
batch_size = 8


rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [121]:
import torch.utils.data as td 

# Creating torch dataset
class RFCXDataset(td.Dataset):
    def __init__(self, files):
        self.data = []
        self.labels = []
        
        for file in files:
            # One-hot encoded labels 
            label = int(str.split(file, '_')[1])
            label_arr = np.zeros(num_species, dtype=np.single)
            label_arr[label] = 1.
            self.labels.append(label_arr)
            
            # Open and save spectrogram
            
            img = Image.open('./Data/working/train/' + file)
            mel_spec = np.array(img)
            img.close()
            
            # bmp -> (0,1)
            mel_spec = mel_spec/255
            mel_spec = np.stack((mel_spec, mel_spec, mel_spec))
            
            self.data.append(mel_spec)
            
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        return self.data[item], self.labels[item]

In [122]:
# Setting up training and testing split 
file_list = []
label_list = []

for f in os.listdir('./Data/working/train'):
    if '.bmp' in f:
        file_list.append(f)
        label = str.split(f, '_')[1]
        label_list.append(label)
        
from sklearn.model_selection import StratifiedKFold


skf = StratifiedKFold(n_splits = 5, shuffle= True, random_state=rng_seed)

train_files = []
val_files = []

for fold_id, (tr_idx, val_idx) in enumerate(skf.split(file_list, label_list)):
    
    if fold_id == 0: # Just one fold 
        
        train_files = np.take(file_list, tr_idx)
        val_files = np.take(file_list, val_idx)
        
        
print('Training on ' + str(len(train_files)) + ' examples')
print('Validating on ' + str(len(val_files)) + ' examples')

Training on 243 examples
Validating on 61 examples




In [112]:
# Create neural network for baseline 
import torch
from torch import nn 
from torch.utils.data import DataLoader 
# !pip install resnest
# !pip install torchdata
import torch.utils.data as td
from resnest.torch import resnest50
import torchvision
from torchvision import models

In [123]:
train_dataset = RFCXDataset(train_files)
val_dataset = RFCXDataset(val_files)

train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler = td.RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size = batch_size, sampler = td.RandomSampler(val_dataset))

model = models.resnet50(pretrained=True)

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_species)
)

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.0001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)

pos_weights = torch.ones(num_species)
pos_weights = pos_weights * num_species
loss_function = nn.BCEWithLogitsLoss(pos_weight = pos_weights)

In [125]:
for batch, (data, target) in enumerate(train_loader):
    print(batch)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [124]:
best_corrects = 0

# Train loop
print('Starting training loop')
for e in range(0, 8):
    # Stats
    train_loss = []
    train_corr = []
    
    # Single epoch - train
    model.train()
    for batch, (data, target) in enumerate(train_loader):
        print('batch tick')
        data = data.float()
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()
        
        optimizer.zero_grad()
        
        output = model(data)
        loss = loss_function(output, target)
        
        loss.backward()
        optimizer.step()
        
        # Stats
        vals, answers = torch.max(output, 1)
        vals, targets = torch.max(target, 1)
        corrects = 0
        for i in range(0, len(answers)):
            if answers[i] == targets[i]:
                corrects = corrects + 1
        train_corr.append(corrects)
        
        train_loss.append(loss.item())
    
    # Stats
    for g in optimizer.param_groups:
        lr = g['lr']
    print('Epoch ' + str(e) + ' training end. LR: ' + str(lr) + ', Loss: ' + str(sum(train_loss) / len(train_loss)) +
          ', Correct answers: ' + str(sum(train_corr)) + '/' + str(train_dataset.__len__()))
    
    # Single epoch - validation
    with torch.no_grad():
        # Stats
        val_loss = []
        val_corr = []
        
        model.eval()
        for batch, (data, target) in enumerate(val_loader):
            data = data.float()
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            
            output = model(data)
            loss = loss_function(output, target)
            
            # Stats
            vals, answers = torch.max(output, 1)
            vals, targets = torch.max(target, 1)
            corrects = 0
            for i in range(0, len(answers)):
                if answers[i] == targets[i]:
                    corrects = corrects + 1
            val_corr.append(corrects)
        
            val_loss.append(loss.item())
    
    # Stats
    print('Epoch ' + str(e) + ' validation end. LR: ' + str(lr) + ', Loss: ' + str(sum(val_loss) / len(val_loss)) +
          ', Correct answers: ' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()))
    
    # If this epoch is better than previous on validation, save model
    # Validation loss is the more common metric, but in this case our loss is misaligned with competition metric, making accuracy a better metric
    if sum(val_corr) > best_corrects:
        print('Saving new best model at epoch ' + str(e) + ' (' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()) + ')')
        torch.save(model, 'best_model.pt')
        best_corrects = sum(val_corr)
        
    # Call every epoch
    scheduler.step()

# Free memory
del model

Starting training loop
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
Epoch 0 training end. LR: 0.01, Loss: 1.3659581292060115, Correct answers: 14/243
Epoch 0 validation end. LR: 0.01, Loss: 1.34653802216053, Correct answers: 4/61
Saving new best model at epoch 0 (4/61)
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
batch tick
Epoch 1 training end. LR: 0.01, Loss: 1.325682678530293, Correct answers: 14/243
Epoch 1 vali

In [133]:
def load_test_file(f):
    wav, sr = librosa.load('./Data/test/' + f, sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            sl = wav[len(wav) - length:len(wav)]
        else:
            sl = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        mel_spec = librosa.feature.melspectrogram(y=sl, n_fft=fft, hop_length=hop, sr=sr, power=1.5)
        mel_spec = resize(mel_spec, (224, 400))
    
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)
        
        mel_spec = np.stack((mel_spec, mel_spec, mel_spec))

        mel_array.append(mel_spec)
    
    return mel_array

In [137]:
import csv
model = models.resnet50(pretrained=True)

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_species)
)

model = torch.load('best_model.pt')
model.eval()

if torch.cuda.is_available():
    model.cuda()
    
print("Starting prediction")
with open("predictions.csv", 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    test_files = os.listdir('./Data/test')
    print(len(test_files))
    
    for i in range(0, int(len(test_files)/4)):
        data = load_test_file(test_files[i])
        print('tick')
        data = torch.tensor(data)
        print('tick')
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()
            
        output = model(data)
        
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())
            
            
        submission_writer.writerow(write_array)
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')
    
print('Submission Generated')

Starting prediction
1992
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
tick
