In [2]:
import pandas as pd
import numpy as np 
import soundfile as sf 
import librosa
from skimage.transform import resize 
from PIL import Image
import os
import torch
import random 
from torch import nn 
from torch.utils.data import DataLoader 
import torch.utils.data as td
import torchvision
from torchvision import models
from torchvision import transforms
from sklearn.model_selection import StratifiedKFold
import torch.utils.data as td
import pywt



# Setting seeds for reproducible results 
rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

num_species = 24
batch_size = 32

fft = 2048
hop = 512 
# According to research, standard sampling bitrate is 48khz. Seen in discussion of kaggle competition as well. 
sr = 48000
length = 10*sr
# ResNet50 input layer is 224 x 224 x 3, so I'm resizing the image to fit the first input dimension. 
mel_spec_dimensions = (224,224)

data_path = '../Data/'

preproc_type = 'spec'

### Cuda Device Selection

Use cuda:{device_num} to select cuda device that is not being used already

Make sure that this device is selected by exporting CUDA_VISIBLE_DEVICES={device_num} on the shell that's running the notebook server

In [3]:
os.system('nvidia-smi')

Thu Mar 31 16:09:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.39.01    Driver Version: 510.39.01    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   53C    P0   267W / 300W |  14448MiB / 16384MiB |     69%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   42C    P0    71W / 300W |  12485MiB / 16384MiB |     31%      Default |
|       

0

cuda


In [5]:
print(data_path)

../Data/


In [6]:
df = pd.read_csv(data_path + 'train_Augmented.csv') #File Containing the Spectograms as String

In [7]:
df.shape

(1216, 9)

In [8]:
def to2DArray(x): 
    x=x.replace("[", '')
    x=x.replace("]", '')
    x=x.replace("...", '')
    x=x.replace("\n", '')
    y=np.array(x.split(" "))
    y = y[y != ""]
    y = np.asfarray(y, 'float64')
    y = np.reshape(y,(1, y.size))
    return y

In [9]:
df[preproc_type] = df[preproc_type].apply(lambda x: to2DArray(x))

### Creating PyTorch Dataset Class

Note: Have to stack the spectrograms so that they're (224 x 224 x 3) to fit the input dimensions of ResNet50

In [10]:
class RFCXDatasetFromArr(td.Dataset):
    def __init__(self, df, preproc_type):
        
        self.data = []
        self.labels = []
         # need this to transform data to tensors    
        self.transform = transforms.ToTensor()
                
        labels = df['species_id'].to_list()
        for label in labels:
            label_arr = np.zeros(24, dtype=np.single)
            label_arr[label] = 1.
            self.labels.append(label_arr)
             
        specs = df[preproc_type]
#         print(specs)
            
        for spec in specs:
            stack = np.stack([spec, spec, spec])
            self.data.append(stack)
            
            
#         for i in range(len(specs)):
#             current_spec = np.array(specs[i])
#             dwt_decomp = pywt.dwt2(current_spec, 'bior1.3')
#             LL, (LH, HL, HH) = dwt_decomp
#             stack = np.stack([LH, HL, HH])
#             self.data.append((LH, HL, HH))
            
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.data[idx]), torch.tensor(self.labels[idx]))        

### Configuring Models

ResNet50 Research Reference: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5#data-augmentation

After reading up on ResNet at the above link, SGD was recommended as an optimizer. Went with a recommended learning rate scheduler from a related notebook in Kaggle. The above link recommends a different scheduler. We chose to use BCE w/ Logits Loss also based on recommendations from related work. We plan on trying out multiple different loss functions to see what works best for our problem. 

In [11]:


# Model definition 
model = models.resnet50(pretrained=True)
model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_species)
)

# load model into GPU
model = model.to(device)

Below, we can see the shape of our model. Note that ResNet50 has an output dimension of 2048, which we pass through a fully connected layer. The output of our fc layer is in agreement with competition standards. We designed the FC layer based on related work, and will optimize it in later phases.

### EfficientNet

As per Abha's request, we're trying out EfficientNet here. The main advantages for us of using EfficientNet are that it is built to parallelize more efficiently and runs faster than ResNet50. It also has been shown that it is effective on other CV tasks, so it's worth a shot.

There are 8 different EfficientNet variations. Each one takes the same input dimensionality as ResNet50, but the output classifier has different dimensions in each variation. Here we use the most complex model, EfficientNet-B7. The classifier for this model uses a linear activation layer that outputs our 24 probabilistic classes. 

In [12]:
effnet = models.efficientnet_b7(pretrained=True)
effnet.classifier = nn.Sequential(
    nn.Dropout(p=0.2, inplace=True),
    nn.Linear(in_features=2560, out_features=24, bias=True),
)

effnet = effnet.to('cuda')

In [13]:
effnet.classifier

Sequential(
  (0): Dropout(p=0.2, inplace=True)
  (1): Linear(in_features=2560, out_features=24, bias=True)
)

### Training Loop

Training loop based on the work of another Kaggle notebook: https://www.kaggle.com/fffrrt/all-in-one-rfcx-baseline-for-beginners

Maintains a validation accuracy statistic (Does the most probable class match the ground-truth label?) as the model trains, and saves the model with the highest validation accuracy to the project directory.

In [14]:
def training_loop(train_loader, val_loader, model, optimizer, scheduler, pos_weight, loss_function, max_epochs, highest_score):
    best_corrects = highest_score
    
    for e in range(0, max_epochs):
        train_loss = []
        
        model.train()
        for batch, (data, target) in enumerate(train_loader):

            data = data.float()
            if torch.cuda.is_available():
                data, target = data.to('cuda'), target.to('cuda')

            optimizer.zero_grad()
            output = model(data)
            output = output.cuda()
            
            loss = loss_function(output, target)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

        for g in optimizer.param_groups:
            lr = g['lr']
            
        # Train Results
        print("====TRAIN====")
        print("Epoch: ", str(e))
        print("Learning Rate: ", str(lr))
        print("Training Loss: ", str(sum(train_loss) / len(train_loss)))

        # Validation
        with torch.no_grad():
            val_loss = []
            val_corr = []

            model.eval()
            for batch, (data, target) in enumerate(val_loader):
                data = data.float()
                if torch.cuda.is_available():
                    data, target = data.cuda(), target.cuda()
                    
                output = model(data)
                loss = loss_function(output, target)

                val_loss.append(loss.item())

                vals, answers = torch.max(output, 1)
                vals, targets = torch.max(target, 1)
                corrects = 0
                for i in range(0, len(answers)):
                    if answers[i] == targets[i]:
                        corrects = corrects + 1
                val_corr.append(corrects)

        # Val Results 
        
        print("====VAL====")
        print("Epoch: ", str(e))
        print("Learning Rate: ", str(lr))
        print("Validation Loss: ", str(sum(val_loss) / len(val_loss)))


        if sum(val_corr) > best_corrects:
            print('Saving new best model at epoch ' + str(e) + ' (' + str(sum(val_corr)) + '/' + str(len(val_loader.dataset)) + ')')
            torch.save(model, f'best_model_{model.__class__.__name__}.pt')
            best_corrects = sum(val_corr)

        scheduler.step()

    del model
    
    return best_corrects

In [15]:
# training_loop(train_loader, val_loader, effnet, optimizer, scheduler, pos_weight, loss_function, 32, 0)

### Setting Up Cross Validation Loop 

Use this code cell to test hyperparameters using 5-fold cross validation. Just have to put list of 5 CV parameters in the specified location and change the parameters to variables in the loop.

In [16]:
effnet = models.efficientnet_b7(pretrained=True)
effnet.classifier = nn.Sequential(
    nn.Dropout(p=0.2, inplace=True),
    nn.Linear(in_features=2560, out_features=24, bias=True),
)

effnet = effnet.to('cuda')


train_df = None
val_df = None

X = df.drop('species_id', axis=1)
y = df['species_id']

strat = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng_seed)

highest_correct = 0

### INSERT CV PARAMETERS HERE ### 



#################################

## Make sure to change cv parameters to variables in for loop as well (e.g. lr = possible_lr_values[fold])

for fold, (train_index, val_index) in enumerate(strat.split(X,y)):
    print("CV FOLD ", fold)
    
    train_df = df.iloc[train_index].reset_index(drop=True)
    val_df = df.iloc[val_index].reset_index(drop=True)

    train_dataset = RFCXDatasetFromArr(train_df, preproc_type)
    val_dataset = RFCXDatasetFromArr(val_df, preproc_type)

    train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler = td.RandomSampler(train_dataset))
    val_loader = DataLoader(val_dataset, batch_size = batch_size, sampler = td.RandomSampler(val_dataset))

    # Need to add in loss, optimizer and scheduler, set up cv for the different folds 
    pos_weight = (torch.ones(num_species) * num_species)
    
    optimizer = torch.optim.SGD(effnet.parameters(), lr=0.0001, weight_decay=0.01, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    loss_function = nn.BCEWithLogitsLoss(pos_weight)
    loss_function.to('cuda')
    
    best_score = training_loop(train_loader, val_loader, effnet, optimizer, scheduler, pos_weight, loss_function, 16, highest_correct)
    
    if best_score > highest_correct: 
        highest_correct = best_score


CV FOLD  0


KeyboardInterrupt: 

### Generating Submission

In [59]:
# Train Test Split for submission 
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.15, stratify=df['species_id'])

In [60]:
train = train.drop('Unnamed: 0', axis=1)

In [61]:
train_dataset = RFCXDatasetFromArr(train, preproc_type)

In [62]:
# train_df = df.iloc[train_index].reset_index(drop=True)
# val_df = df.iloc[val_index].reset_index(drop=True)

# train_dataset = RFCXDatasetFromArr(train_df, preproc_type)
# val_dataset = RFCXDatasetFromArr(val_df, preproc_type)



train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler = td.RandomSampler(train_dataset))
# val_loader = DataLoader(val_dataset, batch_size = batch_size, sampler = td.RandomSampler(val_dataset))

# Need to add in loss, optimizer and scheduler, set up cv for the different folds 
pos_weight = pos_weight = (torch.ones(num_species) * num_species)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=0.01, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
loss_function = nn.BCEWithLogitsLoss(pos_weight)
loss_function.to('cuda')

# best_score = training_loop(train_loader, val_loader, effnet, optimizer, scheduler, pos_weight, loss_function, 32, highest_correct)


for e in range(0, 32):
    print("Epoch ", e)
    train_loss = []

    effnet.train()
    for batch, (data, target) in enumerate(train_loader):

        data = data.float()
        if torch.cuda.is_available():
            data, target = data.to('cuda'), target.to('cuda')

        optimizer.zero_grad()
        output = effnet(data)
        output = output.cuda()

        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())

    for g in optimizer.param_groups:
        lr = g['lr']
        
    scheduler.step()
    
torch.save(effnet, f'effnet_baseline.pt')


Epoch  0
Epoch  1
Epoch  2
Epoch  3
Epoch  4
Epoch  5
Epoch  6
Epoch  7
Epoch  8
Epoch  9
Epoch  10
Epoch  11
Epoch  12
Epoch  13
Epoch  14
Epoch  15
Epoch  16
Epoch  17
Epoch  18
Epoch  19
Epoch  20
Epoch  21
Epoch  22
Epoch  23
Epoch  24
Epoch  25
Epoch  26
Epoch  27
Epoch  28
Epoch  29
Epoch  30
Epoch  31


In [63]:
test = test.drop("Unnamed: 0", axis=1)

In [64]:
test_dataset = RFCXDatasetFromArr(test, preproc_type)

In [17]:
def create_mel_spectograms(df):
    wav, sr = librosa.load("../../Data/test/{}".format(df), sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        mel_spec = librosa.feature.melspectrogram(slice, n_fft=fft, hop_length=hop, sr=sr)
        mel_spec = resize(mel_spec, mel_spec_dimensions)
    
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)
        
        mel_spec = np.stack((mel_spec, mel_spec, mel_spec))

        mel_array.append(mel_spec)
    
    return mel_array

In [18]:
import csv
model = torch.load('./best_models/66-406/best_model_EfficientNet.pt')
model.eval()

if torch.cuda.is_available():
    model.cuda()
    
# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir('../../Data/test/') 
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = create_mel_spectograms(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output = model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

Starting prediction loop
1992


  data = torch.tensor(data)


Predicted for 100 of 1993 files
Predicted for 200 of 1993 files
Predicted for 300 of 1993 files
Predicted for 400 of 1993 files
Predicted for 500 of 1993 files
Predicted for 600 of 1993 files
Predicted for 700 of 1993 files
Predicted for 800 of 1993 files
Predicted for 900 of 1993 files
Predicted for 1000 of 1993 files
Predicted for 1100 of 1993 files
Predicted for 1200 of 1993 files
Predicted for 1300 of 1993 files
Predicted for 1400 of 1993 files
Predicted for 1500 of 1993 files
Predicted for 1600 of 1993 files
Predicted for 1700 of 1993 files
Predicted for 1800 of 1993 files
Predicted for 1900 of 1993 files
Submission generated
