<a href="https://colab.research.google.com/github/AndreyKuratov/project_mldm_21/blob/main/MLDM_TP_short_cuts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from skimage.transform import resize
from skimage.util import random_noise
from skimage.filters import gaussian
from skimage import exposure
import cv2
import numpy as np
import random

def addNoisy(img):
    noise_img = random_noise(img)
    return addChannels(noise_img)

def contrast_stretching(img):
    p2, p98 = np.percentile(img, (2, 98))
    contrast_img = exposure.rescale_intensity(img, in_range=(p2, p98))
    return addChannels(contrast_img)

def log_correction(img):
    log_img = exposure.adjust_log(img)
    return addChannels(log_img)

def randomGaussian(img):
    gaussian_img = gaussian(img, sigma=random.randint(0, 5))
    return addChannels(gaussian_img)

def addChannels(img):
    return np.stack((img, img, img))

def spec_to_image(spec):    
    spec = resize(spec, (224, 400))
    eps=1e-6
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    spec_scaled = np.asarray(spec_scaled)
    return spec_scaled

In [None]:
import librosa
from torch.utils.data import Dataset, DataLoader

class AudioData(Dataset):
    def __init__(self, _data, data_type):
        self.data = []
        self.labels = []
        for i in range(0, len(_data)):
            # All sound files are 48000 bitrate, no need to slowly resample
            wav, sr = librosa.load('/content/drive/MyDrive/MLDM_proj/train/' + _data[i][0] + '.flac', sr=None)

            t_min = float(_data[i][3]) * sr
            t_max = float(_data[i][5]) * sr

            # Positioning sound slice
            center = np.round((t_min + t_max) / 2)
            beginning = center - length / 2
            if beginning < 0:
                beginning = 0

            ending = beginning + length
            if ending > len(wav):
                ending = len(wav)
                beginning = ending - length

            slice = wav[int(beginning):int(ending)]
            
            spec=librosa.feature.melspectrogram(slice, sr=sr,n_fft=fft,hop_length=hop,fmin=fmin,fmax=fmax)
            spec_db=librosa.power_to_db(spec,top_db=80)
            
            img = spec_to_image(spec_db)
            mel_spec = np.stack((img, img, img))
            self.data.append(mel_spec)
            label = int(_data[i][1])
            self.labels.append(label)
            
            if data_type == "train":
                augmentation_functions = [
                    addNoisy, contrast_stretching,
                    randomGaussian, log_correction
                ]
                for fun in augmentation_functions:
                    mel_spec = fun(img)
                    self.data.append(mel_spec)
                    self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [None]:
import csv
import random

fft = 2048
hop = 512
# Less rounding errors this way
sr = 48000
length = 10 * sr

with open('/content/drive/MyDrive/MLDM_proj/train_tp.csv') as f:
    reader = csv.reader(f)
    next(reader, None)
    data = list(reader)

# Check minimum/maximum frequencies for bird calls
# Not neccesary, but there are usually plenty of noise in low frequencies, and removing it helps
fmin = 24000
fmax = 0

# Skip header row (recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max) and start from 1 instead of 0
for i in range(0, len(data)):
    if fmin > float(data[i][4]):
        fmin = float(data[i][4])
    if fmax < float(data[i][6]):
        fmax = float(data[i][6])
# Get some safety margin
fmin = int(fmin * 0.9)
fmax = int(fmax * 1.1)
print('Minimum frequency: ' + str(fmin) + ', maximum frequency: ' + str(fmax))

percentage_train = 90
random.shuffle(data)
total = len(data)
train_data_amount = round(total / 100 * percentage_train)
# train_audio = data[:train_data_amount]
# val_audio = data[train_data_amount:]
# train_data = AudioData(train_audio, "train")
# valid_data = AudioData(val_audio, "valid")

Minimum frequency: 84, maximum frequency: 15056


In [None]:
import pickle

In [None]:
common_dir = '/content/drive/MyDrive/MLDM_proj/'

In [None]:
with open(common_dir + 'train_data_for_DL.pkl', 'wb') as f:
  pickle.dump(train_data, f)

with open(common_dir + 'valid_data_for_DL.pkl', 'wb') as f:
  pickle.dump(valid_data, f)

In [None]:
with open(common_dir + 'train_data_for_DL.pkl', 'rb') as f:
  train_data = pickle.load(f)

with open(common_dir + 'valid_data_for_DL.pkl', 'rb') as f:
  valid_data = pickle.load(f)

In [None]:
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=16, shuffle=True)

In [None]:
print("train", len(train_data))
print("valid", len(valid_data))

train 5470
valid 122


In [None]:
from torchvision.models import resnet50
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
import copy

In [None]:
num_birds = 24

if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device=torch.device('cpu')

resnet_model = resnet50(pretrained=True)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, num_birds)
resnet_model = resnet_model.to(device)

In [None]:
from tqdm import tqdm

learning_rate = 2e-4
optimizer = torch.optim.Adam(resnet_model.parameters(), lr=learning_rate)
epochs = 20
loss_fn = nn.CrossEntropyLoss()

def setlr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

def lr_decay(optimizer, epoch):
    if epoch%10==0:
        new_lr = learning_rate / (10**(epoch//10))
        optimizer = setlr(optimizer, new_lr)
        print(f'Changed learning rate to {new_lr}')
    return optimizer

def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, change_lr=None):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    train_losses = []
    valid_losses = []
    
    for epoch in tqdm(range(1,epochs+1)):
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch)
        for i, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
            
        train_losses.append(batch_losses)
        print(f'Epoch - {epoch} Train-Loss : {np.mean(train_losses[-1])}')
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())
        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        print(f'Epoch - {epoch} Valid-Loss : {np.mean(valid_losses[-1])} Valid-Accuracy : {accuracy}')
        # deep copy the model
        if accuracy > best_acc:
            best_acc = accuracy
            best_model_wts = copy.deepcopy(model.state_dict())

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
resnet_model = train(resnet_model, loss_fn, train_loader, valid_loader, epochs, optimizer, lr_decay)

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch - 1 Train-Loss : 2.136324789788988


  5%|▌         | 1/20 [01:05<20:40, 65.28s/it]

Epoch - 1 Valid-Loss : 1.087562769651413 Valid-Accuracy : 0.6967213114754098
Epoch - 2 Train-Loss : 1.2858019316057017


 10%|█         | 2/20 [02:09<19:21, 64.54s/it]

Epoch - 2 Valid-Loss : 1.0000904835760593 Valid-Accuracy : 0.7704918032786885
Epoch - 3 Train-Loss : 0.7556469241778055


 15%|█▌        | 3/20 [03:13<18:13, 64.31s/it]

Epoch - 3 Valid-Loss : 0.7308412455022335 Valid-Accuracy : 0.7786885245901639
Epoch - 4 Train-Loss : 0.4789674313009134


 20%|██        | 4/20 [04:17<17:06, 64.16s/it]

Epoch - 4 Valid-Loss : 0.8124755509197712 Valid-Accuracy : 0.8114754098360656
Epoch - 5 Train-Loss : 0.29495610265682143


 25%|██▌       | 5/20 [05:21<16:01, 64.11s/it]

Epoch - 5 Valid-Loss : 0.6613836102187634 Valid-Accuracy : 0.819672131147541
Epoch - 6 Train-Loss : 0.18250837183301474


 30%|███       | 6/20 [06:25<14:56, 64.07s/it]

Epoch - 6 Valid-Loss : 0.585271842777729 Valid-Accuracy : 0.8852459016393442
Epoch - 7 Train-Loss : 0.14337755670792787


 35%|███▌      | 7/20 [07:29<13:52, 64.06s/it]

Epoch - 7 Valid-Loss : 0.6586902406997979 Valid-Accuracy : 0.8524590163934426
Epoch - 8 Train-Loss : 0.14124047669598408


 40%|████      | 8/20 [08:33<12:48, 64.03s/it]

Epoch - 8 Valid-Loss : 0.5371331474743783 Valid-Accuracy : 0.8852459016393442
Epoch - 9 Train-Loss : 0.09151945298412346


 45%|████▌     | 9/20 [09:37<11:43, 63.98s/it]

Epoch - 9 Valid-Loss : 0.6235972139984369 Valid-Accuracy : 0.8442622950819673
Changed learning rate to 2e-05
Epoch - 10 Train-Loss : 0.036384831412766154


 50%|█████     | 10/20 [10:41<10:39, 63.98s/it]

Epoch - 10 Valid-Loss : 0.5331049095839262 Valid-Accuracy : 0.8770491803278688
Epoch - 11 Train-Loss : 0.011244049768933034


 55%|█████▌    | 11/20 [11:45<09:35, 63.95s/it]

Epoch - 11 Valid-Loss : 0.5521361278370023 Valid-Accuracy : 0.8770491803278688
Epoch - 12 Train-Loss : 0.009660310752224177


 60%|██████    | 12/20 [12:48<08:31, 63.95s/it]

Epoch - 12 Valid-Loss : 0.5766796600073576 Valid-Accuracy : 0.8852459016393442
Epoch - 13 Train-Loss : 0.006813151791248585


 65%|██████▌   | 13/20 [13:52<07:27, 63.93s/it]

Epoch - 13 Valid-Loss : 0.5255470285192132 Valid-Accuracy : 0.8770491803278688
Epoch - 14 Train-Loss : 0.006946088260445439


 70%|███████   | 14/20 [14:56<06:23, 63.94s/it]

Epoch - 14 Valid-Loss : 0.5555051788687706 Valid-Accuracy : 0.860655737704918
Epoch - 15 Train-Loss : 0.0049047641170535195


 75%|███████▌  | 15/20 [16:01<05:20, 64.07s/it]

Epoch - 15 Valid-Loss : 0.5522448448464274 Valid-Accuracy : 0.8852459016393442
Epoch - 16 Train-Loss : 0.00422320132925344


 80%|████████  | 16/20 [17:05<04:16, 64.18s/it]

Epoch - 16 Valid-Loss : 0.5201553450897336 Valid-Accuracy : 0.8934426229508197
Epoch - 17 Train-Loss : 0.004250912655965965


 85%|████████▌ | 17/20 [18:10<03:13, 64.43s/it]

Epoch - 17 Valid-Loss : 0.5591669762507081 Valid-Accuracy : 0.8770491803278688
Epoch - 18 Train-Loss : 0.004222019907507136


 90%|█████████ | 18/20 [19:14<02:08, 64.37s/it]

Epoch - 18 Valid-Loss : 0.5251013187225908 Valid-Accuracy : 0.8770491803278688
Epoch - 19 Train-Loss : 0.004588028710437272


 95%|█████████▌| 19/20 [20:19<01:04, 64.32s/it]

Epoch - 19 Valid-Loss : 0.5755440257489681 Valid-Accuracy : 0.8770491803278688
Changed learning rate to 2e-06
Epoch - 20 Train-Loss : 0.006344433967032598


100%|██████████| 20/20 [21:23<00:00, 64.17s/it]

Epoch - 20 Valid-Loss : 0.5500521569047123 Valid-Accuracy : 0.8852459016393442





In [16]:
torch.save(resnet_model.state_dict(), common_dir + 'model_short_slices')

In [17]:
def load_test_file(f):
    wav, sr = librosa.load('/content/drive/MyDrive/MLDM_proj/test/' + f, sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        spec=librosa.feature.melspectrogram(slice, sr=sr,n_fft=fft,hop_length=hop,fmin=fmin,fmax=fmax)
        spec_db=librosa.power_to_db(spec,top_db=80)

        img = spec_to_image(spec_db)
        mel_spec = np.stack((img, img, img))
        mel_array.append(mel_spec)
    
    return mel_array

In [18]:
import os
    
# Prediction loop
print('Starting prediction loop')
with open(common_dir + 'submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir(common_dir + 'test/')
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = load_test_file(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output = resnet_model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())

        submission_writer.writerow(write_array)
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

Starting prediction loop
1992


  app.launch_new_instance()


Predicted for 100 of 1993 files
Predicted for 200 of 1993 files
Predicted for 300 of 1993 files
Predicted for 400 of 1993 files
Predicted for 500 of 1993 files
Predicted for 600 of 1993 files
Predicted for 700 of 1993 files
Predicted for 800 of 1993 files
Predicted for 900 of 1993 files
Predicted for 1000 of 1993 files
Predicted for 1100 of 1993 files
Predicted for 1200 of 1993 files
Predicted for 1300 of 1993 files
Predicted for 1400 of 1993 files
Predicted for 1500 of 1993 files
Predicted for 1600 of 1993 files
Predicted for 1700 of 1993 files
Predicted for 1800 of 1993 files
Predicted for 1900 of 1993 files
Submission generated
