# Drive and env

In [1]:
%pip install git+https://github.com/Mo5mami/wtfml.git

In [2]:
%pip install torchaudio librosa pretrainedmodels albumentations==0.4.6 imblearn

In [3]:
!pip freeze | grep torch

torch==1.7.0
torchaudio==0.7.0a0+ac17b64
torchtext==0.8.0a0+cd6902d
torchvision==0.8.1


In [4]:
from __future__ import print_function
import argparse
import sys
import os
import random
import librosa
from tqdm.notebook import tqdm
import scipy
import numpy as np
import pandas as pd
import torch
import torchaudio
import torchvision
from scipy.io import wavfile
import IPython.display as ipd
import torch
from torch import nn
from torch.nn import functional as F
#from utils import one_hot_embedding
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import KFold,StratifiedKFold,StratifiedShuffleSplit
import albumentations

from albumentations.pytorch.transforms import ToTensor

from wtfml.utils import EarlyStopping
from wtfml.engine import Engine
import pretrainedmodels
from pretrainedmodels.models import nasnetamobile
import cv2
import gc
import math
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from scipy.stats.mstats import gmean
from librosa.display import specshow
from sklearn.utils import class_weight
from torch.optim.lr_scheduler import _LRScheduler
import io


  '"sox" backend is being deprecated. '


In [5]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
num_seed=42
seed_all(num_seed)

In [6]:
!nvidia-smi

Mon Nov 23 13:05:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla M60           On   | 000068DF:00:00.0 Off |                  Off |
| N/A   32C    P8    14W / 150W |      3MiB /  8129MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
torch.cuda.is_available()

True

# Utils and settings

## general settings

In [8]:
class Config:
  device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  epochs=40
  random_state=42
  train_batchsize=4
  test_batchsize=4
  val_every=5
  print_every=20
  logdir="logs"
  DATASET_PATH="audio_files"
  DATASET2_PATH="latest_keywords"
  DATASET3_PATH="nlp_keywords"
  n_folds=10
  test_size=0.1
  lr=1.2*1e-4
  aftertrain_lr=2*1e-6
  min_lr=0.1*1e-4
  experiment_id="models"

In [9]:
os.mkdir(Config.experiment_id)

## audio settings

In [10]:
class AudioConfig:
    audio_length=3
    sr=44100
    #sr=44100
    fixed_sr=audio_length*sr
    #hop_length = 512
    #hop_length = 275
    hop_length = 276
    fmin = 20
    fmax = 8000
    n_mels = 64
    n_mfcc=13
    #n_fft = 8192
    n_fft = n_mels*20
    #n_fft=8000
    min_seconds = 0.1
    #CROP_SIZE = 247
    WRAP_PAD_PROB = 0.5
    pad=400
    spec_aug_prob=0.8
    mixer_prob=0.0
    audio_crop_prob=0.5
    height=228
    width=400
    duration=3.5

## plot functions

In [11]:
def plot_signal(signals):
    fig, axes = plt.subplots(nrows=1, ncols=1, sharex=False,
                             sharey=True, figsize=(20,5))
    axes.set_title("sig")
    axes.plot(list(signals))
    
    
def plot_signals(signals):
    fig, axes = plt.subplots(nrows=2, ncols=5, sharex=False,
                             sharey=True, figsize=(20,5))
    fig.suptitle('Time Series', size=16)
    i = 0
    for x in range(2):
        for y in range(5):
            axes[x,y].set_title(list(signals.keys())[i])
            axes[x,y].plot(list(signals.values())[i])
            axes[x,y].get_xaxis().set_visible(False)
            axes[x,y].get_yaxis().set_visible(False)
            i += 1

def plot_fft(fft):
    fig, axes = plt.subplots(nrows=2, ncols=5, sharex=False,
                             sharey=True, figsize=(20,5))
    fig.suptitle('Fourier Transforms', size=16)
    i = 0
    for x in range(2):
        for y in range(5):
            data = list(fft.values())[i]
            Y, freq = data[0], data[1]
            axes[x,y].set_title(list(fft.keys())[i])
            axes[x,y].plot(freq, Y)
            axes[x,y].get_xaxis().set_visible(False)
            axes[x,y].get_yaxis().set_visible(False)
            i += 1

def plot_fbank(fbank):
    fig, axes = plt.subplots(nrows=2, ncols=5, sharex=False,
                             sharey=True, figsize=(20,5))
    fig.suptitle('Filter Bank Coefficients', size=16)
    i = 0
    for x in range(2):
        for y in range(5):
            axes[x,y].set_title(list(fbank.keys())[i])
            axes[x,y].imshow(list(fbank.values())[i],
                    cmap='hot', interpolation='nearest')
            axes[x,y].get_xaxis().set_visible(False)
            axes[x,y].get_yaxis().set_visible(False)
            i += 1

def plot_mfccs(mfccs):
    fig, axes = plt.subplots(nrows=1, ncols=1, sharex=False,
                             sharey=True, figsize=(20,5))
    
    axes.set_title("mfcc")
    
    specshow(mfccs,x_axis='time',y_axis='mel', 
                             sr=AudioConfig.sr, hop_length=AudioConfig.hop_length,
                            fmin=AudioConfig.fmin, fmax=AudioConfig.fmax)
    plt.colorbar(format='%+2.0f dB')
    
    plt.show()
def get_plot_mfccs(mfccs):
    
    
    fig,ax = plt.subplots(1)
    fig.subplots_adjust(left=0,right=1,bottom=0,top=1)
    ax.axis('off')
    
    
    specshow(mfccs,x_axis="time",y_axis="mel", 
                             sr=AudioConfig.sr,hop_length=AudioConfig.hop_length,
                            fmin=AudioConfig.fmin, fmax=AudioConfig.fmax)
    ax.axis('off')
    
    image=io.BytesIO()
    fig.savefig(image,bbox_inches='tight',pad_inches=0.0)
    img=np.frombuffer(image.getvalue(), dtype='uint8')
    img = cv2.imdecode(img,cv2.IMREAD_COLOR)
    image.close()
    plt.close()
    return img



def plot_class_dist(X):
    class_dis=X.groupby("label")["fn"].count()
    fig, ax = plt.subplots()
    
    ax.set_title('Class Distribution', y=1.08)
    ax.pie(class_dis, labels=class_dis.index, autopct='%1.1f%%',
          shadow=False, startangle=90)
    ax.axis('equal')
    plt.show()


def show_melspectrogram(mels, title='Log-frequency power spectrogram'):
    librosa.display.specshow(mels, x_axis='time', y_axis='mel', 
                             sr=conf.sampling_rate, hop_length=conf.hop_length,
                            fmin=conf.fmin, fmax=conf.fmax)
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.show()


## util functions

In [13]:
"""
Read audio from a path
"""

def read_wav(filepath):
  
  sample_rate, samples = wavfile.read(filepath)
  return sample_rate,np.array(samples)

"""
Listen to audio sample
"""
def listen(samples,sample_rate):
  return ipd.Audio(samples, rate=sample_rate)

"""
Wav loader for DatasetFolder using librosa
"""
def wav_loader(path,sr=AudioConfig.sr,fixed_sr=AudioConfig.fixed_sr):
    sample,sr=librosa.load(path,sr=sr)
    result=torch.zeros(1,fixed_sr)
    length=min(fixed_sr,len(sample))
    result[0,:length]=torch.tensor(sample[:length])
    return (result,sr)
    

"""
Wav loader for DatasetFolder using torchaudio
"""

def torch_wav_loader(path,sr=AudioConfig.sr,fixed_sr=AudioConfig.fixed_sr):
    sample,sr=torchaudio.load_wav(path)
    result=torch.zeros(1,fixed_sr)
    length=min(fixed_sr,sample.shape[1])
    result[0,:length]=sample[0,:length]
    return (result,sr)

"""
accuracy measure
"""
def accuracy(predictions,real):
    return (predictions==real).sum()*100/len(predictions)

def enveloppe(sig,sr,threshhold):
  mask=[]
  sig=pd.Series(sig).apply(np.abs)
  sig_mean=sig.rolling(window=int(sr/10),min_periods=1,center=True).mean()
  for mean in sig_mean:
    if mean>threshhold:
      mask.append(True)
    else:  mask.append(False)
  return mask


def read_audio(file_path,top_db=60):
    min_samples = int(AudioConfig.min_seconds * AudioConfig.sr)
    y, sr = librosa.load(file_path, sr=AudioConfig.sr)
    
    trim_y, trim_idx = librosa.effects.trim(y,top_db=top_db,frame_length=AudioConfig.n_fft, hop_length=AudioConfig.hop_length)  # trim, top_db=default(60)

    if len(trim_y) < min_samples:
        center = (trim_idx[1] - trim_idx[0]) // 2
        left_idx = max(0, center - min_samples // 2)
        right_idx = min(len(y), center + min_samples // 2)
        trim_y = y[left_idx:right_idx]

        if len(trim_y) < min_samples:
            padding = min_samples - len(trim_y)
            offset = padding // 2
            trim_y = np.pad(trim_y, (offset, padding - offset), 'constant')

    
    return trim_y

def test_top_db(filepath,top_db,print_mask=True):
  sample=read_audio(filepath)
  plot_signal(sample)
  sample_test=read_audio(filepath,top_db=top_db)
  plot_signal(sample_test)
  if(print_mask):  return listen(sample_test,AudioConfig.sr)
  else : return listen(sample,AudioConfig.sr)


def audio_to_melspectrogram(audio):
    
    spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=AudioConfig.sr,
                                                 n_mels=AudioConfig.n_mels,
                                                 n_fft=AudioConfig.n_fft,
                                                 hop_length=AudioConfig.hop_length,
                                                 fmin=AudioConfig.fmin,
                                                 fmax=AudioConfig.fmax,
                                                 power=2
                                                 )
    spectrogram = librosa.power_to_db(spectrogram,ref=np.max)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

def read_as_melspectrogram(file_path,time_stretch=1.0, pitch_shift=0.0,
                           debug_display=False):
    x = read_audio(file_path)
    if time_stretch != 1.0:
        x = librosa.effects.time_stretch(x, time_stretch)

    if pitch_shift != 0.0:
        librosa.effects.pitch_shift(x, config.sampling_rate, n_steps=pitch_shift)

    mels = audio_to_melspectrogram(x)
    if debug_display:
        import IPython
        IPython.display.display(IPython.display.Audio(x, rate=config.sampling_rate))
        show_melspectrogram(mels)
    return (mels,AudioConfig.sr)

def mix_up(x, y):
        x = np.array(x, np.float32)
        lam = np.random.beta(1.0, 1.0)
        ori_index = np.arange(int(len(x)))
        index_array = np.arange(int(len(x)))
        np.random.shuffle(index_array)        
        
        mixed_x = lam * x[ori_index] + (1 - lam) * x[index_array]
        mixed_y = lam * y[ori_index] + (1 - lam) * y[index_array]
        
        return mixed_x, mixed_y

def oversample(dataframe):
    X,y=RandomOverSampler(random_state=42).fit_sample(dataframe, dataframe["label"])
    return pd.DataFrame(X,columns=dataframe.columns).reset_index(drop=True)

def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return torch.eye(num_classes, dtype=float)[y]

"""
function to test the enveloppe
"""
def test_enveloppe(filepath,thresh=0.0005,print_mask=True):
  sample=read_audio(filepath)
  print(sample.max())
  plot_signal(sample)
  mask=enveloppe(sample,AudioConfig.sr,thresh)
  plot_signal(sample[mask])
  if(print_mask):  return listen(sample[mask],AudioConfig.sr)
  else : return listen(sample,AudioConfig.sr)

def create_csv_dataset_from_path(dataset_path):
    classes=[classe for classe in os.listdir(dataset_path)]
    class_to_idx={classe:idx for idx,classe in enumerate(classes)}
    idx_to_class={idx:classe for idx,classe in enumerate(classes)}
    
    path=[]
    target=[]
    for classe in classes:
        class_path=os.path.join(dataset_path,classe)
        for sample in os.listdir(class_path):
            path.append(os.path.join(class_path,sample))
            target.append(classe)
    
    dataset=pd.DataFrame(data={"fn":path,"label":target})
    dataset = dataset.sample(frac=1,random_state=42).reset_index(drop=True)
    return dataset,classes

def onset_test(path):
    y=read_audio(path)
    times = librosa.times_like(audio_to_melspectrogram(y))
    onset_env = librosa.onset.onset_strength(y=y, sr=AudioConfig.sr,
                                         aggregate=np.median,
                                         n_fft=AudioConfig.n_fft,
                                        hop_length=AudioConfig.hop_length,
                                         fmax=8000, n_mels=160)
    print(onset_env.argmax())
    print(onset_env.shape)
    plt.plot(times, 1 + onset_env / onset_env.max(), alpha=0.8,
           label='Median (custom mel)')

## Transformation

### wav trans

In [16]:
class ChangeAmplitude(object):
    """Changes amplitude of an audio randomly."""

    def __init__(self, amplitude_range=(0.7, 1.1)):
        self.amplitude_range = amplitude_range

    def __call__(self, image,**kwargs):
        

        image = image * random.uniform(*self.amplitude_range)
        return data

class ChangeSpeedAndPitchAudio(object):
    """Change the speed of an audio. This transform also changes the pitch of the audio."""

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, image,**kwargs):
        

        samples = image
        sample_rate = AudioConfig.sr
        scale = random.uniform(-self.max_scale, self.max_scale)
        speed_fac = 1.0  / (1 + scale)
        image = np.interp(np.arange(0, len(samples), speed_fac), np.arange(0,len(samples)), samples).astype(np.float32)
        return image

class StretchAudio(object):
    """Stretches an audio randomly."""

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, image,**kwargs):
        

        scale = random.uniform(-self.max_scale, self.max_scale)
        image = librosa.effects.time_stretch(image, 1+scale)
        return image
class TimeshiftAudio(object):
    """Shifts an audio randomly."""

    def __init__(self, max_shift_seconds=0.2):
        self.max_shift_seconds = max_shift_seconds

    def __call__(self, image,**kwargs):

        samples = image
        sample_rate = AudioConfig.sr
        max_shift = (sample_rate * self.max_shift_seconds)
        shift = random.randint(-max_shift, max_shift)
        a = -min(0, shift)
        b = max(0, shift)
        samples = np.pad(samples, (a, b), "constant")
        image = samples[:len(samples) - a] if a else samples[b:]
        return image


In [17]:
def gauss_noise(k,sig):
     return np.random.normal(scale=k*np.max(sig), size=len(sig))

def gn(samples,k=2e-2):
    noise_g = gauss_noise(k,samples)
    return samples+noise_g

class GN(object):
    """Adds a random background noise."""
    def __init__(self, ):
        None

    def __call__(self, image,**kwargs):
        k=np.random.uniform(low=8e-3, high=4e-2, size=None)
        sample=self.gn(image,k=k)
        return sample

"""
testing gaussian noise
"""
def test_transform(filepath,k=2e-2,print_mask=False):
  sr=AudioConfig.sr
  sample=read_audio(filepath)
  plot_signal(sample)
  
  sample2=gn(sample,k=k)
  plot_signal(sample2)
  if(print_mask):  return listen(sample2,sr)
  else : return listen(sample,sr)




### spect transformation

In [18]:
class ToMfcc(object):
    def __call__(self,image,**kwargs):
        return audio_to_melspectrogram(image)

In [19]:
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6,**kwargs):
    X=X.transpose(1, 0, 2)
    mean = mean or X.mean()
    std = std or X.std()
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V
  

class ToColor:
    def __init__(self,
                  mean=None,
                  std=None):
        self.mean=mean
        self.std = std
        

    def __call__(self, image,**kwargs):
        return mono_to_color(image,
                            self.mean,
                            self.std,
                            )

In [20]:
class AudioCrop:
    def __init__(self,percentage=0.75):
        self.percentage=percentage

    def __call__(self,image,**kwargs):
        perc=np.random.random()*(1-self.percentage)+self.percentage
        return albumentations.RandomCrop(image.shape[0],int(image.shape[1]*perc),p=1)(image=image)["image"]


In [21]:
class Onset:
    def __init__(self,size):
        self.size=size

    def __call__(self,image,**kwargs):
        onset_env = librosa.onset.onset_strength(S=image)
        argmax=onset_env.argmax()
        return albumentations.Crop(x_min=argmax-self.size//2, y_min=0, x_max=argmax+self.size//2, y_max=AudioConfig.n_mels,p=1)(image=image)["image"]


In [22]:
class PadToSize:
    def __init__(self, size, mode='constant'):
        #assert mode in ['constant', 'wrap']
        self.size = size
        self.mode = mode

    def __call__(self, image,**kwargs):
        if image.shape[1] < self.size:
            padding = self.size - image.shape[1]
            offset = padding // 2
            pad_width = ((0, 0), (offset, padding - offset))
            #pad_width = ((0, 0), (0, padding ))
            if self.mode == 'constant':
                
                #image = np.pad(image, pad_width,'constant', constant_values=image.min())
                image = np.pad(image, pad_width,'constant', constant_values=0)
            else:
                image = np.pad(image, pad_width, 'wrap')
        return image


In [23]:
class AudioPad:
    def __init__(self,percentage=0.10, mode='constant'):
        self.percentage=percentage
        self.mode=mode
    def __call__(self,image,**kwargs):
        return PadToSize(int(image.shape[1]*(self.percentage+1)),self.mode)(image=image)


In [24]:
class ImageStack:
    def __call__(self, image,**kwargs):
        delta = librosa.feature.delta(image)
        accelerate = librosa.feature.delta(image, order=2)
        image = np.stack([image, delta, accelerate], axis=-1)
        image = image.astype(np.float32)
        return image
        


In [25]:
def spec_augment(spec: np.ndarray,
                 num_mask=2,
                 freq_masking=0.15,
                 time_masking=0.20,
                 value=0):
    spec = spec.copy()
    num_mask = random.randint(1, num_mask)
    for i in range(num_mask):
        all_freqs_num, all_frames_num  = spec.shape
        freq_percentage = random.uniform(0.0, freq_masking)

        num_freqs_to_mask = int(freq_percentage * all_freqs_num)
        f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
        f0 = int(f0)
        spec[f0:f0 + num_freqs_to_mask, :] = value

        time_percentage = random.uniform(0.0, time_masking)

        num_frames_to_mask = int(time_percentage * all_frames_num)
        t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
        t0 = int(t0)
        spec[:, t0:t0 + num_frames_to_mask] = value
    return spec


class SpecAugment:
    def __init__(self,
                 num_mask=2,
                 freq_masking=0.15,
                 time_masking=0.20):
        self.num_mask = num_mask
        self.freq_masking = freq_masking
        self.time_masking = time_masking

    def __call__(self, image,**kwargs):
        return spec_augment(image,
                            self.num_mask,
                            self.freq_masking,
                            self.time_masking,
                            image.min())
        
  

### Get transformation

In [26]:
def get_transforms(train, height,width,
                   wrap_pad_prob=0.5,
                   resize_scale=(1, 0.8),
                   resize_ratio=(1, 2.4),
                   resize_prob=0.4,
                   spec_num_mask=2,
                   spec_freq_masking=0.15,
                   spec_time_masking=0.20,
                   spec_prob=0.5):
    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)
    if train:
      
        transforms = albumentations.Compose([
            
            
            
            albumentations.OneOf([albumentations.Lambda(PadToSize(AudioConfig.pad,mode="constant"),p=0.5),
                                  albumentations.Lambda(PadToSize(AudioConfig.pad,mode="wrap"),p=0.5),
                                  #albumentations.Resize(height,width,p=0.6),
                                  #albumentations.RandomResizedCrop(height,width,scale=resize_scale, ratio=resize_ratio,p=0.3),
                                  #albumentations.RandomResizedCrop(height,width,scale=(1,0.9), ratio=(1,2.0),p=0.2),     
                                    ],p=1),
            
            albumentations.Lambda(AudioCrop(percentage=0.9), p=AudioConfig.audio_crop_prob),
            #albumentations.RandomCrop(height,width,p=1),
            
            #albumentations.OneOf([albumentations.RandomResizedCrop(height,width,scale=resize_scale, ratio=resize_ratio),
            #                      albumentations.RandomResizedCrop(height,width,scale=(1,0.9), ratio=(1,2.0)), ],p=resize_prob),
            #albumentations.RandomResizedCrop(height,width,scale=resize_scale, ratio=resize_ratio,p=resize_prob),

            
            
            #albumentations.Compose([albumentations.Lambda(PadToSize(AudioConfig.pad,mode="wrap"),p=1),
            #                        albumentations.RandomCrop(AudioConfig.n_mels,width,p=1),],p=1),
            albumentations.RandomResizedCrop(height,width,scale=resize_scale, ratio=resize_ratio,p=0.0),
            #albumentations.CenterCrop(AudioConfig.n_mels,width,p=1),
            #albumentations.RandomCrop(AudioConfig.n_mels,width,p=1),
            
            #albumentations.Crop(x_min=0, y_min=0, x_max=width, y_max=AudioConfig.n_mels,p=1),
            albumentations.Resize(height,width,p=1),
            albumentations.OneOf([albumentations.Lambda(SpecAugment(num_mask=2,freq_masking=0.10,time_masking=0.16)),
                                  #albumentations.Lambda(SpecAugment()) ,
                                  ],p=AudioConfig.spec_aug_prob),
                                  
            
            albumentations.Lambda(ImageStack(),p=1),
            albumentations.Lambda(ToColor(),p=1),
            albumentations.Normalize (mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, p=1.0),
            albumentations.pytorch.transforms.ToTensor(),
            
            
        ])
    else:
        transforms = albumentations.Compose([
            albumentations.Lambda(PadToSize(AudioConfig.pad,mode="wrap"),p=1),
            #albumentations.Crop(x_min=0, y_min=0, x_max=width, y_max=AudioConfig.n_mels,p=1),
            albumentations.CenterCrop(AudioConfig.n_mels,width,p=1),
            albumentations.Resize(height,width,p=1),
            albumentations.Lambda(ImageStack(),p=1),
            albumentations.Lambda(ToColor(),p=1),
            albumentations.Normalize (mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, p=1.0),
            albumentations.pytorch.transforms.ToTensor(),
            
        ])
    return transforms

## Mixers

In [28]:
def get_random_sample(dataset):
    rnd_idx = random.randint(0, len(dataset) - 1)
    rnd_audio,rnd_target=dataset.tensor_dict[df.loc[rnd_idx,dataset.path_col]]
    rnd_target=dataset.class_to_idx[rnd_target]
    
    rnd_audio = dataset.transform(image=rnd_audio[0])["image"]
    rnd_target=dataset.target_transform(rnd_target,num_classes=len(dataset.classes))
    return rnd_audio, rnd_target

class AddMixer:
    def __init__(self, alpha_dist='uniform'):
        assert alpha_dist in ['uniform', 'beta']
        self.alpha_dist = alpha_dist

    def sample_alpha(self):
        if self.alpha_dist == 'uniform':
            return random.uniform(0, 0.5)
        elif self.alpha_dist == 'beta':
            return np.random.beta(0.4, 0.4)

    def __call__(self, dataset, image, target):
        rnd_image, rnd_target = get_random_sample(dataset)
        alpha = self.sample_alpha()
        image = (1 - alpha) * image + alpha * rnd_image
        target = (1 - alpha) * target + alpha * rnd_target
        return image, target


class SigmoidConcatMixer:
    def __init__(self, sigmoid_range=(3, 12)):
        self.sigmoid_range = sigmoid_range

    def sample_mask(self, size):
        x_radius = random.randint(*self.sigmoid_range)

        step = (x_radius * 2) / size[1]
        x = np.arange(-x_radius, x_radius, step=step)
        y = torch.sigmoid(torch.from_numpy(x)).numpy()
        mix_mask = np.tile(y, (size[0], 1))
        return torch.from_numpy(mix_mask.astype(np.float32))

    def __call__(self, dataset, image, target):
        rnd_image, rnd_target = get_random_sample(dataset)

        mix_mask = self.sample_mask(image.shape[-2:])
        rnd_mix_mask = 1 - mix_mask

        image = mix_mask * image + rnd_mix_mask * rnd_image
        target = target + rnd_target
        target = np.clip(target, 0.0, 1.0)
        return image, target


class RandomMixer:
    def __init__(self, mixers, p=None):
        self.mixers = mixers
        self.p = p

    def __call__(self, dataset, image, target):
        mixer = np.random.choice(self.mixers, p=self.p)
        image, target = mixer(dataset, image, target)
        return image, target


class UseMixerWithProb:
    def __init__(self, mixer, prob=.5):
        self.mixer = mixer
        self.prob = prob

    def __call__(self, dataset, image, target):
        if random.random() < self.prob:
            return self.mixer(dataset, image, target)
            print(image.shape,target.shape)
        return image, target


# Load Data

In [29]:
df1=pd.read_csv("Train.csv")
submission=pd.read_csv("SampleSubmission.csv")
submission["label"]="akawuka"
df2,_=create_csv_dataset_from_path(Config.DATASET2_PATH)
df3,_=create_csv_dataset_from_path(Config.DATASET3_PATH)
df=pd.concat([df1,df2,df3],ignore_index=True).reset_index(drop=True)
#df=df1
df_all=pd.concat([df1,df2,df3],ignore_index=True).reset_index(drop=True)

In [30]:
print("Files in the dataset : ",len(os.listdir(Config.DATASET_PATH)))
print("train1 shape : ",df1.shape)
print("train2 shape : ",df2.shape)
print("train3 shape : ",df3.shape)
print("train all shape : ",df.shape)
print("test shape : ",submission.shape)

Files in the dataset :  2126
train1 shape :  (1109, 2)
train2 shape :  (1740, 2)
train3 shape :  (1860, 2)
train all shape :  (4709, 2)
test shape :  (1017, 195)


In [31]:
classes=df["label"].unique()
class_to_idx={classe:idx for idx,classe in enumerate(classes)}
idx_to_class={idx:classe for idx,classe in enumerate(classes)}

# Dataset definition and loading data

In [32]:
class CSVDataset(Dataset):
    def __init__(self, df, loader,classes=None, transform=None,
                 target_transform=None,device=torch.device("cpu")):
        super(Dataset, self).__init__()
        
        self.df=df.reset_index(drop=True)
        self.loader=loader
        self.transform=transform
        
        self.target_transform=target_transform
        self.device=device
        self.loaded=False
        self.loaded_samples=[]
        self.path_col="fn"
        self.target_col="label"
        if classes is None:
            self.classes=df[self.target_col].unique()
        else :
            self.classes=classes
        
        self.class_to_idx={classe:idx for idx,classe in enumerate(self.classes)}
        self.idx_to_class={idx:classe for idx,classe in enumerate(self.classes)}
        
    
    def load_data(self):
        self.loaded_samples=[]
        for ind in tqdm(range(len(self.df)),0):
            path=self.df.loc[ind,self.path_col]
            target=self.df.loc[ind,self.target_col]
            sample = self.loader(path)
            self.loaded_samples.append([sample,target])
        self.loaded=True
        
    def save_tensor(self,path):
        assert self.loaded==True
        torch.save(self.loaded_samples,path)
    def load_tensor(self,path):
        self.loaded_samples=torch.load(path)
        self.loaded=True
    
    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            dict {"audio" "sample_rate" "target"}
        """
        if self.loaded:
            sample, target = self.loaded_samples[index]
        
        else:    
            path=self.df.loc[index,self.path_col]
            target=self.df.loc[index,self.target_col]
            sample = self.loader(path)

        audio=sample[0]
        sample_rate=sample[1]
        target=self.class_to_idx[target]
        
        if self.transform is not None:
            audio = self.transform(image=audio)["image"]

        if self.target_transform is not None:
            target = self.target_transform(target)

        
        

        
        return {"audio":audio,"sample_rate":sample_rate ,"target":target}
    
    def __len__(self):
        return self.df.shape[0]



In [33]:
class PathDataset(Dataset):

    def __init__(self, df, tensor_dict,classes=None, transform=None,
                  target_transform=None,mixer=None,device=torch.device("cpu")):
          super(PathDataset, self).__init__()
          
          self.df=df.reset_index(drop=True)
          self.tensor_dict=tensor_dict
          self.transform=transform
          self.mixer=mixer
          self.target_transform=target_transform
          self.device=device
          self.path_col="fn"
          self.target_col="label"
          if classes is None:
              self.classes=df[self.target_col].unique()
          else :
              self.classes=classes
          
          self.class_to_idx={classe:idx for idx,classe in enumerate(self.classes)}
          self.idx_to_class={idx:classe for idx,classe in enumerate(self.classes)}

    def __getitem__(self, index):
          """
          Args:
              index (int): Index

          Returns:
              dict {"audio" "sample_rate" "target"}
          """
          
          path=self.df.loc[index,self.path_col]
          #target=self.df.loc[index,self.target_col]
          sample,target = self.tensor_dict[path]
          audio=sample[0]
          sample_rate=sample[1]
          target=class_to_idx[target]
          
          if self.transform is not None:
              audio = self.transform(image=audio)["image"]

          if self.target_transform is not None:
              target = self.target_transform(target,num_classes=len(self.classes))

          
          if self.mixer is not None:
              audio, target = self.mixer(self, audio, target)

          
          return {"audio":audio,"sample_rate":sample_rate ,"target":target}


    def __len__(self):
        return self.df.shape[0]

In [34]:
train_transform = get_transforms(train=True,height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)

In [35]:
amixer = RandomMixer([
        SigmoidConcatMixer(sigmoid_range=(3, 12)),
        AddMixer(alpha_dist='uniform')
    ], p=[0.6, 0.4])
amixer = UseMixerWithProb(amixer, prob=0.0)

In [36]:
audio_set=CSVDataset(df_all,read_as_melspectrogram,classes=classes,transform=train_transform)


In [37]:
submission_set=CSVDataset(submission,read_as_melspectrogram,classes=classes,transform=train_transform)

In [45]:
#audio_set.load_data()
audio_set.load_tensor("dataset.pth")

In [46]:
tensor_dict={path:audio_set.loaded_samples[idx] for idx,path in enumerate(df_all[audio_set.path_col])}

In [47]:
loaded_set=PathDataset(df_all,tensor_dict,classes=classes,transform=train_transform,)

In [48]:
loaded_set.__getitem__(457)["audio"].shape

torch.Size([3, 400, 228])

In [49]:
#submission_set.load_data()
submission_set.load_tensor("submission.pth")

In [50]:
submission_tensor_dict={path:submission_set.loaded_samples[idx] for idx,path in enumerate(submission[submission_set.path_col])}

# Class weights

In [64]:
df1["num_label"]=df1.label.apply(lambda x:class_to_idx[x])

In [65]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 classes,
                                                 df1.label)
class_weights

 'ensujju' 'okulimibwa' 'mpeke' 'okusaasaana' 'ebigimusa' 'ekikolo' 'farm'
 'kisaanyi' 'kikajjo' 'ekisaanyi' 'ndwadde' 'omusiri' 'butterfly'
 'munyeera' 'eggobe' 'ebiwojjolo' 'ebisoolisooli' 'namuginga' 'okugimusa'
 'maize streak virus' 'ekirime' 'miceere' 'sikungula' 'lumonde'
 'okukungula' 'cassava' 'ebirime' 'ebijanjaalo' 'weeding' 'garden'
 'drought' 'leaves' 'insect' 'akatungulu' 'seed' 'pepper'
 'matooke seedlings' 'harvesting' 'medicine' 'nursery bed' 'mucungwa'
 'endwadde' 'pawpaw' 'enkota' 'ensiringanyi' 'kassooli' 'okufuuyira'
 'caterpillars' 'ekijanjaalo' 'okukkoola' 'crop' 'okulima' 'endagala'
 'kaamulali' 'ennima' 'omuceere' 'micungwa' 'ebisaanyi' 'plant' 'eddagala'
 'ennimiro' 'amakoola' 'ebiwuka' 'ekigimusa' 'bibala' 'beans' 'nnimiro'
 'ebinyebwa' 'passion fruit' 'Spinach' 'okuzifuuyira' 'ekirwadde'
 'nakavundira' 'nfukirira' 'onion' 'ddagala' 'muwogo' 'irrigate'
 'akasaanyi' 'ekikajjo' 'emmwanyi' 'ekiwojjolo' 'orange' 'ebibala'
 'ebyobulimi' 'ensuku' 'farmer' 'spray' 'o

array([0.5746114 , 1.4365285 , 0.63845711, 0.95768566, 1.1492228 ,
       0.82087343, 0.71826425, 0.95768566, 1.1492228 , 0.95768566,
       0.63845711, 0.47884283, 1.4365285 , 0.82087343, 0.71826425,
       1.4365285 , 0.63845711, 0.522374  , 1.1492228 , 1.91537133,
       0.522374  , 1.1492228 , 0.522374  , 0.82087343, 0.5746114 ,
       0.5746114 , 1.4365285 , 1.1492228 , 0.95768566, 0.95768566,
       0.95768566, 0.95768566, 1.91537133, 1.1492228 , 0.95768566,
       0.71826425, 0.82087343, 0.95768566, 0.82087343, 1.1492228 ,
       0.95768566, 0.82087343, 1.4365285 , 1.91537133, 0.82087343,
       1.4365285 , 0.82087343, 1.1492228 , 0.522374  , 1.4365285 ,
       0.95768566, 0.522374  , 0.95768566, 0.5746114 , 1.4365285 ,
       0.95768566, 1.4365285 , 0.71826425, 1.1492228 , 0.95768566,
       0.82087343, 1.1492228 , 1.4365285 , 1.1492228 , 0.63845711,
       0.82087343, 0.63845711, 1.1492228 , 1.4365285 , 0.522374  ,
       0.71826425, 0.95768566, 1.91537133, 1.1492228 , 1.14922

# Create folds

In [67]:
df["folds"]=-1
df2["folds"]=-1
df3["folds"]=-1

kf = StratifiedKFold(n_splits=Config.n_folds, random_state=Config.random_state, shuffle=False)
for fold, (_, val_index) in enumerate(kf.split(df,df["label"])):
        df.loc[val_index, "folds"] = fold



# Model

In [73]:
class Net(torch.nn.Module):
    def __init__(self,arch,num_classes ,pretrained='imagenet'):
        super(Net, self).__init__()
        self.base_model = pretrainedmodels.__dict__[
            arch
        ](pretrained=pretrained)
        
        
        self.prepare = torch.nn.Sequential()
        self.prepare.add_module('conv', nn.Conv2d(in_channels=1, out_channels=3, kernel_size=3, padding=1, stride=1,
                                                bias=False))
        #self.prepare.add_module('bn', nn.BatchNorm2d(3, eps=0.001, momentum=0.1, affine=True))
        if arch=="dpn98":
            self.l0 = torch.nn.Linear(2688, num_classes)
        elif arch=="se_resnext50_32x4d" or arch=="resnet101" :
            self.l0 = torch.nn.Linear(2048, num_classes)
        elif arch=="dpn68":
            self.l0 = torch.nn.Linear(832, num_classes)
        elif arch=="resnet18":
            self.l0 = torch.nn.Linear(512, num_classes)
            #self.l0 = torch.nn.Linear(1024, num_classes)

        elif arch=="vgg19":
            self.l0 = torch.nn.Linear(512, num_classes)
            #self.l0 = torch.nn.Linear(1024, num_classes)

        elif arch=="se_resnet50":    
            self.l0 = torch.nn.Linear(2048, num_classes)
        elif arch=="resnet50":    
            self.l0 = torch.nn.Linear(2048, num_classes)  
        elif arch=="senet154":
            self.l0 = torch.nn.Linear(2048, num_classes)
        elif arch=="se_resnext101_32x4d":
            self.l0 = torch.nn.Linear(2048, num_classes)
        elif arch=="dpn107":
            self.l0 = torch.nn.Linear(2688, num_classes)
        elif arch=="densenet121":
            self.l0 = torch.nn.Linear(1024, num_classes)
            fc_size = self.base_model.last_linear.in_features
            #print("fc_size : ",fc_size)
            self.base_model.last_linear = nn.Sequential(nn.Linear(7168, 193))


        else :
            self.l0 = torch.nn.Linear(4098, num_classes)
    def forward(self, audio, target,sample_rate):
        batch_size, _, _, _ = audio.shape
        
        x=audio
        x = self.base_model.features(x)
        
        x = F.adaptive_avg_pool2d(x, 1).reshape(batch_size, -1)
        
        out = self.l0(x)
        
        loss = torch.nn.CrossEntropyLoss()(out, torch.argmax(target, dim=1))
        

        return out, loss

# Training

In [74]:
model_name="dpn68"
#model_name="dpn98"
#model_name="resnet18"
#model_name="densenet121"
pretrained="imagenet"

In [75]:
model = Net(model_name,num_classes=len(classes),pretrained=pretrained)

In [76]:
def train(fold):
    seed_all(num_seed)
    model_path=os.path.join(Config.experiment_id,f"model_fold_{fold}.bin")
    df_train = df[df["folds"] != fold].reset_index(drop=True)
    
    df_valid = df[df["folds"] == fold].reset_index(drop=True)
    df_train["weights"]=df_train["label"].apply(lambda x:class_weights[class_to_idx[x]])
    
    print("-------------",df_train.shape,"---------------",df_valid.shape,"-------------")
    train_transfrom = get_transforms(train=True,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    valid_transfrom = get_transforms(train=False,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    mixer = RandomMixer([
        
        AddMixer(alpha_dist='uniform')
    ], p=[1])
    mixer = UseMixerWithProb(mixer, prob=AudioConfig.mixer_prob)
    train_dataset =PathDataset(df_train,tensor_dict,classes=classes,transform=train_transfrom,target_transform=to_categorical,mixer=mixer)
    
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=Config.train_batchsize, shuffle=True, num_workers=8
    )
    
    
    
    valid_dataset =PathDataset(df_valid,tensor_dict,classes=classes,transform=valid_transfrom,target_transform=to_categorical)
    
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=Config.test_batchsize, shuffle=False, num_workers=8
    )
    model = Net(model_name,num_classes=len(classes),pretrained=pretrained)
    
    model.to(Config.device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=Config.lr)
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=0, factor=0.6,min_lr=Config.min_lr,verbose=True)
    
    es = EarlyStopping(patience=8, mode="min")
    eng = Engine(model, optimizer, device=Config.device)
    for epoch in range(Config.epochs):
        train_loss = eng.train(train_loader)
        valid_loss,predictions = eng.evaluate(valid_loader, return_predictions=True)
        
        with open('out.txt', 'a') as f:
            f.write(f"Fold = {fold}  Epoch = {epoch}, valid loss = {valid_loss}\n")
        
        scheduler.step(valid_loss)
        es(valid_loss, model, model_path=model_path)
        if es.early_stop:
            print("Early stopping")
            break
    


In [77]:
def after_train(fold):
    seed_all(num_seed)
    model_path=os.path.join(Config.experiment_id,f"model_fold_{fold}.bin")
    model_save_path=os.path.join(Config.experiment_id,f"model_fold_{fold}.bin")
    df_train = df[df["folds"] != fold].reset_index(drop=True)
    
    df_valid = df[df["folds"] == fold].reset_index(drop=True)
    df_train["weights"]=df_train["label"].apply(lambda x:class_weights[class_to_idx[x]])
    
    print("-------------",df_train.shape,"---------------",df_valid.shape,"-------------")
    train_transfrom = get_transforms(train=True,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    valid_transfrom = get_transforms(train=False,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    mixer = RandomMixer([
        
        AddMixer(alpha_dist='uniform')
    ], p=[1])
    mixer = UseMixerWithProb(mixer, prob=AudioConfig.mixer_prob)
    train_dataset =PathDataset(df_train,tensor_dict,classes=classes,transform=train_transfrom,target_transform=to_categorical,mixer=mixer)
    
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=Config.train_batchsize, shuffle=True, num_workers=8
    )
    
    
    
    valid_dataset =PathDataset(df_valid,tensor_dict,classes=classes,transform=valid_transfrom,target_transform=to_categorical)
    
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=Config.test_batchsize, shuffle=False, num_workers=8
    )
    model = Net(model_name,num_classes=len(classes),pretrained=pretrained)
    model.load_state_dict(torch.load(model_path))
    model.to(Config.device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=Config.aftertrain_lr)
    
    scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 10, eta_min=0.2*1e-6, last_epoch=-1,)
    
    es = EarlyStopping(patience=6, mode="min")
    eng = Engine(model, optimizer,scheduler=scheduler, device=Config.device)
    for epoch in range(Config.epochs):
        if epoch!=0:
            train_loss = eng.train(train_loader)
        valid_loss,predictions = eng.evaluate(valid_loader, return_predictions=True)
        
        with open('out.txt', 'a') as f:
            f.write(f"Fold = {fold}  Epoch = {epoch}, valid loss = {valid_loss}\n")
        
        
        es(valid_loss, model, model_path=model_save_path)
        if es.early_stop:
            print("Early stopping")
            break
    


In [78]:
def eval_train(fold):
    seed_all(num_seed)
    model_path=os.path.join(Config.experiment_id,f"model_fold_{fold}.bin")
    df_train = df[df["folds"] != fold].reset_index(drop=True)
    df_valid = df[df["folds"] == fold].reset_index(drop=True)
    
    train_transfrom = get_transforms(train=True,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    valid_transfrom = get_transforms(train=False,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    mixer = RandomMixer([
        
        AddMixer(alpha_dist='uniform')
    ], p=[1])
    mixer = UseMixerWithProb(mixer, prob=AudioConfig.mixer_prob)
    train_dataset =PathDataset(df_train,tensor_dict,classes=classes,transform=train_transfrom,target_transform=to_categorical,mixer=mixer)
    
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=Config.train_batchsize, shuffle=True, num_workers=8
    )

    valid_dataset =PathDataset(df_valid,tensor_dict,classes=classes,transform=valid_transfrom,target_transform=to_categorical)
    
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=Config.test_batchsize, shuffle=False, num_workers=8
    )
    model = Net(model_name,num_classes=len(classes),pretrained=pretrained)
    model.load_state_dict(torch.load(model_path))
    model.to(Config.device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=Config.lr)
    eng = Engine(model, optimizer, device=Config.device)
    
    train_loss,predictions = eng.evaluate(train_loader, return_predictions=True)
    valid_loss,predictions = eng.evaluate(valid_loader, return_predictions=True)
    
    
    print(f"train loss = {train_loss}, valid loss = {valid_loss} ")
    return train_loss,valid_loss

    


In [79]:
def predict(fold):
    seed_all(num_seed)
    model_path=os.path.join(Config.experiment_id,f"model_fold_{fold}.bin")
    test_transfrom = get_transforms(train=False,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    train_transfrom = get_transforms(train=True,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    
    
    test_dataset =PathDataset(submission,submission_tensor_dict,classes=classes,transform=test_transfrom,target_transform=to_categorical)
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=Config.test_batchsize, shuffle=False, num_workers=8
    )

    model = Net(model_name,num_classes=len(classes),pretrained=pretrained)
    model.load_state_dict(torch.load(model_path))
    model.to(Config.device)
    model.eval()
    optimizer = torch.optim.Adam(model.parameters(), lr=Config.lr)
    eng = Engine(model, optimizer, device=Config.device)
    predictions = eng.predict(test_loader)
    predictions=torch.nn.Softmax(dim=1)(torch.cat(predictions))
    return predictions

In [80]:
def predict_tta(fold):
    seed_all(num_seed)
    model_path=os.path.join(Config.experiment_id,f"model_fold_{fold}.bin")
    test_transfrom = get_transforms(train=False,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    train_transfrom = get_transforms(train=True,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    
    
    test_dataset =PathDataset(submission,submission_tensor_dict,classes=classes,transform=train_transfrom,target_transform=to_categorical)
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=Config.test_batchsize, shuffle=False, num_workers=8
    )

    model = Net(model_name,num_classes=len(classes),pretrained=pretrained)
    model.load_state_dict(torch.load(model_path))
    model.to(Config.device)
    model.eval()
    optimizer = torch.optim.AdamW(model.parameters(), lr=Config.lr)
    eng = Engine(model, optimizer, device=Config.device)
    all_predictions=[]
    for i in range(30):
        all_predictions.append(torch.nn.Softmax(dim=1)(torch.cat(eng.predict(test_loader))).numpy())
    
    predictions=gmean(all_predictions)
    
    return predictions

In [81]:
def generate_submission_csv(fold):
    seed_all(num_seed)
    model_path=os.path.join(Config.experiment_id,f"model_fold_{fold}.bin")
    df_train = df[df["folds"] != fold].reset_index(drop=True)
    df_valid = df[df["folds"] == fold].reset_index(drop=True)
    
    train_transfrom = get_transforms(train=True,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    valid_transfrom = get_transforms(train=False,
                                     height=AudioConfig.height,
                                     width=AudioConfig.width,
                                     wrap_pad_prob=AudioConfig.WRAP_PAD_PROB)
    mixer = RandomMixer([
        
        AddMixer(alpha_dist='uniform')
    ], p=[1])
    mixer = UseMixerWithProb(mixer, prob=AudioConfig.mixer_prob)
    train_dataset =PathDataset(df_train,tensor_dict,classes=classes,transform=train_transfrom,target_transform=to_categorical,mixer=mixer)
    
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=Config.train_batchsize, shuffle=True, num_workers=8
    )

    valid_dataset =PathDataset(df_valid,tensor_dict,classes=classes,transform=valid_transfrom,target_transform=to_categorical)
    
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=Config.test_batchsize, shuffle=False, num_workers=8
    )
    model = Net(model_name,num_classes=len(classes),pretrained=pretrained)
    model.load_state_dict(torch.load(model_path))
    model.to(Config.device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=Config.lr)
    eng = Engine(model, optimizer, device=Config.device)
    
    
    predictions = eng.predict(valid_loader)
    
    predictions=torch.nn.Softmax(dim=1)(torch.cat(predictions))
    sample = df[df["folds"] == fold].reset_index(drop=True)
    
    sample.loc[:, classes] = predictions
    return sample

    


11:54

### Training folds

In [84]:
%%capture
for fold in range(0,Config.n_folds):
    print("Fold : ",fold)
    train(fold)
    after_train(fold)


In [87]:
train_losses=[]
valid_losses=[]
for fold in range(Config.n_folds):
    train_loss,valid_loss=eval_train(fold)
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.12384458328696091, valid loss = 1.187116670897783 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.12307793921582348, valid loss = 0.7459480790243784 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.08145786040942492, valid loss = 0.5414975580874776 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.05222611909441596, valid loss = 0.49727889978064427 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.06339400710921321, valid loss = 0.42139920602420394 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.05416902270058103, valid loss = 0.43164998805316873 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.07797903835741618, valid loss = 0.23521054835775 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.06480202990594235, valid loss = 0.2807040279104682 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.06779549509014512, valid loss = 0.14814615108554205 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1060.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))


train loss = 0.05680204783891655, valid loss = 0.1526668972801417 


In [88]:
np.mean(valid_losses)

0.4641618026501558

In [91]:
p=[]
for fold in range(Config.n_folds):
    p.append(predict(fold).numpy())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=255.0), HTML(value='')))




In [92]:
predictions=gmean(p,axis=0)

In [93]:
predictions.shape

(1017, 193)

In [94]:
prediction_file=f"dpn68.csv"
sample = pd.read_csv("SampleSubmission.csv")
sample.loc[:, classes] = predictions
sample.to_csv(prediction_file, index=False)


In [97]:
sample.head()

Unnamed: 0,fn,maize streak virus,disease,okukkoola,muwogo,mpeke,mucungwa,greens,garden,mango,...,kasaanyi,suckers,insects,fertilizer,nakavundira,ekiwojjolo,akawuka,ddagala,ebiwojjolo,obutungulu
0,audio_files/00118N3.wav,0.0004982374,2.936035e-05,1.333825e-05,3.919042e-06,2.633253e-05,8.151552e-06,0.001120957,0.00931372,0.00283508,...,2.5673e-06,0.0001674182,7.750516e-05,0.000594537,8.61863e-07,9.44892e-07,7.436629e-05,0.0004465838,2.752912e-07,1.438401e-06
1,audio_files/00P0NMV.wav,4.956156e-10,3.945247e-10,1.653673e-10,4.860184e-10,2.68699e-09,8.620225e-09,2.600239e-10,1.43883e-08,9.735198e-10,...,2.857995e-09,3.664304e-09,3.635627e-10,7.494829e-08,0.9991038,8.150544e-09,3.170091e-07,8.116902e-07,1.520849e-09,8.244088e-09
2,audio_files/01QEEZI.wav,3.727014e-07,2.840558e-06,1.689248e-06,9.05607e-05,1.190358e-06,7.33529e-07,4.384382e-07,7.363206e-07,1.020058e-07,...,8.333769e-07,1.431371e-08,2.882434e-07,2.873111e-07,3.088863e-07,7.111642e-07,4.648003e-08,3.707795e-07,5.742815e-06,9.233232e-07
3,audio_files/037YAED.wav,2.327935e-05,9.565458e-06,7.227104e-08,1.340523e-06,0.0003220598,3.877383e-06,3.17939e-06,0.4161313,1.187128e-05,...,9.186658e-06,0.003707174,8.858315e-05,0.0004840702,8.9072e-06,1.463914e-06,9.296805e-05,0.0003652636,1.415184e-06,3.81017e-06
4,audio_files/0382N0Y.wav,7.332573e-08,1.563314e-06,7.388575e-08,4.681138e-07,2.524634e-07,5.709543e-07,7.408599e-08,7.139526e-08,6.32246e-08,...,1.688224e-07,4.939552e-08,5.181317e-08,1.411171e-07,8.881476e-07,1.860517e-06,2.594567e-06,7.256855e-08,1.065122e-05,1.903768e-07


In [98]:
sample.iloc[700,1:].sum()

0.9982855086786987

In [99]:
sample.iloc[700,1:].max()

0.9969573616981506