In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#Load in the libraries we need 
import torch
import numpy as np
import pandas as pd
import argparse
from glob import glob
import librosa
from sed_model import load_model # this creates our model

from sed_dataloader import * # this is the data file
from metrics import LWLRAP
from tqdm.auto import tqdm

  'The interface of "soundfile" backend is planned to change in 0.8.0 to '


### Load in the dataset to evaluate or test on

Here we are using Christian's test set

In [3]:
df = pd.read_csv('../labels_test_CD_20210309.csv');df.head()

Unnamed: 0,Label,File,Event_ID,X_min,X_max,Y_min,Y_max,Species,EngName,Group,Date,recID,wave,duration
0,AshDro,extr_1670_B09_20190727_060000_All_Day_3h.txt,AshDro_1,3657.588,9824.903,1.788462,4.25,Dicrurus leucophaeus,Ashy Drongo,Birds,27/07/2019,B09,extr_1670_B09_20190727_060000_All_Day_3h.wav,6167.315
1,AshDro,extr_1670_B09_20190727_060000_All_Day_3h.txt,AshDro_2,10622.568,12178.988,2.153846,4.153846,Dicrurus leucophaeus,Ashy Drongo,Birds,27/07/2019,B09,extr_1670_B09_20190727_060000_All_Day_3h.wav,1556.42
2,AshDro,extr_1676_B01_20190718_060000_All_Day_3h.txt,AshDro_3,291.829,2295.72,1.361702,3.593381,Dicrurus leucophaeus,Ashy Drongo,Birds,18/07/2019,B01,extr_1676_B01_20190718_060000_All_Day_3h.wav,2003.891
3,AshDro2,extr_1463_B05_20181212_000009_All_Day.txt,AshDro2_1,5243.02,8965.874,1.73236,2.355231,Dicrurus leucophaeus,Ashy Drongo,Birds,12/12/2018,B05,extr_1463_B05_20181212_000009_All_Day.wav,3722.854
4,AshDro3,extr_1673_B05_20190729_060000_All_Day_3h.txt,AshDro3_1,5700.389,7821.012,1.73494,5.686747,Dicrurus leucophaeus,Ashy Drongo,Birds,29/07/2019,B05,extr_1673_B05_20190729_060000_All_Day_3h.wav,2120.623


## Load the classes
We need to load the classes that the models were trained on, in this case, that is the 51 class model

In [4]:
classes = np.loadtxt('classes.txt', dtype='str', delimiter='\n')
classes = list(classes)

We first preprocess the data, so that it can be used by our dataloaders. 

This includes adding a recording_id so that it may load the correct audio file, and a species_id if you are evaluating some data. There is also the oppertunity to drop classes that are not in the classes file to test how well it performs on just these classes.  

This function below ouputs a a preprocessed CSV file, a prediction and labels file as well. 

In [5]:
def preprocess(df, classes=[], drop_classes= False, mode='test'):
           
    #appends a species id
    #if the species is not within the classes it is labeled as an unknown#
    if mode !='test':
        df['species_id'] = [-1 if s not in classes else classes.index(s) for s in df.Label] #we choose what we are looking at 
        if drop_classes == True:
            df = df[df.Species.isin(classes)]
    df = df.reset_index(drop=True)
    #preprare a output file of the predictions from the model
    species_cols = [f'{classes[i]}' for i in range(len(classes))]
    cv_preds = pd.DataFrame(columns=species_cols)
    cv_preds['filepath'] = df['filepath'].drop_duplicates()
    cv_preds.loc[:, species_cols] = 0
    cv_preds = cv_preds.reset_index(drop=True)
    
    if mode == 'test':
        return df, cv_preds, species_cols
    return df, cv_preds,species_cols, cv_preds.copy()

## Evaluation loop

This is the evaluation loop, that loads the model, creates a dataloader and passes the data to the model. It then processes the predictions from the model, and addits it to a prediction csv, and the appropriate label to the labels csv file. 


In [6]:
def evaluation_loop(paths,species_cols, df, config, cv_preds, label_df, device):

    for path in paths:
        print(f'loading model {path}')
        model = load_model(path, config)
        model.to(device)
        #load the data into dataloaders
        dataloader = get_dataloader(df, config=config, mode='valid')

        tk = tqdm(dataloader, total=len(dataloader))
        sub_index = 0
        with torch.no_grad():
            #pass it to the cpu or gpu
            _preds, labels =[],[]
            for i, (x,l) in enumerate(tk):
            
                x = x.to(device)
                l = l.to(device)
                bs, seq, w = x.shape #we need to reshape the input it as contains lots of segments of raw audio files that are not in the right shape for the CNN
                x = x.reshape(bs*seq, w)
                x = x.float()
                preds = model(x, None, None)# We do the same as above, but we don't mix the labels up, 

                #we also need to reshape the framewise output to calculate the evaluation metric
                preds = torch.sigmoid(torch.max(preds['framewise_output'], dim=1)[0])
                preds = preds.reshape(bs, seq, -1)
                preds = torch.sum(preds, dim=1)
                val_lwlrap = LWLRAP(preds, l)#get metrics and log them
                o = preds.cpu().numpy()
                _preds.extend(o)
                labels.extend(l.cpu().numpy())
                for val, ll in zip(o,l.cpu().numpy()):
                    cv_preds.loc[sub_index, species_cols] += list(val)
                    label_df.loc[sub_index, species_cols] = ll
                    sub_index += 1
    
            p = torch.from_numpy(np.array(_preds)) 
            t = torch.from_numpy(np.array(labels))
            print(f"lwlrap: {LWLRAP(p, t):.6}")
    return cv_preds, label_df

## Inference loop
This is similar to the above loop, the only difference is that it only passes a predictions csv file back.  
This is the loop you would use to predict on data, you don't know what is within the audio files. 

In [7]:
def inference_loop(paths,species_cols, df,config, cv_preds,device):

    for path in paths:
        print(f'loading model {path}')
        model = load_model(path, config)
        model.to(device)
        #load the data into dataloaders
        dataloader = get_dataloader(df, config=config, mode='test')

        tk = tqdm(dataloader, total=len(dataloader))
        sub_index = 0
        with torch.no_grad():

            for i, (x,l) in enumerate(tk):
            
                x = x.to(device)
                bs, seq, w = x.shape #we need to reshape the input it as contains lots of segments of raw audio files that are not in the right shape for the CNN
                x = x.reshape(bs*seq, w)
                x = x.float()
                preds = model(x, None, None)# We do the same as above, but we don't mix the labels up, 

                #we also need to reshape the framewise output to calculate the evaluation metric
                preds = torch.sigmoid(torch.max(preds['framewise_output'], dim=1)[0])
                preds = preds.reshape(bs, seq, -1)
                preds = torch.sum(preds, dim=1)

                o = preds.cpu().numpy()

                for val in o:
                    cv_preds.loc[sub_index, species_cols] += list(val)
                    sub_index += 1

    return cv_preds

## Config
The config controls, the batch size( number of images on the GPU), num_works, the sliding window length, duration and other  parameters that may be changed. It also contains the path to where the models weights reside and the folder that contains the audio files.  

In [8]:
df['filepath'] = df.wave

In [9]:
num_classes = len(classes) #this is the number of classes we are using check to make sure this is label
class Config:
    exp_name = "Efficientnet_B0" #experiment and save path
    save_path = 'weights'
    pretrain_weights = None 
    model_param = {
        'encoder' : 'tf_efficientnet_b3_ns', #we pass in which pretrained CNN we wish to you
        'sample_rate': 44100,
        'window_size' : 512, #1024 for better results
        'hop_size' : 512, # #mel spectrogram params
        'mel_bins' : 128, # 60
        'fmin' : 0,
        'fmax' : 13000,
        'classes_num' : num_classes #this is the number of classes that you are training on.
    }
    duration = 4 #if you set this less than 4 it will cause the training to crash
    stride=4
    original_sr = 44100
    num_classes = num_classes 
    ROOT = '.'
    total_duration =15.
    batch_size = 16 #number of images that are passed to the GPU
    num_workers = 0# number of workers, this must be 0 on windows! 
    data_root = osp.join(ROOT, "../Test_data_CD")
    output_csv = 'output.csv'
    model_path = f'{save_path}/{exp_name}/models' #path to where models are kept
    mode='val'
config = Config()


In [10]:
def main(df,classes=[], drop_classes=False, config=Config()):
    #we preprocess the data
    proccessed = preprocess(df, classes=classes, drop_classes=drop_classes, mode = config.mode)
    if config.mode =='test':
        df, cv_preds, species_cols = proccessed
    else:
        df, cv_preds, species_cols, labels_df = proccessed
    # if there is a GPU we load the audio onto the GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #we load a model from the models class
    #and get the paths of models from the models path
    paths = glob(f'{config.model_path}/*.pth')

    if config.mode !='test':
        cv_preds, labels_df = evaluation_loop(paths,species_cols, df,config,  cv_preds, labels_df, device)
    else:
        cv_preds = inference_loop(paths,species_cols, df,config, cv_preds,device)
    #divide by length of folds
    cv_preds.loc[:, species_cols] /=len(paths)

    print(f'Saving predictions to {config.output_csv}')
    cv_preds.to_csv(config.output_csv, index=False)
    if config.mode=='test':
        return cv_preds
    #we evaluate the model using LWlRAP and LRAP
    preds = cv_preds.loc[:,species_cols].values.astype(np.float32)
    preds = torch.from_numpy(preds)

    labels = labels_df.loc[:,species_cols].values.astype(np.float32)
    labels = torch.from_numpy(labels)

    print(f"Label weighted label ranking average precision: {LWLRAP(preds, labels):.6}")
    return cv_preds, labels_df

This will evaluate on all of the data within the CSV file

In [11]:
p,l = main(df, classes, drop_classes=False)

loading model weights/Efficientnet_B0/models\model_0.pth


  "Empty filters detected in mel frequency basis. "


  0%|          | 0/10 [00:00<?, ?it/s]

lwlrap: 0.610217
Saving predictions to output.csv
Label weighted label ranking average precision: 0.610217


This will evaluate on only classes the model has been trained with, and that are within the CSV file

In [12]:
#changing the mode of the config file from eval to test
config = Config()
config.mode='test'

This will predict on the audio files that are within the CSV file, it will not evaluate on the CSV file. 

In [13]:
p = main(df, classes, drop_classes=False, config=config) 

loading model weights/Efficientnet_B0/models\model_0.pth


  0%|          | 0/10 [00:00<?, ?it/s]

Saving predictions to output.csv


### Inference on longer clips

In [85]:
def prediction_for_clip(clip: np.ndarray, config,  model, device):
    
    dataloader = get_dataloader_single_clip(clip, config)
    preds_ = []
    for i, (x) in enumerate(dataloader):
        with torch.no_grad():
            x = x.to(device)
            x = x.float()
            preds = model(x)# We do the same as above, but we don't mix the labels up, 

        #we also need to reshape the framewise output to calculate the evaluation metric
        preds = torch.sigmoid(torch.max(preds['framewise_output'], dim=1)[0])
        o = preds.cpu().numpy()
        preds_.extend(o)


    return np.array(preds_).max(axis=0) #return the CSV
    

In [122]:
from collections import defaultdict
def prediction(test_df,config, path,): 
    #framewise predictions for getting the onset and offset of bird calls
    #using one model
    df, cv_preds, species_cols = preprocess(test_df, classes=classes, drop_classes=False, mode = config.mode)

    model = load_model(path, config)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    sub_idx=0
    for filepath in tqdm(cv_preds.filepath): # for the audio files
        clip, _ = librosa.load(f'{config.data_root}/{filepath}',sr=None,
                               res_type="kaiser_fast") #we load the audio

        pred = prediction_for_clip(clip, config,  model, device)

        cv_preds.loc[sub_idx,species_cols] = pred
        sub_idx+=1
    print(f'Saving predictions to {config.output_csv}')
    cv_preds.to_csv(config.output_csv, index=False)
    return cv_preds

In [123]:
preds = prediction(df, config, glob(f'{config.model_path}/*.pth')[0])

  "Empty filters detected in mel frequency basis. "


  0%|          | 0/153 [00:00<?, ?it/s]

Saving predictions to output.csv


In [124]:
preds

Unnamed: 0,BarCucDov,BroThrBar1,BroThrBar3,CheBelPar,CreCheBab1,EyeWreBab1,FlaFroBar1,GreHeaCan,HorWreBab1,HorWreBab3,...,SnoBroFly1,SnoBroFly2,SunBruCuc1,SunBruCuc2,SunBusWar,SunCuc1,SunCucShr1,SunWar,TriShrVir1,filepath
0,0.503500,0.509965,0.505194,0.579459,0.718772,0.522245,0.535120,0.524652,0.522902,0.599663,...,0.574349,0.544880,0.525028,0.512050,0.539900,0.594786,0.504708,0.731058,0.597082,extr_1670_B09_20190727_060000_All_Day_3h.wav
1,0.509763,0.518879,0.505969,0.565183,0.731030,0.728696,0.581311,0.532075,0.723048,0.512448,...,0.513172,0.536872,0.606059,0.516536,0.666535,0.612820,0.521001,0.542102,0.540889,extr_1676_B01_20190718_060000_All_Day_3h.wav
2,0.504265,0.504124,0.503738,0.511782,0.599826,0.726585,0.554468,0.538623,0.731058,0.602205,...,0.504770,0.504192,0.641876,0.520661,0.573991,0.609488,0.514470,0.514113,0.523751,extr_1463_B05_20181212_000009_All_Day.wav
3,0.504016,0.508311,0.503862,0.548234,0.548771,0.519193,0.545046,0.542701,0.726175,0.505037,...,0.504275,0.505756,0.583905,0.549811,0.594032,0.598947,0.606853,0.509814,0.606657,extr_1673_B05_20190729_060000_All_Day_3h.wav
4,0.502838,0.506977,0.503140,0.511359,0.550509,0.579363,0.539103,0.600442,0.530400,0.522639,...,0.502556,0.516032,0.577347,0.526771,0.662753,0.575786,0.514367,0.511239,0.586876,extr_2153_B04_20190226_000231_All_Day.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,0.506955,0.506442,0.502954,0.508359,0.559692,0.517290,0.549071,0.558185,0.518121,0.501056,...,0.503123,0.503243,0.599052,0.533823,0.578982,0.605591,0.536398,0.510248,0.518811,extr_1607_B12_20190726_060014_All_Day_3h.wav
149,0.511526,0.503636,0.507650,0.642264,0.730920,0.514964,0.698827,0.535997,0.521807,0.703726,...,0.516958,0.509303,0.522436,0.505653,0.558264,0.574572,0.524679,0.546167,0.572987,extr_5014_B06_20181006_064554_All_Day [-7.2658...
150,0.506813,0.505579,0.506911,0.610660,0.718606,0.512943,0.713959,0.529309,0.717101,0.551894,...,0.542584,0.518759,0.559418,0.525457,0.535822,0.573771,0.502780,0.639179,0.597479,extr_5001_B01_20180925_064600_All_Day [-7.2861...
151,0.503351,0.501821,0.504663,0.518281,0.704145,0.731059,0.552575,0.524876,0.530051,0.519836,...,0.509042,0.526114,0.583678,0.527495,0.526931,0.597851,0.502243,0.528691,0.511990,extr_5003_B02_20180920_063200_All_Day [-7.2733...


## Framewise Onset and offset Prediction

In [133]:
def prediction_for_clip_framewise(test_df: pd.DataFrame, clip: np.ndarray, config,  model,threshold=0.5):
    PERIOD = config.duration
    SR=config.original_sr
    #we break the clip into multiple audio clips to pass to the GPU, we process each clip seperately
    audios = []
    y = clip.astype(np.float32)
    len_y = len(y)
    start = 0
    end = int(PERIOD * SR)
    while True:
        y_batch = y[start:end].astype(np.float32) #the first batch
        if len(y_batch) != PERIOD * SR: #we keep on adding till we get to the end of the audio clip
            y_pad = np.zeros(PERIOD * SR, dtype=np.float32)
            y_pad[:len(y_batch)] = y_batch
            audios.append(y_pad)
            break
        start = end
        end += int(PERIOD * SR)
        audios.append(y_batch) #add the audio clip to anarrya
    array = np.asarray(audios)
    tensors = torch.from_numpy(array) #convert the audop batches to a tensor for the GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()  #the model is placed to evaluate mode
    estimated_event_list = []
    global_time = 0.0
    audio_id = test_df["filepath"].values[0] #get the audio ID
    for image in tqdm(tensors): #for each image in the tensor
        image = image.view(1, image.size(0)) #we convert to a batch of one and send to the GPU
        image = image.to(device)
        #we predict and get the framewise output. 
        with torch.no_grad():
            prediction = model(image)
            framewise_outputs = prediction["framewise_output"].detach(
                ).cpu().numpy()[0]
        #we threshold the outputs, and get rid of items that are below the 
        thresholded = framewise_outputs >= threshold

        for target_idx in range(thresholded.shape[1]): #thresholded.shape[1] is the number of classes youve trained on
            if thresholded[:, target_idx].mean() == 0: #if mean is zero, nothing is detected
                pass
            else:
                detected = np.argwhere(thresholded[:, target_idx]).reshape(-1) #else we get all of the thresholds at the target idx

                head_idx = 0
                tail_idx = 0 
                while True:
                    if (tail_idx + 1 == len(detected)) or (
                            detected[tail_idx + 1] - 
                            detected[tail_idx] != 1): #if the tail idx is not larger than the detected class
                        onset = 0.01 * detected[
                            head_idx] + global_time #we loop through the detected classes and add to the onset and offset the detected is the time domain
                        offset = 0.01 * detected[
                            tail_idx] + global_time
                        onset_idx = detected[head_idx]
                        offset_idx = detected[tail_idx]
                        max_confidence = framewise_outputs[
                            onset_idx:offset_idx, target_idx].max() #we get the mean and max confidence scores 
                        mean_confidence = framewise_outputs[
                            onset_idx:offset_idx, target_idx].mean()
                        estimated_event = {
                            "audio_id": audio_id,
                            "ebird_code": classes[target_idx],
                            "onset": onset,
                            "offset": offset,
                            "max_confidence": max_confidence,
                            "mean_confidence": mean_confidence
                        }
                        estimated_event_list.append(estimated_event) #we append the estimated event for the class
                        head_idx = tail_idx + 1
                        tail_idx = tail_idx + 1
                        if head_idx >= len(detected):
                            break
                    else:
                        tail_idx += 1
        global_time += PERIOD #we add to the global time and create a dataframe from the estimated time list
        prediction_df = pd.DataFrame(estimated_event_list) 
    return prediction_df #return the CSV
    

In [134]:
from collections import defaultdict
def prediction_framewise(test_df,model, config, thresh=0.5): 
    #framewise predictions for getting the onset and offset of bird calls
    #using one model
    unique_audio_id = test_df.filepath.unique() #we get unique audio filepaths in the test_csv

    prediction_dfs = [] #create a predictions df list
    for filepath in tqdm(unique_audio_id): # for the audio files
        clip, _ = librosa.load(f'{config.data_root}/{filepath}',sr=None,
                               res_type="kaiser_fast") #we load the audio
        
        test_df_for_audio_id = test_df.query(
            f"filepath == '{filepath}'").reset_index(drop=True) #create a csv file for the filepath
        prediction_df = prediction_for_clip_framewise(test_df_for_audio_id, #pass this to the prediction per clip
                                                clip=clip,
                                                config=config,
                                                model=model,
                                                threshold=thresh) #threshold of what confidence in a score we are looking for

        prediction_dfs.append(prediction_df)
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True) #ad all of the lists together
    return prediction_df

In [136]:
device

device(type='cuda')

In [137]:
model = load_model(f'{config.save_path}/{config.exp_name}/models/model_0.pth', config)
model.to(device)
list_of_prediction_df = prediction_framewise(df, model, config, thresh=0.9)

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [141]:
list_of_prediction_df

Unnamed: 0,audio_id,ebird_code,onset,offset,max_confidence,mean_confidence
0,extr_1670_B09_20190727_060000_All_Day_3h.wav,CreCheBab1,2.1,2.39,0.938380,0.938381
1,extr_1670_B09_20190727_060000_All_Day_3h.wav,MouLeaWar1,0.0,2.69,0.999935,0.988133
2,extr_1670_B09_20190727_060000_All_Day_3h.wav,MouLeaWar1,4.0,5.49,0.996988,0.983448
3,extr_1670_B09_20190727_060000_All_Day_3h.wav,MouLeaWar1,8.6,9.79,0.998533,0.982834
4,extr_1670_B09_20190727_060000_All_Day_3h.wav,SunWar,12.0,13.49,0.999996,0.993999
...,...,...,...,...,...,...
640,extr_5003_B02_20180920_063200_All_Day [-7.2733...,EyeWreBab1,9.5,10.69,1.000000,0.999614
641,extr_5003_B02_20180920_063200_All_Day [-7.2733...,EyeWreBab1,12.3,13.79,1.000000,0.999683
642,extr_5007_B01_20180910_064734_All_Day [-7.2963...,HorWreBab1,5.2,6.09,0.982359,0.966977
643,extr_5007_B01_20180910_064734_All_Day [-7.2963...,HorWreBab1,8.9,9.49,0.935119,0.923277
