# READ CSV
##### Purpose :
Reads csv from S2L Sound Component Training Data Collection (Combined) OR DEMO.csv and creates a class pkl containing ROI metadata

#####  Notes :
- Runs in ROI_generation conda env (envs/ROI_generation.yml)


In [3]:
# read csv 
import pandas as pd
import numpy as np
import csv
import joblib
import os 

import csv
import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd

import joblib

import warnings

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


In [4]:
csv_path = "../data/demo_ROIs.csv" # ROI tags

In [53]:
df = pd.read_csv(csv_path, index_col=None,)
df

Unnamed: 0,File_name,Selection,View,Channel,Begin_Time_s,End_Time_s,Low_Freq_Hz,High_Freq_Hz,Annotation
0,s2llg003_190530_2019-05-30_17-10.wav,6.0,Spectrogram 1,1.0,28.1631,31.0,844.268,11627.866,OQU
1,s2lam051_190531_2019-06-01_21-30.wav,4.0,Spectrogram 1,1.0,11.2957,13.0,0.0,329.63,AMU
2,s2llg008_180502_2018-05-08_09-00.wav,2.0,Spectrogram 1,1.0,7.4828,8.0,3647.714,8215.637,BBI
3,s2llg003_190530_2019-06-01_09-20.wav,3.0,Spectrogram 1,1.0,32.4845,34.0,440.816,2400.0,AMA
4,s2lam028_190412_2019-04-15_07-30.wav,7.0,Spectrogram 1,1.0,0.0,5.0,0.0,2093.264,AVT


The dataframe contains 5 S2L ROIs to generate mel specs and run through the modeling pipeline. Each 'Annotation' is a subclass of ABGQI:
- OQU = quiet
- AMU = anthrophony music
- BBI = biophony bird
- AMA = anthrophony machine
- AVT = anthrophony vehicle traffic

In [59]:
# Number of wav files
len(df['File_name'].unique())

5

In [60]:
# average ROI length
length = df['End_Time_s'] - df['Begin_Time_s']
length.mean()

2.3147800000000003

In [61]:
# ROI variance
length.var()

2.931460986999999

In [62]:
# Summary of ROI subtypes ('Annotation')
df.groupby(['Annotation']).sum()

Unnamed: 0_level_0,Selection,Channel,Begin_Time_s,End_Time_s,Low_Freq_Hz,High_Freq_Hz
Annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AMA,3.0,1.0,32.4845,34.0,440.816,2400.0
AMU,4.0,1.0,11.2957,13.0,0.0,329.63
AVT,7.0,1.0,0.0,5.0,0.0,2093.264
BBI,2.0,1.0,7.4828,8.0,3647.714,8215.637
OQU,6.0,1.0,28.1631,31.0,844.268,11627.866


In [63]:
# quick gathering of the class info (sub classes of ABGQI, here)
classes = []
cnt = "ROI_code"

# open csv
with open(csv_path,'rt')as f:
    data = csv.reader(f)
    next(data, None) # skip header
    
    # iterate through each row finding class annotations (e.g. AVT)
    for row in data:
        temp_class = row[8] # column == Annotation
 
        # aggregate classes
        if(temp_class != cnt):
            classes.append(temp_class)
            cnt = temp_class

In [64]:
# becasue there were duplicates!!
mylist = list(dict.fromkeys(classes))
print("There are", len(mylist), "classes")
classes = mylist

There are 5 classes


In [65]:
classes

['OQU', 'AMU', 'BBI', 'AMA', 'AVT']

In [66]:
# Gather ROI metadata
roi_list = []

# Iterate through each class
for i in range(len(classes)):
    roi_cnt = 0
    roi = classes[i]
    dur = 0 
    
    with open(csv_path,'rt') as f:
        data = csv.reader(f)
        next(data, None) # skip header
        
        for row in data:
            # ensure we have an annotation [8] with st [4] and stop [5] times
            if(row[8]==roi and row[5] != '' and row[4] != ''): 
                roi_cnt = roi_cnt+1
                dur = dur + (np.float16(row[5])-np.float16(row[4]))
    
    # if we have any valid ROIs save info in a list for each class          
    if(roi_cnt!=0)            :
        print(roi_cnt,'ROIs for',classes[i])
        print("Avg. duration: dur:",dur/roi_cnt)
        print('-'*50)
        roi_list.append([roi_cnt,classes[i]])

1 ROIs for OQU
Avg. duration: dur: 2.84375
--------------------------------------------------
1 ROIs for AMU
Avg. duration: dur: 1.703125
--------------------------------------------------
1 ROIs for BBI
Avg. duration: dur: 0.515625
--------------------------------------------------
1 ROIs for AMA
Avg. duration: dur: 1.5
--------------------------------------------------
1 ROIs for AVT
Avg. duration: dur: 5.0
--------------------------------------------------


In [67]:
# count total ROIs and get names (metadata)
(roi_list)
roi_sum = 0
indx = 0
total_roi_w_cutoff = 0
class_name_lst = []
cutoff_class_name = []
cutoff = 1 # th nuo. or ROIs should at least be these many


for i in range(len(roi_list)):
    roi_sum = roi_sum + roi_list[i][0]
    
    if(roi_list[i][0] >=cutoff):
        indx+=1
        print(indx, roi_list[i])
        cutoff_class_name.append(roi_list[i])
        total_roi_w_cutoff += roi_list[i][0]
        class_name_lst.append(roi_list[i])
print("ROI_SUM with cutoff",total_roi_w_cutoff)
print("ROI_SUM for all classes",roi_sum)


1 [1, 'OQU']
2 [1, 'AMU']
3 [1, 'BBI']
4 [1, 'AMA']
5 [1, 'AVT']
ROI_SUM with cutoff 5
ROI_SUM for all classes 5


In [68]:
# save class data
joblib.dump(cutoff_class_name, '../data/cutoff_class_name_demo.pkl')
print("Dumped")

Dumped


### CLIP WAV FILES BASED ON ABOVE ROI PROCESSING

In [69]:
dir_path = "../data/wavs/" # demo wavs
alt_dir_path = '' # possible other wav data
class_pkl = '../data/cutoff_class_name_demo.pkl'
os.path.exists(dir_path)

True

In [70]:
#load in class names, can skip script above if already exists
cutoff_class_name = joblib.load(class_pkl)

In [72]:
#warnings.filterwarnings('ignore')
sr = 22050 # sampling rate here

# iterate through each ROI class
for iii in range(len(cutoff_class_name)):
    excep = 0 # counting exceptions if any occurs
    print(cutoff_class_name[iii][1])
    class_name = cutoff_class_name[iii][1] # class of interest
    roi_temp = []
    roi_cnt = 0
        
    # open the ROI csv
    with open(csv_path,'rt')as f:
        data = csv.reader(f)
        next(data, None) 
        
        # iterate through each csv row
        for row in data:
            if(row[8] == class_name and row[0] != 'NA'):
                roi_cnt +=1
                print(class_name,":",roi_cnt)
                st = round((float(row[4]))*sr) #start time
                en = round((float(row[5]))*sr) #stop time

                y1 = round((float(row[6]))) # low freq
                y2 = round((float(row[7]))) # upper freq

                # read in wav file
                try:
                    x, sr = librosa.load(dir_path+row[0])
                    pth_ = dir_path+row[0]

                # try alternate wv dir
                except:
                    print("Main Dir doesn't have file... ")
                    print(dir_path+row[0] +"\n")
                    try:
                        x, sr = librosa.load(alt_dir_path+row[0])
                        pth_ = alt_dir_path+row[0]

                    except Exception as error:
                        print('***********************Caught this error: ' + repr(error))
                        break
                        excep +=1

                roi_temp.append((x[st:en],pth_,st,en, y1, y2))
    
    jl_file_dmp = os.path.join("../data/ROI_metadata/",cutoff_class_name[iii][1] +'.pkl')
    joblib.dump(roi_temp, jl_file_dmp)
    print('No. of ROIs for',cutoff_class_name[iii][1],'were:',len(roi_temp))
    print("Saved at",jl_file_dmp)
    print("Done with",cutoff_class_name[iii][1],'with',excep,'Exceptions.')
    print('-'*80)

OQU
OQU : 1
No. of ROIs for OQU were: 1
Saved at ../data/ROI_metadata/OQU.pkl
Done with OQU with 0 Exceptions.
--------------------------------------------------------------------------------
AMU
AMU : 1
No. of ROIs for AMU were: 1
Saved at ../data/ROI_metadata/AMU.pkl
Done with AMU with 0 Exceptions.
--------------------------------------------------------------------------------
BBI
BBI : 1
No. of ROIs for BBI were: 1
Saved at ../data/ROI_metadata/BBI.pkl
Done with BBI with 0 Exceptions.
--------------------------------------------------------------------------------
AMA
AMA : 1
No. of ROIs for AMA were: 1
Saved at ../data/ROI_metadata/AMA.pkl
Done with AMA with 0 Exceptions.
--------------------------------------------------------------------------------
AVT
AVT : 1
No. of ROIs for AVT were: 1
Saved at ../data/ROI_metadata/AVT.pkl
Done with AVT with 0 Exceptions.
--------------------------------------------------------------------------------


In [74]:
# check for pickle dump success. Should match above total ROI counts
dir_path = '../data/ROI_metadata/'
dir_file_list = os.listdir(dir_path)

for i in range(len(dir_file_list)):
    jl = joblib.load(os.path.join(dir_path,dir_file_list[i]))
    print(np.shape(jl),dir_file_list[i].split('_')[0])

del jl

(1, 6) AMA.pkl
(1, 6) AMU.pkl
(1, 6) AVT.pkl
(1, 6) BBI.pkl
(1, 6) OQU.pkl
