In [2]:
import sys # Python system library needed to load custom functions
import math # module with access to mathematical functions
import os # for changing the directory

import numpy as np  # for performing calculations on numerical arrays
import pandas as pd  # home of the DataFrame construct, _the_ most important object for Data Science

from IPython.display import Audio # for listening to our insects
import IPython
from scipy.fft import fft # function to calculate Fast Fourier Transform

import wave
from glob import glob
import librosa
import torchaudio
from tqdm.auto import tqdm          # library to display progress bar while doing apply on pandas dataframe

import matplotlib.pyplot as plt  # allows creation of insightful plots
import seaborn as sns # another library to make even more beautiful plots

sys.path.append('../../src') # add the source directory to the PYTHONPATH. This allows to import local functions and modules.
# enable rendering plots under the code cell that created it
%matplotlib inline

from eda_utils import show_sampling, signal_generator, plot_random_spec, plot_spec, plot_waveform # functions to create plots for and from audio data
from gdsc_utils import download_directory, PROJECT_DIR # function to download GDSC data from S3 bucket and our root directory
from config import DEFAULT_BUCKET  # S3 bucket with the GDSC data

os.chdir(PROJECT_DIR) # changing our directory to root

In [3]:
download_directory('data/', None, DEFAULT_BUCKET)

In [49]:
df = pd.read_csv('data/metadata.csv')
df_3_5 = pd.read_csv('data/production_data/Raffaela3-5s_crop/metadata.csv')

In [57]:
df['species and label'] = df.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
#df_3_5['species and label'] = df_3_5.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
df_stats = df.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
#df_35_sort = df_3_5.sort_values(by=['label'], ascending=True, ignore_index=True)
df_35_stats = df_3_5.groupby(['label']).agg(count = ('label', 'count')).reset_index()
#df_stats['length']
#df.head()
df_35_stats.head()

In [51]:
value = 3.5
df_35_stats['count'] *= value

In [55]:
df[["sample_rate","length"]].describe()

In [58]:
df_35_stats[["count"]].describe()

In [54]:
plt.figure(figsize = (20,6))
plt.title('Total length of files per species', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(df_35_stats['label'], df_35_stats['count'], color = 'cyan')
plt.bar(df_stats['species and label'], df_stats['length'], color = 'orange')
plt.xlabel("Species and label")
plt.ylabel("lenght")
#plt.legend(["whole set", "> 40 sec."])
plt.show()

In [5]:
# train data and validation data (whole and with length > 40 sec.) 
train = df.query('subset == "train"') 
train_value = df.query('subset == "train" & length > 40') 
validation = df.query('subset == "validation"') 
validation_value = df.query('subset == "validation" & length > 40') 
test = df.query('subset == "test"') 

In [6]:
# sort the data by label/species
train_srt = train.sort_values(['label', 'length'], ascending =[True, True]) 
train_value_srt = train_value.sort_values(['label', 'length'], ascending =[True, True]) 
val_srt = validation.sort_values(['label', 'length'], ascending =[True, True]) 
val_value_srt = validation_value.sort_values(['label', 'length'], ascending =[True, True]) 
test_srt = test.sort_values(['label', 'length'], ascending =[True, True])

In [7]:
# VISUALIZATION:
# add variable that combines species and label
train_srt['species and label'] = train_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
train_value_srt['species and label'] = train_value_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
val_srt['species and label'] = val_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
val_value_srt['species and label'] = val_value_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
test_srt['species and label'] = test_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)

In [10]:
# Calculating stats per label/species - total length of recording per class and the total number of class occurences in the dataset
# sorted_stats shows how many seconds of recordings we have for each species, the number of different entries as well as the average length per sample (for trian-data)
sorted_train = train_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_train_value = train_value_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_val = val_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_val_value = val_value_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_test = test_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()

In [11]:
# Calculating average length of an audio sample
sorted_train['avg_len'] = sorted_train['length']/sorted_train['count']
sorted_train_value['avg_len'] = sorted_train_value['length']/sorted_train_value['count']
sorted_val['avg_len'] = sorted_val['length']/sorted_val['count']
sorted_val_value['avg_len'] = sorted_val_value['length']/sorted_val_value['count']
sorted_test['avg_len'] = sorted_test['length']/sorted_test['count']

In [19]:
plt.figure(figsize = (20,6))
plt.title('Number of files per species train-dataset', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(sorted_train['species and label'], sorted_train['count'], color = 'cyan')
plt.bar(sorted_train_value['species and label'], sorted_train_value['count'], color = 'orange')
plt.xlabel("Species and label")
plt.ylabel("count")
plt.legend(["whole set", "> 40 sec."])
plt.show()

In [20]:
plt.figure(figsize = (20,6))
plt.title('Number of files per species validation-dataset', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(sorted_val['species and label'], sorted_val['count'], color = 'cyan')
plt.bar(sorted_val_value['species and label'], sorted_val_value['count'], color = 'orange')
plt.xlabel("Species and label")
plt.ylabel("count")
plt.legend(["whole set", "> 40 sec."])
plt.show()

In [21]:
plt.figure(figsize = (20,6))
plt.title('Number of files per species train vs. validation dataset', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(sorted_train['species and label'], sorted_train['count'], color = 'cyan')
plt.bar(sorted_val['species and label'], sorted_val['count'], color = 'orange')
plt.xlabel("Species and label")
plt.ylabel("count")
plt.legend(["train", " validation"])
plt.show()

In [None]:
label = df.query('label == 51') 
cnt = label.count()
print(cnt['unique_file'])

In [75]:
# train data and validation data with length < 5 sec.
train_5 = df.query('subset == "train" & length < 5') 
validation_5 = df.query('subset == "validation" & length < 5') 
test_5 = df.query('subset == "test" & length < 5') 


# sort the data by label/species
train_5_srt = train_5.sort_values(['label', 'length'], ascending =[True, True]) 
val_5_srt = validation_5.sort_values(['label', 'length'], ascending =[True, True])
test_5_srt = test_5.sort_values(['label', 'length'], ascending =[True, True]) 

# add variable that combines species and label
train_5_srt['species and label'] = train_5_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
val_5_srt['species and label'] = val_5_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
test_5_srt['species and label'] = test_5_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)

# Calculating stats per label/species - total length of recording per class and the total number of class occurences in the dataset
sorted_train_5 = train_5_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_val_5 = val_5_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_test_5 = test_5_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()


# Calculating average length of an audio sample
sorted_train_5['avg_len'] = sorted_train_5['length']/sorted_train_5['count']
sorted_val_5['avg_len'] = sorted_val_5['length']/sorted_val_5['count']
sorted_test_5['avg_len'] = sorted_test_5['length']/sorted_test_5['count']

In [14]:
plt.figure(figsize = (20,6))
plt.title('Number of files in train-dataset (< 5 Sec.)', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(sorted_train['species and label'], sorted_train['count'], color = 'cyan')
plt.bar(sorted_train_5['species and label'], sorted_train_5['count'], color = 'orange')
plt.xlabel("Species and label")
plt.ylabel("count")
plt.legend(["whole set", "< 5 sec."])
plt.show()

In [76]:
# train data and validation data with length [5,10) sec.
train_10 = df.query('subset == "train" & length >= 5 & length < 10') 
validation_10 = df.query('subset == "validation" & length >= 5 & length < 10') 
test_10 = df.query('subset == "test" & length >= 5 & length < 10') 

# sort the data by label/species
train_10_srt = train_10.sort_values(['label', 'length'], ascending =[True, True]) 
val_10_srt = validation_10.sort_values(['label', 'length'], ascending =[True, True])
test_10_srt = test_10.sort_values(['label', 'length'], ascending =[True, True]) 

# add variable that combines species and label
train_10_srt['species and label'] = train_10_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
val_10_srt['species and label'] = val_10_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
test_10_srt['species and label'] = test_10_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)

# Calculating stats per label/species - total length of recording per class and the total number of class occurences in the dataset
sorted_train_10 = train_10_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_val_10 = val_10_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_test_10 = test_10_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()

# Calculating average length of an audio sample
sorted_train_10['avg_len'] = sorted_train_10['length']/sorted_train_10['count']
sorted_val_10['avg_len'] = sorted_val_10['length']/sorted_val_10['count']
sorted_test_10['avg_len'] = sorted_test_10['length']/sorted_test_10['count']

In [25]:
plt.figure(figsize = (20,6))
plt.title('Number of files in train-dataset (< 10 Sec.)', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(sorted_train['species and label'], sorted_train['count'], color = 'cyan')
plt.bar(sorted_train_10['species and label'], sorted_train_10['count'], color = 'orange')
plt.xlabel("Species and label")
plt.ylabel("count")
plt.legend(["whole set", "< 10 sec."])
plt.show()

In [77]:
# train data and validation data with length [10,15) sec.
train_15 = df.query('subset == "train" & length >= 10 & length < 15') 
validation_15 = df.query('subset == "validation" & length >= 10 & length < 15') 
test_15 = df.query('subset == "test" & length >= 10 & length < 15') 

# sort the data by label/species
train_15_srt = train_15.sort_values(['label', 'length'], ascending =[True, True]) 
val_15_srt = validation_15.sort_values(['label', 'length'], ascending =[True, True])
test_15_srt = test_15.sort_values(['label', 'length'], ascending =[True, True]) 

# add variable that combines species and label
train_15_srt['species and label'] = train_15_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
val_15_srt['species and label'] = val_15_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
test_15_srt['species and label'] = test_15_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)

# Calculating stats per label/species - total length of recording per class and the total number of class occurences in the dataset
sorted_train_15 = train_15_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_val_15 = val_15_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_test_15 = test_15_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()

# Calculating average length of an audio sample
sorted_train_15['avg_len'] = sorted_train_15['length']/sorted_train_15['count']
sorted_val_15['avg_len'] = sorted_val_15['length']/sorted_val_15['count']
sorted_test_15['avg_len'] = sorted_test_15['length']/sorted_test_15['count']

In [27]:
plt.figure(figsize = (20,6))
plt.title('Number of files in train-dataset (< 15 Sec.)', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(sorted_train['species and label'], sorted_train['count'], color = 'cyan')
plt.bar(sorted_train_15['species and label'], sorted_train_15['count'], color = 'orange')
plt.xlabel("Species and label")
plt.ylabel("count")
plt.legend(["whole set", "< 15sec."])
plt.show()

In [78]:
train = df.query('subset == "train"') 
validation = df.query('subset == "validation"') 
test = df.query('subset == "test"') 

# sort the data by label/species
train_srt = train.sort_values(['label', 'length'], ascending =[True, True]) 
val_srt = validation.sort_values(['label', 'length'], ascending =[True, True])
test_srt = test.sort_values(['label', 'length'], ascending =[True, True]) 

# add variable that combines species and label
train_srt['species and label'] = train_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
val_srt['species and label'] = val_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
test_srt['species and label'] = test_srt.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)

# Calculating stats per label/species - total length of recording per class and the total number of class occurences in the dataset
sorted_train = train_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_val = val_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
sorted_test = test_srt.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()

# Calculating average length of an audio sample
sorted_train['avg_len'] = sorted_train['length']/sorted_train['count']
sorted_val['avg_len'] = sorted_val['length']/sorted_val['count']
sorted_test['avg_len'] = sorted_test['length']/sorted_test['count']

In [43]:
with wave.open('data/test/99.wav') as mywav:
    duration_seconds = mywav.getnframes() / mywav.getframerate()
    print(f"Length of the WAV file: {duration_seconds:.1f} s")

In [8]:
with wave.open('data/train/Roeselianaroeselii_XC751814-dat028-019_edit1.wav') as mywav:
    duration_seconds = mywav.getnframes() / mywav.getframerate()
    print(duration_seconds)

In [52]:
def length_test_set (wave_paths):
    #file_length = np.zeros(556)
    file_length = []
    i = 0
    for filename in tqdm(wave_paths):
        wave, sample_rate = torchaudio.load(filename)
        audio, _ = librosa.load(filename, sr=sample_rate)   
        #file_length[i] = 
        val = librosa.get_duration(y=audio, sr=sample_rate)
        #i += 1     
        file_length.append(val)
        
    return file_length

In [22]:
wave, sample_rate = torchaudio.load(f'data/test/0.wav')
audio, _ = librosa.load(f'data/test/0.wav', sr=sample_rate)   
file_length = librosa.get_duration(y=audio, sr=sample_rate)
print(file_length)

In [53]:
length = length_test_set(paths)

In [54]:
type(length)

In [61]:
result_5 = [i for i in length if i < 5]
result_10 = [i for i in length if i >= 5 and i < 10]
result_15 = [i for i in length if i >= 10 and i < 15]

In [81]:
# total amount in train-set
amount_train = sum(sorted_train['count'])
amount_train_5 = sum(sorted_train_5['count'])
amount_train_10 = sum(sorted_train_10['count'])
amount_train_15 = sum(sorted_train_15['count'])
percent_5 = round((amount_train_5 / amount_train) * 100 , 2)
percent_10 = round((amount_train_10 / amount_train) * 100 , 2)
percent_15 = round((amount_train_15 / amount_train) * 100 , 2)
percent_train = round(100 - (percent_5 + percent_10 + percent_15),2)
print("# files train-set =", amount_train)
print("# files train-set [0, 5) sec.  =", amount_train_5, "(",percent_5,"%",")")
print("# files train-set [5, 10) sec. =", amount_train_10, "(",percent_10,"%",")")
print("# files train-set [10, 15) sec. =", amount_train_15, "(",percent_15,"%",")")
print('# files train-set >= 15 sec. =', "(",percent_train,"%",")")
print("--------------------------------------------")

# total amount in validation-set
amount_val = sum(sorted_val['count'])
amount_val_5 = sum(sorted_val_5['count'])
amount_val_10 = sum(sorted_val_10['count'])
amount_val_15 = sum(sorted_val_15['count'])
percent_v5 = round((amount_val_5 / amount_val) * 100 , 2)
percent_v10 = round((amount_val_10 / amount_val) * 100 , 2)
percent_v15 = round((amount_val_15 / amount_val) * 100 , 2)
percent_val = round(100 -(percent_v5 + percent_v10 + percent_v15),2)
print("# files validation-set =", amount_val)
print("# files validation-set [0, 5) sec.  =", amount_val_5, "(",percent_v5,"%",")")
print("# files validation-set [5, 10) sec. =", amount_val_10, "(",percent_v10,"%",")")
print("# files validation-set [10, 15) sec. =", amount_val_15,  "(",percent_v15,"%",")")
print('# files validation-set >= 15 sec. =', "(",percent_val,"%",")")
print("--------------------------------------------")

# total amount in test-set
amount_test = len(length)
amount_test_5 = len(result_5)
amount_test_10 = len(result_10)
amount_test_15 = len(result_15)
percent_test5 = round((amount_test_5 / amount_test) * 100 , 2)
percent_test10 = round((amount_test_10 / amount_test) * 100 , 2)
percent_test15 = round((amount_test_15 / amount_test) * 100 , 2)
percent_test = round(100 - (percent_5 + percent_10 + percent_15),2)
print("# files test-set =", amount_test)
print("# files test-set [0, 5) sec.  =", amount_test_5, "(",percent_test5,"%",")")
print("# files test-set [5, 10) sec. =", amount_test_10, "(",percent_test10,"%",")")
print("# files test-set [10, 15) sec. =", amount_test_15, "(",percent_test15,"%",")")
print('# files test-set >= 15 sec. =', "(",percent_test,"%",")")