In [2]:
import matplotlib.pyplot as plt     # allows creation of insightful plots
import numpy as np                  # for performing calculations on numerical arrays
import pandas as pd                 # home of the DataFrame construct, _the_ most important object for Data Science
import seaborn as sns               # allows creation of insightful plots, but a bit prettier
import sys                          # Python system library needed to load custom functions
import torchaudio                   # library that will allow us to load the audio files
import os                           # for changing the directory

from scipy.fft import fft, fftfreq  # functions for calculating the fourier transform and frequencies from audio data
from tqdm.auto import tqdm          # library to display progress bar while doing apply on pandas dataframe

sys.path.append('../../src')
pd.set_option('display.max_columns', None) # All the columns in a dataframe are shown 
tqdm.pandas()                       # integrate tqdm with Pandas
# line to render the plots under the code cell that created it
%matplotlib inline

from eda_utils import plot_spec     # functions to create plots for and from audio data
from gdsc_utils import PROJECT_DIR # our root directory
os.chdir(PROJECT_DIR) # changing our directory to root

In [3]:
download_directory('data/', None, DEFAULT_BUCKET)

In [4]:
df = pd.read_csv('data/metadata.csv')

In [12]:
# paths
# train
ep_train_18_1 = 'data/train/Cyclochilaaustralasiae_GBIF2883236324_IN61761362_140722.wav'
ep_train_18_2 = 'data/train/Cyclochilaaustralasiae_GBIF2988654650_IN65392091_150201.wav'
ep_train_18_3 = 'data/train/Cyclochilaaustralasiae_GBIF2883127714_IN61461296_140099.wav'
ep_train_18_4 = 'data/train/Cyclochilaaustralasiae_GBIF2988310032_IN64913166_148835.wav'
ep_train_18_5 = 'data/train/Cyclochilaaustralasiae_GBIF2988576050_IN64914163_148834.wav'
ep_train_18_6 = 'data/train/Cyclochilaaustralasiae_GBIF2898419788_IN62243743_141893_edit1.wav'
ep_train_18_7 = 'data/train/Cyclochilaaustralasiae_GBIF2429268043_IN33470690_47652.wav'
ep_train_18_8 = 'data/train/Cyclochilaaustralasiae_GBIF2465305282_IN36155866_51319.wav'

# validation
ep_val_18_1 = 'data/val/Cyclochilaaustralasiae_GBIF1949995530_IN18893186_28098_edit1.wav'
ep_val_18_2 = 'data/val/Cyclochilaaustralasiae_GBIF2882927607_IN61523430_140217.wav'
ep_val_18_3 = 'data/val/Cyclochilaaustralasiae_GBIF1978841250_IN19182596_28530.wav'

In [13]:
ep_train_6_1 ='data/train/Bicoloranabicolor_XC752542-dat061-002_edit1.wav'
ep_train_6_2 ='data/train/Bicoloranabicolor_XC752542-dat061-002_edit2.wav'
ep_train_6_3 ='data/train/Bicoloranabicolor_XC752542-dat061-002_edit4.wav'
ep_train_6_4 ='data/train/Bicoloranabicolor_XC752542-dat061-002_edit5.wav'
ep_train_6_5 ='data/train/Bicoloranabicolor_XC752542-dat061-002_edit6.wav'
ep_train_6_6 ='data/train/Bicoloranabicolor_XC751441-dat044-005.wav'
ep_train_6_7 ='data/train/Bicoloranabicolor_XC752542-dat061-002_edit3.wav'
ep_train_6_8 ='data/train/Bicoloranabicolor_GBIF3327990654_IN86271646_271178.wav'
ep_train_6_9 ='data/train/Bicoloranabicolor_XC752106-dat061-001_edit1.wav'
ep_train_6_10 ='data/train/Bicoloranabicolor_XC753587-Bicolorana-bicolor-Cereglio-30-Giugno-2018_2CH96K16.wav'
ep_train_6_11 ='data/train/Bicoloranabicolor_XC752106-dat061-001_edit2.wav'
ep_train_6_12 ='data/train/Bicoloranabicolor_XC751442-dat044-006.wav'


In [6]:
def extract_peak_frequency(path, freq_threshold=300, n_top=1):
    data, sampling_rate = torchaudio.load(path)                                        # loading audio file
    data = data[0,:20*sampling_rate].numpy()                                           # Taking the first 20 seconds as it should be enough to hear insects sound
    fft_data = fft(data)                                                               # calculating fourier transform
    freqs = fftfreq(len(data))                                                         # calculating frequencies 
    freqs = abs(freqs)                                                                 # we are interested in positive numbers only
    index_above = freqs*sampling_rate>freq_threshold                                   # there is a lot of noise near 0 hz, so let's filter out what's below 300Hz, this is a hyperparameter you can play with
    peak_coefficient = np.argpartition(np.abs(fft_data[index_above]), -n_top)[-n_top:] # taking the index for top n values
    peak_freq = freqs[index_above][peak_coefficient]                                   # using the index to find top n frequencies

    return peak_freq * sampling_rate                                                   # We need to multiple it by sampling_rate to get the real frequency values

In [15]:
top10_freq_train1 = extract_peak_frequency(ep_train_18_1, freq_threshold=300, n_top=10)
top10_freq_train2 = extract_peak_frequency(ep_train_18_2, freq_threshold=300, n_top=10)
top10_freq_train3 = extract_peak_frequency(ep_train_18_3, freq_threshold=300, n_top=10)
top10_freq_train4 = extract_peak_frequency(ep_train_18_4, freq_threshold=300, n_top=10)
top10_freq_train5 = extract_peak_frequency(ep_train_18_5, freq_threshold=300, n_top=10)
top10_freq_train6 = extract_peak_frequency(ep_train_18_6, freq_threshold=300, n_top=10)
top10_freq_train7 = extract_peak_frequency(ep_train_18_7, freq_threshold=300, n_top=10)
top10_freq_train8 = extract_peak_frequency(ep_train_18_8, freq_threshold=300, n_top=10)

In [16]:
top10_freq_train1_6 = extract_peak_frequency(ep_train_6_1, freq_threshold=300, n_top=10)
top10_freq_train2_6 = extract_peak_frequency(ep_train_6_2, freq_threshold=300, n_top=10)
top10_freq_train3_6 = extract_peak_frequency(ep_train_6_3, freq_threshold=300, n_top=10)
top10_freq_train4_6 = extract_peak_frequency(ep_train_6_4, freq_threshold=300, n_top=10)
top10_freq_train5_6 = extract_peak_frequency(ep_train_6_5, freq_threshold=300, n_top=10)
top10_freq_train6_6 = extract_peak_frequency(ep_train_6_6, freq_threshold=300, n_top=10)
top10_freq_train7_6 = extract_peak_frequency(ep_train_6_7, freq_threshold=300, n_top=10)
top10_freq_train8_6 = extract_peak_frequency(ep_train_6_8, freq_threshold=300, n_top=10)

In [18]:
plt.title('Top 10 frequencies train-data, species 18 vs species 6', fontsize = 10)
plt.plot(np.arange(0,10), top10_freq_train1, color="black")
plt.plot(np.arange(0,10), top10_freq_train1_6, '--', color="black")
plt.plot(np.arange(0,10), top10_freq_train2)
plt.plot(np.arange(0,10), top10_freq_train3)
plt.plot(np.arange(0,10), top10_freq_train4)
plt.plot(np.arange(0,10), top10_freq_train5)
plt.plot(np.arange(0,10), top10_freq_train6)
plt.plot(np.arange(0,10), top10_freq_train7)
plt.plot(np.arange(0,10), top10_freq_train8)
plt.plot(np.arange(0,10), top10_freq_train2_6, '--')
plt.plot(np.arange(0,10), top10_freq_train3_6, '--')
plt.plot(np.arange(0,10), top10_freq_train4_6, '--')
plt.plot(np.arange(0,10), top10_freq_train5_6, '--')
plt.plot(np.arange(0,10), top10_freq_train6_6, '--')
plt.plot(np.arange(0,10), top10_freq_train7_6, '--')
plt.plot(np.arange(0,10), top10_freq_train8_6, '--')
plt.legend(["species 18", "species 6"])
plt.show()

In [19]:
df_sorted = df.sort_values(['label', 'length'], ascending =[True, True]) 
# extracting top1 frequencies per file and storing it in a pandas df
df_freqs = df_sorted['path'].progress_apply(lambda x: pd.Series(extract_peak_frequency(x))) 
# joining the metadata df with the top1 frequency dataframe and renaming the column of newly created feature
df_metadata = df_sorted.join(df_freqs).rename({0:'top_frequency'}, axis = 1)

In [21]:
plt.figure(figsize = (20,6))
sns.boxplot(data = df_metadata, x = 'species', y = "top_frequency")
plt.xticks(rotation = 90)
plt.show()

In [7]:
# top1 frequency (top classes)
df['species and label'] = df.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
df_top1 = df.query('label == 15 or label == 18 or label == 20 or label == 22 or label == 31 or label == 32 or label == 34 or label == 35 or label == 36 or label == 37 or label == 38 or label == 40 or label == 41 or label == 48 or label == 64')
df_sorted_top1 = df_top1.sort_values(['label', 'length'], ascending =[True, True]) 
# extracting top1 frequencies per file and storing it in a pandas df
df_freqs_top1 = df_sorted_top1['path'].progress_apply(lambda x: pd.Series(extract_peak_frequency(x))) 
# joining the metadata df with the top1 frequency dataframe and renaming the column of newly created feature
df_metadata_top1 = df_sorted_top1.join(df_freqs_top1).rename({0:'top_frequency'}, axis = 1)

In [9]:
# top 10 frequency (top class)
df['species and label'] = df.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
df_top10 = df.query('label == 15 or label == 18 or label == 20 or label == 22 or label == 31 or label == 32 or label == 34 or label == 35 or label == 36 or label == 37 or label == 38 or label == 40 or label == 41 or label == 48 or label == 64')
df_sorted_top10 = df_top10.sort_values(['label', 'length'], ascending =[True, True]) 
# extracting top1 frequencies per file and storing it in a pandas df
df_freqs_top10 = df_sorted_top10['path'].progress_apply(lambda x: pd.Series(extract_peak_frequency(x))) 
# joining the metadata df with the top1 frequency dataframe and renaming the column of newly created feature
df_metadata_top10 = df_sorted_top10.join(df_freqs_top10).rename({0:'top_frequency'}, axis = 1)

In [12]:
# top 50 frequency (top class)
df['species and label'] = df.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
df_top50 = df.query('label == 15 or label == 18 or label == 20 or label == 22 or label == 31 or label == 32 or label == 34 or label == 35 or label == 36 or label == 37 or label == 38 or label == 40 or label == 41 or label == 48 or label == 64')
df_sorted_top50 = df_top50.sort_values(['label', 'length'], ascending =[True, True]) 
# extracting top1 frequencies per file and storing it in a pandas df
df_freqs_top50 = df_sorted_top50['path'].progress_apply(lambda x: pd.Series(extract_peak_frequency(x))) 
# joining the metadata df with the top1 frequency dataframe and renaming the column of newly created feature
df_metadata_top50 = df_sorted_top50.join(df_freqs_top50).rename({0:'top_frequency'}, axis = 1)

In [58]:
plt.figure(figsize = (20,6))
sns.boxplot(data = df_metadata_top1, x = 'species and label', y = "top_frequency")
plt.xticks(rotation = 90)
plt.title('Top 1 frequency (top classes)', fontsize = 20)
plt.show()

In [65]:
plt.figure(figsize = (20,6))
sns.boxplot(data = df_metadata_top10, x = 'species and label', y = "top_frequency")
plt.xticks(rotation = 90)
plt.title('Top 10 frequency (top classes)', fontsize = 20)
plt.show()

In [72]:
plt.figure(figsize = (20,6))
sns.boxplot(data = df_metadata_top50, x = 'species and label', y = "top_frequency")
plt.xticks(rotation = 90)
plt.title('Top 50 frequency (top classes)', fontsize = 20)
plt.show()

In [8]:
# top 1 frequency (low class)
df_low1 = df.query('label == 6 or label == 7 or label == 13 or label == 17 or label == 28 or label == 30 or label == 45 or label == 46 or label == 47 or label == 49 or label == 54 or label == 55 or label == 57 or label == 58 or label == 62')
df_sorted_low1 = df_low1.sort_values(['label', 'length'], ascending =[True, True]) 
# extracting top1 frequencies per file and storing it in a pandas df
df_freqs_low1 = df_sorted_low1['path'].progress_apply(lambda x: pd.Series(extract_peak_frequency(x))) 
# joining the metadata df with the top1 frequency dataframe and renaming the column of newly created feature
df_metadata_low1 = df_sorted_low1.join(df_freqs_low1).rename({0:'top_frequency'}, axis = 1)

In [10]:
# top 10 frequency (low class)
df_low10 = df.query('label == 6 or label == 7 or label == 13 or label == 17 or label == 28 or label == 30 or label == 45 or label == 46 or label == 47 or label == 49 or label == 54 or label == 55 or label == 57 or label == 58 or label == 62')
df_sorted_low10 = df_low10.sort_values(['label', 'length'], ascending =[True, True]) 
# extracting top1 frequencies per file and storing it in a pandas df
df_freqs_low10 = df_sorted_low10['path'].progress_apply(lambda x: pd.Series(extract_peak_frequency(x))) 
# joining the metadata df with the top1 frequency dataframe and renaming the column of newly created feature
df_metadata_low10 = df_sorted_low10.join(df_freqs_low10).rename({0:'top_frequency'}, axis = 1)

In [13]:
# top 50 frequency (low class)
df_low50 = df.query('label == 6 or label == 7 or label == 13 or label == 17 or label == 28 or label == 30 or label == 45 or label == 46 or label == 47 or label == 49 or label == 54 or label == 55 or label == 57 or label == 58 or label == 62')
df_sorted_low50 = df_low50.sort_values(['label', 'length'], ascending =[True, True]) 
# extracting top1 frequencies per file and storing it in a pandas df
df_freqs_low50 = df_sorted_low50['path'].progress_apply(lambda x: pd.Series(extract_peak_frequency(x))) 
# joining the metadata df with the top1 frequency dataframe and renaming the column of newly created feature
df_metadata_low50 = df_sorted_low50.join(df_freqs_low50).rename({0:'top_frequency'}, axis = 1)

In [79]:
plt.figure(figsize = (20,6))
sns.boxplot(data = df_metadata_low1, x = 'species and label', y = "top_frequency")
plt.xticks(rotation = 90)
plt.title('Top 1 frequency (low classes)', fontsize = 20)
plt.show()

In [78]:
plt.figure(figsize = (20,6))
sns.boxplot(data = df_metadata_low10, x = 'species and label', y = "top_frequency")
plt.xticks(rotation = 90)
plt.title('Top 10 frequency (low classes)', fontsize = 20)
plt.show()

In [83]:
plt.figure(figsize = (20,6))
sns.boxplot(data = df_metadata_low50, x = 'species and label', y = "top_frequency")
plt.xticks(rotation = 90)
plt.title('Top 50 frequency (low classes)', fontsize = 20)
plt.show()

In [26]:
# bad classes
# species 6
df_6 = df_low1.query('label == 6')
df_train_6= df_low1.query('subset == "train" & label == 6')
df_val_6= df_low1.query('subset == "validation" & label == 6')
count6_train = df_train_6.count()
count6_val = df_val_6.count()
# files < 5 sec.
df_train_6_5= df_train_6.query('length < 5')
df_val_6_5= df_val_6.query('length < 5')
count6_train_5 = df_train_6_5.count()
count6_val_5 = df_val_6_5.count()
# files < 10 sec.
df_train_6_10= df_train_6.query('length > 5 & length > 10')
df_val_6_10= df_val_6.query('length > 5 & length > 10')
count6_train_10 = df_train_6_10.count()
count6_val_10 = df_val_6_10.count()
# files < 15 sec.
df_train_6_15= df_train_6.query('length > 10 & length > 15')
df_val_6_15= df_val_6.query('length > 10 & length > 15')
count6_train_15 = df_train_6_15.count()
count6_val_15 = df_val_6_15.count()

# species 7
df_7= df_low1.query('label == 7')
df_train_7= df_low1.query('subset == "train" & label == 7')
df_val_7= df_low1.query('subset == "validation" & label == 7')
count7_train = df_train_7.count()
count7_val = df_val_7.count()
# files < 5 sec.
df_train_7_5= df_train_7.query('length < 5')
df_val_7_5= df_val_7.query('length < 5')
count7_train_5 = df_train_7_5.count()
count7_val_5 = df_val_7_5.count()
# files < 10 sec.
df_train_7_10= df_train_7.query('length > 5 & length > 10')
df_val_7_10= df_val_7.query('length > 5 & length > 10')
count7_train_10 = df_train_7_10.count()
count7_val_10 = df_val_7_10.count()
# files < 15 sec.
df_train_7_15= df_train_7.query('length > 10 & length > 15')
df_val_7_15= df_val_7.query('length > 10 & length > 15')
count7_train_15 = df_train_7_15.count()
count7_val_15 = df_val_7_15.count()

# species 58
df_58= df_low1.query('label == 58')
df_train_58= df_low1.query('subset == "train" & label == 58')
df_val_58= df_low1.query('subset == "validation" & label == 58')
count58_train = df_train_58.count()
count58_val = df_val_58.count()
# files < 5 sec.
df_train_58_5= df_train_58.query('length < 5')
df_val_58_5= df_val_58.query('length < 5')
count58_train_5 = df_train_58_5.count()
count58_val_5 = df_val_58_5.count()
# files < 10 sec.
df_train_58_10= df_train_58.query('length > 5 & length > 10')
df_val_58_10= df_val_58.query('length > 5 & length > 10')
count58_train_10 = df_train_58_10.count()
count58_val_10 = df_val_58_10.count()
# files < 15 sec.
df_train_58_15= df_train_58.query('length > 10 & length > 15')
df_val_58_15= df_val_58.query('length > 10 & length > 15')
count58_train_15 = df_train_58_15.count()
count58_val_15 = df_val_58_15.count()

# species 45
df_45= df_low1.query('label == 45')
df_train_45= df_low1.query('subset == "train" & label == 45')
df_val_45= df_low1.query('subset == "validation" & label == 45')
count45_train = df_train_45.count()
count45_val = df_val_45.count()
# files < 5 sec.
df_train_45_5= df_train_45.query('length < 5')
df_val_45_5= df_val_45.query('length < 5')
count45_train_5 = df_train_45_5.count()
count45_val_5 = df_val_45_5.count()
# files < 10 sec.
df_train_45_10= df_train_45.query('length > 5 & length > 10')
df_val_45_10= df_val_45.query('length > 5 & length > 10')
count45_train_10 = df_train_45_10.count()
count45_val_10 = df_val_45_10.count()
# files < 15 sec.
df_train_45_15= df_train_45.query('length > 10 & length > 15')
df_val_45_15= df_val_45.query('length > 10 & length > 15')
count45_train_15 = df_train_45_15.count()
count45_val_15 = df_val_45_15.count()

# Spezies 57
df_57= df_low1.query('label == 57')
df_train_57= df_low1.query('subset == "train" & label == 57')
df_val_57= df_low1.query('subset == "validation" & label == 57')
count57_train = df_train_57.count()
count57_val = df_val_57.count()
# files < 5 sec.
df_train_57_5= df_train_57.query('length < 5')
df_val_57_5= df_val_57.query('length < 5')
count57_train_5 = df_train_57_5.count()
count57_val_5 = df_val_57_5.count()
# files < 10 sec.
df_train_57_10= df_train_57.query('length > 5 & length > 10')
df_val_57_10= df_val_57.query('length > 5 & length > 10')
count57_train_10 = df_train_57_10.count()
count57_val_10 = df_val_57_10.count()
# files < 15 sec.
df_train_57_15= df_train_57.query('length > 10 & length > 15')
df_val_57_15= df_val_57.query('length > 10 & length > 15')
count57_train_15 = df_train_57_15.count()
count57_val_15 = df_val_57_15.count()

In [10]:
# good classes
# species 48
df_train_48= df_top1.query('subset == "train" & label == 48')
df_val_48= df_top1.query('subset == "validation" & label == 48')
count48_train = df_train_48.count()
count48_val = df_val_48.count()
# files < 5 sec.
df_train_48_5= df_train_48.query('length < 5')
df_val_48_5= df_val_48.query('length < 5')
count48_train_5 = df_train_48_5.count()
count48_val_5 = df_val_48_5.count()
# files < 10 sec.
df_train_48_10= df_train_48.query('length > 5 & length > 10')
df_val_48_10= df_val_48.query('length > 5 & length > 10')
count48_train_10 = df_train_48_10.count()
count48_val_10 = df_val_48_10.count()
# files < 15 sec.
df_train_48_15= df_train_48.query('length > 10 & length > 15')
df_val_48_15= df_val_48.query('length > 10 & length > 15')
count48_train_15 = df_train_48_15.count()
count48_val_15 = df_val_48_15.count()

# species 15
df_train_15= df_top1.query('subset == "train" & label == 15')
df_val_15= df_top1.query('subset == "validation" & label == 15')
count15_train = df_train_15.count()
count15_val = df_val_15.count()
# files < 5 sec.
df_train_15_5= df_train_15.query('length < 5')
df_val_15_5= df_val_15.query('length < 5')
count15_train_5 = df_train_15_5.count()
count15_val_5 = df_val_15_5.count()
# files < 10 sec.
df_train_15_10= df_train_15.query('length > 5 & length > 10')
df_val_15_10= df_val_15.query('length > 5 & length > 10')
count15_train_10 = df_train_15_10.count()
count15_val_10 = df_val_15_10.count()
# files < 15 sec.
df_train_15_15= df_train_15.query('length > 10 & length > 15')
df_val_15_15= df_val_15.query('length > 10 & length > 15')
count15_train_15 = df_train_15_15.count()
count15_val_15 = df_val_15_15.count()

# species 18
df_train_18= df_top1.query('subset == "train" & label == 18')
df_val_18= df_top1.query('subset == "validation" & label == 18')
count18_train = df_train_18.count()
count18_val = df_val_18.count()
# files < 5 sec.
df_train_18_5= df_train_18.query('length < 5')
df_val_18_5= df_val_18.query('length < 5')
count18_train_5 = df_train_18_5.count()
count18_val_5 = df_val_18_5.count()
# files < 10 sec.
df_train_18_10= df_train_18.query('length > 5 & length > 10')
df_val_18_10= df_val_18.query('length > 5 & length > 10')
count18_train_10 = df_train_18_10.count()
count18_val_10 = df_val_18_10.count()
# files < 15 sec.
df_train_18_15= df_train_18.query('length > 10 & length > 15')
df_val_18_15= df_val_18.query('length > 10 & length > 15')
count18_train_15 = df_train_18_15.count()
count18_val_15 = df_val_18_15.count()

# species 20
df_train_20= df_top1.query('subset == "train" & label == 20')
df_val_20= df_top1.query('subset == "validation" & label == 20')
count20_train = df_train_20.count()
count20_val = df_val_20.count()
# files < 5 sec.
df_train_20_5= df_train_20.query('length < 5')
df_val_20_5= df_val_20.query('length < 5')
count20_train_5 = df_train_20_5.count()
count20_val_5 = df_val_20_5.count()
# files < 10 sec.
df_train_20_10= df_train_20.query('length > 5 & length > 10')
df_val_20_10= df_val_20.query('length > 5 & length > 10')
count20_train_10 = df_train_20_10.count()
count20_val_10 = df_val_20_10.count()
# files < 15 sec.
df_train_20_15= df_train_20.query('length > 10 & length > 15')
df_val_20_15= df_val_20.query('length > 10 & length > 15')
count20_train_15 = df_train_20_15.count()
count20_val_15 = df_val_20_15.count()

# Spezies 64
df_train_64= df_top1.query('subset == "train" & label == 64')
df_val_64= df_top1.query('subset == "validation" & label == 64')
count64_train = df_train_64.count()
count64_val = df_val_64.count()
# files < 5 sec.
df_train_64_5= df_train_64.query('length < 5')
df_val_64_5= df_val_64.query('length < 5')
count64_train_5 = df_train_64_5.count()
count64_val_5 = df_val_64_5.count()
# files < 10 sec.
df_train_64_10= df_train_64.query('length > 5 & length > 10')
df_val_64_10= df_val_64.query('length > 5 & length > 10')
count64_train_10 = df_train_64_10.count()
count64_val_10 = df_val_64_10.count()
# files < 15 sec.
df_train_64_15= df_train_64.query('length > 10 & length > 15')
df_val_64_15= df_val_64.query('length > 10 & length > 15')
count64_train_15 = df_train_64_15.count()
count64_val_15 = df_val_64_15.count()

In [11]:
total_6 = count6_train['length']+count6_val['length']
total_7 = count7_train['length']+count7_val['length']
total_58 = count58_train['length']+count58_val['length']
total_45 = count45_train['length']+count45_val['length']
total_57 = count57_train['length']+count57_val['length']

# Percent greater equal 15 sec
percent_6 = round(((total_6-count6_val_5-count6_val_10-count6_val_15) / total_6) * 100 , 2)
percent_7 = round(((total_7-count7_val_5-count7_val_10-count7_val_15) / total_7) * 100 , 2)
percent_58 = round(((total_58-count58_val_5-count58_val_10-count58_val_15) / total_58) * 100 , 2)
percent_45 = round(((total_45-count45_val_5-count45_val_10-count45_val_15) / total_45) * 100 , 2)
percent_57 = round(((total_57-count57_val_5-count57_val_10-count57_val_15) / total_57) * 100 , 2)

In [12]:
total_48 = count48_train['length']+count48_val['length']
total_15 = count15_train['length']+count15_val['length']
total_18 = count18_train['length']+count18_val['length']
total_20 = count20_train['length']+count20_val['length']
total_64 = count64_train['length']+count64_val['length']

# Percent greater equal 15 sec
percent_48 = round(((total_48-count48_val_5-count48_val_10-count48_val_15) / total_48) * 100 , 2)
percent_15 = round(((total_15-count15_val_5-count15_val_10-count15_val_15) / total_15) * 100 , 2)
percent_18 = round(((total_18-count18_val_5-count18_val_10-count18_val_15) / total_18) * 100 , 2)
percent_20 = round(((total_20-count20_val_5-count20_val_10-count20_val_15) / total_20) * 100 , 2)
percent_64 = round(((total_64-count64_val_5-count64_val_10-count64_val_15) / total_64) * 100 , 2)

In [13]:
total_6 = count6_train['length']+count6_val['length']
total_7 = count7_train['length']+count7_val['length']
total_58 = count58_train['length']+count58_val['length']
total_45 = count45_train['length']+count45_val['length']
total_57 = count57_train['length']+count57_val['length']

# Percent greater equal 15 sec
percent_6 = round(((total_6-count6_val_5-count6_val_10-count6_val_15) / total_6) * 100 , 2)
percent_7 = round(((total_7-count7_val_5-count7_val_10-count7_val_15) / total_7) * 100 , 2)
percent_58 = round(((total_58-count58_val_5-count58_val_10-count58_val_15) / total_58) * 100 , 2)
percent_45 = round(((total_45-count45_val_5-count45_val_10-count45_val_15) / total_45) * 100 , 2)
percent_57 = round(((total_57-count57_val_5-count57_val_10-count57_val_15) / total_57) * 100 , 2)

In [41]:
print("species 6: ", "train =", count6_train['length'], " validation =", count6_val['length'], "(",total_6,")")
print("            # files [0,5) sec.:  ", count6_val_5['length'])
print("            # files [5,10) sec.: ", count6_val_10['length'])
print("            # files [10,15) sec.:", count6_val_15['length'])
print("            # files >= 15 sec.:  ", percent_6['length'],"%")
print("species 7: ","train =", count7_train['length'], "  validation =", count7_val['length'], "(",total_7,")")
print("            # files [0,5) sec.:  ", count7_val_5['length'])
print("            # files [5,10) sec.: ", count7_val_10['length'])
print("            # files [10,15) sec.:", count7_val_15['length'])
print("            # files >= 15 sec.:  ", percent_7['length'],"%")
print("species 58:","train =", count58_train['length'], " validation =", count58_val['length'] , "(",total_58,")")
print("            # files [0,5) sec.:  ", count58_val_5['length'])
print("            # files [5,10) sec.: ", count58_val_10['length'])
print("            # files [10,15) sec.:", count58_val_15['length'])
print("            # files >= 15 sec.:  ", percent_58['length'],"%")
print("species 45:","train =", count45_train['length'], " validation =", count45_val['length'], "(",total_45,")")
print("            # files [0,5) sec.:  ", count45_val_5['length'])
print("            # files [5,10) sec.: ", count45_val_10['length'])
print("            # files [10,15) sec.:", count45_val_15['length'])
print("            # files >= 15 sec.:  ", percent_45['length'],"%")
print("species 57:","train =", count57_train['length'], " validation =", count57_val['length'], "(",total_57,")")
print("            # files [0,5) sec.:  ", count57_val_5['length'])
print("            # files [5,10) sec.: ", count57_val_10['length'])
print("            # files [10,15) sec.:", count57_val_15['length'])
print("            # files >= 15 sec.:  ", percent_57['length'],"%")

In [62]:
print("species 48: ", "train =", count48_train['length'], " validation =", count48_val['length'], "(",total_48,")")
print("            # files [0,5) sec.:  ", count48_val_5['length'])
print("            # files [5,10) sec.: ", count48_val_10['length'])
print("            # files [10,15) sec.:", count48_val_15['length'])
print("            # files >= 15 sec.:  ", percent_48['length'],"%")
print("species 15: ","train =", count15_train['length'], "  validation =", count15_val['length'], "(",total_15,")")
print("            # files [0,5) sec.:  ", count15_val_5['length'])
print("            # files [5,10) sec.: ", count15_val_10['length'])
print("            # files [10,15) sec.:", count15_val_15['length'])
print("            # files >= 15 sec.:  ", percent_15['length'],"%")
print("species 18:","train =", count18_train['length'], " validation =", count18_val['length'] , "(",total_18,")")
print("            # files [0,5) sec.:  ", count18_val_5['length'])
print("            # files [5,10) sec.: ", count18_val_10['length'])
print("            # files [10,15) sec.:", count18_val_15['length'])
print("            # files >= 15 sec.:  ", percent_18['length'],"%")
print("species 20:","train =", count20_train['length'], " validation =", count20_val['length'], "(",total_20,")")
print("            # files [0,5) sec.:  ", count20_val_5['length'])
print("            # files [5,10) sec.: ", count20_val_10['length'])
print("            # files [10,15) sec.:", count20_val_15['length'])
print("            # files >= 15 sec.:  ", percent_20['length'],"%")
print("species 64:","train =", count64_train['length'], " validation =", count64_val['length'], "(",total_64,")")
print("            # files [0,5) sec.:  ", count64_val_5['length'])
print("            # files [5,10) sec.: ", count64_val_10['length'])
print("            # files [10,15) sec.:", count64_val_15['length'])
print("            # files >= 15 sec.:  ", percent_64['length'],"%")

In [21]:
# get length of bad vs. good classes
df_28 = df.query('label == 28')
df_train_28= df.query('subset == "train" & label == 28')
df_val_28= df.query('subset == "validation" & label == 28')
df_sort_28 = df_28.sort_values(['length'], ascending =[True]) 
df_sort_train_28 = df_train_28.sort_values(['length'], ascending =[True]) 
df_sort_val_28 = df_val_28.sort_values(['length'], ascending =[True]) 

In [22]:
plt.title('length files species 28', fontsize = 10)
plt.plot(np.arange(0,len(df_28['length'])), df_sort_28['length'], '--', color="black")
plt.plot(np.arange(0,len(df_train_28['length'])), df_sort_train_28['length'], color="orange")
plt.plot(np.arange(0,len(df_val_28['length'])), df_sort_val_28['length'], color="cyan")
plt.legend(["total", "train", "validation"])
plt.show()

In [79]:
# bad classes
df_13 = df.query('label == 13')
df_17 = df.query('label == 17')
df_30 = df.query('label == 30')
df_45 = df.query('label == 45')
df_47 = df.query('label == 47')
df_49 = df.query('label == 49')
df_54 = df.query('label == 54')

In [80]:
# good classes
df_15 = df.query('label == 15')
df_18 = df.query('label == 18')
df_20 = df.query('label == 20')
df_22 = df.query('label == 22')
df_31 = df.query('label == 31')
df_48 = df.query('label == 48')
df_64 = df.query('label == 64')

In [81]:
max_min_13 = [df_13['length'].min()]
max_min_13.append(df_13['length'].max())
max_min_17 = [df_17['length'].min()]
max_min_17.append(df_17['length'].max())
max_min_45 = [df_45['length'].min()]
max_min_45.append(df_45['length'].max())
max_min_47 = [df_47['length'].min()]
max_min_47.append(df_47['length'].max())
max_min_49 = [df_49['length'].min()]
max_min_49.append(df_49['length'].max())
max_min_54 = [df_54['length'].min()]
max_min_54.append(df_54['length'].max())

In [82]:
max_min_15 = [df_15['length'].min()]
max_min_15.append(df_15['length'].max())
max_min_18 = [df_18['length'].min()]
max_min_18.append(df_18['length'].max())
max_min_20 = [df_20['length'].min()]
max_min_20.append(df_20['length'].max())
max_min_22 = [df_22['length'].min()]
max_min_22.append(df_22['length'].max())
max_min_48 = [df_48['length'].min()]
max_min_48.append(df_48['length'].max())
max_min_64 = [df_64['length'].min()]
max_min_64.append(df_64['length'].max())

In [87]:
plt.title('Bad classes', fontsize = 10)
plt.scatter(x = max_min_13[0], y = max_min_13[1])
plt.scatter(x = max_min_17[0], y = max_min_17[1])
plt.scatter(x = max_min_45[0], y = max_min_45[1])
plt.scatter(x = max_min_47[0], y = max_min_47[1])
plt.scatter(x = max_min_49[0], y = max_min_49[1])
plt.scatter(x = max_min_54[0], y = max_min_54[1])
plt.xlabel("Minimum length")
plt.ylabel("Maximum length")
plt.legend(["species 13", "species 17", "species 45", "species 47", "species 49", "species 54"])
plt.show()

In [88]:
plt.title('Good classes', fontsize = 10)
plt.scatter(x = max_min_15[0], y = max_min_15[1])
plt.scatter(x = max_min_18[0], y = max_min_18[1])
plt.scatter(x = max_min_20[0], y = max_min_20[1])
plt.scatter(x = max_min_22[0], y = max_min_22[1])
plt.scatter(x = max_min_48[0], y = max_min_48[1])
plt.scatter(x = max_min_64[0], y = max_min_64[1])
plt.xlabel("Minimum length")
plt.ylabel("Maximum length")
plt.legend(["species 15", "species 18", "species 20", "species 22", "species 48", "species 64"])
plt.show()

In [3]:
df1cw = pd.read_csv(f'notebooks/Raffaela/effnet_baseline/maxmin_6_v1_cw/val_evaluation.csv')
df1cw.drop(df1cw.tail(3).index, inplace=True)
f1_df1cw = df1cw[['f1-score']].mean()

In [4]:
df2cw = pd.read_csv(f'notebooks/Raffaela/effnet_baseline/maxmin_6_v2_cw/val_evaluation.csv')
df2cw.drop(df2cw.tail(3).index, inplace=True)
f1_df2cw = df2cw[['f1-score']].mean()

In [5]:
df3cw = pd.read_csv(f'notebooks/Raffaela/effnet_baseline/maxmin_6_v3_cw/val_evaluation.csv')
df3cw.drop(df3cw.tail(3).index, inplace=True)
f1_df3cw = df3cw[['f1-score']].mean()

In [6]:
df2lrcw = pd.read_csv(f'notebooks/Raffaela/effnet_baseline/maxmin_6_v2_cw_LR/val_evaluation.csv')
df2lrcw.drop(df2lrcw.tail(3).index, inplace=True)
f1_df2lrcw = df2lrcw[['f1-score']].mean()

In [7]:
df2lr = pd.read_csv(f'notebooks/Raffaela/effnet_baseline/maxmin_6_v2_LR/val_evaluation.csv')
df2lr.drop(df2lr.tail(3).index, inplace=True)
f1_df2lr =df2lr[['f1-score']].mean()

In [8]:
df2 = pd.read_csv(f'notebooks/Raffaela/effnet_baseline/maxmin_6_v2/val_evaluation.csv')
df2.drop(df2.tail(3).index, inplace=True)
f1_df2 =df2[['f1-score']].mean()

In [9]:
df2cw_2 = pd.read_csv(f'notebooks/Raffaela/effnet_baseline/maxmin_6_2_v2_cw/val_evaluation.csv')
df2cw_2.drop(df2cw_2.tail(3).index, inplace=True)
f1_df2cw_2 = df2cw_2[['f1-score']].mean()

In [10]:
df2cwlr_val = pd.read_csv(f'notebooks/Raffaela/effnet_baseline/maxmin_6_v2_cw_LR_val/val_evaluation.csv')
df2cwlr_val.drop(df2cwlr_val.tail(3).index, inplace=True)
f1_df2cwlr_val = df2cwlr_val[['f1-score']].mean()

In [12]:
print("f1_df1cw =", f1_df1cw)
print("f1_df2cw =", f1_df2cw)
print("f1_df3cw =", f1_df3cw)
print("f1_df2lrcw =", f1_df2lrcw)
print("f1_df2lr =", f1_df2lr)
print("f1_df2 =", f1_df2)
print("f1_df2cw_2 =", f1_df2cw_2)
print("f1_df2cwlr_val=", f1_df2cwlr_val)