In [2]:
import sys # Python system library needed to load custom functions
import math # module with access to mathematical functions
import os # for changing the directory

import numpy as np  # for performing calculations on numerical arrays
import pandas as pd  # home of the DataFrame construct, _the_ most important object for Data Science

from numpy import save  

from IPython.display import Audio # for listening to our insects
import IPython
from scipy.fft import fft # function to calculate Fast Fourier Transform

import wave
from glob import glob
import librosa
import torchaudio
from tqdm.auto import tqdm          # library to display progress bar while doing apply on pandas dataframe

import matplotlib.pyplot as plt  # allows creation of insightful plots
import seaborn as sns # another library to make even more beautiful plots

sys.path.append('../../src') # add the source directory to the PYTHONPATH. This allows to import local functions and modules.
# enable rendering plots under the code cell that created it
%matplotlib inline

from eda_utils import show_sampling, signal_generator, plot_random_spec, plot_spec, plot_waveform # functions to create plots for and from audio data
from gdsc_utils import download_directory, PROJECT_DIR # function to download GDSC data from S3 bucket and our root directory
from config import DEFAULT_BUCKET  # S3 bucket with the GDSC data

os.chdir(PROJECT_DIR) # changing our directory to root

In [3]:
download_directory('data/', None, DEFAULT_BUCKET)

In [3]:
df = pd.read_csv('data/metadata.csv')
df_3_5 = pd.read_csv('data/production_data/Raffaela3-5s_crop/metadata.csv')

In [4]:
df['species and label'] = df.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
df_stats = df.groupby(['label','species and label']).agg(length = ('length', 'sum'), count = ('species', 'count')).reset_index()
df_35_stats = df_3_5.groupby(['label']).agg(count = ('label', 'count')).reset_index()

In [5]:
value = 3.5
df_35_stats['count'] *= value

In [6]:
plt.figure(figsize = (20,6))
plt.title('Total length of files per species', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(df_35_stats['label'], df_35_stats['count'], color = 'cyan')
plt.bar(df_stats['species and label'], df_stats['length'], color = 'orange')
plt.xlabel("Species and label")
plt.ylabel("lenght")
plt.legend(["original set", "3.5s crops"])
plt.show()

In [7]:
df_35_sort = df_3_5.sort_values(by=['label'], ascending=True, ignore_index=True)
cnt = 0
amount = []
cl = 0

for i in range(len(df_35_sort)):
    if df_35_sort.loc[i, "label"] == cl:
        cnt += 1
        if i == (len(df_35_sort)-1):
            amount.append(cnt)
            break
    else:
        amount.append(cnt)        
        cnt = 1
        cl += 1

#print(amount)
class_total = np.array(amount)
class_sec = class_total * 3.5
total = len(df_3_5)             #total number of files

In [8]:
result1 = total / class_total
result2 = 1 - (class_total / total)
result3 = total / ((cl+1) * class_total)
#weight_res1 = class_sec * result
#weight_res1 = class_total * result1

In [9]:
x = list(range(66))
plt.figure(figsize = (20,6))
plt.title('Class weights (version1)', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(x, result1, color = 'cyan')
plt.xlabel("Label")
plt.ylabel("lenght")
#plt.legend(["cw orig", "cw new"])
plt.show()

In [10]:
x = list(range(66))
plt.figure(figsize = (20,6))
plt.title('Class weights (version2)', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(x, result2, color = 'cyan')
plt.xlabel("Label")
plt.ylabel("lenght")
plt.show()

In [11]:
x = list(range(66))
plt.figure(figsize = (20,6))
plt.title('Class weights (version3)', fontsize = 20)
plt.xticks(rotation = 90)
plt.bar(x, result3, color = 'cyan')
plt.xlabel("Label")
plt.ylabel("lenght")
plt.show()

In [12]:
save('notebooks/Raffaela/class_weights_1.npy',result1)
save('notebooks/Raffaela/class_weights_2.npy',result2)
save('notebooks/Raffaela/class_weights_3.npy',result3)

In [4]:
df_6 = pd.read_csv('data/production_data/6s_crop/metadata.csv')

In [5]:
df_6_sort = df_6.sort_values(by=['label'], ascending=True, ignore_index=True)
cnt = 0
amount = []
cl = 0

for i in range(len(df_6_sort)):
    if df_6_sort.loc[i, "label"] == cl:
        cnt += 1
        if i == (len(df_6_sort)-1):
            amount.append(cnt)
            break
    else:
        amount.append(cnt)        
        cnt = 1
        cl += 1

#print(amount)
class_total = np.array(amount)
class_sec = class_total * 6
total = len(df_6)             #total number of files

In [5]:
result1 = total / class_total
result2 = 1 - (class_total / total)
result3 = total / ((cl+1) * class_total)

In [6]:
save('notebooks/Raffaela/class_weights6_1.npy',result1)
save('notebooks/Raffaela/class_weights6_2.npy',result2)
save('notebooks/Raffaela/class_weights6_3.npy',result3)