# Data Exploration

In [2]:
import sys # Python system library needed to load custom functions
import math # module with access to mathematical functions
import os # for changing the directory

import numpy as np  # for performing calculations on numerical arrays
import pandas as pd  # home of the DataFrame construct, _the_ most important object for Data Science

from IPython.display import Audio # for listening to our insects
import IPython
from scipy.fft import fft # function to calculate Fast Fourier Transform

import matplotlib.pyplot as plt  # allows creation of insightful plots
import seaborn as sns # another library to make even more beautiful plots

sys.path.append('../../src') # add the source directory to the PYTHONPATH. This allows to import local functions and modules.
# enable rendering plots under the code cell that created it
%matplotlib inline

from eda_utils import show_sampling, signal_generator, plot_random_spec, plot_spec, plot_waveform # functions to create plots for and from audio data
from gdsc_utils import download_directory, PROJECT_DIR # function to download GDSC data from S3 bucket and our root directory
from config import DEFAULT_BUCKET  # S3 bucket with the GDSC data

os.chdir(PROJECT_DIR) # changing our directory to root

In [3]:
download_directory('data/', None, DEFAULT_BUCKET)

In [4]:
metadata = pd.read_csv('data/metadata.csv')
metadata['species and label'] = metadata.apply(lambda x: f"{x['species']} ({str(x['label'])})", axis = 1)
metadata.head()

In [5]:
len(metadata)

### Split for Training and Validation Subset and sort by length and label

In [6]:
train_metadata = metadata[metadata["subset"] == "train"].sort_values('length', ascending=False)
train_metadata

In [7]:
val_metadata = metadata[metadata["subset"] == "validation"].sort_values('length', ascending=False)
val_metadata

### Only get files that are > 40s and sort by label

In [8]:
train_metadata_gr_20 = train_metadata[train_metadata["length"] > 40].sort_values('label')
train_metadata_gr_20

In [9]:
# Calculating stats per label/species - total length of recording per class and the total number of class occurences in the dataset
train_metadata_gr_20_stats = train_metadata_gr_20.groupby(['label','species and label']).agg(count = ('species', 'count')).reset_index()

# Calculating average length of an audio sample
train_metadata_gr_20_stats.head()

In [10]:
train_metadata_gr_20_stats = train_metadata_gr_20_stats.sort_values('label')

plt.figure(figsize = (20,6))
sns.barplot(x = train_metadata_gr_20_stats['species and label'], y = train_metadata_gr_20_stats['count'], color = 'royalblue')
plt.title('Number of files per species', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

In [11]:
val_metadata_gr_20 = val_metadata[val_metadata["length"] < 10].sort_values('label')
val_metadata_gr_20

In [12]:
# Calculating stats per label/species - total length of recording per class and the total number of class occurences in the dataset
val_metadata_gr_20_stats = val_metadata_gr_20.groupby(['label','species and label']).agg(count = ('species', 'count')).reset_index()

# Calculating average length of an audio sample
val_metadata_gr_20_stats.head()

In [13]:
val_metadata_gr_20_stats = val_metadata_gr_20_stats.sort_values('label')

plt.figure(figsize = (20,6))
sns.barplot(x = val_metadata_gr_20_stats['species and label'], y = val_metadata_gr_20_stats['count'], color = 'royalblue')
plt.title('Number of files per species', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

### Do a few plots

In [14]:
train_paths = train_metadata_gr_20['file_name']#.to_dict()
len(train_paths)
labels_explore_train = train_metadata_gr_20['species and label']
#example_path = 'data/train/Chorthippusbiguttulus_XC751834-dat031-007_edit3.wav'
#plot_waveform(example_path)

In [14]:
val_paths = val_metadata_gr_20['file_name']#.to_dict()
len(val_paths)
labels_explore_val = val_metadata_gr_20['species and label']

In [16]:
#train_paths=train_paths.set_index('file_name').to_dict()['species and label']

In [17]:
labels_explore_val

In [16]:
train_paths[0]
plot_waveform(f'data/train/{train_paths[0]}')
plt.close()

In [None]:
#for i in train_paths[:2]:
    #plot_waveform(f'data/train/{i}')

In [18]:
def explorations_train(path):
    #plot_waveform(f'data/train/{path}')
    for i, path in enumerate(path):
        print(labels_explore_train[i])
        #plot_waveform(f'data/train/{path}')
        plot_spec([f'data/train/{path}'])
    #plt.close()
        IPython.display.display(Audio(f'data/train/{path}'))

In [None]:
train_paths

In [None]:
explorations(train_paths[0:10])

In [22]:
val_paths = val_metadata_gr_20['file_name'].to_list()
len(val_paths)

In [17]:
val_paths

In [18]:
def explorations_val(path):
    #plot_waveform(f'data/train/{path}')
    for i, path in enumerate(path):
        print(labels_explore_val.to_list()[i])
        plot_waveform(f'data/val/{path}')
        plot_spec([f'data/val/{path}'])
    #plt.close()
        IPython.display.display(Audio(f'data/val/{path}'))

In [19]:
explorations_val(val_paths[0:2])

In [None]:
explorations_val(val_paths)#[0:2])