In [87]:
from desed_task.dataio import StronglyAnnotatedSet, WeakSet, UnlabeledSet
import pandas as pd
import numpy as np
import yaml

import sys
sys.path.append('../')
from encoder import ManyHotEncoder


In [88]:
with open("../master_src/params.yaml", "r") as f:
    configs = yaml.safe_load(f)

In [89]:
tsv_entries_strong = pd.read_csv(configs["data"]["strong_tsv"], sep = '\t')
tsv_entries_weak = pd.read_csv(configs["data"]["weak_tsv"], sep = '\t')
tsv_entries_unlabeled = pd.read_csv(configs["data"]["unlabeled_tsv"], sep = '\t')
print(tsv_entries_strong.head())
print(tsv_entries_weak.head())
print(tsv_entries_unlabeled.head())

                         filename  onset  offset         event_label
0   Y--OMDPXfO6o_9.000_19.000.wav  0.000   9.785  Alarm_bell_ringing
1   Y--OMDPXfO6o_9.000_19.000.wav  1.556   9.415              Speech
2  Y--dr8rXrv8k_23.000_33.000.wav  1.667   2.657              Speech
3  Y--dr8rXrv8k_23.000_33.000.wav  0.000   0.541              Speech
4  Y--dr8rXrv8k_23.000_33.000.wav  2.849   3.480                 Cat
                         filename               event_labels
0  YKK227gPpRn4_30.000_40.000.wav  Alarm_bell_ringing,Speech
1   Y-6p32k2WUCs_0.000_10.000.wav         Alarm_bell_ringing
2    Y-Zjgm_sZd10_0.000_4.000.wav         Alarm_bell_ringing
3    Y-ikrSfqB0LU_0.000_3.000.wav         Alarm_bell_ringing
4   Y26ubekDhG_Y_0.000_10.000.wav         Alarm_bell_ringing
                         filename
0  Y--CE2f-ttEQ_30.000_40.000.wav
1  Y--ERHDSdxGQ_30.000_40.000.wav
2  Y-0A1_JR5f34_16.000_26.000.wav
3  Y-0BScjoz3Z0_10.000_20.000.wav
4  Y-0nSANmuPxU_10.000_20.000.wav


In [90]:
labels = sorted(tsv_entries_strong.event_label.unique().tolist())
labels

['Alarm_bell_ringing',
 'Blender',
 'Cat',
 'Dishes',
 'Dog',
 'Electric_shaver_toothbrush',
 'Frying',
 'Running_water',
 'Speech',
 'Vacuum_cleaner']

In [91]:
encoder = ManyHotEncoder(labels, 
                         audio_len=configs["data"]["audio_max_len"], 
                         frame_len=configs["feats"]["n_window"], 
                         frame_hop=configs["feats"]["hop_length"],
                         fs=configs["feats"]["sample_rate"],
                         net_pooling=configs["data"]["net_subsample"])
encoder.n_frames

618

In [92]:
print(encoder.encode_strong_df(tsv_entries_strong).shape, encoder.encode_strong_df(tsv_entries_strong))

(618, 10) [[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]


In [93]:
dataset_strong = StronglyAnnotatedSet(audio_folder = configs["data"]["strong_folder_44k"], 
                                  tsv_entries = tsv_entries_strong, 
                                  encoder=encoder)

In [94]:
dataset_strong[0][1].shape

torch.Size([10, 618])

In [95]:
dataset_strong.examples[dataset_strong.examples_list[0]]

{'mixture': '/mnt/d/DESED_dataset/audio/train/strong_label_real/Y--OMDPXfO6o_9.000_19.000.wav',
 'events': [{'event_label': 'Alarm_bell_ringing',
   'onset': 0.0,
   'offset': 9.785},
  {'event_label': 'Speech', 'onset': 1.556, 'offset': 9.415}]}

In [96]:
dataset_weak = WeakSet(audio_folder = '/mnt/d/DESED_dataset/audio/train/weak/',
                       tsv_entries=tsv_entries_weak, 
                       encoder=encoder)

In [97]:
dataset_weak[0], dataset_weak.examples['YKK227gPpRn4_30.000_40.000.wav']

([tensor([-0.0231, -0.0146, -0.0064,  ...,  0.0057,  0.0065,  0.0091]),
  tensor([[1., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [1., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]),
  [1.0]],
 {'mixture': '/mnt/d/DESED_dataset/audio/train/weak/YKK227gPpRn4_30.000_40.000.wav',
  'events': ['Alarm_bell_ringing', 'Speech']})

In [98]:
dataset_unlabeled = UnlabeledSet(unlabeled_folder = '/mnt/d/DESED_dataset/audio/train/unlabel_in_domain/', 
                                 encoder=encoder)

In [99]:
dataset_unlabeled[0]

[tensor([-0.0507, -0.0507, -0.0507,  ..., -0.0370, -0.0303, -0.0239]),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 [1.0]]

# Mel spectrogram

In [100]:
from torchaudio.transforms import MelSpectrogram, Spectrogram
import torch

In [101]:
mel_spec = MelSpectrogram(
    sample_rate=configs["feats"]["sample_rate"],
    n_fft=configs["feats"]["n_window"],
    win_length=configs["feats"]["n_window"],
    hop_length=configs["feats"]["hop_length"],
    f_min=configs["feats"]["f_min"],
    f_max=configs["feats"]["f_max"],
    n_mels=configs["feats"]["n_mels"],
    window_fn=torch.hamming_window,
    wkwargs={"periodic": False},
    power=1,
    center=False
)

In [102]:
spectrogram = Spectrogram(
    n_fft=configs["feats"]["n_window"],
    win_length=configs["feats"]["n_window"],
    hop_length=configs["feats"]["hop_length"],
    window_fn=torch.hamming_window,
    pad = 0,
    # wkwargs={"periodic": False},
    power=1,
    center=False
)

In [103]:
dataset_strong[0][0].shape

torch.Size([160000])

In [104]:
mel_spec(dataset_strong[0][0]).shape

torch.Size([128, 618])

In [105]:
spec = spectrogram(dataset_strong[0][0])
spectrogram(dataset_strong[0][0]).shape

torch.Size([1025, 618])