# Data Processing Notebook:

In [9]:
# Uncomment and install any needed dependencies
# !pip install pydub
# !pip install librosa
# !pip install regex
# !pip install soundfile
# !pip install noisereduce

In [2]:
import os
import sys
sys.path.append('../')

from functions import processing_funcs, utils
from pydub.utils import mediainfo

## First, we need to get some files.

In [3]:
%%time
voice_dir = os.path.join('..', 'voice_data')
utils.make_dir(voice_dir)
utils.get_voice_data(voice_dir)

Creating directory at ../voice_data...

../voice_data already exists.

Downloading 31 files from https://media.talkbank.org/ca/CallFriend/eng-n/0wav/
12.9% done...
22.58% done...
32.26% done...
41.94% done...
51.61% done...
61.29% done...
70.97% done...
80.65% done...
90.32% done...
100.0% done...
All 31 downloaded to the ../voice_data directory
CPU times: user 10.7 s, sys: 8.77 s, total: 19.4 s
Wall time: 7min 4s


### Split stereo channels into mono:

In [4]:
%%time
mono_dir_path = os.path.join(voice_dir, 'mono_channels')
utils.make_dir(mono_dir_path)
processing_funcs.split_stereo_audio(voice_dir_path = '../voice_data/')

Creating directory at ../voice_data/mono_channels...

../voice_data/mono_channels already exists.

Collecting all files in ../voice_data/ matching regular expression [0-9]+\.wav.

Removing noise from the 31 files in ../voice_data/mono_channels
12.9% done...
22.58% done...
32.26% done...
41.94% done...
51.61% done...
61.29% done...
70.97% done...
80.65% done...
90.32% done...
100.0% done...
Splitting complete.

CPU times: user 12.8 s, sys: 903 ms, total: 13.7 s
Wall time: 18.6 s


### All of our stereo audio should now be split into mono channels:

In [5]:
mono_dir = utils.read_dir_files(dir_path = mono_dir_path,
                                file_regex = r'[0-9]+\_(?:L|R)\.wav')

print('Total number of mono files:', len(mono_dir), '\n')

for mono_file in mono_dir:
    print(mono_file)

Collecting all files in ../voice_data/mono_channels matching regular expression [0-9]+\_(?:L|R)\.wav.

Total number of mono files: 62 

4175_L.wav
4175_R.wav
4504_L.wav
4504_R.wav
4708_L.wav
4708_R.wav
4745_L.wav
4745_R.wav
4823_L.wav
4823_R.wav
4874_L.wav
4874_R.wav
4889_L.wav
4889_R.wav
4984_L.wav
4984_R.wav
5000_L.wav
5000_R.wav
5051_L.wav
5051_R.wav
5220_L.wav
5220_R.wav
5635_L.wav
5635_R.wav
5926_L.wav
5926_R.wav
6015_L.wav
6015_R.wav
6062_L.wav
6062_R.wav
6065_L.wav
6065_R.wav
6093_L.wav
6093_R.wav
6126_L.wav
6126_R.wav
6157_L.wav
6157_R.wav
6193_L.wav
6193_R.wav
6239_L.wav
6239_R.wav
6255_L.wav
6255_R.wav
6278_L.wav
6278_R.wav
6372_L.wav
6372_R.wav
6379_L.wav
6379_R.wav
6476_L.wav
6476_R.wav
6862_L.wav
6862_R.wav
6869_L.wav
6869_R.wav
6899_L.wav
6899_R.wav
6938_L.wav
6938_R.wav
6952_L.wav
6952_R.wav


### We now have 62 mono files: 2 * 31 original files. This makes sense!

Now to check to make sure they look okay:

In [6]:
print("Here is the file information for the first 10 files:")
index = 0
for file in mono_dir:
    file_path = os.path.join(mono_dir_path, file)
    file_info = mediainfo(file_path)
    print('File:', file_path, '|',
          'Number of Channels:', file_info['channels'], '|',
          'Sampling Rate:', file_info['sample_rate'], '|',
          'Duration (secs):', utils.get_file_duration(file_path))
    index += 1
    if index == 10:
        break

Here is the file information for the first 10 files:
File: ../voice_data/mono_channels/4175_L.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (secs): 1800.000000
File: ../voice_data/mono_channels/4175_R.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (secs): 1800.000000
File: ../voice_data/mono_channels/4504_L.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (secs): 470.760000
File: ../voice_data/mono_channels/4504_R.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (secs): 470.760000
File: ../voice_data/mono_channels/4708_L.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (secs): 1800.000000
File: ../voice_data/mono_channels/4708_R.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (secs): 1800.000000
File: ../voice_data/mono_channels/4745_L.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (secs): 1425.450000
File: ../voice_data/mono_channels/4745_R.wav | Number of Channels: 1 | Sampling Rate: 8000 | Du

### Channels and duration look good, and the native sampling rate was preserved. Perfect.

### Now to eliminate noise as much as possible from the mono files:

We'll use the 'noisereduce' package and see how that affects our data:

noisereduce package information here:
https://pypi.org/project/noisereduce/#:~:text=Noisereduce%20is%20a%20noise%20reduction,a%20form%20of%20Noise%20Gate.

We'll be using the simplest implementation to start out with and we can try out different methods later on.

@software{tim_sainburg_2019_3243139,
  author       = {Tim Sainburg},
  title        = {timsainb/noisereduce: v1.0},
  month        = jun,
  year         = 2019,
  publisher    = {Zenodo},
  version      = {db94fe2},
  doi          = {10.5281/zenodo.3243139},
  url          = {https://doi.org/10.5281/zenodo.3243139}
}


@article{sainburg2020finding,
  title={Finding, visualizing, and quantifying latent structure across diverse animal vocal repertoires},
  author={Sainburg, Tim and Thielk, Marvin and Gentner, Timothy Q},
  journal={PLoS computational biology},
  volume={16},
  number={10},
  pages={e1008228},
  year={2020},
  publisher={Public Library of Science}
}

In [7]:
%%time

processing_funcs.reduce_noise(voice_dir_path = voice_dir)

Collecting all files in ../voice_data/mono_channels matching regular expression [0-9]+\_(?:L|R)\.wav.

Removing noise from the 62 files in ../voice_data/mono_channels
11.29% done...
20.97% done...
30.65% done...
40.32% done...
50.0% done...
61.29% done...
70.97% done...
80.65% done...
90.32% done...
100.0% done...
Noise reduction complete.

CPU times: user 4min 39s, sys: 22.9 s, total: 5min 1s
Wall time: 5min 6s


### Now to remove silence from the audio files:
Since most of the files were 30 minutes and they were all recorded with 2 speakers each on a separate channel, we should reduce the single channel voice files to just the timeframes that the individual is speaking.

In [8]:
%%time
silence_dir_path = os.path.join(mono_dir_path, 'silence_removed')
utils.make_dir(silence_dir_path)
processing_funcs.remove_silence(mono_channel_dir = mono_dir_path)

Creating directory at ../voice_data/mono_channels/silence_removed...

../voice_data/mono_channels/silence_removed already exists.

Collecting all files in ../voice_data/mono_channels matching regular expression [0-9]+\_(?:L|R)\.wav.

Removing periods of silence from 62 files...

11.29% done...
20.97% done...
30.65% done...
40.32% done...
50.0% done...
61.29% done...
70.97% done...
80.65% done...
90.32% done...
100.0% done...
Removing silence process complete.

CPU times: user 40.9 s, sys: 4.18 s, total: 45.1 s
Wall time: 46.4 s
